xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 525207cd)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vma(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VMA);
133 }
134 
135 static inline uint32_t vext_vta_all_1s(uint32_t desc)
136 {
137     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
138 }
139 
140 /*
141  * Get the maximum number of elements can be operated.
142  *
143  * log2_esz: log2 of element size in bytes.
144  */
145 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
146 {
147     /*
148      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
149      * so vlen in bytes (vlenb) is encoded as maxsz.
150      */
151     uint32_t vlenb = simd_maxsz(desc);
152 
153     /* Return VLMAX */
154     int scale = vext_lmul(desc) - log2_esz;
155     return scale < 0 ? vlenb >> -scale : vlenb << scale;
156 }
157 
158 /*
159  * Get number of total elements, including prestart, body and tail elements.
160  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
161  * are held in the same vector register.
162  */
163 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
164                                             uint32_t esz)
165 {
166     uint32_t vlenb = simd_maxsz(desc);
167     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
168     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
169                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
170     return (vlenb << emul) / esz;
171 }
172 
173 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
174 {
175     return (addr & env->cur_pmmask) | env->cur_pmbase;
176 }
177 
178 /*
179  * This function checks watchpoint before real load operation.
180  *
181  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
182  * In user mode, there is no watchpoint support now.
183  *
184  * It will trigger an exception if there is no mapping in TLB
185  * and page table walk can't fill the TLB entry. Then the guest
186  * software can return here after process the exception or never return.
187  */
188 static void probe_pages(CPURISCVState *env, target_ulong addr,
189                         target_ulong len, uintptr_t ra,
190                         MMUAccessType access_type)
191 {
192     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
193     target_ulong curlen = MIN(pagelen, len);
194 
195     probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                  cpu_mmu_index(env, false), ra);
197     if (len > curlen) {
198         addr += curlen;
199         curlen = len - curlen;
200         probe_access(env, adjust_addr(env, addr), curlen, access_type,
201                      cpu_mmu_index(env, false), ra);
202     }
203 }
204 
205 /* set agnostic elements to 1s */
206 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
207                               uint32_t tot)
208 {
209     if (is_agnostic == 0) {
210         /* policy undisturbed */
211         return;
212     }
213     if (tot - cnt == 0) {
214         return ;
215     }
216     memset(base + cnt, -1, tot - cnt);
217 }
218 
219 static inline void vext_set_elem_mask(void *v0, int index,
220                                       uint8_t value)
221 {
222     int idx = index / 64;
223     int pos = index % 64;
224     uint64_t old = ((uint64_t *)v0)[idx];
225     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
226 }
227 
228 /*
229  * Earlier designs (pre-0.9) had a varying number of bits
230  * per mask value (MLEN). In the 0.9 design, MLEN=1.
231  * (Section 4.5)
232  */
233 static inline int vext_elem_mask(void *v0, int index)
234 {
235     int idx = index / 64;
236     int pos = index  % 64;
237     return (((uint64_t *)v0)[idx] >> pos) & 1;
238 }
239 
240 /* elements operations for load and store */
241 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
242                                uint32_t idx, void *vd, uintptr_t retaddr);
243 
244 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
245 static void NAME(CPURISCVState *env, abi_ptr addr,         \
246                  uint32_t idx, void *vd, uintptr_t retaddr)\
247 {                                                          \
248     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
249     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
250 }                                                          \
251 
252 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
253 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
254 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
255 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
256 
257 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
258 static void NAME(CPURISCVState *env, abi_ptr addr,         \
259                  uint32_t idx, void *vd, uintptr_t retaddr)\
260 {                                                          \
261     ETYPE data = *((ETYPE *)vd + H(idx));                  \
262     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
263 }
264 
265 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
266 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
267 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
268 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
269 
270 /*
271  *** stride: access vector element from strided memory
272  */
273 static void
274 vext_ldst_stride(void *vd, void *v0, target_ulong base,
275                  target_ulong stride, CPURISCVState *env,
276                  uint32_t desc, uint32_t vm,
277                  vext_ldst_elem_fn *ldst_elem,
278                  uint32_t log2_esz, uintptr_t ra)
279 {
280     uint32_t i, k;
281     uint32_t nf = vext_nf(desc);
282     uint32_t max_elems = vext_max_elems(desc, log2_esz);
283     uint32_t esz = 1 << log2_esz;
284     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
285     uint32_t vta = vext_vta(desc);
286     uint32_t vma = vext_vma(desc);
287 
288     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
289         k = 0;
290         while (k < nf) {
291             if (!vm && !vext_elem_mask(v0, i)) {
292                 /* set masked-off elements to 1s */
293                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
294                                   (i + k * max_elems + 1) * esz);
295                 k++;
296                 continue;
297             }
298             target_ulong addr = base + stride * i + (k << log2_esz);
299             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
300             k++;
301         }
302     }
303     env->vstart = 0;
304     /* set tail elements to 1s */
305     for (k = 0; k < nf; ++k) {
306         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
307                           (k * max_elems + max_elems) * esz);
308     }
309     if (nf * max_elems % total_elems != 0) {
310         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
311         uint32_t registers_used =
312             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
313         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
314                           registers_used * vlenb);
315     }
316 }
317 
318 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
319 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
320                   target_ulong stride, CPURISCVState *env,              \
321                   uint32_t desc)                                        \
322 {                                                                       \
323     uint32_t vm = vext_vm(desc);                                        \
324     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
325                      ctzl(sizeof(ETYPE)), GETPC());                     \
326 }
327 
328 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
329 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
330 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
331 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
332 
333 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
334 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
335                   target_ulong stride, CPURISCVState *env,              \
336                   uint32_t desc)                                        \
337 {                                                                       \
338     uint32_t vm = vext_vm(desc);                                        \
339     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
340                      ctzl(sizeof(ETYPE)), GETPC());                     \
341 }
342 
343 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
344 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
345 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
346 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
347 
348 /*
349  *** unit-stride: access elements stored contiguously in memory
350  */
351 
352 /* unmasked unit-stride load and store operation*/
353 static void
354 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
355              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
356              uintptr_t ra)
357 {
358     uint32_t i, k;
359     uint32_t nf = vext_nf(desc);
360     uint32_t max_elems = vext_max_elems(desc, log2_esz);
361     uint32_t esz = 1 << log2_esz;
362     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
363     uint32_t vta = vext_vta(desc);
364 
365     /* load bytes from guest memory */
366     for (i = env->vstart; i < evl; i++, env->vstart++) {
367         k = 0;
368         while (k < nf) {
369             target_ulong addr = base + ((i * nf + k) << log2_esz);
370             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
371             k++;
372         }
373     }
374     env->vstart = 0;
375     /* set tail elements to 1s */
376     for (k = 0; k < nf; ++k) {
377         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
378                           (k * max_elems + max_elems) * esz);
379     }
380     if (nf * max_elems % total_elems != 0) {
381         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
382         uint32_t registers_used =
383             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
384         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
385                           registers_used * vlenb);
386     }
387 }
388 
389 /*
390  * masked unit-stride load and store operation will be a special case of stride,
391  * stride = NF * sizeof (MTYPE)
392  */
393 
394 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
395 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
396                          CPURISCVState *env, uint32_t desc)             \
397 {                                                                       \
398     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
399     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
400                      ctzl(sizeof(ETYPE)), GETPC());                     \
401 }                                                                       \
402                                                                         \
403 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
404                   CPURISCVState *env, uint32_t desc)                    \
405 {                                                                       \
406     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
407                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
408 }
409 
410 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
411 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
412 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
413 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
414 
415 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
416 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
417                          CPURISCVState *env, uint32_t desc)              \
418 {                                                                        \
419     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
420     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
421                      ctzl(sizeof(ETYPE)), GETPC());                      \
422 }                                                                        \
423                                                                          \
424 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
425                   CPURISCVState *env, uint32_t desc)                     \
426 {                                                                        \
427     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
428                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
429 }
430 
431 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
432 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
433 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
434 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
435 
436 /*
437  *** unit stride mask load and store, EEW = 1
438  */
439 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, lde_b,
445                  0, evl, GETPC());
446 }
447 
448 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
449                     CPURISCVState *env, uint32_t desc)
450 {
451     /* evl = ceil(vl/8) */
452     uint8_t evl = (env->vl + 7) >> 3;
453     vext_ldst_us(vd, base, env, desc, ste_b,
454                  0, evl, GETPC());
455 }
456 
457 /*
458  *** index: access vector element from indexed memory
459  */
460 typedef target_ulong vext_get_index_addr(target_ulong base,
461         uint32_t idx, void *vs2);
462 
463 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
464 static target_ulong NAME(target_ulong base,            \
465                          uint32_t idx, void *vs2)      \
466 {                                                      \
467     return (base + *((ETYPE *)vs2 + H(idx)));          \
468 }
469 
470 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
471 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
472 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
473 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
474 
475 static inline void
476 vext_ldst_index(void *vd, void *v0, target_ulong base,
477                 void *vs2, CPURISCVState *env, uint32_t desc,
478                 vext_get_index_addr get_index_addr,
479                 vext_ldst_elem_fn *ldst_elem,
480                 uint32_t log2_esz, uintptr_t ra)
481 {
482     uint32_t i, k;
483     uint32_t nf = vext_nf(desc);
484     uint32_t vm = vext_vm(desc);
485     uint32_t max_elems = vext_max_elems(desc, log2_esz);
486     uint32_t esz = 1 << log2_esz;
487     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
488     uint32_t vta = vext_vta(desc);
489     uint32_t vma = vext_vma(desc);
490 
491     /* load bytes from guest memory */
492     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
493         k = 0;
494         while (k < nf) {
495             if (!vm && !vext_elem_mask(v0, i)) {
496                 /* set masked-off elements to 1s */
497                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
498                                   (i + k * max_elems + 1) * esz);
499                 k++;
500                 continue;
501             }
502             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
503             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
504             k++;
505         }
506     }
507     env->vstart = 0;
508     /* set tail elements to 1s */
509     for (k = 0; k < nf; ++k) {
510         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
511                           (k * max_elems + max_elems) * esz);
512     }
513     if (nf * max_elems % total_elems != 0) {
514         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
515         uint32_t registers_used =
516             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
517         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
518                           registers_used * vlenb);
519     }
520 }
521 
522 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
523 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
524                   void *vs2, CPURISCVState *env, uint32_t desc)            \
525 {                                                                          \
526     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
527                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
528 }
529 
530 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
531 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
532 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
533 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
534 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
535 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
536 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
537 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
538 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
539 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
540 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
541 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
542 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
543 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
544 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
545 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
546 
547 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
548 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
549                   void *vs2, CPURISCVState *env, uint32_t desc)  \
550 {                                                                \
551     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
552                     STORE_FN, ctzl(sizeof(ETYPE)),               \
553                     GETPC());                                    \
554 }
555 
556 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
557 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
558 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
559 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
560 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
561 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
562 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
563 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
564 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
565 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
566 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
567 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
568 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
569 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
570 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
571 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
572 
573 /*
574  *** unit-stride fault-only-fisrt load instructions
575  */
576 static inline void
577 vext_ldff(void *vd, void *v0, target_ulong base,
578           CPURISCVState *env, uint32_t desc,
579           vext_ldst_elem_fn *ldst_elem,
580           uint32_t log2_esz, uintptr_t ra)
581 {
582     void *host;
583     uint32_t i, k, vl = 0;
584     uint32_t nf = vext_nf(desc);
585     uint32_t vm = vext_vm(desc);
586     uint32_t max_elems = vext_max_elems(desc, log2_esz);
587     uint32_t esz = 1 << log2_esz;
588     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
589     uint32_t vta = vext_vta(desc);
590     uint32_t vma = vext_vma(desc);
591     target_ulong addr, offset, remain;
592 
593     /* probe every access*/
594     for (i = env->vstart; i < env->vl; i++) {
595         if (!vm && !vext_elem_mask(v0, i)) {
596             continue;
597         }
598         addr = adjust_addr(env, base + i * (nf << log2_esz));
599         if (i == 0) {
600             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
601         } else {
602             /* if it triggers an exception, no need to check watchpoint */
603             remain = nf << log2_esz;
604             while (remain > 0) {
605                 offset = -(addr | TARGET_PAGE_MASK);
606                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
607                                          cpu_mmu_index(env, false));
608                 if (host) {
609 #ifdef CONFIG_USER_ONLY
610                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
611                         vl = i;
612                         goto ProbeSuccess;
613                     }
614 #else
615                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
616 #endif
617                 } else {
618                     vl = i;
619                     goto ProbeSuccess;
620                 }
621                 if (remain <=  offset) {
622                     break;
623                 }
624                 remain -= offset;
625                 addr = adjust_addr(env, addr + offset);
626             }
627         }
628     }
629 ProbeSuccess:
630     /* load bytes from guest memory */
631     if (vl != 0) {
632         env->vl = vl;
633     }
634     for (i = env->vstart; i < env->vl; i++) {
635         k = 0;
636         while (k < nf) {
637             if (!vm && !vext_elem_mask(v0, i)) {
638                 /* set masked-off elements to 1s */
639                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
640                                   (i + k * max_elems + 1) * esz);
641                 k++;
642                 continue;
643             }
644             target_ulong addr = base + ((i * nf + k) << log2_esz);
645             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
646             k++;
647         }
648     }
649     env->vstart = 0;
650     /* set tail elements to 1s */
651     for (k = 0; k < nf; ++k) {
652         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
653                           (k * max_elems + max_elems) * esz);
654     }
655     if (nf * max_elems % total_elems != 0) {
656         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
657         uint32_t registers_used =
658             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
659         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
660                           registers_used * vlenb);
661     }
662 }
663 
664 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
665 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
666                   CPURISCVState *env, uint32_t desc)      \
667 {                                                         \
668     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
669               ctzl(sizeof(ETYPE)), GETPC());              \
670 }
671 
672 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
673 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
674 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
675 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
676 
677 #define DO_SWAP(N, M) (M)
678 #define DO_AND(N, M)  (N & M)
679 #define DO_XOR(N, M)  (N ^ M)
680 #define DO_OR(N, M)   (N | M)
681 #define DO_ADD(N, M)  (N + M)
682 
683 /* Signed min/max */
684 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
685 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
686 
687 /* Unsigned min/max */
688 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
689 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
690 
691 /*
692  *** load and store whole register instructions
693  */
694 static void
695 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
696                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
697 {
698     uint32_t i, k, off, pos;
699     uint32_t nf = vext_nf(desc);
700     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
701     uint32_t max_elems = vlenb >> log2_esz;
702 
703     k = env->vstart / max_elems;
704     off = env->vstart % max_elems;
705 
706     if (off) {
707         /* load/store rest of elements of current segment pointed by vstart */
708         for (pos = off; pos < max_elems; pos++, env->vstart++) {
709             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
710             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
711         }
712         k++;
713     }
714 
715     /* load/store elements for rest of segments */
716     for (; k < nf; k++) {
717         for (i = 0; i < max_elems; i++, env->vstart++) {
718             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
719             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
720         }
721     }
722 
723     env->vstart = 0;
724 }
725 
726 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
727 void HELPER(NAME)(void *vd, target_ulong base,       \
728                   CPURISCVState *env, uint32_t desc) \
729 {                                                    \
730     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
731                     ctzl(sizeof(ETYPE)), GETPC());   \
732 }
733 
734 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
735 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
736 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
737 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
738 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
739 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
740 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
741 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
742 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
743 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
744 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
745 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
746 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
747 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
748 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
749 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
750 
751 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
752 void HELPER(NAME)(void *vd, target_ulong base,       \
753                   CPURISCVState *env, uint32_t desc) \
754 {                                                    \
755     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
756                     ctzl(sizeof(ETYPE)), GETPC());   \
757 }
758 
759 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
760 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
761 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
762 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
763 
764 /*
765  *** Vector Integer Arithmetic Instructions
766  */
767 
768 /* expand macro args before macro */
769 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
770 
771 /* (TD, T1, T2, TX1, TX2) */
772 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
773 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
774 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
775 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
776 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
777 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
778 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
779 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
780 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
781 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
782 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
783 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
784 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
785 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
786 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
787 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
788 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
789 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
790 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
791 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
792 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
793 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
794 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
795 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
796 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
797 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
798 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
799 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
800 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
801 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
802 
803 /* operation of two vector elements */
804 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
805 
806 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
807 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
808 {                                                               \
809     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
810     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
811     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
812 }
813 #define DO_SUB(N, M) (N - M)
814 #define DO_RSUB(N, M) (M - N)
815 
816 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
817 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
818 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
819 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
820 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
821 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
822 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
823 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
824 
825 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
826                        CPURISCVState *env, uint32_t desc,
827                        opivv2_fn *fn, uint32_t esz)
828 {
829     uint32_t vm = vext_vm(desc);
830     uint32_t vl = env->vl;
831     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
832     uint32_t vta = vext_vta(desc);
833     uint32_t vma = vext_vma(desc);
834     uint32_t i;
835 
836     for (i = env->vstart; i < vl; i++) {
837         if (!vm && !vext_elem_mask(v0, i)) {
838             /* set masked-off elements to 1s */
839             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
840             continue;
841         }
842         fn(vd, vs1, vs2, i);
843     }
844     env->vstart = 0;
845     /* set tail elements to 1s */
846     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
847 }
848 
849 /* generate the helpers for OPIVV */
850 #define GEN_VEXT_VV(NAME, ESZ)                            \
851 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
852                   void *vs2, CPURISCVState *env,          \
853                   uint32_t desc)                          \
854 {                                                         \
855     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
856                do_##NAME, ESZ);                           \
857 }
858 
859 GEN_VEXT_VV(vadd_vv_b, 1)
860 GEN_VEXT_VV(vadd_vv_h, 2)
861 GEN_VEXT_VV(vadd_vv_w, 4)
862 GEN_VEXT_VV(vadd_vv_d, 8)
863 GEN_VEXT_VV(vsub_vv_b, 1)
864 GEN_VEXT_VV(vsub_vv_h, 2)
865 GEN_VEXT_VV(vsub_vv_w, 4)
866 GEN_VEXT_VV(vsub_vv_d, 8)
867 
868 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
869 
870 /*
871  * (T1)s1 gives the real operator type.
872  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
873  */
874 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
875 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
876 {                                                                   \
877     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
878     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
879 }
880 
881 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
882 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
883 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
884 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
885 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
886 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
887 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
888 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
889 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
890 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
891 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
892 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
893 
894 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
895                        CPURISCVState *env, uint32_t desc,
896                        opivx2_fn fn, uint32_t esz)
897 {
898     uint32_t vm = vext_vm(desc);
899     uint32_t vl = env->vl;
900     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
901     uint32_t vta = vext_vta(desc);
902     uint32_t vma = vext_vma(desc);
903     uint32_t i;
904 
905     for (i = env->vstart; i < vl; i++) {
906         if (!vm && !vext_elem_mask(v0, i)) {
907             /* set masked-off elements to 1s */
908             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
909             continue;
910         }
911         fn(vd, s1, vs2, i);
912     }
913     env->vstart = 0;
914     /* set tail elements to 1s */
915     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
916 }
917 
918 /* generate the helpers for OPIVX */
919 #define GEN_VEXT_VX(NAME, ESZ)                            \
920 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
921                   void *vs2, CPURISCVState *env,          \
922                   uint32_t desc)                          \
923 {                                                         \
924     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
925                do_##NAME, ESZ);                           \
926 }
927 
928 GEN_VEXT_VX(vadd_vx_b, 1)
929 GEN_VEXT_VX(vadd_vx_h, 2)
930 GEN_VEXT_VX(vadd_vx_w, 4)
931 GEN_VEXT_VX(vadd_vx_d, 8)
932 GEN_VEXT_VX(vsub_vx_b, 1)
933 GEN_VEXT_VX(vsub_vx_h, 2)
934 GEN_VEXT_VX(vsub_vx_w, 4)
935 GEN_VEXT_VX(vsub_vx_d, 8)
936 GEN_VEXT_VX(vrsub_vx_b, 1)
937 GEN_VEXT_VX(vrsub_vx_h, 2)
938 GEN_VEXT_VX(vrsub_vx_w, 4)
939 GEN_VEXT_VX(vrsub_vx_d, 8)
940 
941 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
942 {
943     intptr_t oprsz = simd_oprsz(desc);
944     intptr_t i;
945 
946     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
947         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
948     }
949 }
950 
951 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
952 {
953     intptr_t oprsz = simd_oprsz(desc);
954     intptr_t i;
955 
956     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
957         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
958     }
959 }
960 
961 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
962 {
963     intptr_t oprsz = simd_oprsz(desc);
964     intptr_t i;
965 
966     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
967         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
968     }
969 }
970 
971 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
972 {
973     intptr_t oprsz = simd_oprsz(desc);
974     intptr_t i;
975 
976     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
977         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
978     }
979 }
980 
981 /* Vector Widening Integer Add/Subtract */
982 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
983 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
984 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
985 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
986 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
987 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
988 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
989 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
990 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
991 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
992 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
993 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
994 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
995 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
996 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
997 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
998 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
999 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1000 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1001 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1002 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1003 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1004 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1005 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1006 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1007 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1008 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1009 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1010 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1011 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1012 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1013 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1014 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1015 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1016 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1017 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1018 GEN_VEXT_VV(vwaddu_vv_b, 2)
1019 GEN_VEXT_VV(vwaddu_vv_h, 4)
1020 GEN_VEXT_VV(vwaddu_vv_w, 8)
1021 GEN_VEXT_VV(vwsubu_vv_b, 2)
1022 GEN_VEXT_VV(vwsubu_vv_h, 4)
1023 GEN_VEXT_VV(vwsubu_vv_w, 8)
1024 GEN_VEXT_VV(vwadd_vv_b, 2)
1025 GEN_VEXT_VV(vwadd_vv_h, 4)
1026 GEN_VEXT_VV(vwadd_vv_w, 8)
1027 GEN_VEXT_VV(vwsub_vv_b, 2)
1028 GEN_VEXT_VV(vwsub_vv_h, 4)
1029 GEN_VEXT_VV(vwsub_vv_w, 8)
1030 GEN_VEXT_VV(vwaddu_wv_b, 2)
1031 GEN_VEXT_VV(vwaddu_wv_h, 4)
1032 GEN_VEXT_VV(vwaddu_wv_w, 8)
1033 GEN_VEXT_VV(vwsubu_wv_b, 2)
1034 GEN_VEXT_VV(vwsubu_wv_h, 4)
1035 GEN_VEXT_VV(vwsubu_wv_w, 8)
1036 GEN_VEXT_VV(vwadd_wv_b, 2)
1037 GEN_VEXT_VV(vwadd_wv_h, 4)
1038 GEN_VEXT_VV(vwadd_wv_w, 8)
1039 GEN_VEXT_VV(vwsub_wv_b, 2)
1040 GEN_VEXT_VV(vwsub_wv_h, 4)
1041 GEN_VEXT_VV(vwsub_wv_w, 8)
1042 
1043 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1044 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1045 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1046 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1047 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1048 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1049 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1050 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1051 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1052 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1053 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1054 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1055 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1056 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1057 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1058 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1059 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1060 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1061 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1062 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1063 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1064 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1065 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1066 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1067 GEN_VEXT_VX(vwaddu_vx_b, 2)
1068 GEN_VEXT_VX(vwaddu_vx_h, 4)
1069 GEN_VEXT_VX(vwaddu_vx_w, 8)
1070 GEN_VEXT_VX(vwsubu_vx_b, 2)
1071 GEN_VEXT_VX(vwsubu_vx_h, 4)
1072 GEN_VEXT_VX(vwsubu_vx_w, 8)
1073 GEN_VEXT_VX(vwadd_vx_b, 2)
1074 GEN_VEXT_VX(vwadd_vx_h, 4)
1075 GEN_VEXT_VX(vwadd_vx_w, 8)
1076 GEN_VEXT_VX(vwsub_vx_b, 2)
1077 GEN_VEXT_VX(vwsub_vx_h, 4)
1078 GEN_VEXT_VX(vwsub_vx_w, 8)
1079 GEN_VEXT_VX(vwaddu_wx_b, 2)
1080 GEN_VEXT_VX(vwaddu_wx_h, 4)
1081 GEN_VEXT_VX(vwaddu_wx_w, 8)
1082 GEN_VEXT_VX(vwsubu_wx_b, 2)
1083 GEN_VEXT_VX(vwsubu_wx_h, 4)
1084 GEN_VEXT_VX(vwsubu_wx_w, 8)
1085 GEN_VEXT_VX(vwadd_wx_b, 2)
1086 GEN_VEXT_VX(vwadd_wx_h, 4)
1087 GEN_VEXT_VX(vwadd_wx_w, 8)
1088 GEN_VEXT_VX(vwsub_wx_b, 2)
1089 GEN_VEXT_VX(vwsub_wx_h, 4)
1090 GEN_VEXT_VX(vwsub_wx_w, 8)
1091 
1092 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1093 #define DO_VADC(N, M, C) (N + M + C)
1094 #define DO_VSBC(N, M, C) (N - M - C)
1095 
1096 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1097 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1098                   CPURISCVState *env, uint32_t desc)          \
1099 {                                                             \
1100     uint32_t vl = env->vl;                                    \
1101     uint32_t esz = sizeof(ETYPE);                             \
1102     uint32_t total_elems =                                    \
1103         vext_get_total_elems(env, desc, esz);                 \
1104     uint32_t vta = vext_vta(desc);                            \
1105     uint32_t i;                                               \
1106                                                               \
1107     for (i = env->vstart; i < vl; i++) {                      \
1108         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1109         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1110         ETYPE carry = vext_elem_mask(v0, i);                  \
1111                                                               \
1112         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1113     }                                                         \
1114     env->vstart = 0;                                          \
1115     /* set tail elements to 1s */                             \
1116     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1117 }
1118 
1119 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1120 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1121 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1122 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1123 
1124 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1125 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1126 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1127 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1128 
1129 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1130 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1131                   CPURISCVState *env, uint32_t desc)                     \
1132 {                                                                        \
1133     uint32_t vl = env->vl;                                               \
1134     uint32_t esz = sizeof(ETYPE);                                        \
1135     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1136     uint32_t vta = vext_vta(desc);                                       \
1137     uint32_t i;                                                          \
1138                                                                          \
1139     for (i = env->vstart; i < vl; i++) {                                 \
1140         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1141         ETYPE carry = vext_elem_mask(v0, i);                             \
1142                                                                          \
1143         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1144     }                                                                    \
1145     env->vstart = 0;                                          \
1146     /* set tail elements to 1s */                                        \
1147     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1148 }
1149 
1150 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1151 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1152 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1153 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1154 
1155 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1156 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1157 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1158 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1159 
1160 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1161                           (__typeof(N))(N + M) < N)
1162 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1163 
1164 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1165 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1166                   CPURISCVState *env, uint32_t desc)          \
1167 {                                                             \
1168     uint32_t vl = env->vl;                                    \
1169     uint32_t vm = vext_vm(desc);                              \
1170     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1171     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1172     uint32_t i;                                               \
1173                                                               \
1174     for (i = env->vstart; i < vl; i++) {                      \
1175         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1176         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1177         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1178         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1179     }                                                         \
1180     env->vstart = 0;                                          \
1181     /* mask destination register are always tail-agnostic */  \
1182     /* set tail elements to 1s */                             \
1183     if (vta_all_1s) {                                         \
1184         for (; i < total_elems; i++) {                        \
1185             vext_set_elem_mask(vd, i, 1);                     \
1186         }                                                     \
1187     }                                                         \
1188 }
1189 
1190 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1191 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1192 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1193 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1194 
1195 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1196 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1197 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1198 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1199 
1200 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1201 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1202                   void *vs2, CPURISCVState *env, uint32_t desc) \
1203 {                                                               \
1204     uint32_t vl = env->vl;                                      \
1205     uint32_t vm = vext_vm(desc);                                \
1206     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1207     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1208     uint32_t i;                                                 \
1209                                                                 \
1210     for (i = env->vstart; i < vl; i++) {                        \
1211         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1212         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1213         vext_set_elem_mask(vd, i,                               \
1214                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1215     }                                                           \
1216     env->vstart = 0;                                            \
1217     /* mask destination register are always tail-agnostic */    \
1218     /* set tail elements to 1s */                               \
1219     if (vta_all_1s) {                                           \
1220         for (; i < total_elems; i++) {                          \
1221             vext_set_elem_mask(vd, i, 1);                       \
1222         }                                                       \
1223     }                                                           \
1224 }
1225 
1226 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1227 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1228 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1229 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1230 
1231 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1232 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1233 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1234 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1235 
1236 /* Vector Bitwise Logical Instructions */
1237 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1238 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1239 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1240 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1241 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1242 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1243 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1244 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1245 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1246 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1247 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1248 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1249 GEN_VEXT_VV(vand_vv_b, 1)
1250 GEN_VEXT_VV(vand_vv_h, 2)
1251 GEN_VEXT_VV(vand_vv_w, 4)
1252 GEN_VEXT_VV(vand_vv_d, 8)
1253 GEN_VEXT_VV(vor_vv_b, 1)
1254 GEN_VEXT_VV(vor_vv_h, 2)
1255 GEN_VEXT_VV(vor_vv_w, 4)
1256 GEN_VEXT_VV(vor_vv_d, 8)
1257 GEN_VEXT_VV(vxor_vv_b, 1)
1258 GEN_VEXT_VV(vxor_vv_h, 2)
1259 GEN_VEXT_VV(vxor_vv_w, 4)
1260 GEN_VEXT_VV(vxor_vv_d, 8)
1261 
1262 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1263 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1264 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1265 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1266 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1267 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1268 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1269 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1270 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1271 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1272 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1273 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1274 GEN_VEXT_VX(vand_vx_b, 1)
1275 GEN_VEXT_VX(vand_vx_h, 2)
1276 GEN_VEXT_VX(vand_vx_w, 4)
1277 GEN_VEXT_VX(vand_vx_d, 8)
1278 GEN_VEXT_VX(vor_vx_b, 1)
1279 GEN_VEXT_VX(vor_vx_h, 2)
1280 GEN_VEXT_VX(vor_vx_w, 4)
1281 GEN_VEXT_VX(vor_vx_d, 8)
1282 GEN_VEXT_VX(vxor_vx_b, 1)
1283 GEN_VEXT_VX(vxor_vx_h, 2)
1284 GEN_VEXT_VX(vxor_vx_w, 4)
1285 GEN_VEXT_VX(vxor_vx_d, 8)
1286 
1287 /* Vector Single-Width Bit Shift Instructions */
1288 #define DO_SLL(N, M)  (N << (M))
1289 #define DO_SRL(N, M)  (N >> (M))
1290 
1291 /* generate the helpers for shift instructions with two vector operators */
1292 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1293 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1294                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1295 {                                                                         \
1296     uint32_t vm = vext_vm(desc);                                          \
1297     uint32_t vl = env->vl;                                                \
1298     uint32_t esz = sizeof(TS1);                                           \
1299     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1300     uint32_t vta = vext_vta(desc);                                        \
1301     uint32_t vma = vext_vma(desc);                                        \
1302     uint32_t i;                                                           \
1303                                                                           \
1304     for (i = env->vstart; i < vl; i++) {                                  \
1305         if (!vm && !vext_elem_mask(v0, i)) {                              \
1306             /* set masked-off elements to 1s */                           \
1307             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1308             continue;                                                     \
1309         }                                                                 \
1310         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1311         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1312         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1313     }                                                                     \
1314     env->vstart = 0;                                                      \
1315     /* set tail elements to 1s */                                         \
1316     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1317 }
1318 
1319 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1320 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1321 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1322 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1323 
1324 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1325 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1326 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1327 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1328 
1329 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1330 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1331 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1332 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1333 
1334 /* generate the helpers for shift instructions with one vector and one scalar */
1335 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1336 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1337         void *vs2, CPURISCVState *env, uint32_t desc)       \
1338 {                                                           \
1339     uint32_t vm = vext_vm(desc);                            \
1340     uint32_t vl = env->vl;                                  \
1341     uint32_t esz = sizeof(TD);                              \
1342     uint32_t total_elems =                                  \
1343         vext_get_total_elems(env, desc, esz);               \
1344     uint32_t vta = vext_vta(desc);                          \
1345     uint32_t vma = vext_vma(desc);                          \
1346     uint32_t i;                                             \
1347                                                             \
1348     for (i = env->vstart; i < vl; i++) {                    \
1349         if (!vm && !vext_elem_mask(v0, i)) {                \
1350             /* set masked-off elements to 1s */             \
1351             vext_set_elems_1s(vd, vma, i * esz,             \
1352                               (i + 1) * esz);               \
1353             continue;                                       \
1354         }                                                   \
1355         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1356         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1357     }                                                       \
1358     env->vstart = 0;                                        \
1359     /* set tail elements to 1s */                           \
1360     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1361 }
1362 
1363 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1364 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1365 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1366 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1367 
1368 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1369 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1370 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1371 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1372 
1373 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1374 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1375 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1376 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1377 
1378 /* Vector Narrowing Integer Right Shift Instructions */
1379 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1380 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1381 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1382 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1383 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1384 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1385 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1386 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1387 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1388 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1389 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1390 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1391 
1392 /* Vector Integer Comparison Instructions */
1393 #define DO_MSEQ(N, M) (N == M)
1394 #define DO_MSNE(N, M) (N != M)
1395 #define DO_MSLT(N, M) (N < M)
1396 #define DO_MSLE(N, M) (N <= M)
1397 #define DO_MSGT(N, M) (N > M)
1398 
1399 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1400 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1401                   CPURISCVState *env, uint32_t desc)          \
1402 {                                                             \
1403     uint32_t vm = vext_vm(desc);                              \
1404     uint32_t vl = env->vl;                                    \
1405     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1406     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1407     uint32_t vma = vext_vma(desc);                            \
1408     uint32_t i;                                               \
1409                                                               \
1410     for (i = env->vstart; i < vl; i++) {                      \
1411         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1412         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1413         if (!vm && !vext_elem_mask(v0, i)) {                  \
1414             /* set masked-off elements to 1s */               \
1415             if (vma) {                                        \
1416                 vext_set_elem_mask(vd, i, 1);                 \
1417             }                                                 \
1418             continue;                                         \
1419         }                                                     \
1420         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1421     }                                                         \
1422     env->vstart = 0;                                          \
1423     /* mask destination register are always tail-agnostic */  \
1424     /* set tail elements to 1s */                             \
1425     if (vta_all_1s) {                                         \
1426         for (; i < total_elems; i++) {                        \
1427             vext_set_elem_mask(vd, i, 1);                     \
1428         }                                                     \
1429     }                                                         \
1430 }
1431 
1432 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1433 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1434 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1435 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1436 
1437 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1438 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1439 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1440 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1441 
1442 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1443 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1444 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1445 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1446 
1447 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1448 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1449 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1450 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1451 
1452 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1453 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1454 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1455 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1456 
1457 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1458 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1459 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1460 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1461 
1462 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1463 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1464                   CPURISCVState *env, uint32_t desc)                \
1465 {                                                                   \
1466     uint32_t vm = vext_vm(desc);                                    \
1467     uint32_t vl = env->vl;                                          \
1468     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1469     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1470     uint32_t vma = vext_vma(desc);                                  \
1471     uint32_t i;                                                     \
1472                                                                     \
1473     for (i = env->vstart; i < vl; i++) {                            \
1474         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1475         if (!vm && !vext_elem_mask(v0, i)) {                        \
1476             /* set masked-off elements to 1s */                     \
1477             if (vma) {                                              \
1478                 vext_set_elem_mask(vd, i, 1);                       \
1479             }                                                       \
1480             continue;                                               \
1481         }                                                           \
1482         vext_set_elem_mask(vd, i,                                   \
1483                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1484     }                                                               \
1485     env->vstart = 0;                                                \
1486     /* mask destination register are always tail-agnostic */        \
1487     /* set tail elements to 1s */                                   \
1488     if (vta_all_1s) {                                               \
1489         for (; i < total_elems; i++) {                              \
1490             vext_set_elem_mask(vd, i, 1);                           \
1491         }                                                           \
1492     }                                                               \
1493 }
1494 
1495 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1496 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1497 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1498 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1499 
1500 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1501 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1502 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1503 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1504 
1505 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1506 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1507 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1508 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1509 
1510 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1511 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1512 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1513 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1514 
1515 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1516 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1517 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1518 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1519 
1520 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1521 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1522 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1523 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1524 
1525 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1526 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1527 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1528 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1529 
1530 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1531 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1532 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1533 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1534 
1535 /* Vector Integer Min/Max Instructions */
1536 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1537 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1538 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1539 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1540 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1541 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1542 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1543 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1544 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1545 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1546 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1547 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1548 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1549 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1550 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1551 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1552 GEN_VEXT_VV(vminu_vv_b, 1)
1553 GEN_VEXT_VV(vminu_vv_h, 2)
1554 GEN_VEXT_VV(vminu_vv_w, 4)
1555 GEN_VEXT_VV(vminu_vv_d, 8)
1556 GEN_VEXT_VV(vmin_vv_b, 1)
1557 GEN_VEXT_VV(vmin_vv_h, 2)
1558 GEN_VEXT_VV(vmin_vv_w, 4)
1559 GEN_VEXT_VV(vmin_vv_d, 8)
1560 GEN_VEXT_VV(vmaxu_vv_b, 1)
1561 GEN_VEXT_VV(vmaxu_vv_h, 2)
1562 GEN_VEXT_VV(vmaxu_vv_w, 4)
1563 GEN_VEXT_VV(vmaxu_vv_d, 8)
1564 GEN_VEXT_VV(vmax_vv_b, 1)
1565 GEN_VEXT_VV(vmax_vv_h, 2)
1566 GEN_VEXT_VV(vmax_vv_w, 4)
1567 GEN_VEXT_VV(vmax_vv_d, 8)
1568 
1569 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1570 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1571 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1572 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1573 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1574 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1575 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1576 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1577 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1578 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1579 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1580 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1581 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1582 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1583 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1584 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1585 GEN_VEXT_VX(vminu_vx_b, 1)
1586 GEN_VEXT_VX(vminu_vx_h, 2)
1587 GEN_VEXT_VX(vminu_vx_w, 4)
1588 GEN_VEXT_VX(vminu_vx_d, 8)
1589 GEN_VEXT_VX(vmin_vx_b, 1)
1590 GEN_VEXT_VX(vmin_vx_h, 2)
1591 GEN_VEXT_VX(vmin_vx_w, 4)
1592 GEN_VEXT_VX(vmin_vx_d, 8)
1593 GEN_VEXT_VX(vmaxu_vx_b, 1)
1594 GEN_VEXT_VX(vmaxu_vx_h, 2)
1595 GEN_VEXT_VX(vmaxu_vx_w, 4)
1596 GEN_VEXT_VX(vmaxu_vx_d, 8)
1597 GEN_VEXT_VX(vmax_vx_b, 1)
1598 GEN_VEXT_VX(vmax_vx_h, 2)
1599 GEN_VEXT_VX(vmax_vx_w, 4)
1600 GEN_VEXT_VX(vmax_vx_d, 8)
1601 
1602 /* Vector Single-Width Integer Multiply Instructions */
1603 #define DO_MUL(N, M) (N * M)
1604 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1605 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1606 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1607 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1608 GEN_VEXT_VV(vmul_vv_b, 1)
1609 GEN_VEXT_VV(vmul_vv_h, 2)
1610 GEN_VEXT_VV(vmul_vv_w, 4)
1611 GEN_VEXT_VV(vmul_vv_d, 8)
1612 
1613 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1614 {
1615     return (int16_t)s2 * (int16_t)s1 >> 8;
1616 }
1617 
1618 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1619 {
1620     return (int32_t)s2 * (int32_t)s1 >> 16;
1621 }
1622 
1623 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1624 {
1625     return (int64_t)s2 * (int64_t)s1 >> 32;
1626 }
1627 
1628 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1629 {
1630     uint64_t hi_64, lo_64;
1631 
1632     muls64(&lo_64, &hi_64, s1, s2);
1633     return hi_64;
1634 }
1635 
1636 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1637 {
1638     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1639 }
1640 
1641 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1642 {
1643     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1644 }
1645 
1646 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1647 {
1648     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1649 }
1650 
1651 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1652 {
1653     uint64_t hi_64, lo_64;
1654 
1655     mulu64(&lo_64, &hi_64, s2, s1);
1656     return hi_64;
1657 }
1658 
1659 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1660 {
1661     return (int16_t)s2 * (uint16_t)s1 >> 8;
1662 }
1663 
1664 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1665 {
1666     return (int32_t)s2 * (uint32_t)s1 >> 16;
1667 }
1668 
1669 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1670 {
1671     return (int64_t)s2 * (uint64_t)s1 >> 32;
1672 }
1673 
1674 /*
1675  * Let  A = signed operand,
1676  *      B = unsigned operand
1677  *      P = mulu64(A, B), unsigned product
1678  *
1679  * LET  X = 2 ** 64  - A, 2's complement of A
1680  *      SP = signed product
1681  * THEN
1682  *      IF A < 0
1683  *          SP = -X * B
1684  *             = -(2 ** 64 - A) * B
1685  *             = A * B - 2 ** 64 * B
1686  *             = P - 2 ** 64 * B
1687  *      ELSE
1688  *          SP = P
1689  * THEN
1690  *      HI_P -= (A < 0 ? B : 0)
1691  */
1692 
1693 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1694 {
1695     uint64_t hi_64, lo_64;
1696 
1697     mulu64(&lo_64, &hi_64, s2, s1);
1698 
1699     hi_64 -= s2 < 0 ? s1 : 0;
1700     return hi_64;
1701 }
1702 
1703 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1704 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1705 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1706 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1707 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1708 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1709 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1710 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1711 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1712 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1713 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1714 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1715 GEN_VEXT_VV(vmulh_vv_b, 1)
1716 GEN_VEXT_VV(vmulh_vv_h, 2)
1717 GEN_VEXT_VV(vmulh_vv_w, 4)
1718 GEN_VEXT_VV(vmulh_vv_d, 8)
1719 GEN_VEXT_VV(vmulhu_vv_b, 1)
1720 GEN_VEXT_VV(vmulhu_vv_h, 2)
1721 GEN_VEXT_VV(vmulhu_vv_w, 4)
1722 GEN_VEXT_VV(vmulhu_vv_d, 8)
1723 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1724 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1725 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1726 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1727 
1728 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1729 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1730 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1731 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1732 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1733 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1734 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1735 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1736 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1737 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1738 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1739 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1740 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1741 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1742 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1743 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1744 GEN_VEXT_VX(vmul_vx_b, 1)
1745 GEN_VEXT_VX(vmul_vx_h, 2)
1746 GEN_VEXT_VX(vmul_vx_w, 4)
1747 GEN_VEXT_VX(vmul_vx_d, 8)
1748 GEN_VEXT_VX(vmulh_vx_b, 1)
1749 GEN_VEXT_VX(vmulh_vx_h, 2)
1750 GEN_VEXT_VX(vmulh_vx_w, 4)
1751 GEN_VEXT_VX(vmulh_vx_d, 8)
1752 GEN_VEXT_VX(vmulhu_vx_b, 1)
1753 GEN_VEXT_VX(vmulhu_vx_h, 2)
1754 GEN_VEXT_VX(vmulhu_vx_w, 4)
1755 GEN_VEXT_VX(vmulhu_vx_d, 8)
1756 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1757 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1758 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1759 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1760 
1761 /* Vector Integer Divide Instructions */
1762 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1763 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1764 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1765         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1766 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1767         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1768 
1769 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1770 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1771 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1772 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1773 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1774 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1775 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1776 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1777 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1778 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1779 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1780 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1781 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1782 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1783 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1784 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1785 GEN_VEXT_VV(vdivu_vv_b, 1)
1786 GEN_VEXT_VV(vdivu_vv_h, 2)
1787 GEN_VEXT_VV(vdivu_vv_w, 4)
1788 GEN_VEXT_VV(vdivu_vv_d, 8)
1789 GEN_VEXT_VV(vdiv_vv_b, 1)
1790 GEN_VEXT_VV(vdiv_vv_h, 2)
1791 GEN_VEXT_VV(vdiv_vv_w, 4)
1792 GEN_VEXT_VV(vdiv_vv_d, 8)
1793 GEN_VEXT_VV(vremu_vv_b, 1)
1794 GEN_VEXT_VV(vremu_vv_h, 2)
1795 GEN_VEXT_VV(vremu_vv_w, 4)
1796 GEN_VEXT_VV(vremu_vv_d, 8)
1797 GEN_VEXT_VV(vrem_vv_b, 1)
1798 GEN_VEXT_VV(vrem_vv_h, 2)
1799 GEN_VEXT_VV(vrem_vv_w, 4)
1800 GEN_VEXT_VV(vrem_vv_d, 8)
1801 
1802 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1803 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1804 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1805 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1806 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1807 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1808 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1809 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1810 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1811 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1812 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1813 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1814 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1815 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1816 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1817 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1818 GEN_VEXT_VX(vdivu_vx_b, 1)
1819 GEN_VEXT_VX(vdivu_vx_h, 2)
1820 GEN_VEXT_VX(vdivu_vx_w, 4)
1821 GEN_VEXT_VX(vdivu_vx_d, 8)
1822 GEN_VEXT_VX(vdiv_vx_b, 1)
1823 GEN_VEXT_VX(vdiv_vx_h, 2)
1824 GEN_VEXT_VX(vdiv_vx_w, 4)
1825 GEN_VEXT_VX(vdiv_vx_d, 8)
1826 GEN_VEXT_VX(vremu_vx_b, 1)
1827 GEN_VEXT_VX(vremu_vx_h, 2)
1828 GEN_VEXT_VX(vremu_vx_w, 4)
1829 GEN_VEXT_VX(vremu_vx_d, 8)
1830 GEN_VEXT_VX(vrem_vx_b, 1)
1831 GEN_VEXT_VX(vrem_vx_h, 2)
1832 GEN_VEXT_VX(vrem_vx_w, 4)
1833 GEN_VEXT_VX(vrem_vx_d, 8)
1834 
1835 /* Vector Widening Integer Multiply Instructions */
1836 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1837 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1838 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1839 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1840 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1841 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1842 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1843 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1844 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1845 GEN_VEXT_VV(vwmul_vv_b, 2)
1846 GEN_VEXT_VV(vwmul_vv_h, 4)
1847 GEN_VEXT_VV(vwmul_vv_w, 8)
1848 GEN_VEXT_VV(vwmulu_vv_b, 2)
1849 GEN_VEXT_VV(vwmulu_vv_h, 4)
1850 GEN_VEXT_VV(vwmulu_vv_w, 8)
1851 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1852 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1853 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1854 
1855 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1856 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1857 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1858 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1859 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1860 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1861 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1862 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1863 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1864 GEN_VEXT_VX(vwmul_vx_b, 2)
1865 GEN_VEXT_VX(vwmul_vx_h, 4)
1866 GEN_VEXT_VX(vwmul_vx_w, 8)
1867 GEN_VEXT_VX(vwmulu_vx_b, 2)
1868 GEN_VEXT_VX(vwmulu_vx_h, 4)
1869 GEN_VEXT_VX(vwmulu_vx_w, 8)
1870 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1871 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1872 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1873 
1874 /* Vector Single-Width Integer Multiply-Add Instructions */
1875 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1876 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1877 {                                                                  \
1878     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1879     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1880     TD d = *((TD *)vd + HD(i));                                    \
1881     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1882 }
1883 
1884 #define DO_MACC(N, M, D) (M * N + D)
1885 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1886 #define DO_MADD(N, M, D) (M * D + N)
1887 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1888 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1889 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1890 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1891 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1892 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1893 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1894 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1895 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1896 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1897 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1898 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1899 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1900 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1901 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1902 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1903 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1904 GEN_VEXT_VV(vmacc_vv_b, 1)
1905 GEN_VEXT_VV(vmacc_vv_h, 2)
1906 GEN_VEXT_VV(vmacc_vv_w, 4)
1907 GEN_VEXT_VV(vmacc_vv_d, 8)
1908 GEN_VEXT_VV(vnmsac_vv_b, 1)
1909 GEN_VEXT_VV(vnmsac_vv_h, 2)
1910 GEN_VEXT_VV(vnmsac_vv_w, 4)
1911 GEN_VEXT_VV(vnmsac_vv_d, 8)
1912 GEN_VEXT_VV(vmadd_vv_b, 1)
1913 GEN_VEXT_VV(vmadd_vv_h, 2)
1914 GEN_VEXT_VV(vmadd_vv_w, 4)
1915 GEN_VEXT_VV(vmadd_vv_d, 8)
1916 GEN_VEXT_VV(vnmsub_vv_b, 1)
1917 GEN_VEXT_VV(vnmsub_vv_h, 2)
1918 GEN_VEXT_VV(vnmsub_vv_w, 4)
1919 GEN_VEXT_VV(vnmsub_vv_d, 8)
1920 
1921 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1922 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1923 {                                                                   \
1924     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1925     TD d = *((TD *)vd + HD(i));                                     \
1926     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1927 }
1928 
1929 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1930 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1931 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1932 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1933 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1934 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1935 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1936 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1937 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1938 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1939 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1940 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1941 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1942 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1943 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1944 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1945 GEN_VEXT_VX(vmacc_vx_b, 1)
1946 GEN_VEXT_VX(vmacc_vx_h, 2)
1947 GEN_VEXT_VX(vmacc_vx_w, 4)
1948 GEN_VEXT_VX(vmacc_vx_d, 8)
1949 GEN_VEXT_VX(vnmsac_vx_b, 1)
1950 GEN_VEXT_VX(vnmsac_vx_h, 2)
1951 GEN_VEXT_VX(vnmsac_vx_w, 4)
1952 GEN_VEXT_VX(vnmsac_vx_d, 8)
1953 GEN_VEXT_VX(vmadd_vx_b, 1)
1954 GEN_VEXT_VX(vmadd_vx_h, 2)
1955 GEN_VEXT_VX(vmadd_vx_w, 4)
1956 GEN_VEXT_VX(vmadd_vx_d, 8)
1957 GEN_VEXT_VX(vnmsub_vx_b, 1)
1958 GEN_VEXT_VX(vnmsub_vx_h, 2)
1959 GEN_VEXT_VX(vnmsub_vx_w, 4)
1960 GEN_VEXT_VX(vnmsub_vx_d, 8)
1961 
1962 /* Vector Widening Integer Multiply-Add Instructions */
1963 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1964 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1965 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1966 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1967 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1968 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1969 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1970 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1971 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1972 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1973 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1974 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1975 GEN_VEXT_VV(vwmacc_vv_b, 2)
1976 GEN_VEXT_VV(vwmacc_vv_h, 4)
1977 GEN_VEXT_VV(vwmacc_vv_w, 8)
1978 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1979 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1980 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1981 
1982 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1983 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1984 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1985 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1986 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1987 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1988 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1989 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1990 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1991 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1992 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1993 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1994 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1995 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1996 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1997 GEN_VEXT_VX(vwmacc_vx_b, 2)
1998 GEN_VEXT_VX(vwmacc_vx_h, 4)
1999 GEN_VEXT_VX(vwmacc_vx_w, 8)
2000 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2001 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2002 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2003 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2004 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2005 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2006 
2007 /* Vector Integer Merge and Move Instructions */
2008 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2009 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2010                   uint32_t desc)                                     \
2011 {                                                                    \
2012     uint32_t vl = env->vl;                                           \
2013     uint32_t esz = sizeof(ETYPE);                                    \
2014     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2015     uint32_t vta = vext_vta(desc);                                   \
2016     uint32_t i;                                                      \
2017                                                                      \
2018     for (i = env->vstart; i < vl; i++) {                             \
2019         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2020         *((ETYPE *)vd + H(i)) = s1;                                  \
2021     }                                                                \
2022     env->vstart = 0;                                                 \
2023     /* set tail elements to 1s */                                    \
2024     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2025 }
2026 
2027 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2028 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2029 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2030 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2031 
2032 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2033 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2034                   uint32_t desc)                                     \
2035 {                                                                    \
2036     uint32_t vl = env->vl;                                           \
2037     uint32_t esz = sizeof(ETYPE);                                    \
2038     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2039     uint32_t vta = vext_vta(desc);                                   \
2040     uint32_t i;                                                      \
2041                                                                      \
2042     for (i = env->vstart; i < vl; i++) {                             \
2043         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2044     }                                                                \
2045     env->vstart = 0;                                                 \
2046     /* set tail elements to 1s */                                    \
2047     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2048 }
2049 
2050 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2051 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2052 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2053 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2054 
2055 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2056 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2057                   CPURISCVState *env, uint32_t desc)                 \
2058 {                                                                    \
2059     uint32_t vl = env->vl;                                           \
2060     uint32_t esz = sizeof(ETYPE);                                    \
2061     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2062     uint32_t vta = vext_vta(desc);                                   \
2063     uint32_t i;                                                      \
2064                                                                      \
2065     for (i = env->vstart; i < vl; i++) {                             \
2066         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2067         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2068     }                                                                \
2069     env->vstart = 0;                                                 \
2070     /* set tail elements to 1s */                                    \
2071     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2072 }
2073 
2074 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2075 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2076 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2077 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2078 
2079 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2080 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2081                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2082 {                                                                    \
2083     uint32_t vl = env->vl;                                           \
2084     uint32_t esz = sizeof(ETYPE);                                    \
2085     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2086     uint32_t vta = vext_vta(desc);                                   \
2087     uint32_t i;                                                      \
2088                                                                      \
2089     for (i = env->vstart; i < vl; i++) {                             \
2090         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2091         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2092                    (ETYPE)(target_long)s1);                          \
2093         *((ETYPE *)vd + H(i)) = d;                                   \
2094     }                                                                \
2095     env->vstart = 0;                                                 \
2096     /* set tail elements to 1s */                                    \
2097     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2098 }
2099 
2100 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2101 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2102 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2103 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2104 
2105 /*
2106  *** Vector Fixed-Point Arithmetic Instructions
2107  */
2108 
2109 /* Vector Single-Width Saturating Add and Subtract */
2110 
2111 /*
2112  * As fixed point instructions probably have round mode and saturation,
2113  * define common macros for fixed point here.
2114  */
2115 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2116                           CPURISCVState *env, int vxrm);
2117 
2118 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2119 static inline void                                                  \
2120 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2121           CPURISCVState *env, int vxrm)                             \
2122 {                                                                   \
2123     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2124     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2125     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2126 }
2127 
2128 static inline void
2129 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2130              CPURISCVState *env,
2131              uint32_t vl, uint32_t vm, int vxrm,
2132              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2133 {
2134     for (uint32_t i = env->vstart; i < vl; i++) {
2135         if (!vm && !vext_elem_mask(v0, i)) {
2136             /* set masked-off elements to 1s */
2137             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2138             continue;
2139         }
2140         fn(vd, vs1, vs2, i, env, vxrm);
2141     }
2142     env->vstart = 0;
2143 }
2144 
2145 static inline void
2146 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2147              CPURISCVState *env,
2148              uint32_t desc,
2149              opivv2_rm_fn *fn, uint32_t esz)
2150 {
2151     uint32_t vm = vext_vm(desc);
2152     uint32_t vl = env->vl;
2153     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2154     uint32_t vta = vext_vta(desc);
2155     uint32_t vma = vext_vma(desc);
2156 
2157     switch (env->vxrm) {
2158     case 0: /* rnu */
2159         vext_vv_rm_1(vd, v0, vs1, vs2,
2160                      env, vl, vm, 0, fn, vma, esz);
2161         break;
2162     case 1: /* rne */
2163         vext_vv_rm_1(vd, v0, vs1, vs2,
2164                      env, vl, vm, 1, fn, vma, esz);
2165         break;
2166     case 2: /* rdn */
2167         vext_vv_rm_1(vd, v0, vs1, vs2,
2168                      env, vl, vm, 2, fn, vma, esz);
2169         break;
2170     default: /* rod */
2171         vext_vv_rm_1(vd, v0, vs1, vs2,
2172                      env, vl, vm, 3, fn, vma, esz);
2173         break;
2174     }
2175     /* set tail elements to 1s */
2176     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2177 }
2178 
2179 /* generate helpers for fixed point instructions with OPIVV format */
2180 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2181 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2182                   CPURISCVState *env, uint32_t desc)            \
2183 {                                                               \
2184     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2185                  do_##NAME, ESZ);                               \
2186 }
2187 
2188 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2189 {
2190     uint8_t res = a + b;
2191     if (res < a) {
2192         res = UINT8_MAX;
2193         env->vxsat = 0x1;
2194     }
2195     return res;
2196 }
2197 
2198 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2199                                uint16_t b)
2200 {
2201     uint16_t res = a + b;
2202     if (res < a) {
2203         res = UINT16_MAX;
2204         env->vxsat = 0x1;
2205     }
2206     return res;
2207 }
2208 
2209 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2210                                uint32_t b)
2211 {
2212     uint32_t res = a + b;
2213     if (res < a) {
2214         res = UINT32_MAX;
2215         env->vxsat = 0x1;
2216     }
2217     return res;
2218 }
2219 
2220 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2221                                uint64_t b)
2222 {
2223     uint64_t res = a + b;
2224     if (res < a) {
2225         res = UINT64_MAX;
2226         env->vxsat = 0x1;
2227     }
2228     return res;
2229 }
2230 
2231 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2232 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2233 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2234 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2235 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2236 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2237 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2238 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2239 
2240 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2241                           CPURISCVState *env, int vxrm);
2242 
2243 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2244 static inline void                                                  \
2245 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2246           CPURISCVState *env, int vxrm)                             \
2247 {                                                                   \
2248     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2249     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2250 }
2251 
2252 static inline void
2253 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2254              CPURISCVState *env,
2255              uint32_t vl, uint32_t vm, int vxrm,
2256              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2257 {
2258     for (uint32_t i = env->vstart; i < vl; i++) {
2259         if (!vm && !vext_elem_mask(v0, i)) {
2260             /* set masked-off elements to 1s */
2261             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2262             continue;
2263         }
2264         fn(vd, s1, vs2, i, env, vxrm);
2265     }
2266     env->vstart = 0;
2267 }
2268 
2269 static inline void
2270 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2271              CPURISCVState *env,
2272              uint32_t desc,
2273              opivx2_rm_fn *fn, uint32_t esz)
2274 {
2275     uint32_t vm = vext_vm(desc);
2276     uint32_t vl = env->vl;
2277     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2278     uint32_t vta = vext_vta(desc);
2279     uint32_t vma = vext_vma(desc);
2280 
2281     switch (env->vxrm) {
2282     case 0: /* rnu */
2283         vext_vx_rm_1(vd, v0, s1, vs2,
2284                      env, vl, vm, 0, fn, vma, esz);
2285         break;
2286     case 1: /* rne */
2287         vext_vx_rm_1(vd, v0, s1, vs2,
2288                      env, vl, vm, 1, fn, vma, esz);
2289         break;
2290     case 2: /* rdn */
2291         vext_vx_rm_1(vd, v0, s1, vs2,
2292                      env, vl, vm, 2, fn, vma, esz);
2293         break;
2294     default: /* rod */
2295         vext_vx_rm_1(vd, v0, s1, vs2,
2296                      env, vl, vm, 3, fn, vma, esz);
2297         break;
2298     }
2299     /* set tail elements to 1s */
2300     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2301 }
2302 
2303 /* generate helpers for fixed point instructions with OPIVX format */
2304 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2305 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2306         void *vs2, CPURISCVState *env, uint32_t desc)     \
2307 {                                                         \
2308     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2309                  do_##NAME, ESZ);                         \
2310 }
2311 
2312 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2313 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2314 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2315 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2316 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2317 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2318 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2319 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2320 
2321 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2322 {
2323     int8_t res = a + b;
2324     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2325         res = a > 0 ? INT8_MAX : INT8_MIN;
2326         env->vxsat = 0x1;
2327     }
2328     return res;
2329 }
2330 
2331 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2332 {
2333     int16_t res = a + b;
2334     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2335         res = a > 0 ? INT16_MAX : INT16_MIN;
2336         env->vxsat = 0x1;
2337     }
2338     return res;
2339 }
2340 
2341 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2342 {
2343     int32_t res = a + b;
2344     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2345         res = a > 0 ? INT32_MAX : INT32_MIN;
2346         env->vxsat = 0x1;
2347     }
2348     return res;
2349 }
2350 
2351 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2352 {
2353     int64_t res = a + b;
2354     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2355         res = a > 0 ? INT64_MAX : INT64_MIN;
2356         env->vxsat = 0x1;
2357     }
2358     return res;
2359 }
2360 
2361 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2362 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2363 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2364 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2365 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2366 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2367 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2368 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2369 
2370 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2371 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2372 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2373 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2374 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2375 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2376 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2377 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2378 
2379 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2380 {
2381     uint8_t res = a - b;
2382     if (res > a) {
2383         res = 0;
2384         env->vxsat = 0x1;
2385     }
2386     return res;
2387 }
2388 
2389 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2390                                uint16_t b)
2391 {
2392     uint16_t res = a - b;
2393     if (res > a) {
2394         res = 0;
2395         env->vxsat = 0x1;
2396     }
2397     return res;
2398 }
2399 
2400 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2401                                uint32_t b)
2402 {
2403     uint32_t res = a - b;
2404     if (res > a) {
2405         res = 0;
2406         env->vxsat = 0x1;
2407     }
2408     return res;
2409 }
2410 
2411 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2412                                uint64_t b)
2413 {
2414     uint64_t res = a - b;
2415     if (res > a) {
2416         res = 0;
2417         env->vxsat = 0x1;
2418     }
2419     return res;
2420 }
2421 
2422 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2423 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2424 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2425 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2426 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2427 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2428 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2429 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2430 
2431 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2432 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2433 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2434 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2435 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2436 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2437 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2438 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2439 
2440 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2441 {
2442     int8_t res = a - b;
2443     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2444         res = a >= 0 ? INT8_MAX : INT8_MIN;
2445         env->vxsat = 0x1;
2446     }
2447     return res;
2448 }
2449 
2450 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2451 {
2452     int16_t res = a - b;
2453     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2454         res = a >= 0 ? INT16_MAX : INT16_MIN;
2455         env->vxsat = 0x1;
2456     }
2457     return res;
2458 }
2459 
2460 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2461 {
2462     int32_t res = a - b;
2463     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2464         res = a >= 0 ? INT32_MAX : INT32_MIN;
2465         env->vxsat = 0x1;
2466     }
2467     return res;
2468 }
2469 
2470 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2471 {
2472     int64_t res = a - b;
2473     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2474         res = a >= 0 ? INT64_MAX : INT64_MIN;
2475         env->vxsat = 0x1;
2476     }
2477     return res;
2478 }
2479 
2480 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2481 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2482 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2483 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2484 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2485 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2486 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2487 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2488 
2489 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2490 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2491 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2492 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2493 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2494 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2495 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2496 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2497 
2498 /* Vector Single-Width Averaging Add and Subtract */
2499 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2500 {
2501     uint8_t d = extract64(v, shift, 1);
2502     uint8_t d1;
2503     uint64_t D1, D2;
2504 
2505     if (shift == 0 || shift > 64) {
2506         return 0;
2507     }
2508 
2509     d1 = extract64(v, shift - 1, 1);
2510     D1 = extract64(v, 0, shift);
2511     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2512         return d1;
2513     } else if (vxrm == 1) { /* round-to-nearest-even */
2514         if (shift > 1) {
2515             D2 = extract64(v, 0, shift - 1);
2516             return d1 & ((D2 != 0) | d);
2517         } else {
2518             return d1 & d;
2519         }
2520     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2521         return !d & (D1 != 0);
2522     }
2523     return 0; /* round-down (truncate) */
2524 }
2525 
2526 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2527 {
2528     int64_t res = (int64_t)a + b;
2529     uint8_t round = get_round(vxrm, res, 1);
2530 
2531     return (res >> 1) + round;
2532 }
2533 
2534 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2535 {
2536     int64_t res = a + b;
2537     uint8_t round = get_round(vxrm, res, 1);
2538     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2539 
2540     /* With signed overflow, bit 64 is inverse of bit 63. */
2541     return ((res >> 1) ^ over) + round;
2542 }
2543 
2544 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2545 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2546 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2547 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2548 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2549 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2550 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2551 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2552 
2553 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2554 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2555 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2556 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2557 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2558 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2559 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2560 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2561 
2562 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2563                                uint32_t a, uint32_t b)
2564 {
2565     uint64_t res = (uint64_t)a + b;
2566     uint8_t round = get_round(vxrm, res, 1);
2567 
2568     return (res >> 1) + round;
2569 }
2570 
2571 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2572                                uint64_t a, uint64_t b)
2573 {
2574     uint64_t res = a + b;
2575     uint8_t round = get_round(vxrm, res, 1);
2576     uint64_t over = (uint64_t)(res < a) << 63;
2577 
2578     return ((res >> 1) | over) + round;
2579 }
2580 
2581 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2582 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2583 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2584 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2585 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2586 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2587 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2588 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2589 
2590 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2591 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2592 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2593 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2594 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2595 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2596 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2597 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2598 
2599 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2600 {
2601     int64_t res = (int64_t)a - b;
2602     uint8_t round = get_round(vxrm, res, 1);
2603 
2604     return (res >> 1) + round;
2605 }
2606 
2607 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2608 {
2609     int64_t res = (int64_t)a - b;
2610     uint8_t round = get_round(vxrm, res, 1);
2611     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2612 
2613     /* With signed overflow, bit 64 is inverse of bit 63. */
2614     return ((res >> 1) ^ over) + round;
2615 }
2616 
2617 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2618 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2619 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2620 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2621 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2622 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2623 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2624 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2625 
2626 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2627 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2628 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2629 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2630 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2631 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2632 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2633 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2634 
2635 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2636                                uint32_t a, uint32_t b)
2637 {
2638     int64_t res = (int64_t)a - b;
2639     uint8_t round = get_round(vxrm, res, 1);
2640 
2641     return (res >> 1) + round;
2642 }
2643 
2644 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2645                                uint64_t a, uint64_t b)
2646 {
2647     uint64_t res = (uint64_t)a - b;
2648     uint8_t round = get_round(vxrm, res, 1);
2649     uint64_t over = (uint64_t)(res > a) << 63;
2650 
2651     return ((res >> 1) | over) + round;
2652 }
2653 
2654 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2655 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2656 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2657 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2658 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2659 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2660 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2661 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2662 
2663 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2664 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2665 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2666 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2667 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2668 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2669 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2670 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2671 
2672 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2673 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2674 {
2675     uint8_t round;
2676     int16_t res;
2677 
2678     res = (int16_t)a * (int16_t)b;
2679     round = get_round(vxrm, res, 7);
2680     res   = (res >> 7) + round;
2681 
2682     if (res > INT8_MAX) {
2683         env->vxsat = 0x1;
2684         return INT8_MAX;
2685     } else if (res < INT8_MIN) {
2686         env->vxsat = 0x1;
2687         return INT8_MIN;
2688     } else {
2689         return res;
2690     }
2691 }
2692 
2693 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2694 {
2695     uint8_t round;
2696     int32_t res;
2697 
2698     res = (int32_t)a * (int32_t)b;
2699     round = get_round(vxrm, res, 15);
2700     res   = (res >> 15) + round;
2701 
2702     if (res > INT16_MAX) {
2703         env->vxsat = 0x1;
2704         return INT16_MAX;
2705     } else if (res < INT16_MIN) {
2706         env->vxsat = 0x1;
2707         return INT16_MIN;
2708     } else {
2709         return res;
2710     }
2711 }
2712 
2713 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2714 {
2715     uint8_t round;
2716     int64_t res;
2717 
2718     res = (int64_t)a * (int64_t)b;
2719     round = get_round(vxrm, res, 31);
2720     res   = (res >> 31) + round;
2721 
2722     if (res > INT32_MAX) {
2723         env->vxsat = 0x1;
2724         return INT32_MAX;
2725     } else if (res < INT32_MIN) {
2726         env->vxsat = 0x1;
2727         return INT32_MIN;
2728     } else {
2729         return res;
2730     }
2731 }
2732 
2733 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2734 {
2735     uint8_t round;
2736     uint64_t hi_64, lo_64;
2737     int64_t res;
2738 
2739     if (a == INT64_MIN && b == INT64_MIN) {
2740         env->vxsat = 1;
2741         return INT64_MAX;
2742     }
2743 
2744     muls64(&lo_64, &hi_64, a, b);
2745     round = get_round(vxrm, lo_64, 63);
2746     /*
2747      * Cannot overflow, as there are always
2748      * 2 sign bits after multiply.
2749      */
2750     res = (hi_64 << 1) | (lo_64 >> 63);
2751     if (round) {
2752         if (res == INT64_MAX) {
2753             env->vxsat = 1;
2754         } else {
2755             res += 1;
2756         }
2757     }
2758     return res;
2759 }
2760 
2761 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2762 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2763 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2764 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2765 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2766 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2767 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2768 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2769 
2770 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2771 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2772 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2773 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2774 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2775 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2776 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2777 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2778 
2779 /* Vector Single-Width Scaling Shift Instructions */
2780 static inline uint8_t
2781 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2782 {
2783     uint8_t round, shift = b & 0x7;
2784     uint8_t res;
2785 
2786     round = get_round(vxrm, a, shift);
2787     res   = (a >> shift)  + round;
2788     return res;
2789 }
2790 static inline uint16_t
2791 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2792 {
2793     uint8_t round, shift = b & 0xf;
2794     uint16_t res;
2795 
2796     round = get_round(vxrm, a, shift);
2797     res   = (a >> shift)  + round;
2798     return res;
2799 }
2800 static inline uint32_t
2801 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2802 {
2803     uint8_t round, shift = b & 0x1f;
2804     uint32_t res;
2805 
2806     round = get_round(vxrm, a, shift);
2807     res   = (a >> shift)  + round;
2808     return res;
2809 }
2810 static inline uint64_t
2811 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2812 {
2813     uint8_t round, shift = b & 0x3f;
2814     uint64_t res;
2815 
2816     round = get_round(vxrm, a, shift);
2817     res   = (a >> shift)  + round;
2818     return res;
2819 }
2820 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2821 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2822 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2823 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2824 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2825 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2826 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2827 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2828 
2829 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2830 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2831 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2832 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2833 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2834 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2835 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2836 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2837 
2838 static inline int8_t
2839 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2840 {
2841     uint8_t round, shift = b & 0x7;
2842     int8_t res;
2843 
2844     round = get_round(vxrm, a, shift);
2845     res   = (a >> shift)  + round;
2846     return res;
2847 }
2848 static inline int16_t
2849 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2850 {
2851     uint8_t round, shift = b & 0xf;
2852     int16_t res;
2853 
2854     round = get_round(vxrm, a, shift);
2855     res   = (a >> shift)  + round;
2856     return res;
2857 }
2858 static inline int32_t
2859 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2860 {
2861     uint8_t round, shift = b & 0x1f;
2862     int32_t res;
2863 
2864     round = get_round(vxrm, a, shift);
2865     res   = (a >> shift)  + round;
2866     return res;
2867 }
2868 static inline int64_t
2869 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2870 {
2871     uint8_t round, shift = b & 0x3f;
2872     int64_t res;
2873 
2874     round = get_round(vxrm, a, shift);
2875     res   = (a >> shift)  + round;
2876     return res;
2877 }
2878 
2879 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2880 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2881 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2882 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2883 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2884 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2885 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2886 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2887 
2888 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2889 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2890 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2891 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2892 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2893 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2894 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2895 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2896 
2897 /* Vector Narrowing Fixed-Point Clip Instructions */
2898 static inline int8_t
2899 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2900 {
2901     uint8_t round, shift = b & 0xf;
2902     int16_t res;
2903 
2904     round = get_round(vxrm, a, shift);
2905     res   = (a >> shift)  + round;
2906     if (res > INT8_MAX) {
2907         env->vxsat = 0x1;
2908         return INT8_MAX;
2909     } else if (res < INT8_MIN) {
2910         env->vxsat = 0x1;
2911         return INT8_MIN;
2912     } else {
2913         return res;
2914     }
2915 }
2916 
2917 static inline int16_t
2918 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2919 {
2920     uint8_t round, shift = b & 0x1f;
2921     int32_t res;
2922 
2923     round = get_round(vxrm, a, shift);
2924     res   = (a >> shift)  + round;
2925     if (res > INT16_MAX) {
2926         env->vxsat = 0x1;
2927         return INT16_MAX;
2928     } else if (res < INT16_MIN) {
2929         env->vxsat = 0x1;
2930         return INT16_MIN;
2931     } else {
2932         return res;
2933     }
2934 }
2935 
2936 static inline int32_t
2937 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2938 {
2939     uint8_t round, shift = b & 0x3f;
2940     int64_t res;
2941 
2942     round = get_round(vxrm, a, shift);
2943     res   = (a >> shift)  + round;
2944     if (res > INT32_MAX) {
2945         env->vxsat = 0x1;
2946         return INT32_MAX;
2947     } else if (res < INT32_MIN) {
2948         env->vxsat = 0x1;
2949         return INT32_MIN;
2950     } else {
2951         return res;
2952     }
2953 }
2954 
2955 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2956 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2957 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2958 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2959 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2960 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2961 
2962 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2963 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2964 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2965 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2966 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2967 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2968 
2969 static inline uint8_t
2970 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2971 {
2972     uint8_t round, shift = b & 0xf;
2973     uint16_t res;
2974 
2975     round = get_round(vxrm, a, shift);
2976     res   = (a >> shift)  + round;
2977     if (res > UINT8_MAX) {
2978         env->vxsat = 0x1;
2979         return UINT8_MAX;
2980     } else {
2981         return res;
2982     }
2983 }
2984 
2985 static inline uint16_t
2986 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2987 {
2988     uint8_t round, shift = b & 0x1f;
2989     uint32_t res;
2990 
2991     round = get_round(vxrm, a, shift);
2992     res   = (a >> shift)  + round;
2993     if (res > UINT16_MAX) {
2994         env->vxsat = 0x1;
2995         return UINT16_MAX;
2996     } else {
2997         return res;
2998     }
2999 }
3000 
3001 static inline uint32_t
3002 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3003 {
3004     uint8_t round, shift = b & 0x3f;
3005     uint64_t res;
3006 
3007     round = get_round(vxrm, a, shift);
3008     res   = (a >> shift)  + round;
3009     if (res > UINT32_MAX) {
3010         env->vxsat = 0x1;
3011         return UINT32_MAX;
3012     } else {
3013         return res;
3014     }
3015 }
3016 
3017 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3018 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3019 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3020 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3021 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3022 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3023 
3024 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3025 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3026 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3027 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3028 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3029 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3030 
3031 /*
3032  *** Vector Float Point Arithmetic Instructions
3033  */
3034 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3035 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3036 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3037                       CPURISCVState *env)                      \
3038 {                                                              \
3039     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3040     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3041     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3042 }
3043 
3044 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3045 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3046                   void *vs2, CPURISCVState *env,          \
3047                   uint32_t desc)                          \
3048 {                                                         \
3049     uint32_t vm = vext_vm(desc);                          \
3050     uint32_t vl = env->vl;                                \
3051     uint32_t total_elems =                                \
3052         vext_get_total_elems(env, desc, ESZ);             \
3053     uint32_t vta = vext_vta(desc);                        \
3054     uint32_t vma = vext_vma(desc);                        \
3055     uint32_t i;                                           \
3056                                                           \
3057     for (i = env->vstart; i < vl; i++) {                  \
3058         if (!vm && !vext_elem_mask(v0, i)) {              \
3059             /* set masked-off elements to 1s */           \
3060             vext_set_elems_1s(vd, vma, i * ESZ,           \
3061                               (i + 1) * ESZ);             \
3062             continue;                                     \
3063         }                                                 \
3064         do_##NAME(vd, vs1, vs2, i, env);                  \
3065     }                                                     \
3066     env->vstart = 0;                                      \
3067     /* set tail elements to 1s */                         \
3068     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3069                       total_elems * ESZ);                 \
3070 }
3071 
3072 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3073 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3074 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3075 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3076 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3077 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3078 
3079 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3080 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3081                       CPURISCVState *env)                      \
3082 {                                                              \
3083     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3084     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3085 }
3086 
3087 #define GEN_VEXT_VF(NAME, ESZ)                            \
3088 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3089                   void *vs2, CPURISCVState *env,          \
3090                   uint32_t desc)                          \
3091 {                                                         \
3092     uint32_t vm = vext_vm(desc);                          \
3093     uint32_t vl = env->vl;                                \
3094     uint32_t total_elems =                                \
3095         vext_get_total_elems(env, desc, ESZ);              \
3096     uint32_t vta = vext_vta(desc);                        \
3097     uint32_t vma = vext_vma(desc);                        \
3098     uint32_t i;                                           \
3099                                                           \
3100     for (i = env->vstart; i < vl; i++) {                  \
3101         if (!vm && !vext_elem_mask(v0, i)) {              \
3102             /* set masked-off elements to 1s */           \
3103             vext_set_elems_1s(vd, vma, i * ESZ,           \
3104                               (i + 1) * ESZ);             \
3105             continue;                                     \
3106         }                                                 \
3107         do_##NAME(vd, s1, vs2, i, env);                   \
3108     }                                                     \
3109     env->vstart = 0;                                      \
3110     /* set tail elements to 1s */                         \
3111     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3112                       total_elems * ESZ);                 \
3113 }
3114 
3115 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3116 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3117 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3118 GEN_VEXT_VF(vfadd_vf_h, 2)
3119 GEN_VEXT_VF(vfadd_vf_w, 4)
3120 GEN_VEXT_VF(vfadd_vf_d, 8)
3121 
3122 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3123 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3124 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3125 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3126 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3127 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3128 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3129 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3130 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3131 GEN_VEXT_VF(vfsub_vf_h, 2)
3132 GEN_VEXT_VF(vfsub_vf_w, 4)
3133 GEN_VEXT_VF(vfsub_vf_d, 8)
3134 
3135 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3136 {
3137     return float16_sub(b, a, s);
3138 }
3139 
3140 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3141 {
3142     return float32_sub(b, a, s);
3143 }
3144 
3145 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3146 {
3147     return float64_sub(b, a, s);
3148 }
3149 
3150 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3151 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3152 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3153 GEN_VEXT_VF(vfrsub_vf_h, 2)
3154 GEN_VEXT_VF(vfrsub_vf_w, 4)
3155 GEN_VEXT_VF(vfrsub_vf_d, 8)
3156 
3157 /* Vector Widening Floating-Point Add/Subtract Instructions */
3158 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3159 {
3160     return float32_add(float16_to_float32(a, true, s),
3161             float16_to_float32(b, true, s), s);
3162 }
3163 
3164 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3165 {
3166     return float64_add(float32_to_float64(a, s),
3167             float32_to_float64(b, s), s);
3168 
3169 }
3170 
3171 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3172 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3173 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3174 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3175 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3176 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3177 GEN_VEXT_VF(vfwadd_vf_h, 4)
3178 GEN_VEXT_VF(vfwadd_vf_w, 8)
3179 
3180 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3181 {
3182     return float32_sub(float16_to_float32(a, true, s),
3183             float16_to_float32(b, true, s), s);
3184 }
3185 
3186 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3187 {
3188     return float64_sub(float32_to_float64(a, s),
3189             float32_to_float64(b, s), s);
3190 
3191 }
3192 
3193 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3194 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3195 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3196 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3197 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3198 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3199 GEN_VEXT_VF(vfwsub_vf_h, 4)
3200 GEN_VEXT_VF(vfwsub_vf_w, 8)
3201 
3202 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3203 {
3204     return float32_add(a, float16_to_float32(b, true, s), s);
3205 }
3206 
3207 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3208 {
3209     return float64_add(a, float32_to_float64(b, s), s);
3210 }
3211 
3212 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3213 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3214 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3215 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3216 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3217 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3218 GEN_VEXT_VF(vfwadd_wf_h, 4)
3219 GEN_VEXT_VF(vfwadd_wf_w, 8)
3220 
3221 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3222 {
3223     return float32_sub(a, float16_to_float32(b, true, s), s);
3224 }
3225 
3226 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3227 {
3228     return float64_sub(a, float32_to_float64(b, s), s);
3229 }
3230 
3231 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3232 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3233 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3234 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3235 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3236 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3237 GEN_VEXT_VF(vfwsub_wf_h, 4)
3238 GEN_VEXT_VF(vfwsub_wf_w, 8)
3239 
3240 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3241 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3242 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3243 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3244 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3245 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3246 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3247 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3248 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3249 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3250 GEN_VEXT_VF(vfmul_vf_h, 2)
3251 GEN_VEXT_VF(vfmul_vf_w, 4)
3252 GEN_VEXT_VF(vfmul_vf_d, 8)
3253 
3254 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3255 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3256 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3257 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3258 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3259 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3260 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3261 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3262 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3263 GEN_VEXT_VF(vfdiv_vf_h, 2)
3264 GEN_VEXT_VF(vfdiv_vf_w, 4)
3265 GEN_VEXT_VF(vfdiv_vf_d, 8)
3266 
3267 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3268 {
3269     return float16_div(b, a, s);
3270 }
3271 
3272 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3273 {
3274     return float32_div(b, a, s);
3275 }
3276 
3277 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3278 {
3279     return float64_div(b, a, s);
3280 }
3281 
3282 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3283 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3284 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3285 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3286 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3287 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3288 
3289 /* Vector Widening Floating-Point Multiply */
3290 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3291 {
3292     return float32_mul(float16_to_float32(a, true, s),
3293             float16_to_float32(b, true, s), s);
3294 }
3295 
3296 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3297 {
3298     return float64_mul(float32_to_float64(a, s),
3299             float32_to_float64(b, s), s);
3300 
3301 }
3302 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3303 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3304 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3305 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3306 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3307 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3308 GEN_VEXT_VF(vfwmul_vf_h, 4)
3309 GEN_VEXT_VF(vfwmul_vf_w, 8)
3310 
3311 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3312 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3313 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3314         CPURISCVState *env)                                        \
3315 {                                                                  \
3316     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3317     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3318     TD d = *((TD *)vd + HD(i));                                    \
3319     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3320 }
3321 
3322 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3323 {
3324     return float16_muladd(a, b, d, 0, s);
3325 }
3326 
3327 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3328 {
3329     return float32_muladd(a, b, d, 0, s);
3330 }
3331 
3332 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3333 {
3334     return float64_muladd(a, b, d, 0, s);
3335 }
3336 
3337 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3338 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3339 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3340 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3341 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3342 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3343 
3344 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3345 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3346         CPURISCVState *env)                                       \
3347 {                                                                 \
3348     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3349     TD d = *((TD *)vd + HD(i));                                   \
3350     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3351 }
3352 
3353 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3354 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3355 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3356 GEN_VEXT_VF(vfmacc_vf_h, 2)
3357 GEN_VEXT_VF(vfmacc_vf_w, 4)
3358 GEN_VEXT_VF(vfmacc_vf_d, 8)
3359 
3360 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3361 {
3362     return float16_muladd(a, b, d,
3363             float_muladd_negate_c | float_muladd_negate_product, s);
3364 }
3365 
3366 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3367 {
3368     return float32_muladd(a, b, d,
3369             float_muladd_negate_c | float_muladd_negate_product, s);
3370 }
3371 
3372 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3373 {
3374     return float64_muladd(a, b, d,
3375             float_muladd_negate_c | float_muladd_negate_product, s);
3376 }
3377 
3378 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3379 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3380 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3381 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3382 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3383 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3384 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3385 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3386 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3387 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3388 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3389 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3390 
3391 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3392 {
3393     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3394 }
3395 
3396 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3397 {
3398     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3399 }
3400 
3401 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3402 {
3403     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3404 }
3405 
3406 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3407 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3408 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3409 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3410 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3411 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3412 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3413 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3414 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3415 GEN_VEXT_VF(vfmsac_vf_h, 2)
3416 GEN_VEXT_VF(vfmsac_vf_w, 4)
3417 GEN_VEXT_VF(vfmsac_vf_d, 8)
3418 
3419 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3420 {
3421     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3422 }
3423 
3424 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3425 {
3426     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3427 }
3428 
3429 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3430 {
3431     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3432 }
3433 
3434 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3435 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3436 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3437 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3438 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3439 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3440 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3441 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3442 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3443 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3444 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3445 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3446 
3447 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3448 {
3449     return float16_muladd(d, b, a, 0, s);
3450 }
3451 
3452 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3453 {
3454     return float32_muladd(d, b, a, 0, s);
3455 }
3456 
3457 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3458 {
3459     return float64_muladd(d, b, a, 0, s);
3460 }
3461 
3462 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3463 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3464 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3465 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3466 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3467 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3468 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3469 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3470 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3471 GEN_VEXT_VF(vfmadd_vf_h, 2)
3472 GEN_VEXT_VF(vfmadd_vf_w, 4)
3473 GEN_VEXT_VF(vfmadd_vf_d, 8)
3474 
3475 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3476 {
3477     return float16_muladd(d, b, a,
3478             float_muladd_negate_c | float_muladd_negate_product, s);
3479 }
3480 
3481 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3482 {
3483     return float32_muladd(d, b, a,
3484             float_muladd_negate_c | float_muladd_negate_product, s);
3485 }
3486 
3487 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3488 {
3489     return float64_muladd(d, b, a,
3490             float_muladd_negate_c | float_muladd_negate_product, s);
3491 }
3492 
3493 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3494 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3495 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3496 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3497 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3498 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3499 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3500 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3501 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3502 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3503 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3504 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3505 
3506 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3507 {
3508     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3509 }
3510 
3511 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3512 {
3513     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3514 }
3515 
3516 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3517 {
3518     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3519 }
3520 
3521 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3522 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3523 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3524 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3525 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3526 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3527 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3528 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3529 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3530 GEN_VEXT_VF(vfmsub_vf_h, 2)
3531 GEN_VEXT_VF(vfmsub_vf_w, 4)
3532 GEN_VEXT_VF(vfmsub_vf_d, 8)
3533 
3534 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3535 {
3536     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3537 }
3538 
3539 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3540 {
3541     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3542 }
3543 
3544 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3545 {
3546     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3547 }
3548 
3549 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3550 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3551 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3552 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3553 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3554 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3555 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3556 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3557 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3558 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3559 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3560 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3561 
3562 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3563 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3564 {
3565     return float32_muladd(float16_to_float32(a, true, s),
3566                         float16_to_float32(b, true, s), d, 0, s);
3567 }
3568 
3569 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3570 {
3571     return float64_muladd(float32_to_float64(a, s),
3572                         float32_to_float64(b, s), d, 0, s);
3573 }
3574 
3575 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3576 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3577 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3578 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3579 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3580 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3581 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3582 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3583 
3584 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3585 {
3586     return float32_muladd(float16_to_float32(a, true, s),
3587                         float16_to_float32(b, true, s), d,
3588                         float_muladd_negate_c | float_muladd_negate_product, s);
3589 }
3590 
3591 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3592 {
3593     return float64_muladd(float32_to_float64(a, s),
3594                         float32_to_float64(b, s), d,
3595                         float_muladd_negate_c | float_muladd_negate_product, s);
3596 }
3597 
3598 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3599 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3600 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3601 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3602 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3603 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3604 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3605 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3606 
3607 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3608 {
3609     return float32_muladd(float16_to_float32(a, true, s),
3610                         float16_to_float32(b, true, s), d,
3611                         float_muladd_negate_c, s);
3612 }
3613 
3614 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3615 {
3616     return float64_muladd(float32_to_float64(a, s),
3617                         float32_to_float64(b, s), d,
3618                         float_muladd_negate_c, s);
3619 }
3620 
3621 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3622 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3623 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3624 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3625 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3626 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3627 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3628 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3629 
3630 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3631 {
3632     return float32_muladd(float16_to_float32(a, true, s),
3633                         float16_to_float32(b, true, s), d,
3634                         float_muladd_negate_product, s);
3635 }
3636 
3637 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3638 {
3639     return float64_muladd(float32_to_float64(a, s),
3640                         float32_to_float64(b, s), d,
3641                         float_muladd_negate_product, s);
3642 }
3643 
3644 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3645 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3646 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3647 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3648 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3649 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3650 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3651 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3652 
3653 /* Vector Floating-Point Square-Root Instruction */
3654 /* (TD, T2, TX2) */
3655 #define OP_UU_H uint16_t, uint16_t, uint16_t
3656 #define OP_UU_W uint32_t, uint32_t, uint32_t
3657 #define OP_UU_D uint64_t, uint64_t, uint64_t
3658 
3659 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3660 static void do_##NAME(void *vd, void *vs2, int i,      \
3661         CPURISCVState *env)                            \
3662 {                                                      \
3663     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3664     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3665 }
3666 
3667 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3668 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3669         CPURISCVState *env, uint32_t desc)             \
3670 {                                                      \
3671     uint32_t vm = vext_vm(desc);                       \
3672     uint32_t vl = env->vl;                             \
3673     uint32_t total_elems =                             \
3674         vext_get_total_elems(env, desc, ESZ);          \
3675     uint32_t vta = vext_vta(desc);                     \
3676     uint32_t vma = vext_vma(desc);                     \
3677     uint32_t i;                                        \
3678                                                        \
3679     if (vl == 0) {                                     \
3680         return;                                        \
3681     }                                                  \
3682     for (i = env->vstart; i < vl; i++) {               \
3683         if (!vm && !vext_elem_mask(v0, i)) {           \
3684             /* set masked-off elements to 1s */        \
3685             vext_set_elems_1s(vd, vma, i * ESZ,        \
3686                               (i + 1) * ESZ);          \
3687             continue;                                  \
3688         }                                              \
3689         do_##NAME(vd, vs2, i, env);                    \
3690     }                                                  \
3691     env->vstart = 0;                                   \
3692     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3693                       total_elems * ESZ);              \
3694 }
3695 
3696 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3697 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3698 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3699 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3700 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3701 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3702 
3703 /*
3704  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3705  *
3706  * Adapted from riscv-v-spec recip.c:
3707  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3708  */
3709 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3710 {
3711     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3712     uint64_t exp = extract64(f, frac_size, exp_size);
3713     uint64_t frac = extract64(f, 0, frac_size);
3714 
3715     const uint8_t lookup_table[] = {
3716         52, 51, 50, 48, 47, 46, 44, 43,
3717         42, 41, 40, 39, 38, 36, 35, 34,
3718         33, 32, 31, 30, 30, 29, 28, 27,
3719         26, 25, 24, 23, 23, 22, 21, 20,
3720         19, 19, 18, 17, 16, 16, 15, 14,
3721         14, 13, 12, 12, 11, 10, 10, 9,
3722         9, 8, 7, 7, 6, 6, 5, 4,
3723         4, 3, 3, 2, 2, 1, 1, 0,
3724         127, 125, 123, 121, 119, 118, 116, 114,
3725         113, 111, 109, 108, 106, 105, 103, 102,
3726         100, 99, 97, 96, 95, 93, 92, 91,
3727         90, 88, 87, 86, 85, 84, 83, 82,
3728         80, 79, 78, 77, 76, 75, 74, 73,
3729         72, 71, 70, 70, 69, 68, 67, 66,
3730         65, 64, 63, 63, 62, 61, 60, 59,
3731         59, 58, 57, 56, 56, 55, 54, 53
3732     };
3733     const int precision = 7;
3734 
3735     if (exp == 0 && frac != 0) { /* subnormal */
3736         /* Normalize the subnormal. */
3737         while (extract64(frac, frac_size - 1, 1) == 0) {
3738             exp--;
3739             frac <<= 1;
3740         }
3741 
3742         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3743     }
3744 
3745     int idx = ((exp & 1) << (precision - 1)) |
3746                 (frac >> (frac_size - precision + 1));
3747     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3748                             (frac_size - precision);
3749     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3750 
3751     uint64_t val = 0;
3752     val = deposit64(val, 0, frac_size, out_frac);
3753     val = deposit64(val, frac_size, exp_size, out_exp);
3754     val = deposit64(val, frac_size + exp_size, 1, sign);
3755     return val;
3756 }
3757 
3758 static float16 frsqrt7_h(float16 f, float_status *s)
3759 {
3760     int exp_size = 5, frac_size = 10;
3761     bool sign = float16_is_neg(f);
3762 
3763     /*
3764      * frsqrt7(sNaN) = canonical NaN
3765      * frsqrt7(-inf) = canonical NaN
3766      * frsqrt7(-normal) = canonical NaN
3767      * frsqrt7(-subnormal) = canonical NaN
3768      */
3769     if (float16_is_signaling_nan(f, s) ||
3770             (float16_is_infinity(f) && sign) ||
3771             (float16_is_normal(f) && sign) ||
3772             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3773         s->float_exception_flags |= float_flag_invalid;
3774         return float16_default_nan(s);
3775     }
3776 
3777     /* frsqrt7(qNaN) = canonical NaN */
3778     if (float16_is_quiet_nan(f, s)) {
3779         return float16_default_nan(s);
3780     }
3781 
3782     /* frsqrt7(+-0) = +-inf */
3783     if (float16_is_zero(f)) {
3784         s->float_exception_flags |= float_flag_divbyzero;
3785         return float16_set_sign(float16_infinity, sign);
3786     }
3787 
3788     /* frsqrt7(+inf) = +0 */
3789     if (float16_is_infinity(f) && !sign) {
3790         return float16_set_sign(float16_zero, sign);
3791     }
3792 
3793     /* +normal, +subnormal */
3794     uint64_t val = frsqrt7(f, exp_size, frac_size);
3795     return make_float16(val);
3796 }
3797 
3798 static float32 frsqrt7_s(float32 f, float_status *s)
3799 {
3800     int exp_size = 8, frac_size = 23;
3801     bool sign = float32_is_neg(f);
3802 
3803     /*
3804      * frsqrt7(sNaN) = canonical NaN
3805      * frsqrt7(-inf) = canonical NaN
3806      * frsqrt7(-normal) = canonical NaN
3807      * frsqrt7(-subnormal) = canonical NaN
3808      */
3809     if (float32_is_signaling_nan(f, s) ||
3810             (float32_is_infinity(f) && sign) ||
3811             (float32_is_normal(f) && sign) ||
3812             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3813         s->float_exception_flags |= float_flag_invalid;
3814         return float32_default_nan(s);
3815     }
3816 
3817     /* frsqrt7(qNaN) = canonical NaN */
3818     if (float32_is_quiet_nan(f, s)) {
3819         return float32_default_nan(s);
3820     }
3821 
3822     /* frsqrt7(+-0) = +-inf */
3823     if (float32_is_zero(f)) {
3824         s->float_exception_flags |= float_flag_divbyzero;
3825         return float32_set_sign(float32_infinity, sign);
3826     }
3827 
3828     /* frsqrt7(+inf) = +0 */
3829     if (float32_is_infinity(f) && !sign) {
3830         return float32_set_sign(float32_zero, sign);
3831     }
3832 
3833     /* +normal, +subnormal */
3834     uint64_t val = frsqrt7(f, exp_size, frac_size);
3835     return make_float32(val);
3836 }
3837 
3838 static float64 frsqrt7_d(float64 f, float_status *s)
3839 {
3840     int exp_size = 11, frac_size = 52;
3841     bool sign = float64_is_neg(f);
3842 
3843     /*
3844      * frsqrt7(sNaN) = canonical NaN
3845      * frsqrt7(-inf) = canonical NaN
3846      * frsqrt7(-normal) = canonical NaN
3847      * frsqrt7(-subnormal) = canonical NaN
3848      */
3849     if (float64_is_signaling_nan(f, s) ||
3850             (float64_is_infinity(f) && sign) ||
3851             (float64_is_normal(f) && sign) ||
3852             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3853         s->float_exception_flags |= float_flag_invalid;
3854         return float64_default_nan(s);
3855     }
3856 
3857     /* frsqrt7(qNaN) = canonical NaN */
3858     if (float64_is_quiet_nan(f, s)) {
3859         return float64_default_nan(s);
3860     }
3861 
3862     /* frsqrt7(+-0) = +-inf */
3863     if (float64_is_zero(f)) {
3864         s->float_exception_flags |= float_flag_divbyzero;
3865         return float64_set_sign(float64_infinity, sign);
3866     }
3867 
3868     /* frsqrt7(+inf) = +0 */
3869     if (float64_is_infinity(f) && !sign) {
3870         return float64_set_sign(float64_zero, sign);
3871     }
3872 
3873     /* +normal, +subnormal */
3874     uint64_t val = frsqrt7(f, exp_size, frac_size);
3875     return make_float64(val);
3876 }
3877 
3878 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3879 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3880 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3881 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3882 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3883 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3884 
3885 /*
3886  * Vector Floating-Point Reciprocal Estimate Instruction
3887  *
3888  * Adapted from riscv-v-spec recip.c:
3889  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3890  */
3891 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3892                       float_status *s)
3893 {
3894     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3895     uint64_t exp = extract64(f, frac_size, exp_size);
3896     uint64_t frac = extract64(f, 0, frac_size);
3897 
3898     const uint8_t lookup_table[] = {
3899         127, 125, 123, 121, 119, 117, 116, 114,
3900         112, 110, 109, 107, 105, 104, 102, 100,
3901         99, 97, 96, 94, 93, 91, 90, 88,
3902         87, 85, 84, 83, 81, 80, 79, 77,
3903         76, 75, 74, 72, 71, 70, 69, 68,
3904         66, 65, 64, 63, 62, 61, 60, 59,
3905         58, 57, 56, 55, 54, 53, 52, 51,
3906         50, 49, 48, 47, 46, 45, 44, 43,
3907         42, 41, 40, 40, 39, 38, 37, 36,
3908         35, 35, 34, 33, 32, 31, 31, 30,
3909         29, 28, 28, 27, 26, 25, 25, 24,
3910         23, 23, 22, 21, 21, 20, 19, 19,
3911         18, 17, 17, 16, 15, 15, 14, 14,
3912         13, 12, 12, 11, 11, 10, 9, 9,
3913         8, 8, 7, 7, 6, 5, 5, 4,
3914         4, 3, 3, 2, 2, 1, 1, 0
3915     };
3916     const int precision = 7;
3917 
3918     if (exp == 0 && frac != 0) { /* subnormal */
3919         /* Normalize the subnormal. */
3920         while (extract64(frac, frac_size - 1, 1) == 0) {
3921             exp--;
3922             frac <<= 1;
3923         }
3924 
3925         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3926 
3927         if (exp != 0 && exp != UINT64_MAX) {
3928             /*
3929              * Overflow to inf or max value of same sign,
3930              * depending on sign and rounding mode.
3931              */
3932             s->float_exception_flags |= (float_flag_inexact |
3933                                          float_flag_overflow);
3934 
3935             if ((s->float_rounding_mode == float_round_to_zero) ||
3936                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3937                 ((s->float_rounding_mode == float_round_up) && sign)) {
3938                 /* Return greatest/negative finite value. */
3939                 return (sign << (exp_size + frac_size)) |
3940                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3941             } else {
3942                 /* Return +-inf. */
3943                 return (sign << (exp_size + frac_size)) |
3944                     MAKE_64BIT_MASK(frac_size, exp_size);
3945             }
3946         }
3947     }
3948 
3949     int idx = frac >> (frac_size - precision);
3950     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3951                             (frac_size - precision);
3952     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3953 
3954     if (out_exp == 0 || out_exp == UINT64_MAX) {
3955         /*
3956          * The result is subnormal, but don't raise the underflow exception,
3957          * because there's no additional loss of precision.
3958          */
3959         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3960         if (out_exp == UINT64_MAX) {
3961             out_frac >>= 1;
3962             out_exp = 0;
3963         }
3964     }
3965 
3966     uint64_t val = 0;
3967     val = deposit64(val, 0, frac_size, out_frac);
3968     val = deposit64(val, frac_size, exp_size, out_exp);
3969     val = deposit64(val, frac_size + exp_size, 1, sign);
3970     return val;
3971 }
3972 
3973 static float16 frec7_h(float16 f, float_status *s)
3974 {
3975     int exp_size = 5, frac_size = 10;
3976     bool sign = float16_is_neg(f);
3977 
3978     /* frec7(+-inf) = +-0 */
3979     if (float16_is_infinity(f)) {
3980         return float16_set_sign(float16_zero, sign);
3981     }
3982 
3983     /* frec7(+-0) = +-inf */
3984     if (float16_is_zero(f)) {
3985         s->float_exception_flags |= float_flag_divbyzero;
3986         return float16_set_sign(float16_infinity, sign);
3987     }
3988 
3989     /* frec7(sNaN) = canonical NaN */
3990     if (float16_is_signaling_nan(f, s)) {
3991         s->float_exception_flags |= float_flag_invalid;
3992         return float16_default_nan(s);
3993     }
3994 
3995     /* frec7(qNaN) = canonical NaN */
3996     if (float16_is_quiet_nan(f, s)) {
3997         return float16_default_nan(s);
3998     }
3999 
4000     /* +-normal, +-subnormal */
4001     uint64_t val = frec7(f, exp_size, frac_size, s);
4002     return make_float16(val);
4003 }
4004 
4005 static float32 frec7_s(float32 f, float_status *s)
4006 {
4007     int exp_size = 8, frac_size = 23;
4008     bool sign = float32_is_neg(f);
4009 
4010     /* frec7(+-inf) = +-0 */
4011     if (float32_is_infinity(f)) {
4012         return float32_set_sign(float32_zero, sign);
4013     }
4014 
4015     /* frec7(+-0) = +-inf */
4016     if (float32_is_zero(f)) {
4017         s->float_exception_flags |= float_flag_divbyzero;
4018         return float32_set_sign(float32_infinity, sign);
4019     }
4020 
4021     /* frec7(sNaN) = canonical NaN */
4022     if (float32_is_signaling_nan(f, s)) {
4023         s->float_exception_flags |= float_flag_invalid;
4024         return float32_default_nan(s);
4025     }
4026 
4027     /* frec7(qNaN) = canonical NaN */
4028     if (float32_is_quiet_nan(f, s)) {
4029         return float32_default_nan(s);
4030     }
4031 
4032     /* +-normal, +-subnormal */
4033     uint64_t val = frec7(f, exp_size, frac_size, s);
4034     return make_float32(val);
4035 }
4036 
4037 static float64 frec7_d(float64 f, float_status *s)
4038 {
4039     int exp_size = 11, frac_size = 52;
4040     bool sign = float64_is_neg(f);
4041 
4042     /* frec7(+-inf) = +-0 */
4043     if (float64_is_infinity(f)) {
4044         return float64_set_sign(float64_zero, sign);
4045     }
4046 
4047     /* frec7(+-0) = +-inf */
4048     if (float64_is_zero(f)) {
4049         s->float_exception_flags |= float_flag_divbyzero;
4050         return float64_set_sign(float64_infinity, sign);
4051     }
4052 
4053     /* frec7(sNaN) = canonical NaN */
4054     if (float64_is_signaling_nan(f, s)) {
4055         s->float_exception_flags |= float_flag_invalid;
4056         return float64_default_nan(s);
4057     }
4058 
4059     /* frec7(qNaN) = canonical NaN */
4060     if (float64_is_quiet_nan(f, s)) {
4061         return float64_default_nan(s);
4062     }
4063 
4064     /* +-normal, +-subnormal */
4065     uint64_t val = frec7(f, exp_size, frac_size, s);
4066     return make_float64(val);
4067 }
4068 
4069 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4070 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4071 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4072 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4073 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4074 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4075 
4076 /* Vector Floating-Point MIN/MAX Instructions */
4077 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4078 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4079 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4080 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4081 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4082 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4083 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4084 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4085 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4086 GEN_VEXT_VF(vfmin_vf_h, 2)
4087 GEN_VEXT_VF(vfmin_vf_w, 4)
4088 GEN_VEXT_VF(vfmin_vf_d, 8)
4089 
4090 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4091 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4092 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4093 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4094 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4095 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4096 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4097 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4098 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4099 GEN_VEXT_VF(vfmax_vf_h, 2)
4100 GEN_VEXT_VF(vfmax_vf_w, 4)
4101 GEN_VEXT_VF(vfmax_vf_d, 8)
4102 
4103 /* Vector Floating-Point Sign-Injection Instructions */
4104 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4105 {
4106     return deposit64(b, 0, 15, a);
4107 }
4108 
4109 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4110 {
4111     return deposit64(b, 0, 31, a);
4112 }
4113 
4114 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4115 {
4116     return deposit64(b, 0, 63, a);
4117 }
4118 
4119 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4120 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4121 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4122 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4123 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4124 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4125 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4126 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4127 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4128 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4129 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4130 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4131 
4132 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4133 {
4134     return deposit64(~b, 0, 15, a);
4135 }
4136 
4137 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4138 {
4139     return deposit64(~b, 0, 31, a);
4140 }
4141 
4142 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4143 {
4144     return deposit64(~b, 0, 63, a);
4145 }
4146 
4147 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4148 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4149 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4150 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4151 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4152 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4153 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4154 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4155 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4156 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4157 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4158 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4159 
4160 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4161 {
4162     return deposit64(b ^ a, 0, 15, a);
4163 }
4164 
4165 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4166 {
4167     return deposit64(b ^ a, 0, 31, a);
4168 }
4169 
4170 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4171 {
4172     return deposit64(b ^ a, 0, 63, a);
4173 }
4174 
4175 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4176 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4177 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4178 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4179 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4180 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4181 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4182 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4183 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4184 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4185 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4186 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4187 
4188 /* Vector Floating-Point Compare Instructions */
4189 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4190 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4191                   CPURISCVState *env, uint32_t desc)          \
4192 {                                                             \
4193     uint32_t vm = vext_vm(desc);                              \
4194     uint32_t vl = env->vl;                                    \
4195     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
4196     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4197     uint32_t vma = vext_vma(desc);                            \
4198     uint32_t i;                                               \
4199                                                               \
4200     for (i = env->vstart; i < vl; i++) {                      \
4201         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4202         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4203         if (!vm && !vext_elem_mask(v0, i)) {                  \
4204             /* set masked-off elements to 1s */               \
4205             if (vma) {                                        \
4206                 vext_set_elem_mask(vd, i, 1);                 \
4207             }                                                 \
4208             continue;                                         \
4209         }                                                     \
4210         vext_set_elem_mask(vd, i,                             \
4211                            DO_OP(s2, s1, &env->fp_status));   \
4212     }                                                         \
4213     env->vstart = 0;                                          \
4214     /* mask destination register are always tail-agnostic */  \
4215     /* set tail elements to 1s */                             \
4216     if (vta_all_1s) {                                         \
4217         for (; i < total_elems; i++) {                        \
4218             vext_set_elem_mask(vd, i, 1);                     \
4219         }                                                     \
4220     }                                                         \
4221 }
4222 
4223 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4224 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4225 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4226 
4227 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4228 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4229                   CPURISCVState *env, uint32_t desc)                \
4230 {                                                                   \
4231     uint32_t vm = vext_vm(desc);                                    \
4232     uint32_t vl = env->vl;                                          \
4233     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
4234     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4235     uint32_t vma = vext_vma(desc);                                  \
4236     uint32_t i;                                                     \
4237                                                                     \
4238     for (i = env->vstart; i < vl; i++) {                            \
4239         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4240         if (!vm && !vext_elem_mask(v0, i)) {                        \
4241             /* set masked-off elements to 1s */                     \
4242             if (vma) {                                              \
4243                 vext_set_elem_mask(vd, i, 1);                       \
4244             }                                                       \
4245             continue;                                               \
4246         }                                                           \
4247         vext_set_elem_mask(vd, i,                                   \
4248                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4249     }                                                               \
4250     env->vstart = 0;                                                \
4251     /* mask destination register are always tail-agnostic */        \
4252     /* set tail elements to 1s */                                   \
4253     if (vta_all_1s) {                                               \
4254         for (; i < total_elems; i++) {                              \
4255             vext_set_elem_mask(vd, i, 1);                           \
4256         }                                                           \
4257     }                                                               \
4258 }
4259 
4260 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4261 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4262 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4263 
4264 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4265 {
4266     FloatRelation compare = float16_compare_quiet(a, b, s);
4267     return compare != float_relation_equal;
4268 }
4269 
4270 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4271 {
4272     FloatRelation compare = float32_compare_quiet(a, b, s);
4273     return compare != float_relation_equal;
4274 }
4275 
4276 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4277 {
4278     FloatRelation compare = float64_compare_quiet(a, b, s);
4279     return compare != float_relation_equal;
4280 }
4281 
4282 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4283 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4284 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4285 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4286 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4287 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4288 
4289 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4290 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4291 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4292 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4293 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4294 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4295 
4296 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4297 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4298 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4299 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4300 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4301 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4302 
4303 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4304 {
4305     FloatRelation compare = float16_compare(a, b, s);
4306     return compare == float_relation_greater;
4307 }
4308 
4309 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4310 {
4311     FloatRelation compare = float32_compare(a, b, s);
4312     return compare == float_relation_greater;
4313 }
4314 
4315 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4316 {
4317     FloatRelation compare = float64_compare(a, b, s);
4318     return compare == float_relation_greater;
4319 }
4320 
4321 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4322 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4323 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4324 
4325 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4326 {
4327     FloatRelation compare = float16_compare(a, b, s);
4328     return compare == float_relation_greater ||
4329            compare == float_relation_equal;
4330 }
4331 
4332 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4333 {
4334     FloatRelation compare = float32_compare(a, b, s);
4335     return compare == float_relation_greater ||
4336            compare == float_relation_equal;
4337 }
4338 
4339 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4340 {
4341     FloatRelation compare = float64_compare(a, b, s);
4342     return compare == float_relation_greater ||
4343            compare == float_relation_equal;
4344 }
4345 
4346 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4347 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4348 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4349 
4350 /* Vector Floating-Point Classify Instruction */
4351 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4352 static void do_##NAME(void *vd, void *vs2, int i)      \
4353 {                                                      \
4354     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4355     *((TD *)vd + HD(i)) = OP(s2);                      \
4356 }
4357 
4358 #define GEN_VEXT_V(NAME, ESZ)                          \
4359 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4360                   CPURISCVState *env, uint32_t desc)   \
4361 {                                                      \
4362     uint32_t vm = vext_vm(desc);                       \
4363     uint32_t vl = env->vl;                             \
4364     uint32_t total_elems =                             \
4365         vext_get_total_elems(env, desc, ESZ);          \
4366     uint32_t vta = vext_vta(desc);                     \
4367     uint32_t vma = vext_vma(desc);                     \
4368     uint32_t i;                                        \
4369                                                        \
4370     for (i = env->vstart; i < vl; i++) {               \
4371         if (!vm && !vext_elem_mask(v0, i)) {           \
4372             /* set masked-off elements to 1s */        \
4373             vext_set_elems_1s(vd, vma, i * ESZ,        \
4374                               (i + 1) * ESZ);          \
4375             continue;                                  \
4376         }                                              \
4377         do_##NAME(vd, vs2, i);                         \
4378     }                                                  \
4379     env->vstart = 0;                                   \
4380     /* set tail elements to 1s */                      \
4381     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4382                       total_elems * ESZ);              \
4383 }
4384 
4385 target_ulong fclass_h(uint64_t frs1)
4386 {
4387     float16 f = frs1;
4388     bool sign = float16_is_neg(f);
4389 
4390     if (float16_is_infinity(f)) {
4391         return sign ? 1 << 0 : 1 << 7;
4392     } else if (float16_is_zero(f)) {
4393         return sign ? 1 << 3 : 1 << 4;
4394     } else if (float16_is_zero_or_denormal(f)) {
4395         return sign ? 1 << 2 : 1 << 5;
4396     } else if (float16_is_any_nan(f)) {
4397         float_status s = { }; /* for snan_bit_is_one */
4398         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4399     } else {
4400         return sign ? 1 << 1 : 1 << 6;
4401     }
4402 }
4403 
4404 target_ulong fclass_s(uint64_t frs1)
4405 {
4406     float32 f = frs1;
4407     bool sign = float32_is_neg(f);
4408 
4409     if (float32_is_infinity(f)) {
4410         return sign ? 1 << 0 : 1 << 7;
4411     } else if (float32_is_zero(f)) {
4412         return sign ? 1 << 3 : 1 << 4;
4413     } else if (float32_is_zero_or_denormal(f)) {
4414         return sign ? 1 << 2 : 1 << 5;
4415     } else if (float32_is_any_nan(f)) {
4416         float_status s = { }; /* for snan_bit_is_one */
4417         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4418     } else {
4419         return sign ? 1 << 1 : 1 << 6;
4420     }
4421 }
4422 
4423 target_ulong fclass_d(uint64_t frs1)
4424 {
4425     float64 f = frs1;
4426     bool sign = float64_is_neg(f);
4427 
4428     if (float64_is_infinity(f)) {
4429         return sign ? 1 << 0 : 1 << 7;
4430     } else if (float64_is_zero(f)) {
4431         return sign ? 1 << 3 : 1 << 4;
4432     } else if (float64_is_zero_or_denormal(f)) {
4433         return sign ? 1 << 2 : 1 << 5;
4434     } else if (float64_is_any_nan(f)) {
4435         float_status s = { }; /* for snan_bit_is_one */
4436         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4437     } else {
4438         return sign ? 1 << 1 : 1 << 6;
4439     }
4440 }
4441 
4442 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4443 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4444 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4445 GEN_VEXT_V(vfclass_v_h, 2)
4446 GEN_VEXT_V(vfclass_v_w, 4)
4447 GEN_VEXT_V(vfclass_v_d, 8)
4448 
4449 /* Vector Floating-Point Merge Instruction */
4450 
4451 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4452 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4453                   CPURISCVState *env, uint32_t desc)          \
4454 {                                                             \
4455     uint32_t vm = vext_vm(desc);                              \
4456     uint32_t vl = env->vl;                                    \
4457     uint32_t esz = sizeof(ETYPE);                             \
4458     uint32_t total_elems =                                    \
4459         vext_get_total_elems(env, desc, esz);                 \
4460     uint32_t vta = vext_vta(desc);                            \
4461     uint32_t i;                                               \
4462                                                               \
4463     for (i = env->vstart; i < vl; i++) {                      \
4464         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4465         *((ETYPE *)vd + H(i))                                 \
4466           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4467     }                                                         \
4468     env->vstart = 0;                                          \
4469     /* set tail elements to 1s */                             \
4470     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4471 }
4472 
4473 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4474 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4475 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4476 
4477 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4478 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4479 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4480 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4481 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4482 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4483 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4484 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4485 
4486 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4487 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4488 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4489 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4490 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4491 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4492 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4493 
4494 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4495 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4496 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4497 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4498 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4499 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4500 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4501 
4502 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4503 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4504 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4505 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4506 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4507 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4508 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4509 
4510 /* Widening Floating-Point/Integer Type-Convert Instructions */
4511 /* (TD, T2, TX2) */
4512 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4513 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4514 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4515 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4516 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4517 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4518 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4519 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4520 
4521 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4522 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4523 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4524 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4525 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4526 
4527 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4528 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4529 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4530 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4531 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4532 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4533 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4534 
4535 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4536 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4537 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4538 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4539 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4540 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4541 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4542 
4543 /*
4544  * vfwcvt.f.f.v vd, vs2, vm
4545  * Convert single-width float to double-width float.
4546  */
4547 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4548 {
4549     return float16_to_float32(a, true, s);
4550 }
4551 
4552 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4553 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4554 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4555 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4556 
4557 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4558 /* (TD, T2, TX2) */
4559 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4560 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4561 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4562 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4563 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4564 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4565 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4566 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4567 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4568 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4569 
4570 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4571 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4572 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4573 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4574 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4575 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4576 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4577 
4578 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4579 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4580 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4581 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4582 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4583 
4584 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4585 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4586 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4587 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4588 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4589 
4590 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4591 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4592 {
4593     return float32_to_float16(a, true, s);
4594 }
4595 
4596 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4597 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4598 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4599 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4600 
4601 /*
4602  *** Vector Reduction Operations
4603  */
4604 /* Vector Single-Width Integer Reduction Instructions */
4605 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4606 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4607         void *vs2, CPURISCVState *env, uint32_t desc)     \
4608 {                                                         \
4609     uint32_t vm = vext_vm(desc);                          \
4610     uint32_t vl = env->vl;                                \
4611     uint32_t esz = sizeof(TD);                            \
4612     uint32_t vlenb = simd_maxsz(desc);                    \
4613     uint32_t vta = vext_vta(desc);                        \
4614     uint32_t i;                                           \
4615     TD s1 =  *((TD *)vs1 + HD(0));                        \
4616                                                           \
4617     for (i = env->vstart; i < vl; i++) {                  \
4618         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4619         if (!vm && !vext_elem_mask(v0, i)) {              \
4620             continue;                                     \
4621         }                                                 \
4622         s1 = OP(s1, (TD)s2);                              \
4623     }                                                     \
4624     *((TD *)vd + HD(0)) = s1;                             \
4625     env->vstart = 0;                                      \
4626     /* set tail elements to 1s */                         \
4627     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4628 }
4629 
4630 /* vd[0] = sum(vs1[0], vs2[*]) */
4631 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4632 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4633 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4634 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4635 
4636 /* vd[0] = maxu(vs1[0], vs2[*]) */
4637 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4638 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4639 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4640 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4641 
4642 /* vd[0] = max(vs1[0], vs2[*]) */
4643 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4644 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4645 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4646 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4647 
4648 /* vd[0] = minu(vs1[0], vs2[*]) */
4649 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4650 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4651 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4652 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4653 
4654 /* vd[0] = min(vs1[0], vs2[*]) */
4655 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4656 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4657 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4658 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4659 
4660 /* vd[0] = and(vs1[0], vs2[*]) */
4661 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4662 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4663 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4664 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4665 
4666 /* vd[0] = or(vs1[0], vs2[*]) */
4667 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4668 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4669 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4670 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4671 
4672 /* vd[0] = xor(vs1[0], vs2[*]) */
4673 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4674 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4675 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4676 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4677 
4678 /* Vector Widening Integer Reduction Instructions */
4679 /* signed sum reduction into double-width accumulator */
4680 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4681 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4682 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4683 
4684 /* Unsigned sum reduction into double-width accumulator */
4685 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4686 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4687 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4688 
4689 /* Vector Single-Width Floating-Point Reduction Instructions */
4690 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4691 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4692                   void *vs2, CPURISCVState *env,           \
4693                   uint32_t desc)                           \
4694 {                                                          \
4695     uint32_t vm = vext_vm(desc);                           \
4696     uint32_t vl = env->vl;                                 \
4697     uint32_t esz = sizeof(TD);                             \
4698     uint32_t vlenb = simd_maxsz(desc);                     \
4699     uint32_t vta = vext_vta(desc);                         \
4700     uint32_t i;                                            \
4701     TD s1 =  *((TD *)vs1 + HD(0));                         \
4702                                                            \
4703     for (i = env->vstart; i < vl; i++) {                   \
4704         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4705         if (!vm && !vext_elem_mask(v0, i)) {               \
4706             continue;                                      \
4707         }                                                  \
4708         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4709     }                                                      \
4710     *((TD *)vd + HD(0)) = s1;                              \
4711     env->vstart = 0;                                       \
4712     /* set tail elements to 1s */                          \
4713     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4714 }
4715 
4716 /* Unordered sum */
4717 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4718 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4719 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4720 
4721 /* Maximum value */
4722 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4723 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4724 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4725 
4726 /* Minimum value */
4727 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4728 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4729 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4730 
4731 /* Vector Widening Floating-Point Reduction Instructions */
4732 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4733 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4734                             void *vs2, CPURISCVState *env, uint32_t desc)
4735 {
4736     uint32_t vm = vext_vm(desc);
4737     uint32_t vl = env->vl;
4738     uint32_t esz = sizeof(uint32_t);
4739     uint32_t vlenb = simd_maxsz(desc);
4740     uint32_t vta = vext_vta(desc);
4741     uint32_t i;
4742     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4743 
4744     for (i = env->vstart; i < vl; i++) {
4745         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4746         if (!vm && !vext_elem_mask(v0, i)) {
4747             continue;
4748         }
4749         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4750                          &env->fp_status);
4751     }
4752     *((uint32_t *)vd + H4(0)) = s1;
4753     env->vstart = 0;
4754     /* set tail elements to 1s */
4755     vext_set_elems_1s(vd, vta, esz, vlenb);
4756 }
4757 
4758 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4759                             void *vs2, CPURISCVState *env, uint32_t desc)
4760 {
4761     uint32_t vm = vext_vm(desc);
4762     uint32_t vl = env->vl;
4763     uint32_t esz = sizeof(uint64_t);
4764     uint32_t vlenb = simd_maxsz(desc);
4765     uint32_t vta = vext_vta(desc);
4766     uint32_t i;
4767     uint64_t s1 =  *((uint64_t *)vs1);
4768 
4769     for (i = env->vstart; i < vl; i++) {
4770         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4771         if (!vm && !vext_elem_mask(v0, i)) {
4772             continue;
4773         }
4774         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4775                          &env->fp_status);
4776     }
4777     *((uint64_t *)vd) = s1;
4778     env->vstart = 0;
4779     /* set tail elements to 1s */
4780     vext_set_elems_1s(vd, vta, esz, vlenb);
4781 }
4782 
4783 /*
4784  *** Vector Mask Operations
4785  */
4786 /* Vector Mask-Register Logical Instructions */
4787 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4788 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4789                   void *vs2, CPURISCVState *env,          \
4790                   uint32_t desc)                          \
4791 {                                                         \
4792     uint32_t vl = env->vl;                                \
4793     uint32_t total_elems = env_archcpu(env)->cfg.vlen;    \
4794     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4795     uint32_t i;                                           \
4796     int a, b;                                             \
4797                                                           \
4798     for (i = env->vstart; i < vl; i++) {                  \
4799         a = vext_elem_mask(vs1, i);                       \
4800         b = vext_elem_mask(vs2, i);                       \
4801         vext_set_elem_mask(vd, i, OP(b, a));              \
4802     }                                                     \
4803     env->vstart = 0;                                      \
4804     /* mask destination register are always tail-         \
4805      * agnostic                                           \
4806      */                                                   \
4807     /* set tail elements to 1s */                         \
4808     if (vta_all_1s) {                                     \
4809         for (; i < total_elems; i++) {                    \
4810             vext_set_elem_mask(vd, i, 1);                 \
4811         }                                                 \
4812     }                                                     \
4813 }
4814 
4815 #define DO_NAND(N, M)  (!(N & M))
4816 #define DO_ANDNOT(N, M)  (N & !M)
4817 #define DO_NOR(N, M)  (!(N | M))
4818 #define DO_ORNOT(N, M)  (N | !M)
4819 #define DO_XNOR(N, M)  (!(N ^ M))
4820 
4821 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4822 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4823 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4824 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4825 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4826 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4827 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4828 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4829 
4830 /* Vector count population in mask vcpop */
4831 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4832                              uint32_t desc)
4833 {
4834     target_ulong cnt = 0;
4835     uint32_t vm = vext_vm(desc);
4836     uint32_t vl = env->vl;
4837     int i;
4838 
4839     for (i = env->vstart; i < vl; i++) {
4840         if (vm || vext_elem_mask(v0, i)) {
4841             if (vext_elem_mask(vs2, i)) {
4842                 cnt++;
4843             }
4844         }
4845     }
4846     env->vstart = 0;
4847     return cnt;
4848 }
4849 
4850 /* vfirst find-first-set mask bit*/
4851 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4852                               uint32_t desc)
4853 {
4854     uint32_t vm = vext_vm(desc);
4855     uint32_t vl = env->vl;
4856     int i;
4857 
4858     for (i = env->vstart; i < vl; i++) {
4859         if (vm || vext_elem_mask(v0, i)) {
4860             if (vext_elem_mask(vs2, i)) {
4861                 return i;
4862             }
4863         }
4864     }
4865     env->vstart = 0;
4866     return -1LL;
4867 }
4868 
4869 enum set_mask_type {
4870     ONLY_FIRST = 1,
4871     INCLUDE_FIRST,
4872     BEFORE_FIRST,
4873 };
4874 
4875 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4876                    uint32_t desc, enum set_mask_type type)
4877 {
4878     uint32_t vm = vext_vm(desc);
4879     uint32_t vl = env->vl;
4880     uint32_t total_elems = env_archcpu(env)->cfg.vlen;
4881     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4882     uint32_t vma = vext_vma(desc);
4883     int i;
4884     bool first_mask_bit = false;
4885 
4886     for (i = env->vstart; i < vl; i++) {
4887         if (!vm && !vext_elem_mask(v0, i)) {
4888             /* set masked-off elements to 1s */
4889             if (vma) {
4890                 vext_set_elem_mask(vd, i, 1);
4891             }
4892             continue;
4893         }
4894         /* write a zero to all following active elements */
4895         if (first_mask_bit) {
4896             vext_set_elem_mask(vd, i, 0);
4897             continue;
4898         }
4899         if (vext_elem_mask(vs2, i)) {
4900             first_mask_bit = true;
4901             if (type == BEFORE_FIRST) {
4902                 vext_set_elem_mask(vd, i, 0);
4903             } else {
4904                 vext_set_elem_mask(vd, i, 1);
4905             }
4906         } else {
4907             if (type == ONLY_FIRST) {
4908                 vext_set_elem_mask(vd, i, 0);
4909             } else {
4910                 vext_set_elem_mask(vd, i, 1);
4911             }
4912         }
4913     }
4914     env->vstart = 0;
4915     /* mask destination register are always tail-agnostic */
4916     /* set tail elements to 1s */
4917     if (vta_all_1s) {
4918         for (; i < total_elems; i++) {
4919             vext_set_elem_mask(vd, i, 1);
4920         }
4921     }
4922 }
4923 
4924 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4925                      uint32_t desc)
4926 {
4927     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4928 }
4929 
4930 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4931                      uint32_t desc)
4932 {
4933     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4934 }
4935 
4936 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4937                      uint32_t desc)
4938 {
4939     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4940 }
4941 
4942 /* Vector Iota Instruction */
4943 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4944 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4945                   uint32_t desc)                                          \
4946 {                                                                         \
4947     uint32_t vm = vext_vm(desc);                                          \
4948     uint32_t vl = env->vl;                                                \
4949     uint32_t esz = sizeof(ETYPE);                                         \
4950     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4951     uint32_t vta = vext_vta(desc);                                        \
4952     uint32_t vma = vext_vma(desc);                                        \
4953     uint32_t sum = 0;                                                     \
4954     int i;                                                                \
4955                                                                           \
4956     for (i = env->vstart; i < vl; i++) {                                  \
4957         if (!vm && !vext_elem_mask(v0, i)) {                              \
4958             /* set masked-off elements to 1s */                           \
4959             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4960             continue;                                                     \
4961         }                                                                 \
4962         *((ETYPE *)vd + H(i)) = sum;                                      \
4963         if (vext_elem_mask(vs2, i)) {                                     \
4964             sum++;                                                        \
4965         }                                                                 \
4966     }                                                                     \
4967     env->vstart = 0;                                                      \
4968     /* set tail elements to 1s */                                         \
4969     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4970 }
4971 
4972 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4973 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4974 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4975 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4976 
4977 /* Vector Element Index Instruction */
4978 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4979 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4980 {                                                                         \
4981     uint32_t vm = vext_vm(desc);                                          \
4982     uint32_t vl = env->vl;                                                \
4983     uint32_t esz = sizeof(ETYPE);                                         \
4984     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4985     uint32_t vta = vext_vta(desc);                                        \
4986     uint32_t vma = vext_vma(desc);                                        \
4987     int i;                                                                \
4988                                                                           \
4989     for (i = env->vstart; i < vl; i++) {                                  \
4990         if (!vm && !vext_elem_mask(v0, i)) {                              \
4991             /* set masked-off elements to 1s */                           \
4992             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4993             continue;                                                     \
4994         }                                                                 \
4995         *((ETYPE *)vd + H(i)) = i;                                        \
4996     }                                                                     \
4997     env->vstart = 0;                                                      \
4998     /* set tail elements to 1s */                                         \
4999     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5000 }
5001 
5002 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5003 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5004 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5005 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5006 
5007 /*
5008  *** Vector Permutation Instructions
5009  */
5010 
5011 /* Vector Slide Instructions */
5012 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5013 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5014                   CPURISCVState *env, uint32_t desc)                      \
5015 {                                                                         \
5016     uint32_t vm = vext_vm(desc);                                          \
5017     uint32_t vl = env->vl;                                                \
5018     uint32_t esz = sizeof(ETYPE);                                         \
5019     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5020     uint32_t vta = vext_vta(desc);                                        \
5021     uint32_t vma = vext_vma(desc);                                        \
5022     target_ulong offset = s1, i_min, i;                                   \
5023                                                                           \
5024     i_min = MAX(env->vstart, offset);                                     \
5025     for (i = i_min; i < vl; i++) {                                        \
5026         if (!vm && !vext_elem_mask(v0, i)) {                              \
5027             /* set masked-off elements to 1s */                           \
5028             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5029             continue;                                                     \
5030         }                                                                 \
5031         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5032     }                                                                     \
5033     /* set tail elements to 1s */                                         \
5034     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5035 }
5036 
5037 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5038 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5039 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5040 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5041 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5042 
5043 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5044 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5045                   CPURISCVState *env, uint32_t desc)                      \
5046 {                                                                         \
5047     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5048     uint32_t vm = vext_vm(desc);                                          \
5049     uint32_t vl = env->vl;                                                \
5050     uint32_t esz = sizeof(ETYPE);                                         \
5051     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5052     uint32_t vta = vext_vta(desc);                                        \
5053     uint32_t vma = vext_vma(desc);                                        \
5054     target_ulong i_max, i;                                                \
5055                                                                           \
5056     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
5057     for (i = env->vstart; i < i_max; ++i) {                               \
5058         if (!vm && !vext_elem_mask(v0, i)) {                              \
5059             /* set masked-off elements to 1s */                           \
5060             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5061             continue;                                                     \
5062         }                                                                 \
5063         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5064     }                                                                     \
5065                                                                           \
5066     for (i = i_max; i < vl; ++i) {                                        \
5067         if (vm || vext_elem_mask(v0, i)) {                                \
5068             *((ETYPE *)vd + H(i)) = 0;                                    \
5069         }                                                                 \
5070     }                                                                     \
5071                                                                           \
5072     env->vstart = 0;                                                      \
5073     /* set tail elements to 1s */                                         \
5074     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5075 }
5076 
5077 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5078 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5079 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5080 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5081 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5082 
5083 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5084 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5085                      void *vs2, CPURISCVState *env, uint32_t desc)          \
5086 {                                                                           \
5087     typedef uint##BITWIDTH##_t ETYPE;                                       \
5088     uint32_t vm = vext_vm(desc);                                            \
5089     uint32_t vl = env->vl;                                                  \
5090     uint32_t esz = sizeof(ETYPE);                                           \
5091     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5092     uint32_t vta = vext_vta(desc);                                          \
5093     uint32_t vma = vext_vma(desc);                                          \
5094     uint32_t i;                                                             \
5095                                                                             \
5096     for (i = env->vstart; i < vl; i++) {                                    \
5097         if (!vm && !vext_elem_mask(v0, i)) {                                \
5098             /* set masked-off elements to 1s */                             \
5099             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5100             continue;                                                       \
5101         }                                                                   \
5102         if (i == 0) {                                                       \
5103             *((ETYPE *)vd + H(i)) = s1;                                     \
5104         } else {                                                            \
5105             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5106         }                                                                   \
5107     }                                                                       \
5108     env->vstart = 0;                                                        \
5109     /* set tail elements to 1s */                                           \
5110     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5111 }
5112 
5113 GEN_VEXT_VSLIE1UP(8,  H1)
5114 GEN_VEXT_VSLIE1UP(16, H2)
5115 GEN_VEXT_VSLIE1UP(32, H4)
5116 GEN_VEXT_VSLIE1UP(64, H8)
5117 
5118 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5119 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5120                   CPURISCVState *env, uint32_t desc)              \
5121 {                                                                 \
5122     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5123 }
5124 
5125 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5126 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5127 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5128 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5129 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5130 
5131 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5132 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5133                        void *vs2, CPURISCVState *env, uint32_t desc)          \
5134 {                                                                             \
5135     typedef uint##BITWIDTH##_t ETYPE;                                         \
5136     uint32_t vm = vext_vm(desc);                                              \
5137     uint32_t vl = env->vl;                                                    \
5138     uint32_t esz = sizeof(ETYPE);                                             \
5139     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5140     uint32_t vta = vext_vta(desc);                                            \
5141     uint32_t vma = vext_vma(desc);                                            \
5142     uint32_t i;                                                               \
5143                                                                               \
5144     for (i = env->vstart; i < vl; i++) {                                      \
5145         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5146             /* set masked-off elements to 1s */                               \
5147             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5148             continue;                                                         \
5149         }                                                                     \
5150         if (i == vl - 1) {                                                    \
5151             *((ETYPE *)vd + H(i)) = s1;                                       \
5152         } else {                                                              \
5153             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5154         }                                                                     \
5155     }                                                                         \
5156     env->vstart = 0;                                                          \
5157     /* set tail elements to 1s */                                             \
5158     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5159 }
5160 
5161 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5162 GEN_VEXT_VSLIDE1DOWN(16, H2)
5163 GEN_VEXT_VSLIDE1DOWN(32, H4)
5164 GEN_VEXT_VSLIDE1DOWN(64, H8)
5165 
5166 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5167 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5168                   CPURISCVState *env, uint32_t desc)              \
5169 {                                                                 \
5170     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5171 }
5172 
5173 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5174 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5175 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5176 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5177 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5178 
5179 /* Vector Floating-Point Slide Instructions */
5180 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5181 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5182                   CPURISCVState *env, uint32_t desc)          \
5183 {                                                             \
5184     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5185 }
5186 
5187 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5188 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5189 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5190 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5191 
5192 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5193 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5194                   CPURISCVState *env, uint32_t desc)          \
5195 {                                                             \
5196     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5197 }
5198 
5199 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5200 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5201 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5202 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5203 
5204 /* Vector Register Gather Instruction */
5205 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5206 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5207                   CPURISCVState *env, uint32_t desc)                      \
5208 {                                                                         \
5209     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5210     uint32_t vm = vext_vm(desc);                                          \
5211     uint32_t vl = env->vl;                                                \
5212     uint32_t esz = sizeof(TS2);                                           \
5213     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5214     uint32_t vta = vext_vta(desc);                                        \
5215     uint32_t vma = vext_vma(desc);                                        \
5216     uint64_t index;                                                       \
5217     uint32_t i;                                                           \
5218                                                                           \
5219     for (i = env->vstart; i < vl; i++) {                                  \
5220         if (!vm && !vext_elem_mask(v0, i)) {                              \
5221             /* set masked-off elements to 1s */                           \
5222             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5223             continue;                                                     \
5224         }                                                                 \
5225         index = *((TS1 *)vs1 + HS1(i));                                   \
5226         if (index >= vlmax) {                                             \
5227             *((TS2 *)vd + HS2(i)) = 0;                                    \
5228         } else {                                                          \
5229             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5230         }                                                                 \
5231     }                                                                     \
5232     env->vstart = 0;                                                      \
5233     /* set tail elements to 1s */                                         \
5234     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5235 }
5236 
5237 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5238 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5239 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5240 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5241 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5242 
5243 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5244 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5245 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5246 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5247 
5248 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5249 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5250                   CPURISCVState *env, uint32_t desc)                      \
5251 {                                                                         \
5252     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5253     uint32_t vm = vext_vm(desc);                                          \
5254     uint32_t vl = env->vl;                                                \
5255     uint32_t esz = sizeof(ETYPE);                                         \
5256     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5257     uint32_t vta = vext_vta(desc);                                        \
5258     uint32_t vma = vext_vma(desc);                                        \
5259     uint64_t index = s1;                                                  \
5260     uint32_t i;                                                           \
5261                                                                           \
5262     for (i = env->vstart; i < vl; i++) {                                  \
5263         if (!vm && !vext_elem_mask(v0, i)) {                              \
5264             /* set masked-off elements to 1s */                           \
5265             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5266             continue;                                                     \
5267         }                                                                 \
5268         if (index >= vlmax) {                                             \
5269             *((ETYPE *)vd + H(i)) = 0;                                    \
5270         } else {                                                          \
5271             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5272         }                                                                 \
5273     }                                                                     \
5274     env->vstart = 0;                                                      \
5275     /* set tail elements to 1s */                                         \
5276     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5277 }
5278 
5279 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5280 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5281 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5282 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5283 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5284 
5285 /* Vector Compress Instruction */
5286 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5287 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5288                   CPURISCVState *env, uint32_t desc)                      \
5289 {                                                                         \
5290     uint32_t vl = env->vl;                                                \
5291     uint32_t esz = sizeof(ETYPE);                                         \
5292     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5293     uint32_t vta = vext_vta(desc);                                        \
5294     uint32_t num = 0, i;                                                  \
5295                                                                           \
5296     for (i = env->vstart; i < vl; i++) {                                  \
5297         if (!vext_elem_mask(vs1, i)) {                                    \
5298             continue;                                                     \
5299         }                                                                 \
5300         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5301         num++;                                                            \
5302     }                                                                     \
5303     env->vstart = 0;                                                      \
5304     /* set tail elements to 1s */                                         \
5305     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5306 }
5307 
5308 /* Compress into vd elements of vs2 where vs1 is enabled */
5309 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5310 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5311 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5312 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5313 
5314 /* Vector Whole Register Move */
5315 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5316 {
5317     /* EEW = SEW */
5318     uint32_t maxsz = simd_maxsz(desc);
5319     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5320     uint32_t startb = env->vstart * sewb;
5321     uint32_t i = startb;
5322 
5323     memcpy((uint8_t *)vd + H1(i),
5324            (uint8_t *)vs2 + H1(i),
5325            maxsz - startb);
5326 
5327     env->vstart = 0;
5328 }
5329 
5330 /* Vector Integer Extension */
5331 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5332 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5333                   CPURISCVState *env, uint32_t desc)             \
5334 {                                                                \
5335     uint32_t vl = env->vl;                                       \
5336     uint32_t vm = vext_vm(desc);                                 \
5337     uint32_t esz = sizeof(ETYPE);                                \
5338     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5339     uint32_t vta = vext_vta(desc);                               \
5340     uint32_t vma = vext_vma(desc);                               \
5341     uint32_t i;                                                  \
5342                                                                  \
5343     for (i = env->vstart; i < vl; i++) {                         \
5344         if (!vm && !vext_elem_mask(v0, i)) {                     \
5345             /* set masked-off elements to 1s */                  \
5346             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5347             continue;                                            \
5348         }                                                        \
5349         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5350     }                                                            \
5351     env->vstart = 0;                                             \
5352     /* set tail elements to 1s */                                \
5353     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5354 }
5355 
5356 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5357 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5358 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5359 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5360 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5361 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5362 
5363 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5364 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5365 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5366 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5367 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5368 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5369