xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 6e11d7ea)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vma(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VMA);
133 }
134 
135 static inline uint32_t vext_vta_all_1s(uint32_t desc)
136 {
137     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
138 }
139 
140 /*
141  * Get the maximum number of elements can be operated.
142  *
143  * log2_esz: log2 of element size in bytes.
144  */
145 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
146 {
147     /*
148      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
149      * so vlen in bytes (vlenb) is encoded as maxsz.
150      */
151     uint32_t vlenb = simd_maxsz(desc);
152 
153     /* Return VLMAX */
154     int scale = vext_lmul(desc) - log2_esz;
155     return scale < 0 ? vlenb >> -scale : vlenb << scale;
156 }
157 
158 /*
159  * Get number of total elements, including prestart, body and tail elements.
160  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
161  * are held in the same vector register.
162  */
163 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
164                                             uint32_t esz)
165 {
166     uint32_t vlenb = simd_maxsz(desc);
167     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
168     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
169                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
170     return (vlenb << emul) / esz;
171 }
172 
173 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
174 {
175     return (addr & env->cur_pmmask) | env->cur_pmbase;
176 }
177 
178 /*
179  * This function checks watchpoint before real load operation.
180  *
181  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
182  * In user mode, there is no watchpoint support now.
183  *
184  * It will trigger an exception if there is no mapping in TLB
185  * and page table walk can't fill the TLB entry. Then the guest
186  * software can return here after process the exception or never return.
187  */
188 static void probe_pages(CPURISCVState *env, target_ulong addr,
189                         target_ulong len, uintptr_t ra,
190                         MMUAccessType access_type)
191 {
192     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
193     target_ulong curlen = MIN(pagelen, len);
194 
195     probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                  cpu_mmu_index(env, false), ra);
197     if (len > curlen) {
198         addr += curlen;
199         curlen = len - curlen;
200         probe_access(env, adjust_addr(env, addr), curlen, access_type,
201                      cpu_mmu_index(env, false), ra);
202     }
203 }
204 
205 /* set agnostic elements to 1s */
206 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
207                               uint32_t tot)
208 {
209     if (is_agnostic == 0) {
210         /* policy undisturbed */
211         return;
212     }
213     if (tot - cnt == 0) {
214         return ;
215     }
216     memset(base + cnt, -1, tot - cnt);
217 }
218 
219 static inline void vext_set_elem_mask(void *v0, int index,
220                                       uint8_t value)
221 {
222     int idx = index / 64;
223     int pos = index % 64;
224     uint64_t old = ((uint64_t *)v0)[idx];
225     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
226 }
227 
228 /*
229  * Earlier designs (pre-0.9) had a varying number of bits
230  * per mask value (MLEN). In the 0.9 design, MLEN=1.
231  * (Section 4.5)
232  */
233 static inline int vext_elem_mask(void *v0, int index)
234 {
235     int idx = index / 64;
236     int pos = index  % 64;
237     return (((uint64_t *)v0)[idx] >> pos) & 1;
238 }
239 
240 /* elements operations for load and store */
241 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
242                                uint32_t idx, void *vd, uintptr_t retaddr);
243 
244 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
245 static void NAME(CPURISCVState *env, abi_ptr addr,         \
246                  uint32_t idx, void *vd, uintptr_t retaddr)\
247 {                                                          \
248     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
249     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
250 }                                                          \
251 
252 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
253 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
254 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
255 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
256 
257 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
258 static void NAME(CPURISCVState *env, abi_ptr addr,         \
259                  uint32_t idx, void *vd, uintptr_t retaddr)\
260 {                                                          \
261     ETYPE data = *((ETYPE *)vd + H(idx));                  \
262     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
263 }
264 
265 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
266 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
267 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
268 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
269 
270 /*
271  *** stride: access vector element from strided memory
272  */
273 static void
274 vext_ldst_stride(void *vd, void *v0, target_ulong base,
275                  target_ulong stride, CPURISCVState *env,
276                  uint32_t desc, uint32_t vm,
277                  vext_ldst_elem_fn *ldst_elem,
278                  uint32_t log2_esz, uintptr_t ra)
279 {
280     uint32_t i, k;
281     uint32_t nf = vext_nf(desc);
282     uint32_t max_elems = vext_max_elems(desc, log2_esz);
283     uint32_t esz = 1 << log2_esz;
284     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
285     uint32_t vta = vext_vta(desc);
286     uint32_t vma = vext_vma(desc);
287 
288     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
289         k = 0;
290         while (k < nf) {
291             if (!vm && !vext_elem_mask(v0, i)) {
292                 /* set masked-off elements to 1s */
293                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
294                                   (i + k * max_elems + 1) * esz);
295                 k++;
296                 continue;
297             }
298             target_ulong addr = base + stride * i + (k << log2_esz);
299             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
300             k++;
301         }
302     }
303     env->vstart = 0;
304     /* set tail elements to 1s */
305     for (k = 0; k < nf; ++k) {
306         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
307                           (k * max_elems + max_elems) * esz);
308     }
309     if (nf * max_elems % total_elems != 0) {
310         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
311         uint32_t registers_used =
312             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
313         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
314                           registers_used * vlenb);
315     }
316 }
317 
318 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
319 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
320                   target_ulong stride, CPURISCVState *env,              \
321                   uint32_t desc)                                        \
322 {                                                                       \
323     uint32_t vm = vext_vm(desc);                                        \
324     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
325                      ctzl(sizeof(ETYPE)), GETPC());                     \
326 }
327 
328 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
329 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
330 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
331 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
332 
333 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
334 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
335                   target_ulong stride, CPURISCVState *env,              \
336                   uint32_t desc)                                        \
337 {                                                                       \
338     uint32_t vm = vext_vm(desc);                                        \
339     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
340                      ctzl(sizeof(ETYPE)), GETPC());                     \
341 }
342 
343 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
344 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
345 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
346 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
347 
348 /*
349  *** unit-stride: access elements stored contiguously in memory
350  */
351 
352 /* unmasked unit-stride load and store operation*/
353 static void
354 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
355              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
356              uintptr_t ra)
357 {
358     uint32_t i, k;
359     uint32_t nf = vext_nf(desc);
360     uint32_t max_elems = vext_max_elems(desc, log2_esz);
361     uint32_t esz = 1 << log2_esz;
362     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
363     uint32_t vta = vext_vta(desc);
364 
365     /* load bytes from guest memory */
366     for (i = env->vstart; i < evl; i++, env->vstart++) {
367         k = 0;
368         while (k < nf) {
369             target_ulong addr = base + ((i * nf + k) << log2_esz);
370             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
371             k++;
372         }
373     }
374     env->vstart = 0;
375     /* set tail elements to 1s */
376     for (k = 0; k < nf; ++k) {
377         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
378                           (k * max_elems + max_elems) * esz);
379     }
380     if (nf * max_elems % total_elems != 0) {
381         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
382         uint32_t registers_used =
383             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
384         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
385                           registers_used * vlenb);
386     }
387 }
388 
389 /*
390  * masked unit-stride load and store operation will be a special case of stride,
391  * stride = NF * sizeof (MTYPE)
392  */
393 
394 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
395 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
396                          CPURISCVState *env, uint32_t desc)             \
397 {                                                                       \
398     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
399     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
400                      ctzl(sizeof(ETYPE)), GETPC());                     \
401 }                                                                       \
402                                                                         \
403 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
404                   CPURISCVState *env, uint32_t desc)                    \
405 {                                                                       \
406     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
407                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
408 }
409 
410 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
411 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
412 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
413 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
414 
415 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
416 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
417                          CPURISCVState *env, uint32_t desc)              \
418 {                                                                        \
419     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
420     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
421                      ctzl(sizeof(ETYPE)), GETPC());                      \
422 }                                                                        \
423                                                                          \
424 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
425                   CPURISCVState *env, uint32_t desc)                     \
426 {                                                                        \
427     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
428                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
429 }
430 
431 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
432 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
433 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
434 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
435 
436 /*
437  *** unit stride mask load and store, EEW = 1
438  */
439 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, lde_b,
445                  0, evl, GETPC());
446 }
447 
448 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
449                     CPURISCVState *env, uint32_t desc)
450 {
451     /* evl = ceil(vl/8) */
452     uint8_t evl = (env->vl + 7) >> 3;
453     vext_ldst_us(vd, base, env, desc, ste_b,
454                  0, evl, GETPC());
455 }
456 
457 /*
458  *** index: access vector element from indexed memory
459  */
460 typedef target_ulong vext_get_index_addr(target_ulong base,
461         uint32_t idx, void *vs2);
462 
463 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
464 static target_ulong NAME(target_ulong base,            \
465                          uint32_t idx, void *vs2)      \
466 {                                                      \
467     return (base + *((ETYPE *)vs2 + H(idx)));          \
468 }
469 
470 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
471 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
472 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
473 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
474 
475 static inline void
476 vext_ldst_index(void *vd, void *v0, target_ulong base,
477                 void *vs2, CPURISCVState *env, uint32_t desc,
478                 vext_get_index_addr get_index_addr,
479                 vext_ldst_elem_fn *ldst_elem,
480                 uint32_t log2_esz, uintptr_t ra)
481 {
482     uint32_t i, k;
483     uint32_t nf = vext_nf(desc);
484     uint32_t vm = vext_vm(desc);
485     uint32_t max_elems = vext_max_elems(desc, log2_esz);
486     uint32_t esz = 1 << log2_esz;
487     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
488     uint32_t vta = vext_vta(desc);
489     uint32_t vma = vext_vma(desc);
490 
491     /* load bytes from guest memory */
492     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
493         k = 0;
494         while (k < nf) {
495             if (!vm && !vext_elem_mask(v0, i)) {
496                 /* set masked-off elements to 1s */
497                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
498                                   (i + k * max_elems + 1) * esz);
499                 k++;
500                 continue;
501             }
502             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
503             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
504             k++;
505         }
506     }
507     env->vstart = 0;
508     /* set tail elements to 1s */
509     for (k = 0; k < nf; ++k) {
510         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
511                           (k * max_elems + max_elems) * esz);
512     }
513     if (nf * max_elems % total_elems != 0) {
514         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
515         uint32_t registers_used =
516             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
517         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
518                           registers_used * vlenb);
519     }
520 }
521 
522 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
523 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
524                   void *vs2, CPURISCVState *env, uint32_t desc)            \
525 {                                                                          \
526     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
527                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
528 }
529 
530 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
531 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
532 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
533 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
534 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
535 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
536 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
537 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
538 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
539 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
540 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
541 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
542 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
543 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
544 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
545 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
546 
547 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
548 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
549                   void *vs2, CPURISCVState *env, uint32_t desc)  \
550 {                                                                \
551     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
552                     STORE_FN, ctzl(sizeof(ETYPE)),               \
553                     GETPC());                                    \
554 }
555 
556 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
557 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
558 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
559 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
560 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
561 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
562 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
563 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
564 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
565 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
566 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
567 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
568 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
569 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
570 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
571 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
572 
573 /*
574  *** unit-stride fault-only-fisrt load instructions
575  */
576 static inline void
577 vext_ldff(void *vd, void *v0, target_ulong base,
578           CPURISCVState *env, uint32_t desc,
579           vext_ldst_elem_fn *ldst_elem,
580           uint32_t log2_esz, uintptr_t ra)
581 {
582     void *host;
583     uint32_t i, k, vl = 0;
584     uint32_t nf = vext_nf(desc);
585     uint32_t vm = vext_vm(desc);
586     uint32_t max_elems = vext_max_elems(desc, log2_esz);
587     uint32_t esz = 1 << log2_esz;
588     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
589     uint32_t vta = vext_vta(desc);
590     uint32_t vma = vext_vma(desc);
591     target_ulong addr, offset, remain;
592 
593     /* probe every access*/
594     for (i = env->vstart; i < env->vl; i++) {
595         if (!vm && !vext_elem_mask(v0, i)) {
596             continue;
597         }
598         addr = adjust_addr(env, base + i * (nf << log2_esz));
599         if (i == 0) {
600             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
601         } else {
602             /* if it triggers an exception, no need to check watchpoint */
603             remain = nf << log2_esz;
604             while (remain > 0) {
605                 offset = -(addr | TARGET_PAGE_MASK);
606                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
607                                          cpu_mmu_index(env, false));
608                 if (host) {
609 #ifdef CONFIG_USER_ONLY
610                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
611                         vl = i;
612                         goto ProbeSuccess;
613                     }
614 #else
615                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
616 #endif
617                 } else {
618                     vl = i;
619                     goto ProbeSuccess;
620                 }
621                 if (remain <=  offset) {
622                     break;
623                 }
624                 remain -= offset;
625                 addr = adjust_addr(env, addr + offset);
626             }
627         }
628     }
629 ProbeSuccess:
630     /* load bytes from guest memory */
631     if (vl != 0) {
632         env->vl = vl;
633     }
634     for (i = env->vstart; i < env->vl; i++) {
635         k = 0;
636         while (k < nf) {
637             if (!vm && !vext_elem_mask(v0, i)) {
638                 /* set masked-off elements to 1s */
639                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
640                                   (i + k * max_elems + 1) * esz);
641                 k++;
642                 continue;
643             }
644             target_ulong addr = base + ((i * nf + k) << log2_esz);
645             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
646             k++;
647         }
648     }
649     env->vstart = 0;
650     /* set tail elements to 1s */
651     for (k = 0; k < nf; ++k) {
652         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
653                           (k * max_elems + max_elems) * esz);
654     }
655     if (nf * max_elems % total_elems != 0) {
656         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
657         uint32_t registers_used =
658             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
659         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
660                           registers_used * vlenb);
661     }
662 }
663 
664 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
665 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
666                   CPURISCVState *env, uint32_t desc)      \
667 {                                                         \
668     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
669               ctzl(sizeof(ETYPE)), GETPC());              \
670 }
671 
672 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
673 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
674 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
675 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
676 
677 #define DO_SWAP(N, M) (M)
678 #define DO_AND(N, M)  (N & M)
679 #define DO_XOR(N, M)  (N ^ M)
680 #define DO_OR(N, M)   (N | M)
681 #define DO_ADD(N, M)  (N + M)
682 
683 /* Signed min/max */
684 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
685 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
686 
687 /* Unsigned min/max */
688 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
689 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
690 
691 /*
692  *** load and store whole register instructions
693  */
694 static void
695 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
696                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
697 {
698     uint32_t i, k, off, pos;
699     uint32_t nf = vext_nf(desc);
700     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
701     uint32_t max_elems = vlenb >> log2_esz;
702 
703     k = env->vstart / max_elems;
704     off = env->vstart % max_elems;
705 
706     if (off) {
707         /* load/store rest of elements of current segment pointed by vstart */
708         for (pos = off; pos < max_elems; pos++, env->vstart++) {
709             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
710             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
711         }
712         k++;
713     }
714 
715     /* load/store elements for rest of segments */
716     for (; k < nf; k++) {
717         for (i = 0; i < max_elems; i++, env->vstart++) {
718             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
719             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
720         }
721     }
722 
723     env->vstart = 0;
724 }
725 
726 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
727 void HELPER(NAME)(void *vd, target_ulong base,       \
728                   CPURISCVState *env, uint32_t desc) \
729 {                                                    \
730     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
731                     ctzl(sizeof(ETYPE)), GETPC());   \
732 }
733 
734 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
735 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
736 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
737 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
738 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
739 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
740 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
741 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
742 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
743 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
744 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
745 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
746 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
747 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
748 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
749 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
750 
751 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
752 void HELPER(NAME)(void *vd, target_ulong base,       \
753                   CPURISCVState *env, uint32_t desc) \
754 {                                                    \
755     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
756                     ctzl(sizeof(ETYPE)), GETPC());   \
757 }
758 
759 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
760 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
761 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
762 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
763 
764 /*
765  *** Vector Integer Arithmetic Instructions
766  */
767 
768 /* expand macro args before macro */
769 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
770 
771 /* (TD, T1, T2, TX1, TX2) */
772 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
773 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
774 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
775 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
776 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
777 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
778 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
779 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
780 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
781 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
782 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
783 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
784 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
785 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
786 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
787 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
788 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
789 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
790 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
791 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
792 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
793 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
794 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
795 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
796 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
797 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
798 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
799 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
800 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
801 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
802 
803 /* operation of two vector elements */
804 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
805 
806 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
807 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
808 {                                                               \
809     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
810     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
811     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
812 }
813 #define DO_SUB(N, M) (N - M)
814 #define DO_RSUB(N, M) (M - N)
815 
816 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
817 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
818 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
819 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
820 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
821 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
822 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
823 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
824 
825 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
826                        CPURISCVState *env, uint32_t desc,
827                        opivv2_fn *fn, uint32_t esz)
828 {
829     uint32_t vm = vext_vm(desc);
830     uint32_t vl = env->vl;
831     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
832     uint32_t vta = vext_vta(desc);
833     uint32_t vma = vext_vma(desc);
834     uint32_t i;
835 
836     for (i = env->vstart; i < vl; i++) {
837         if (!vm && !vext_elem_mask(v0, i)) {
838             /* set masked-off elements to 1s */
839             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
840             continue;
841         }
842         fn(vd, vs1, vs2, i);
843     }
844     env->vstart = 0;
845     /* set tail elements to 1s */
846     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
847 }
848 
849 /* generate the helpers for OPIVV */
850 #define GEN_VEXT_VV(NAME, ESZ)                            \
851 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
852                   void *vs2, CPURISCVState *env,          \
853                   uint32_t desc)                          \
854 {                                                         \
855     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
856                do_##NAME, ESZ);                           \
857 }
858 
859 GEN_VEXT_VV(vadd_vv_b, 1)
860 GEN_VEXT_VV(vadd_vv_h, 2)
861 GEN_VEXT_VV(vadd_vv_w, 4)
862 GEN_VEXT_VV(vadd_vv_d, 8)
863 GEN_VEXT_VV(vsub_vv_b, 1)
864 GEN_VEXT_VV(vsub_vv_h, 2)
865 GEN_VEXT_VV(vsub_vv_w, 4)
866 GEN_VEXT_VV(vsub_vv_d, 8)
867 
868 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
869 
870 /*
871  * (T1)s1 gives the real operator type.
872  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
873  */
874 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
875 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
876 {                                                                   \
877     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
878     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
879 }
880 
881 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
882 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
883 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
884 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
885 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
886 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
887 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
888 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
889 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
890 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
891 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
892 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
893 
894 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
895                        CPURISCVState *env, uint32_t desc,
896                        opivx2_fn fn, uint32_t esz)
897 {
898     uint32_t vm = vext_vm(desc);
899     uint32_t vl = env->vl;
900     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
901     uint32_t vta = vext_vta(desc);
902     uint32_t vma = vext_vma(desc);
903     uint32_t i;
904 
905     for (i = env->vstart; i < vl; i++) {
906         if (!vm && !vext_elem_mask(v0, i)) {
907             /* set masked-off elements to 1s */
908             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
909             continue;
910         }
911         fn(vd, s1, vs2, i);
912     }
913     env->vstart = 0;
914     /* set tail elements to 1s */
915     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
916 }
917 
918 /* generate the helpers for OPIVX */
919 #define GEN_VEXT_VX(NAME, ESZ)                            \
920 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
921                   void *vs2, CPURISCVState *env,          \
922                   uint32_t desc)                          \
923 {                                                         \
924     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
925                do_##NAME, ESZ);                           \
926 }
927 
928 GEN_VEXT_VX(vadd_vx_b, 1)
929 GEN_VEXT_VX(vadd_vx_h, 2)
930 GEN_VEXT_VX(vadd_vx_w, 4)
931 GEN_VEXT_VX(vadd_vx_d, 8)
932 GEN_VEXT_VX(vsub_vx_b, 1)
933 GEN_VEXT_VX(vsub_vx_h, 2)
934 GEN_VEXT_VX(vsub_vx_w, 4)
935 GEN_VEXT_VX(vsub_vx_d, 8)
936 GEN_VEXT_VX(vrsub_vx_b, 1)
937 GEN_VEXT_VX(vrsub_vx_h, 2)
938 GEN_VEXT_VX(vrsub_vx_w, 4)
939 GEN_VEXT_VX(vrsub_vx_d, 8)
940 
941 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
942 {
943     intptr_t oprsz = simd_oprsz(desc);
944     intptr_t i;
945 
946     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
947         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
948     }
949 }
950 
951 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
952 {
953     intptr_t oprsz = simd_oprsz(desc);
954     intptr_t i;
955 
956     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
957         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
958     }
959 }
960 
961 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
962 {
963     intptr_t oprsz = simd_oprsz(desc);
964     intptr_t i;
965 
966     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
967         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
968     }
969 }
970 
971 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
972 {
973     intptr_t oprsz = simd_oprsz(desc);
974     intptr_t i;
975 
976     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
977         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
978     }
979 }
980 
981 /* Vector Widening Integer Add/Subtract */
982 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
983 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
984 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
985 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
986 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
987 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
988 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
989 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
990 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
991 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
992 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
993 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
994 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
995 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
996 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
997 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
998 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
999 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1000 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1001 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1002 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1003 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1004 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1005 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1006 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1007 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1008 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1009 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1010 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1011 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1012 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1013 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1014 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1015 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1016 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1017 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1018 GEN_VEXT_VV(vwaddu_vv_b, 2)
1019 GEN_VEXT_VV(vwaddu_vv_h, 4)
1020 GEN_VEXT_VV(vwaddu_vv_w, 8)
1021 GEN_VEXT_VV(vwsubu_vv_b, 2)
1022 GEN_VEXT_VV(vwsubu_vv_h, 4)
1023 GEN_VEXT_VV(vwsubu_vv_w, 8)
1024 GEN_VEXT_VV(vwadd_vv_b, 2)
1025 GEN_VEXT_VV(vwadd_vv_h, 4)
1026 GEN_VEXT_VV(vwadd_vv_w, 8)
1027 GEN_VEXT_VV(vwsub_vv_b, 2)
1028 GEN_VEXT_VV(vwsub_vv_h, 4)
1029 GEN_VEXT_VV(vwsub_vv_w, 8)
1030 GEN_VEXT_VV(vwaddu_wv_b, 2)
1031 GEN_VEXT_VV(vwaddu_wv_h, 4)
1032 GEN_VEXT_VV(vwaddu_wv_w, 8)
1033 GEN_VEXT_VV(vwsubu_wv_b, 2)
1034 GEN_VEXT_VV(vwsubu_wv_h, 4)
1035 GEN_VEXT_VV(vwsubu_wv_w, 8)
1036 GEN_VEXT_VV(vwadd_wv_b, 2)
1037 GEN_VEXT_VV(vwadd_wv_h, 4)
1038 GEN_VEXT_VV(vwadd_wv_w, 8)
1039 GEN_VEXT_VV(vwsub_wv_b, 2)
1040 GEN_VEXT_VV(vwsub_wv_h, 4)
1041 GEN_VEXT_VV(vwsub_wv_w, 8)
1042 
1043 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1044 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1045 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1046 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1047 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1048 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1049 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1050 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1051 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1052 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1053 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1054 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1055 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1056 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1057 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1058 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1059 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1060 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1061 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1062 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1063 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1064 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1065 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1066 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1067 GEN_VEXT_VX(vwaddu_vx_b, 2)
1068 GEN_VEXT_VX(vwaddu_vx_h, 4)
1069 GEN_VEXT_VX(vwaddu_vx_w, 8)
1070 GEN_VEXT_VX(vwsubu_vx_b, 2)
1071 GEN_VEXT_VX(vwsubu_vx_h, 4)
1072 GEN_VEXT_VX(vwsubu_vx_w, 8)
1073 GEN_VEXT_VX(vwadd_vx_b, 2)
1074 GEN_VEXT_VX(vwadd_vx_h, 4)
1075 GEN_VEXT_VX(vwadd_vx_w, 8)
1076 GEN_VEXT_VX(vwsub_vx_b, 2)
1077 GEN_VEXT_VX(vwsub_vx_h, 4)
1078 GEN_VEXT_VX(vwsub_vx_w, 8)
1079 GEN_VEXT_VX(vwaddu_wx_b, 2)
1080 GEN_VEXT_VX(vwaddu_wx_h, 4)
1081 GEN_VEXT_VX(vwaddu_wx_w, 8)
1082 GEN_VEXT_VX(vwsubu_wx_b, 2)
1083 GEN_VEXT_VX(vwsubu_wx_h, 4)
1084 GEN_VEXT_VX(vwsubu_wx_w, 8)
1085 GEN_VEXT_VX(vwadd_wx_b, 2)
1086 GEN_VEXT_VX(vwadd_wx_h, 4)
1087 GEN_VEXT_VX(vwadd_wx_w, 8)
1088 GEN_VEXT_VX(vwsub_wx_b, 2)
1089 GEN_VEXT_VX(vwsub_wx_h, 4)
1090 GEN_VEXT_VX(vwsub_wx_w, 8)
1091 
1092 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1093 #define DO_VADC(N, M, C) (N + M + C)
1094 #define DO_VSBC(N, M, C) (N - M - C)
1095 
1096 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1097 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1098                   CPURISCVState *env, uint32_t desc)          \
1099 {                                                             \
1100     uint32_t vl = env->vl;                                    \
1101     uint32_t esz = sizeof(ETYPE);                             \
1102     uint32_t total_elems =                                    \
1103         vext_get_total_elems(env, desc, esz);                 \
1104     uint32_t vta = vext_vta(desc);                            \
1105     uint32_t i;                                               \
1106                                                               \
1107     for (i = env->vstart; i < vl; i++) {                      \
1108         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1109         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1110         ETYPE carry = vext_elem_mask(v0, i);                  \
1111                                                               \
1112         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1113     }                                                         \
1114     env->vstart = 0;                                          \
1115     /* set tail elements to 1s */                             \
1116     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1117 }
1118 
1119 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1120 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1121 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1122 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1123 
1124 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1125 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1126 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1127 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1128 
1129 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1130 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1131                   CPURISCVState *env, uint32_t desc)                     \
1132 {                                                                        \
1133     uint32_t vl = env->vl;                                               \
1134     uint32_t esz = sizeof(ETYPE);                                        \
1135     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1136     uint32_t vta = vext_vta(desc);                                       \
1137     uint32_t i;                                                          \
1138                                                                          \
1139     for (i = env->vstart; i < vl; i++) {                                 \
1140         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1141         ETYPE carry = vext_elem_mask(v0, i);                             \
1142                                                                          \
1143         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1144     }                                                                    \
1145     env->vstart = 0;                                          \
1146     /* set tail elements to 1s */                                        \
1147     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1148 }
1149 
1150 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1151 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1152 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1153 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1154 
1155 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1156 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1157 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1158 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1159 
1160 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1161                           (__typeof(N))(N + M) < N)
1162 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1163 
1164 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1165 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1166                   CPURISCVState *env, uint32_t desc)          \
1167 {                                                             \
1168     uint32_t vl = env->vl;                                    \
1169     uint32_t vm = vext_vm(desc);                              \
1170     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1171     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1172     uint32_t i;                                               \
1173                                                               \
1174     for (i = env->vstart; i < vl; i++) {                      \
1175         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1176         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1177         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1178         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1179     }                                                         \
1180     env->vstart = 0;                                          \
1181     /* mask destination register are always tail-agnostic */  \
1182     /* set tail elements to 1s */                             \
1183     if (vta_all_1s) {                                         \
1184         for (; i < total_elems; i++) {                        \
1185             vext_set_elem_mask(vd, i, 1);                     \
1186         }                                                     \
1187     }                                                         \
1188 }
1189 
1190 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1191 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1192 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1193 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1194 
1195 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1196 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1197 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1198 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1199 
1200 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1201 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1202                   void *vs2, CPURISCVState *env, uint32_t desc) \
1203 {                                                               \
1204     uint32_t vl = env->vl;                                      \
1205     uint32_t vm = vext_vm(desc);                                \
1206     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1207     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1208     uint32_t i;                                                 \
1209                                                                 \
1210     for (i = env->vstart; i < vl; i++) {                        \
1211         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1212         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1213         vext_set_elem_mask(vd, i,                               \
1214                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1215     }                                                           \
1216     env->vstart = 0;                                            \
1217     /* mask destination register are always tail-agnostic */    \
1218     /* set tail elements to 1s */                               \
1219     if (vta_all_1s) {                                           \
1220         for (; i < total_elems; i++) {                          \
1221             vext_set_elem_mask(vd, i, 1);                       \
1222         }                                                       \
1223     }                                                           \
1224 }
1225 
1226 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1227 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1228 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1229 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1230 
1231 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1232 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1233 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1234 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1235 
1236 /* Vector Bitwise Logical Instructions */
1237 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1238 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1239 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1240 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1241 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1242 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1243 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1244 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1245 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1246 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1247 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1248 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1249 GEN_VEXT_VV(vand_vv_b, 1)
1250 GEN_VEXT_VV(vand_vv_h, 2)
1251 GEN_VEXT_VV(vand_vv_w, 4)
1252 GEN_VEXT_VV(vand_vv_d, 8)
1253 GEN_VEXT_VV(vor_vv_b, 1)
1254 GEN_VEXT_VV(vor_vv_h, 2)
1255 GEN_VEXT_VV(vor_vv_w, 4)
1256 GEN_VEXT_VV(vor_vv_d, 8)
1257 GEN_VEXT_VV(vxor_vv_b, 1)
1258 GEN_VEXT_VV(vxor_vv_h, 2)
1259 GEN_VEXT_VV(vxor_vv_w, 4)
1260 GEN_VEXT_VV(vxor_vv_d, 8)
1261 
1262 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1263 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1264 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1265 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1266 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1267 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1268 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1269 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1270 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1271 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1272 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1273 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1274 GEN_VEXT_VX(vand_vx_b, 1)
1275 GEN_VEXT_VX(vand_vx_h, 2)
1276 GEN_VEXT_VX(vand_vx_w, 4)
1277 GEN_VEXT_VX(vand_vx_d, 8)
1278 GEN_VEXT_VX(vor_vx_b, 1)
1279 GEN_VEXT_VX(vor_vx_h, 2)
1280 GEN_VEXT_VX(vor_vx_w, 4)
1281 GEN_VEXT_VX(vor_vx_d, 8)
1282 GEN_VEXT_VX(vxor_vx_b, 1)
1283 GEN_VEXT_VX(vxor_vx_h, 2)
1284 GEN_VEXT_VX(vxor_vx_w, 4)
1285 GEN_VEXT_VX(vxor_vx_d, 8)
1286 
1287 /* Vector Single-Width Bit Shift Instructions */
1288 #define DO_SLL(N, M)  (N << (M))
1289 #define DO_SRL(N, M)  (N >> (M))
1290 
1291 /* generate the helpers for shift instructions with two vector operators */
1292 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1293 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1294                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1295 {                                                                         \
1296     uint32_t vm = vext_vm(desc);                                          \
1297     uint32_t vl = env->vl;                                                \
1298     uint32_t esz = sizeof(TS1);                                           \
1299     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1300     uint32_t vta = vext_vta(desc);                                        \
1301     uint32_t vma = vext_vma(desc);                                        \
1302     uint32_t i;                                                           \
1303                                                                           \
1304     for (i = env->vstart; i < vl; i++) {                                  \
1305         if (!vm && !vext_elem_mask(v0, i)) {                              \
1306             /* set masked-off elements to 1s */                           \
1307             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1308             continue;                                                     \
1309         }                                                                 \
1310         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1311         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1312         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1313     }                                                                     \
1314     env->vstart = 0;                                                      \
1315     /* set tail elements to 1s */                                         \
1316     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1317 }
1318 
1319 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1320 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1321 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1322 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1323 
1324 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1325 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1326 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1327 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1328 
1329 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1330 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1331 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1332 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1333 
1334 /* generate the helpers for shift instructions with one vector and one scalar */
1335 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1336 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1337         void *vs2, CPURISCVState *env, uint32_t desc)       \
1338 {                                                           \
1339     uint32_t vm = vext_vm(desc);                            \
1340     uint32_t vl = env->vl;                                  \
1341     uint32_t esz = sizeof(TD);                              \
1342     uint32_t total_elems =                                  \
1343         vext_get_total_elems(env, desc, esz);               \
1344     uint32_t vta = vext_vta(desc);                          \
1345     uint32_t vma = vext_vma(desc);                          \
1346     uint32_t i;                                             \
1347                                                             \
1348     for (i = env->vstart; i < vl; i++) {                    \
1349         if (!vm && !vext_elem_mask(v0, i)) {                \
1350             /* set masked-off elements to 1s */             \
1351             vext_set_elems_1s(vd, vma, i * esz,             \
1352                               (i + 1) * esz);               \
1353             continue;                                       \
1354         }                                                   \
1355         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1356         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1357     }                                                       \
1358     env->vstart = 0;                                        \
1359     /* set tail elements to 1s */                           \
1360     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1361 }
1362 
1363 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1364 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1365 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1366 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1367 
1368 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1369 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1370 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1371 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1372 
1373 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1374 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1375 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1376 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1377 
1378 /* Vector Narrowing Integer Right Shift Instructions */
1379 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1380 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1381 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1382 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1383 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1384 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1385 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1386 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1387 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1388 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1389 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1390 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1391 
1392 /* Vector Integer Comparison Instructions */
1393 #define DO_MSEQ(N, M) (N == M)
1394 #define DO_MSNE(N, M) (N != M)
1395 #define DO_MSLT(N, M) (N < M)
1396 #define DO_MSLE(N, M) (N <= M)
1397 #define DO_MSGT(N, M) (N > M)
1398 
1399 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1400 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1401                   CPURISCVState *env, uint32_t desc)          \
1402 {                                                             \
1403     uint32_t vm = vext_vm(desc);                              \
1404     uint32_t vl = env->vl;                                    \
1405     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1406     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1407     uint32_t vma = vext_vma(desc);                            \
1408     uint32_t i;                                               \
1409                                                               \
1410     for (i = env->vstart; i < vl; i++) {                      \
1411         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1412         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1413         if (!vm && !vext_elem_mask(v0, i)) {                  \
1414             /* set masked-off elements to 1s */               \
1415             if (vma) {                                        \
1416                 vext_set_elem_mask(vd, i, 1);                 \
1417             }                                                 \
1418             continue;                                         \
1419         }                                                     \
1420         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1421     }                                                         \
1422     env->vstart = 0;                                          \
1423     /* mask destination register are always tail-agnostic */  \
1424     /* set tail elements to 1s */                             \
1425     if (vta_all_1s) {                                         \
1426         for (; i < total_elems; i++) {                        \
1427             vext_set_elem_mask(vd, i, 1);                     \
1428         }                                                     \
1429     }                                                         \
1430 }
1431 
1432 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1433 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1434 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1435 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1436 
1437 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1438 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1439 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1440 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1441 
1442 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1443 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1444 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1445 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1446 
1447 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1448 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1449 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1450 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1451 
1452 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1453 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1454 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1455 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1456 
1457 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1458 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1459 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1460 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1461 
1462 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1463 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1464                   CPURISCVState *env, uint32_t desc)                \
1465 {                                                                   \
1466     uint32_t vm = vext_vm(desc);                                    \
1467     uint32_t vl = env->vl;                                          \
1468     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1469     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1470     uint32_t vma = vext_vma(desc);                                  \
1471     uint32_t i;                                                     \
1472                                                                     \
1473     for (i = env->vstart; i < vl; i++) {                            \
1474         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1475         if (!vm && !vext_elem_mask(v0, i)) {                        \
1476             /* set masked-off elements to 1s */                     \
1477             if (vma) {                                              \
1478                 vext_set_elem_mask(vd, i, 1);                       \
1479             }                                                       \
1480             continue;                                               \
1481         }                                                           \
1482         vext_set_elem_mask(vd, i,                                   \
1483                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1484     }                                                               \
1485     env->vstart = 0;                                                \
1486     /* mask destination register are always tail-agnostic */        \
1487     /* set tail elements to 1s */                                   \
1488     if (vta_all_1s) {                                               \
1489         for (; i < total_elems; i++) {                              \
1490             vext_set_elem_mask(vd, i, 1);                           \
1491         }                                                           \
1492     }                                                               \
1493 }
1494 
1495 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1496 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1497 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1498 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1499 
1500 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1501 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1502 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1503 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1504 
1505 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1506 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1507 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1508 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1509 
1510 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1511 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1512 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1513 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1514 
1515 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1516 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1517 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1518 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1519 
1520 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1521 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1522 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1523 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1524 
1525 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1526 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1527 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1528 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1529 
1530 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1531 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1532 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1533 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1534 
1535 /* Vector Integer Min/Max Instructions */
1536 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1537 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1538 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1539 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1540 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1541 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1542 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1543 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1544 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1545 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1546 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1547 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1548 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1549 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1550 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1551 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1552 GEN_VEXT_VV(vminu_vv_b, 1)
1553 GEN_VEXT_VV(vminu_vv_h, 2)
1554 GEN_VEXT_VV(vminu_vv_w, 4)
1555 GEN_VEXT_VV(vminu_vv_d, 8)
1556 GEN_VEXT_VV(vmin_vv_b, 1)
1557 GEN_VEXT_VV(vmin_vv_h, 2)
1558 GEN_VEXT_VV(vmin_vv_w, 4)
1559 GEN_VEXT_VV(vmin_vv_d, 8)
1560 GEN_VEXT_VV(vmaxu_vv_b, 1)
1561 GEN_VEXT_VV(vmaxu_vv_h, 2)
1562 GEN_VEXT_VV(vmaxu_vv_w, 4)
1563 GEN_VEXT_VV(vmaxu_vv_d, 8)
1564 GEN_VEXT_VV(vmax_vv_b, 1)
1565 GEN_VEXT_VV(vmax_vv_h, 2)
1566 GEN_VEXT_VV(vmax_vv_w, 4)
1567 GEN_VEXT_VV(vmax_vv_d, 8)
1568 
1569 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1570 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1571 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1572 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1573 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1574 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1575 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1576 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1577 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1578 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1579 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1580 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1581 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1582 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1583 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1584 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1585 GEN_VEXT_VX(vminu_vx_b, 1)
1586 GEN_VEXT_VX(vminu_vx_h, 2)
1587 GEN_VEXT_VX(vminu_vx_w, 4)
1588 GEN_VEXT_VX(vminu_vx_d, 8)
1589 GEN_VEXT_VX(vmin_vx_b, 1)
1590 GEN_VEXT_VX(vmin_vx_h, 2)
1591 GEN_VEXT_VX(vmin_vx_w, 4)
1592 GEN_VEXT_VX(vmin_vx_d, 8)
1593 GEN_VEXT_VX(vmaxu_vx_b, 1)
1594 GEN_VEXT_VX(vmaxu_vx_h, 2)
1595 GEN_VEXT_VX(vmaxu_vx_w, 4)
1596 GEN_VEXT_VX(vmaxu_vx_d, 8)
1597 GEN_VEXT_VX(vmax_vx_b, 1)
1598 GEN_VEXT_VX(vmax_vx_h, 2)
1599 GEN_VEXT_VX(vmax_vx_w, 4)
1600 GEN_VEXT_VX(vmax_vx_d, 8)
1601 
1602 /* Vector Single-Width Integer Multiply Instructions */
1603 #define DO_MUL(N, M) (N * M)
1604 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1605 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1606 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1607 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1608 GEN_VEXT_VV(vmul_vv_b, 1)
1609 GEN_VEXT_VV(vmul_vv_h, 2)
1610 GEN_VEXT_VV(vmul_vv_w, 4)
1611 GEN_VEXT_VV(vmul_vv_d, 8)
1612 
1613 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1614 {
1615     return (int16_t)s2 * (int16_t)s1 >> 8;
1616 }
1617 
1618 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1619 {
1620     return (int32_t)s2 * (int32_t)s1 >> 16;
1621 }
1622 
1623 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1624 {
1625     return (int64_t)s2 * (int64_t)s1 >> 32;
1626 }
1627 
1628 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1629 {
1630     uint64_t hi_64, lo_64;
1631 
1632     muls64(&lo_64, &hi_64, s1, s2);
1633     return hi_64;
1634 }
1635 
1636 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1637 {
1638     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1639 }
1640 
1641 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1642 {
1643     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1644 }
1645 
1646 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1647 {
1648     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1649 }
1650 
1651 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1652 {
1653     uint64_t hi_64, lo_64;
1654 
1655     mulu64(&lo_64, &hi_64, s2, s1);
1656     return hi_64;
1657 }
1658 
1659 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1660 {
1661     return (int16_t)s2 * (uint16_t)s1 >> 8;
1662 }
1663 
1664 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1665 {
1666     return (int32_t)s2 * (uint32_t)s1 >> 16;
1667 }
1668 
1669 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1670 {
1671     return (int64_t)s2 * (uint64_t)s1 >> 32;
1672 }
1673 
1674 /*
1675  * Let  A = signed operand,
1676  *      B = unsigned operand
1677  *      P = mulu64(A, B), unsigned product
1678  *
1679  * LET  X = 2 ** 64  - A, 2's complement of A
1680  *      SP = signed product
1681  * THEN
1682  *      IF A < 0
1683  *          SP = -X * B
1684  *             = -(2 ** 64 - A) * B
1685  *             = A * B - 2 ** 64 * B
1686  *             = P - 2 ** 64 * B
1687  *      ELSE
1688  *          SP = P
1689  * THEN
1690  *      HI_P -= (A < 0 ? B : 0)
1691  */
1692 
1693 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1694 {
1695     uint64_t hi_64, lo_64;
1696 
1697     mulu64(&lo_64, &hi_64, s2, s1);
1698 
1699     hi_64 -= s2 < 0 ? s1 : 0;
1700     return hi_64;
1701 }
1702 
1703 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1704 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1705 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1706 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1707 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1708 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1709 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1710 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1711 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1712 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1713 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1714 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1715 GEN_VEXT_VV(vmulh_vv_b, 1)
1716 GEN_VEXT_VV(vmulh_vv_h, 2)
1717 GEN_VEXT_VV(vmulh_vv_w, 4)
1718 GEN_VEXT_VV(vmulh_vv_d, 8)
1719 GEN_VEXT_VV(vmulhu_vv_b, 1)
1720 GEN_VEXT_VV(vmulhu_vv_h, 2)
1721 GEN_VEXT_VV(vmulhu_vv_w, 4)
1722 GEN_VEXT_VV(vmulhu_vv_d, 8)
1723 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1724 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1725 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1726 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1727 
1728 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1729 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1730 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1731 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1732 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1733 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1734 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1735 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1736 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1737 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1738 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1739 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1740 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1741 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1742 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1743 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1744 GEN_VEXT_VX(vmul_vx_b, 1)
1745 GEN_VEXT_VX(vmul_vx_h, 2)
1746 GEN_VEXT_VX(vmul_vx_w, 4)
1747 GEN_VEXT_VX(vmul_vx_d, 8)
1748 GEN_VEXT_VX(vmulh_vx_b, 1)
1749 GEN_VEXT_VX(vmulh_vx_h, 2)
1750 GEN_VEXT_VX(vmulh_vx_w, 4)
1751 GEN_VEXT_VX(vmulh_vx_d, 8)
1752 GEN_VEXT_VX(vmulhu_vx_b, 1)
1753 GEN_VEXT_VX(vmulhu_vx_h, 2)
1754 GEN_VEXT_VX(vmulhu_vx_w, 4)
1755 GEN_VEXT_VX(vmulhu_vx_d, 8)
1756 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1757 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1758 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1759 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1760 
1761 /* Vector Integer Divide Instructions */
1762 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1763 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1764 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1765         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1766 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1767         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1768 
1769 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1770 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1771 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1772 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1773 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1774 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1775 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1776 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1777 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1778 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1779 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1780 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1781 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1782 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1783 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1784 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1785 GEN_VEXT_VV(vdivu_vv_b, 1)
1786 GEN_VEXT_VV(vdivu_vv_h, 2)
1787 GEN_VEXT_VV(vdivu_vv_w, 4)
1788 GEN_VEXT_VV(vdivu_vv_d, 8)
1789 GEN_VEXT_VV(vdiv_vv_b, 1)
1790 GEN_VEXT_VV(vdiv_vv_h, 2)
1791 GEN_VEXT_VV(vdiv_vv_w, 4)
1792 GEN_VEXT_VV(vdiv_vv_d, 8)
1793 GEN_VEXT_VV(vremu_vv_b, 1)
1794 GEN_VEXT_VV(vremu_vv_h, 2)
1795 GEN_VEXT_VV(vremu_vv_w, 4)
1796 GEN_VEXT_VV(vremu_vv_d, 8)
1797 GEN_VEXT_VV(vrem_vv_b, 1)
1798 GEN_VEXT_VV(vrem_vv_h, 2)
1799 GEN_VEXT_VV(vrem_vv_w, 4)
1800 GEN_VEXT_VV(vrem_vv_d, 8)
1801 
1802 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1803 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1804 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1805 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1806 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1807 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1808 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1809 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1810 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1811 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1812 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1813 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1814 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1815 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1816 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1817 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1818 GEN_VEXT_VX(vdivu_vx_b, 1)
1819 GEN_VEXT_VX(vdivu_vx_h, 2)
1820 GEN_VEXT_VX(vdivu_vx_w, 4)
1821 GEN_VEXT_VX(vdivu_vx_d, 8)
1822 GEN_VEXT_VX(vdiv_vx_b, 1)
1823 GEN_VEXT_VX(vdiv_vx_h, 2)
1824 GEN_VEXT_VX(vdiv_vx_w, 4)
1825 GEN_VEXT_VX(vdiv_vx_d, 8)
1826 GEN_VEXT_VX(vremu_vx_b, 1)
1827 GEN_VEXT_VX(vremu_vx_h, 2)
1828 GEN_VEXT_VX(vremu_vx_w, 4)
1829 GEN_VEXT_VX(vremu_vx_d, 8)
1830 GEN_VEXT_VX(vrem_vx_b, 1)
1831 GEN_VEXT_VX(vrem_vx_h, 2)
1832 GEN_VEXT_VX(vrem_vx_w, 4)
1833 GEN_VEXT_VX(vrem_vx_d, 8)
1834 
1835 /* Vector Widening Integer Multiply Instructions */
1836 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1837 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1838 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1839 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1840 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1841 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1842 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1843 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1844 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1845 GEN_VEXT_VV(vwmul_vv_b, 2)
1846 GEN_VEXT_VV(vwmul_vv_h, 4)
1847 GEN_VEXT_VV(vwmul_vv_w, 8)
1848 GEN_VEXT_VV(vwmulu_vv_b, 2)
1849 GEN_VEXT_VV(vwmulu_vv_h, 4)
1850 GEN_VEXT_VV(vwmulu_vv_w, 8)
1851 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1852 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1853 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1854 
1855 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1856 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1857 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1858 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1859 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1860 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1861 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1862 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1863 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1864 GEN_VEXT_VX(vwmul_vx_b, 2)
1865 GEN_VEXT_VX(vwmul_vx_h, 4)
1866 GEN_VEXT_VX(vwmul_vx_w, 8)
1867 GEN_VEXT_VX(vwmulu_vx_b, 2)
1868 GEN_VEXT_VX(vwmulu_vx_h, 4)
1869 GEN_VEXT_VX(vwmulu_vx_w, 8)
1870 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1871 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1872 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1873 
1874 /* Vector Single-Width Integer Multiply-Add Instructions */
1875 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1876 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1877 {                                                                  \
1878     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1879     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1880     TD d = *((TD *)vd + HD(i));                                    \
1881     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1882 }
1883 
1884 #define DO_MACC(N, M, D) (M * N + D)
1885 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1886 #define DO_MADD(N, M, D) (M * D + N)
1887 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1888 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1889 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1890 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1891 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1892 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1893 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1894 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1895 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1896 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1897 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1898 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1899 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1900 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1901 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1902 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1903 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1904 GEN_VEXT_VV(vmacc_vv_b, 1)
1905 GEN_VEXT_VV(vmacc_vv_h, 2)
1906 GEN_VEXT_VV(vmacc_vv_w, 4)
1907 GEN_VEXT_VV(vmacc_vv_d, 8)
1908 GEN_VEXT_VV(vnmsac_vv_b, 1)
1909 GEN_VEXT_VV(vnmsac_vv_h, 2)
1910 GEN_VEXT_VV(vnmsac_vv_w, 4)
1911 GEN_VEXT_VV(vnmsac_vv_d, 8)
1912 GEN_VEXT_VV(vmadd_vv_b, 1)
1913 GEN_VEXT_VV(vmadd_vv_h, 2)
1914 GEN_VEXT_VV(vmadd_vv_w, 4)
1915 GEN_VEXT_VV(vmadd_vv_d, 8)
1916 GEN_VEXT_VV(vnmsub_vv_b, 1)
1917 GEN_VEXT_VV(vnmsub_vv_h, 2)
1918 GEN_VEXT_VV(vnmsub_vv_w, 4)
1919 GEN_VEXT_VV(vnmsub_vv_d, 8)
1920 
1921 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1922 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1923 {                                                                   \
1924     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1925     TD d = *((TD *)vd + HD(i));                                     \
1926     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1927 }
1928 
1929 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1930 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1931 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1932 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1933 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1934 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1935 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1936 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1937 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1938 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1939 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1940 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1941 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1942 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1943 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1944 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1945 GEN_VEXT_VX(vmacc_vx_b, 1)
1946 GEN_VEXT_VX(vmacc_vx_h, 2)
1947 GEN_VEXT_VX(vmacc_vx_w, 4)
1948 GEN_VEXT_VX(vmacc_vx_d, 8)
1949 GEN_VEXT_VX(vnmsac_vx_b, 1)
1950 GEN_VEXT_VX(vnmsac_vx_h, 2)
1951 GEN_VEXT_VX(vnmsac_vx_w, 4)
1952 GEN_VEXT_VX(vnmsac_vx_d, 8)
1953 GEN_VEXT_VX(vmadd_vx_b, 1)
1954 GEN_VEXT_VX(vmadd_vx_h, 2)
1955 GEN_VEXT_VX(vmadd_vx_w, 4)
1956 GEN_VEXT_VX(vmadd_vx_d, 8)
1957 GEN_VEXT_VX(vnmsub_vx_b, 1)
1958 GEN_VEXT_VX(vnmsub_vx_h, 2)
1959 GEN_VEXT_VX(vnmsub_vx_w, 4)
1960 GEN_VEXT_VX(vnmsub_vx_d, 8)
1961 
1962 /* Vector Widening Integer Multiply-Add Instructions */
1963 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1964 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1965 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1966 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1967 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1968 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1969 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1970 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1971 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1972 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1973 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1974 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1975 GEN_VEXT_VV(vwmacc_vv_b, 2)
1976 GEN_VEXT_VV(vwmacc_vv_h, 4)
1977 GEN_VEXT_VV(vwmacc_vv_w, 8)
1978 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1979 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1980 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1981 
1982 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1983 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1984 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1985 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1986 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1987 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1988 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1989 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1990 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1991 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1992 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1993 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1994 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1995 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1996 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1997 GEN_VEXT_VX(vwmacc_vx_b, 2)
1998 GEN_VEXT_VX(vwmacc_vx_h, 4)
1999 GEN_VEXT_VX(vwmacc_vx_w, 8)
2000 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2001 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2002 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2003 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2004 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2005 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2006 
2007 /* Vector Integer Merge and Move Instructions */
2008 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2009 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2010                   uint32_t desc)                                     \
2011 {                                                                    \
2012     uint32_t vl = env->vl;                                           \
2013     uint32_t esz = sizeof(ETYPE);                                    \
2014     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2015     uint32_t vta = vext_vta(desc);                                   \
2016     uint32_t i;                                                      \
2017                                                                      \
2018     for (i = env->vstart; i < vl; i++) {                             \
2019         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2020         *((ETYPE *)vd + H(i)) = s1;                                  \
2021     }                                                                \
2022     env->vstart = 0;                                                 \
2023     /* set tail elements to 1s */                                    \
2024     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2025 }
2026 
2027 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2028 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2029 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2030 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2031 
2032 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2033 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2034                   uint32_t desc)                                     \
2035 {                                                                    \
2036     uint32_t vl = env->vl;                                           \
2037     uint32_t esz = sizeof(ETYPE);                                    \
2038     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2039     uint32_t vta = vext_vta(desc);                                   \
2040     uint32_t i;                                                      \
2041                                                                      \
2042     for (i = env->vstart; i < vl; i++) {                             \
2043         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2044     }                                                                \
2045     env->vstart = 0;                                                 \
2046     /* set tail elements to 1s */                                    \
2047     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2048 }
2049 
2050 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2051 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2052 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2053 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2054 
2055 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2056 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2057                   CPURISCVState *env, uint32_t desc)                 \
2058 {                                                                    \
2059     uint32_t vl = env->vl;                                           \
2060     uint32_t esz = sizeof(ETYPE);                                    \
2061     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2062     uint32_t vta = vext_vta(desc);                                   \
2063     uint32_t i;                                                      \
2064                                                                      \
2065     for (i = env->vstart; i < vl; i++) {                             \
2066         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2067         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2068     }                                                                \
2069     env->vstart = 0;                                                 \
2070     /* set tail elements to 1s */                                    \
2071     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2072 }
2073 
2074 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2075 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2076 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2077 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2078 
2079 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2080 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2081                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2082 {                                                                    \
2083     uint32_t vl = env->vl;                                           \
2084     uint32_t esz = sizeof(ETYPE);                                    \
2085     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2086     uint32_t vta = vext_vta(desc);                                   \
2087     uint32_t i;                                                      \
2088                                                                      \
2089     for (i = env->vstart; i < vl; i++) {                             \
2090         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2091         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2092                    (ETYPE)(target_long)s1);                          \
2093         *((ETYPE *)vd + H(i)) = d;                                   \
2094     }                                                                \
2095     env->vstart = 0;                                                 \
2096     /* set tail elements to 1s */                                    \
2097     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2098 }
2099 
2100 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2101 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2102 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2103 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2104 
2105 /*
2106  *** Vector Fixed-Point Arithmetic Instructions
2107  */
2108 
2109 /* Vector Single-Width Saturating Add and Subtract */
2110 
2111 /*
2112  * As fixed point instructions probably have round mode and saturation,
2113  * define common macros for fixed point here.
2114  */
2115 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2116                           CPURISCVState *env, int vxrm);
2117 
2118 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2119 static inline void                                                  \
2120 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2121           CPURISCVState *env, int vxrm)                             \
2122 {                                                                   \
2123     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2124     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2125     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2126 }
2127 
2128 static inline void
2129 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2130              CPURISCVState *env,
2131              uint32_t vl, uint32_t vm, int vxrm,
2132              opivv2_rm_fn *fn)
2133 {
2134     for (uint32_t i = env->vstart; i < vl; i++) {
2135         if (!vm && !vext_elem_mask(v0, i)) {
2136             continue;
2137         }
2138         fn(vd, vs1, vs2, i, env, vxrm);
2139     }
2140     env->vstart = 0;
2141 }
2142 
2143 static inline void
2144 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2145              CPURISCVState *env,
2146              uint32_t desc,
2147              opivv2_rm_fn *fn, uint32_t esz)
2148 {
2149     uint32_t vm = vext_vm(desc);
2150     uint32_t vl = env->vl;
2151     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2152     uint32_t vta = vext_vta(desc);
2153 
2154     switch (env->vxrm) {
2155     case 0: /* rnu */
2156         vext_vv_rm_1(vd, v0, vs1, vs2,
2157                      env, vl, vm, 0, fn);
2158         break;
2159     case 1: /* rne */
2160         vext_vv_rm_1(vd, v0, vs1, vs2,
2161                      env, vl, vm, 1, fn);
2162         break;
2163     case 2: /* rdn */
2164         vext_vv_rm_1(vd, v0, vs1, vs2,
2165                      env, vl, vm, 2, fn);
2166         break;
2167     default: /* rod */
2168         vext_vv_rm_1(vd, v0, vs1, vs2,
2169                      env, vl, vm, 3, fn);
2170         break;
2171     }
2172     /* set tail elements to 1s */
2173     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2174 }
2175 
2176 /* generate helpers for fixed point instructions with OPIVV format */
2177 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2178 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2179                   CPURISCVState *env, uint32_t desc)            \
2180 {                                                               \
2181     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2182                  do_##NAME, ESZ);                               \
2183 }
2184 
2185 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2186 {
2187     uint8_t res = a + b;
2188     if (res < a) {
2189         res = UINT8_MAX;
2190         env->vxsat = 0x1;
2191     }
2192     return res;
2193 }
2194 
2195 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2196                                uint16_t b)
2197 {
2198     uint16_t res = a + b;
2199     if (res < a) {
2200         res = UINT16_MAX;
2201         env->vxsat = 0x1;
2202     }
2203     return res;
2204 }
2205 
2206 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2207                                uint32_t b)
2208 {
2209     uint32_t res = a + b;
2210     if (res < a) {
2211         res = UINT32_MAX;
2212         env->vxsat = 0x1;
2213     }
2214     return res;
2215 }
2216 
2217 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2218                                uint64_t b)
2219 {
2220     uint64_t res = a + b;
2221     if (res < a) {
2222         res = UINT64_MAX;
2223         env->vxsat = 0x1;
2224     }
2225     return res;
2226 }
2227 
2228 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2229 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2230 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2231 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2232 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2233 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2234 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2235 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2236 
2237 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2238                           CPURISCVState *env, int vxrm);
2239 
2240 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2241 static inline void                                                  \
2242 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2243           CPURISCVState *env, int vxrm)                             \
2244 {                                                                   \
2245     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2246     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2247 }
2248 
2249 static inline void
2250 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2251              CPURISCVState *env,
2252              uint32_t vl, uint32_t vm, int vxrm,
2253              opivx2_rm_fn *fn)
2254 {
2255     for (uint32_t i = env->vstart; i < vl; i++) {
2256         if (!vm && !vext_elem_mask(v0, i)) {
2257             continue;
2258         }
2259         fn(vd, s1, vs2, i, env, vxrm);
2260     }
2261     env->vstart = 0;
2262 }
2263 
2264 static inline void
2265 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2266              CPURISCVState *env,
2267              uint32_t desc,
2268              opivx2_rm_fn *fn, uint32_t esz)
2269 {
2270     uint32_t vm = vext_vm(desc);
2271     uint32_t vl = env->vl;
2272     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2273     uint32_t vta = vext_vta(desc);
2274 
2275     switch (env->vxrm) {
2276     case 0: /* rnu */
2277         vext_vx_rm_1(vd, v0, s1, vs2,
2278                      env, vl, vm, 0, fn);
2279         break;
2280     case 1: /* rne */
2281         vext_vx_rm_1(vd, v0, s1, vs2,
2282                      env, vl, vm, 1, fn);
2283         break;
2284     case 2: /* rdn */
2285         vext_vx_rm_1(vd, v0, s1, vs2,
2286                      env, vl, vm, 2, fn);
2287         break;
2288     default: /* rod */
2289         vext_vx_rm_1(vd, v0, s1, vs2,
2290                      env, vl, vm, 3, fn);
2291         break;
2292     }
2293     /* set tail elements to 1s */
2294     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2295 }
2296 
2297 /* generate helpers for fixed point instructions with OPIVX format */
2298 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2299 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2300         void *vs2, CPURISCVState *env, uint32_t desc)     \
2301 {                                                         \
2302     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2303                  do_##NAME, ESZ);                         \
2304 }
2305 
2306 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2307 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2308 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2309 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2310 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2311 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2312 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2313 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2314 
2315 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2316 {
2317     int8_t res = a + b;
2318     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2319         res = a > 0 ? INT8_MAX : INT8_MIN;
2320         env->vxsat = 0x1;
2321     }
2322     return res;
2323 }
2324 
2325 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2326 {
2327     int16_t res = a + b;
2328     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2329         res = a > 0 ? INT16_MAX : INT16_MIN;
2330         env->vxsat = 0x1;
2331     }
2332     return res;
2333 }
2334 
2335 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2336 {
2337     int32_t res = a + b;
2338     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2339         res = a > 0 ? INT32_MAX : INT32_MIN;
2340         env->vxsat = 0x1;
2341     }
2342     return res;
2343 }
2344 
2345 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2346 {
2347     int64_t res = a + b;
2348     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2349         res = a > 0 ? INT64_MAX : INT64_MIN;
2350         env->vxsat = 0x1;
2351     }
2352     return res;
2353 }
2354 
2355 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2356 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2357 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2358 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2359 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2360 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2361 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2362 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2363 
2364 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2365 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2366 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2367 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2368 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2369 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2370 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2371 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2372 
2373 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2374 {
2375     uint8_t res = a - b;
2376     if (res > a) {
2377         res = 0;
2378         env->vxsat = 0x1;
2379     }
2380     return res;
2381 }
2382 
2383 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2384                                uint16_t b)
2385 {
2386     uint16_t res = a - b;
2387     if (res > a) {
2388         res = 0;
2389         env->vxsat = 0x1;
2390     }
2391     return res;
2392 }
2393 
2394 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2395                                uint32_t b)
2396 {
2397     uint32_t res = a - b;
2398     if (res > a) {
2399         res = 0;
2400         env->vxsat = 0x1;
2401     }
2402     return res;
2403 }
2404 
2405 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2406                                uint64_t b)
2407 {
2408     uint64_t res = a - b;
2409     if (res > a) {
2410         res = 0;
2411         env->vxsat = 0x1;
2412     }
2413     return res;
2414 }
2415 
2416 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2417 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2418 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2419 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2420 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2421 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2422 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2423 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2424 
2425 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2426 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2427 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2428 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2429 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2430 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2431 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2432 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2433 
2434 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2435 {
2436     int8_t res = a - b;
2437     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2438         res = a >= 0 ? INT8_MAX : INT8_MIN;
2439         env->vxsat = 0x1;
2440     }
2441     return res;
2442 }
2443 
2444 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2445 {
2446     int16_t res = a - b;
2447     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2448         res = a >= 0 ? INT16_MAX : INT16_MIN;
2449         env->vxsat = 0x1;
2450     }
2451     return res;
2452 }
2453 
2454 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2455 {
2456     int32_t res = a - b;
2457     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2458         res = a >= 0 ? INT32_MAX : INT32_MIN;
2459         env->vxsat = 0x1;
2460     }
2461     return res;
2462 }
2463 
2464 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2465 {
2466     int64_t res = a - b;
2467     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2468         res = a >= 0 ? INT64_MAX : INT64_MIN;
2469         env->vxsat = 0x1;
2470     }
2471     return res;
2472 }
2473 
2474 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2475 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2476 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2477 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2478 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2479 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2480 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2481 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2482 
2483 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2484 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2485 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2486 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2487 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2488 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2489 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2490 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2491 
2492 /* Vector Single-Width Averaging Add and Subtract */
2493 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2494 {
2495     uint8_t d = extract64(v, shift, 1);
2496     uint8_t d1;
2497     uint64_t D1, D2;
2498 
2499     if (shift == 0 || shift > 64) {
2500         return 0;
2501     }
2502 
2503     d1 = extract64(v, shift - 1, 1);
2504     D1 = extract64(v, 0, shift);
2505     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2506         return d1;
2507     } else if (vxrm == 1) { /* round-to-nearest-even */
2508         if (shift > 1) {
2509             D2 = extract64(v, 0, shift - 1);
2510             return d1 & ((D2 != 0) | d);
2511         } else {
2512             return d1 & d;
2513         }
2514     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2515         return !d & (D1 != 0);
2516     }
2517     return 0; /* round-down (truncate) */
2518 }
2519 
2520 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2521 {
2522     int64_t res = (int64_t)a + b;
2523     uint8_t round = get_round(vxrm, res, 1);
2524 
2525     return (res >> 1) + round;
2526 }
2527 
2528 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2529 {
2530     int64_t res = a + b;
2531     uint8_t round = get_round(vxrm, res, 1);
2532     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2533 
2534     /* With signed overflow, bit 64 is inverse of bit 63. */
2535     return ((res >> 1) ^ over) + round;
2536 }
2537 
2538 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2539 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2540 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2541 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2542 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2543 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2544 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2545 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2546 
2547 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2548 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2549 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2550 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2551 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2552 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2553 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2554 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2555 
2556 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2557                                uint32_t a, uint32_t b)
2558 {
2559     uint64_t res = (uint64_t)a + b;
2560     uint8_t round = get_round(vxrm, res, 1);
2561 
2562     return (res >> 1) + round;
2563 }
2564 
2565 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2566                                uint64_t a, uint64_t b)
2567 {
2568     uint64_t res = a + b;
2569     uint8_t round = get_round(vxrm, res, 1);
2570     uint64_t over = (uint64_t)(res < a) << 63;
2571 
2572     return ((res >> 1) | over) + round;
2573 }
2574 
2575 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2576 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2577 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2578 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2579 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2580 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2581 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2582 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2583 
2584 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2585 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2586 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2587 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2588 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2589 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2590 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2591 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2592 
2593 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2594 {
2595     int64_t res = (int64_t)a - b;
2596     uint8_t round = get_round(vxrm, res, 1);
2597 
2598     return (res >> 1) + round;
2599 }
2600 
2601 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2602 {
2603     int64_t res = (int64_t)a - b;
2604     uint8_t round = get_round(vxrm, res, 1);
2605     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2606 
2607     /* With signed overflow, bit 64 is inverse of bit 63. */
2608     return ((res >> 1) ^ over) + round;
2609 }
2610 
2611 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2612 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2613 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2614 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2615 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2616 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2617 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2618 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2619 
2620 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2621 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2622 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2623 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2624 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2625 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2626 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2627 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2628 
2629 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2630                                uint32_t a, uint32_t b)
2631 {
2632     int64_t res = (int64_t)a - b;
2633     uint8_t round = get_round(vxrm, res, 1);
2634 
2635     return (res >> 1) + round;
2636 }
2637 
2638 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2639                                uint64_t a, uint64_t b)
2640 {
2641     uint64_t res = (uint64_t)a - b;
2642     uint8_t round = get_round(vxrm, res, 1);
2643     uint64_t over = (uint64_t)(res > a) << 63;
2644 
2645     return ((res >> 1) | over) + round;
2646 }
2647 
2648 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2649 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2650 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2651 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2652 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2653 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2654 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2655 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2656 
2657 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2658 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2659 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2660 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2661 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2662 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2663 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2664 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2665 
2666 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2667 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2668 {
2669     uint8_t round;
2670     int16_t res;
2671 
2672     res = (int16_t)a * (int16_t)b;
2673     round = get_round(vxrm, res, 7);
2674     res   = (res >> 7) + round;
2675 
2676     if (res > INT8_MAX) {
2677         env->vxsat = 0x1;
2678         return INT8_MAX;
2679     } else if (res < INT8_MIN) {
2680         env->vxsat = 0x1;
2681         return INT8_MIN;
2682     } else {
2683         return res;
2684     }
2685 }
2686 
2687 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2688 {
2689     uint8_t round;
2690     int32_t res;
2691 
2692     res = (int32_t)a * (int32_t)b;
2693     round = get_round(vxrm, res, 15);
2694     res   = (res >> 15) + round;
2695 
2696     if (res > INT16_MAX) {
2697         env->vxsat = 0x1;
2698         return INT16_MAX;
2699     } else if (res < INT16_MIN) {
2700         env->vxsat = 0x1;
2701         return INT16_MIN;
2702     } else {
2703         return res;
2704     }
2705 }
2706 
2707 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2708 {
2709     uint8_t round;
2710     int64_t res;
2711 
2712     res = (int64_t)a * (int64_t)b;
2713     round = get_round(vxrm, res, 31);
2714     res   = (res >> 31) + round;
2715 
2716     if (res > INT32_MAX) {
2717         env->vxsat = 0x1;
2718         return INT32_MAX;
2719     } else if (res < INT32_MIN) {
2720         env->vxsat = 0x1;
2721         return INT32_MIN;
2722     } else {
2723         return res;
2724     }
2725 }
2726 
2727 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2728 {
2729     uint8_t round;
2730     uint64_t hi_64, lo_64;
2731     int64_t res;
2732 
2733     if (a == INT64_MIN && b == INT64_MIN) {
2734         env->vxsat = 1;
2735         return INT64_MAX;
2736     }
2737 
2738     muls64(&lo_64, &hi_64, a, b);
2739     round = get_round(vxrm, lo_64, 63);
2740     /*
2741      * Cannot overflow, as there are always
2742      * 2 sign bits after multiply.
2743      */
2744     res = (hi_64 << 1) | (lo_64 >> 63);
2745     if (round) {
2746         if (res == INT64_MAX) {
2747             env->vxsat = 1;
2748         } else {
2749             res += 1;
2750         }
2751     }
2752     return res;
2753 }
2754 
2755 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2756 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2757 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2758 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2759 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2760 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2761 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2762 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2763 
2764 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2765 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2766 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2767 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2768 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2769 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2770 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2771 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2772 
2773 /* Vector Single-Width Scaling Shift Instructions */
2774 static inline uint8_t
2775 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2776 {
2777     uint8_t round, shift = b & 0x7;
2778     uint8_t res;
2779 
2780     round = get_round(vxrm, a, shift);
2781     res   = (a >> shift)  + round;
2782     return res;
2783 }
2784 static inline uint16_t
2785 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2786 {
2787     uint8_t round, shift = b & 0xf;
2788     uint16_t res;
2789 
2790     round = get_round(vxrm, a, shift);
2791     res   = (a >> shift)  + round;
2792     return res;
2793 }
2794 static inline uint32_t
2795 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2796 {
2797     uint8_t round, shift = b & 0x1f;
2798     uint32_t res;
2799 
2800     round = get_round(vxrm, a, shift);
2801     res   = (a >> shift)  + round;
2802     return res;
2803 }
2804 static inline uint64_t
2805 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2806 {
2807     uint8_t round, shift = b & 0x3f;
2808     uint64_t res;
2809 
2810     round = get_round(vxrm, a, shift);
2811     res   = (a >> shift)  + round;
2812     return res;
2813 }
2814 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2815 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2816 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2817 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2818 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2819 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2820 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2821 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2822 
2823 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2824 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2825 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2826 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2827 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2828 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2829 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2830 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2831 
2832 static inline int8_t
2833 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2834 {
2835     uint8_t round, shift = b & 0x7;
2836     int8_t res;
2837 
2838     round = get_round(vxrm, a, shift);
2839     res   = (a >> shift)  + round;
2840     return res;
2841 }
2842 static inline int16_t
2843 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2844 {
2845     uint8_t round, shift = b & 0xf;
2846     int16_t res;
2847 
2848     round = get_round(vxrm, a, shift);
2849     res   = (a >> shift)  + round;
2850     return res;
2851 }
2852 static inline int32_t
2853 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2854 {
2855     uint8_t round, shift = b & 0x1f;
2856     int32_t res;
2857 
2858     round = get_round(vxrm, a, shift);
2859     res   = (a >> shift)  + round;
2860     return res;
2861 }
2862 static inline int64_t
2863 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2864 {
2865     uint8_t round, shift = b & 0x3f;
2866     int64_t res;
2867 
2868     round = get_round(vxrm, a, shift);
2869     res   = (a >> shift)  + round;
2870     return res;
2871 }
2872 
2873 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2874 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2875 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2876 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2877 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2878 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2879 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2880 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2881 
2882 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2883 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2884 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2885 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2886 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2887 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2888 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2889 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2890 
2891 /* Vector Narrowing Fixed-Point Clip Instructions */
2892 static inline int8_t
2893 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2894 {
2895     uint8_t round, shift = b & 0xf;
2896     int16_t res;
2897 
2898     round = get_round(vxrm, a, shift);
2899     res   = (a >> shift)  + round;
2900     if (res > INT8_MAX) {
2901         env->vxsat = 0x1;
2902         return INT8_MAX;
2903     } else if (res < INT8_MIN) {
2904         env->vxsat = 0x1;
2905         return INT8_MIN;
2906     } else {
2907         return res;
2908     }
2909 }
2910 
2911 static inline int16_t
2912 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2913 {
2914     uint8_t round, shift = b & 0x1f;
2915     int32_t res;
2916 
2917     round = get_round(vxrm, a, shift);
2918     res   = (a >> shift)  + round;
2919     if (res > INT16_MAX) {
2920         env->vxsat = 0x1;
2921         return INT16_MAX;
2922     } else if (res < INT16_MIN) {
2923         env->vxsat = 0x1;
2924         return INT16_MIN;
2925     } else {
2926         return res;
2927     }
2928 }
2929 
2930 static inline int32_t
2931 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2932 {
2933     uint8_t round, shift = b & 0x3f;
2934     int64_t res;
2935 
2936     round = get_round(vxrm, a, shift);
2937     res   = (a >> shift)  + round;
2938     if (res > INT32_MAX) {
2939         env->vxsat = 0x1;
2940         return INT32_MAX;
2941     } else if (res < INT32_MIN) {
2942         env->vxsat = 0x1;
2943         return INT32_MIN;
2944     } else {
2945         return res;
2946     }
2947 }
2948 
2949 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2950 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2951 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2952 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2953 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2954 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2955 
2956 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2957 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2958 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2959 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2960 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2961 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2962 
2963 static inline uint8_t
2964 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2965 {
2966     uint8_t round, shift = b & 0xf;
2967     uint16_t res;
2968 
2969     round = get_round(vxrm, a, shift);
2970     res   = (a >> shift)  + round;
2971     if (res > UINT8_MAX) {
2972         env->vxsat = 0x1;
2973         return UINT8_MAX;
2974     } else {
2975         return res;
2976     }
2977 }
2978 
2979 static inline uint16_t
2980 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2981 {
2982     uint8_t round, shift = b & 0x1f;
2983     uint32_t res;
2984 
2985     round = get_round(vxrm, a, shift);
2986     res   = (a >> shift)  + round;
2987     if (res > UINT16_MAX) {
2988         env->vxsat = 0x1;
2989         return UINT16_MAX;
2990     } else {
2991         return res;
2992     }
2993 }
2994 
2995 static inline uint32_t
2996 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2997 {
2998     uint8_t round, shift = b & 0x3f;
2999     uint64_t res;
3000 
3001     round = get_round(vxrm, a, shift);
3002     res   = (a >> shift)  + round;
3003     if (res > UINT32_MAX) {
3004         env->vxsat = 0x1;
3005         return UINT32_MAX;
3006     } else {
3007         return res;
3008     }
3009 }
3010 
3011 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3012 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3013 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3014 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3015 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3016 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3017 
3018 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3019 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3020 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3021 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3022 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3023 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3024 
3025 /*
3026  *** Vector Float Point Arithmetic Instructions
3027  */
3028 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3029 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3030 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3031                       CPURISCVState *env)                      \
3032 {                                                              \
3033     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3034     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3035     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3036 }
3037 
3038 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3039 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3040                   void *vs2, CPURISCVState *env,          \
3041                   uint32_t desc)                          \
3042 {                                                         \
3043     uint32_t vm = vext_vm(desc);                          \
3044     uint32_t vl = env->vl;                                \
3045     uint32_t total_elems =                                \
3046         vext_get_total_elems(env, desc, ESZ);             \
3047     uint32_t vta = vext_vta(desc);                        \
3048     uint32_t i;                                           \
3049                                                           \
3050     for (i = env->vstart; i < vl; i++) {                  \
3051         if (!vm && !vext_elem_mask(v0, i)) {              \
3052             continue;                                     \
3053         }                                                 \
3054         do_##NAME(vd, vs1, vs2, i, env);                  \
3055     }                                                     \
3056     env->vstart = 0;                                      \
3057     /* set tail elements to 1s */                         \
3058     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3059                       total_elems * ESZ);                 \
3060 }
3061 
3062 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3063 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3064 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3065 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3066 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3067 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3068 
3069 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3070 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3071                       CPURISCVState *env)                      \
3072 {                                                              \
3073     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3074     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3075 }
3076 
3077 #define GEN_VEXT_VF(NAME, ESZ)                            \
3078 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3079                   void *vs2, CPURISCVState *env,          \
3080                   uint32_t desc)                          \
3081 {                                                         \
3082     uint32_t vm = vext_vm(desc);                          \
3083     uint32_t vl = env->vl;                                \
3084     uint32_t total_elems =                                \
3085         vext_get_total_elems(env, desc, ESZ);              \
3086     uint32_t vta = vext_vta(desc);                        \
3087     uint32_t i;                                           \
3088                                                           \
3089     for (i = env->vstart; i < vl; i++) {                  \
3090         if (!vm && !vext_elem_mask(v0, i)) {              \
3091             continue;                                     \
3092         }                                                 \
3093         do_##NAME(vd, s1, vs2, i, env);                   \
3094     }                                                     \
3095     env->vstart = 0;                                      \
3096     /* set tail elements to 1s */                         \
3097     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3098                       total_elems * ESZ);                 \
3099 }
3100 
3101 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3102 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3103 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3104 GEN_VEXT_VF(vfadd_vf_h, 2)
3105 GEN_VEXT_VF(vfadd_vf_w, 4)
3106 GEN_VEXT_VF(vfadd_vf_d, 8)
3107 
3108 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3109 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3110 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3111 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3112 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3113 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3114 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3115 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3116 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3117 GEN_VEXT_VF(vfsub_vf_h, 2)
3118 GEN_VEXT_VF(vfsub_vf_w, 4)
3119 GEN_VEXT_VF(vfsub_vf_d, 8)
3120 
3121 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3122 {
3123     return float16_sub(b, a, s);
3124 }
3125 
3126 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3127 {
3128     return float32_sub(b, a, s);
3129 }
3130 
3131 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3132 {
3133     return float64_sub(b, a, s);
3134 }
3135 
3136 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3137 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3138 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3139 GEN_VEXT_VF(vfrsub_vf_h, 2)
3140 GEN_VEXT_VF(vfrsub_vf_w, 4)
3141 GEN_VEXT_VF(vfrsub_vf_d, 8)
3142 
3143 /* Vector Widening Floating-Point Add/Subtract Instructions */
3144 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3145 {
3146     return float32_add(float16_to_float32(a, true, s),
3147             float16_to_float32(b, true, s), s);
3148 }
3149 
3150 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3151 {
3152     return float64_add(float32_to_float64(a, s),
3153             float32_to_float64(b, s), s);
3154 
3155 }
3156 
3157 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3158 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3159 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3160 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3161 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3162 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3163 GEN_VEXT_VF(vfwadd_vf_h, 4)
3164 GEN_VEXT_VF(vfwadd_vf_w, 8)
3165 
3166 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3167 {
3168     return float32_sub(float16_to_float32(a, true, s),
3169             float16_to_float32(b, true, s), s);
3170 }
3171 
3172 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3173 {
3174     return float64_sub(float32_to_float64(a, s),
3175             float32_to_float64(b, s), s);
3176 
3177 }
3178 
3179 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3180 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3181 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3182 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3183 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3184 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3185 GEN_VEXT_VF(vfwsub_vf_h, 4)
3186 GEN_VEXT_VF(vfwsub_vf_w, 8)
3187 
3188 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3189 {
3190     return float32_add(a, float16_to_float32(b, true, s), s);
3191 }
3192 
3193 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3194 {
3195     return float64_add(a, float32_to_float64(b, s), s);
3196 }
3197 
3198 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3199 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3200 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3201 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3202 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3203 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3204 GEN_VEXT_VF(vfwadd_wf_h, 4)
3205 GEN_VEXT_VF(vfwadd_wf_w, 8)
3206 
3207 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3208 {
3209     return float32_sub(a, float16_to_float32(b, true, s), s);
3210 }
3211 
3212 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3213 {
3214     return float64_sub(a, float32_to_float64(b, s), s);
3215 }
3216 
3217 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3218 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3219 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3220 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3221 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3222 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3223 GEN_VEXT_VF(vfwsub_wf_h, 4)
3224 GEN_VEXT_VF(vfwsub_wf_w, 8)
3225 
3226 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3227 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3228 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3229 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3230 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3231 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3232 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3233 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3234 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3235 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3236 GEN_VEXT_VF(vfmul_vf_h, 2)
3237 GEN_VEXT_VF(vfmul_vf_w, 4)
3238 GEN_VEXT_VF(vfmul_vf_d, 8)
3239 
3240 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3241 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3242 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3243 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3244 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3245 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3246 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3247 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3248 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3249 GEN_VEXT_VF(vfdiv_vf_h, 2)
3250 GEN_VEXT_VF(vfdiv_vf_w, 4)
3251 GEN_VEXT_VF(vfdiv_vf_d, 8)
3252 
3253 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3254 {
3255     return float16_div(b, a, s);
3256 }
3257 
3258 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3259 {
3260     return float32_div(b, a, s);
3261 }
3262 
3263 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3264 {
3265     return float64_div(b, a, s);
3266 }
3267 
3268 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3269 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3270 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3271 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3272 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3273 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3274 
3275 /* Vector Widening Floating-Point Multiply */
3276 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3277 {
3278     return float32_mul(float16_to_float32(a, true, s),
3279             float16_to_float32(b, true, s), s);
3280 }
3281 
3282 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3283 {
3284     return float64_mul(float32_to_float64(a, s),
3285             float32_to_float64(b, s), s);
3286 
3287 }
3288 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3289 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3290 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3291 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3292 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3293 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3294 GEN_VEXT_VF(vfwmul_vf_h, 4)
3295 GEN_VEXT_VF(vfwmul_vf_w, 8)
3296 
3297 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3298 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3299 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3300         CPURISCVState *env)                                        \
3301 {                                                                  \
3302     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3303     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3304     TD d = *((TD *)vd + HD(i));                                    \
3305     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3306 }
3307 
3308 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3309 {
3310     return float16_muladd(a, b, d, 0, s);
3311 }
3312 
3313 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3314 {
3315     return float32_muladd(a, b, d, 0, s);
3316 }
3317 
3318 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3319 {
3320     return float64_muladd(a, b, d, 0, s);
3321 }
3322 
3323 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3324 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3325 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3326 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3327 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3328 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3329 
3330 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3331 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3332         CPURISCVState *env)                                       \
3333 {                                                                 \
3334     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3335     TD d = *((TD *)vd + HD(i));                                   \
3336     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3337 }
3338 
3339 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3340 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3341 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3342 GEN_VEXT_VF(vfmacc_vf_h, 2)
3343 GEN_VEXT_VF(vfmacc_vf_w, 4)
3344 GEN_VEXT_VF(vfmacc_vf_d, 8)
3345 
3346 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3347 {
3348     return float16_muladd(a, b, d,
3349             float_muladd_negate_c | float_muladd_negate_product, s);
3350 }
3351 
3352 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3353 {
3354     return float32_muladd(a, b, d,
3355             float_muladd_negate_c | float_muladd_negate_product, s);
3356 }
3357 
3358 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3359 {
3360     return float64_muladd(a, b, d,
3361             float_muladd_negate_c | float_muladd_negate_product, s);
3362 }
3363 
3364 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3365 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3366 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3367 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3368 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3369 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3370 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3371 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3372 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3373 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3374 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3375 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3376 
3377 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3378 {
3379     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3380 }
3381 
3382 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3383 {
3384     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3385 }
3386 
3387 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3388 {
3389     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3390 }
3391 
3392 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3393 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3394 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3395 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3396 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3397 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3398 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3399 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3400 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3401 GEN_VEXT_VF(vfmsac_vf_h, 2)
3402 GEN_VEXT_VF(vfmsac_vf_w, 4)
3403 GEN_VEXT_VF(vfmsac_vf_d, 8)
3404 
3405 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3406 {
3407     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3408 }
3409 
3410 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3411 {
3412     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3413 }
3414 
3415 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3416 {
3417     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3418 }
3419 
3420 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3421 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3422 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3423 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3424 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3425 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3426 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3427 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3428 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3429 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3430 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3431 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3432 
3433 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3434 {
3435     return float16_muladd(d, b, a, 0, s);
3436 }
3437 
3438 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3439 {
3440     return float32_muladd(d, b, a, 0, s);
3441 }
3442 
3443 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3444 {
3445     return float64_muladd(d, b, a, 0, s);
3446 }
3447 
3448 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3449 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3450 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3451 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3452 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3453 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3454 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3455 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3456 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3457 GEN_VEXT_VF(vfmadd_vf_h, 2)
3458 GEN_VEXT_VF(vfmadd_vf_w, 4)
3459 GEN_VEXT_VF(vfmadd_vf_d, 8)
3460 
3461 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3462 {
3463     return float16_muladd(d, b, a,
3464             float_muladd_negate_c | float_muladd_negate_product, s);
3465 }
3466 
3467 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3468 {
3469     return float32_muladd(d, b, a,
3470             float_muladd_negate_c | float_muladd_negate_product, s);
3471 }
3472 
3473 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3474 {
3475     return float64_muladd(d, b, a,
3476             float_muladd_negate_c | float_muladd_negate_product, s);
3477 }
3478 
3479 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3480 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3481 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3482 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3483 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3484 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3485 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3486 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3487 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3488 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3489 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3490 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3491 
3492 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3493 {
3494     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3495 }
3496 
3497 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3498 {
3499     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3500 }
3501 
3502 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3503 {
3504     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3505 }
3506 
3507 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3508 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3509 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3510 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3511 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3512 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3513 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3514 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3515 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3516 GEN_VEXT_VF(vfmsub_vf_h, 2)
3517 GEN_VEXT_VF(vfmsub_vf_w, 4)
3518 GEN_VEXT_VF(vfmsub_vf_d, 8)
3519 
3520 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3521 {
3522     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3523 }
3524 
3525 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3526 {
3527     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3528 }
3529 
3530 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3531 {
3532     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3533 }
3534 
3535 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3536 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3537 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3538 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3539 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3540 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3541 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3542 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3543 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3544 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3545 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3546 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3547 
3548 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3549 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3550 {
3551     return float32_muladd(float16_to_float32(a, true, s),
3552                         float16_to_float32(b, true, s), d, 0, s);
3553 }
3554 
3555 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3556 {
3557     return float64_muladd(float32_to_float64(a, s),
3558                         float32_to_float64(b, s), d, 0, s);
3559 }
3560 
3561 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3562 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3563 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3564 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3565 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3566 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3567 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3568 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3569 
3570 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3571 {
3572     return float32_muladd(float16_to_float32(a, true, s),
3573                         float16_to_float32(b, true, s), d,
3574                         float_muladd_negate_c | float_muladd_negate_product, s);
3575 }
3576 
3577 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3578 {
3579     return float64_muladd(float32_to_float64(a, s),
3580                         float32_to_float64(b, s), d,
3581                         float_muladd_negate_c | float_muladd_negate_product, s);
3582 }
3583 
3584 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3585 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3586 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3587 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3588 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3589 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3590 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3591 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3592 
3593 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3594 {
3595     return float32_muladd(float16_to_float32(a, true, s),
3596                         float16_to_float32(b, true, s), d,
3597                         float_muladd_negate_c, s);
3598 }
3599 
3600 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3601 {
3602     return float64_muladd(float32_to_float64(a, s),
3603                         float32_to_float64(b, s), d,
3604                         float_muladd_negate_c, s);
3605 }
3606 
3607 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3608 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3609 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3610 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3611 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3612 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3613 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3614 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3615 
3616 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3617 {
3618     return float32_muladd(float16_to_float32(a, true, s),
3619                         float16_to_float32(b, true, s), d,
3620                         float_muladd_negate_product, s);
3621 }
3622 
3623 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3624 {
3625     return float64_muladd(float32_to_float64(a, s),
3626                         float32_to_float64(b, s), d,
3627                         float_muladd_negate_product, s);
3628 }
3629 
3630 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3631 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3632 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3633 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3634 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3635 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3636 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3637 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3638 
3639 /* Vector Floating-Point Square-Root Instruction */
3640 /* (TD, T2, TX2) */
3641 #define OP_UU_H uint16_t, uint16_t, uint16_t
3642 #define OP_UU_W uint32_t, uint32_t, uint32_t
3643 #define OP_UU_D uint64_t, uint64_t, uint64_t
3644 
3645 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3646 static void do_##NAME(void *vd, void *vs2, int i,      \
3647         CPURISCVState *env)                            \
3648 {                                                      \
3649     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3650     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3651 }
3652 
3653 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3654 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3655         CPURISCVState *env, uint32_t desc)             \
3656 {                                                      \
3657     uint32_t vm = vext_vm(desc);                       \
3658     uint32_t vl = env->vl;                             \
3659     uint32_t total_elems =                             \
3660         vext_get_total_elems(env, desc, ESZ);          \
3661     uint32_t vta = vext_vta(desc);                     \
3662     uint32_t i;                                        \
3663                                                        \
3664     if (vl == 0) {                                     \
3665         return;                                        \
3666     }                                                  \
3667     for (i = env->vstart; i < vl; i++) {               \
3668         if (!vm && !vext_elem_mask(v0, i)) {           \
3669             continue;                                  \
3670         }                                              \
3671         do_##NAME(vd, vs2, i, env);                    \
3672     }                                                  \
3673     env->vstart = 0;                                   \
3674     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3675                       total_elems * ESZ);              \
3676 }
3677 
3678 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3679 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3680 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3681 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3682 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3683 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3684 
3685 /*
3686  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3687  *
3688  * Adapted from riscv-v-spec recip.c:
3689  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3690  */
3691 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3692 {
3693     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3694     uint64_t exp = extract64(f, frac_size, exp_size);
3695     uint64_t frac = extract64(f, 0, frac_size);
3696 
3697     const uint8_t lookup_table[] = {
3698         52, 51, 50, 48, 47, 46, 44, 43,
3699         42, 41, 40, 39, 38, 36, 35, 34,
3700         33, 32, 31, 30, 30, 29, 28, 27,
3701         26, 25, 24, 23, 23, 22, 21, 20,
3702         19, 19, 18, 17, 16, 16, 15, 14,
3703         14, 13, 12, 12, 11, 10, 10, 9,
3704         9, 8, 7, 7, 6, 6, 5, 4,
3705         4, 3, 3, 2, 2, 1, 1, 0,
3706         127, 125, 123, 121, 119, 118, 116, 114,
3707         113, 111, 109, 108, 106, 105, 103, 102,
3708         100, 99, 97, 96, 95, 93, 92, 91,
3709         90, 88, 87, 86, 85, 84, 83, 82,
3710         80, 79, 78, 77, 76, 75, 74, 73,
3711         72, 71, 70, 70, 69, 68, 67, 66,
3712         65, 64, 63, 63, 62, 61, 60, 59,
3713         59, 58, 57, 56, 56, 55, 54, 53
3714     };
3715     const int precision = 7;
3716 
3717     if (exp == 0 && frac != 0) { /* subnormal */
3718         /* Normalize the subnormal. */
3719         while (extract64(frac, frac_size - 1, 1) == 0) {
3720             exp--;
3721             frac <<= 1;
3722         }
3723 
3724         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3725     }
3726 
3727     int idx = ((exp & 1) << (precision - 1)) |
3728                 (frac >> (frac_size - precision + 1));
3729     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3730                             (frac_size - precision);
3731     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3732 
3733     uint64_t val = 0;
3734     val = deposit64(val, 0, frac_size, out_frac);
3735     val = deposit64(val, frac_size, exp_size, out_exp);
3736     val = deposit64(val, frac_size + exp_size, 1, sign);
3737     return val;
3738 }
3739 
3740 static float16 frsqrt7_h(float16 f, float_status *s)
3741 {
3742     int exp_size = 5, frac_size = 10;
3743     bool sign = float16_is_neg(f);
3744 
3745     /*
3746      * frsqrt7(sNaN) = canonical NaN
3747      * frsqrt7(-inf) = canonical NaN
3748      * frsqrt7(-normal) = canonical NaN
3749      * frsqrt7(-subnormal) = canonical NaN
3750      */
3751     if (float16_is_signaling_nan(f, s) ||
3752             (float16_is_infinity(f) && sign) ||
3753             (float16_is_normal(f) && sign) ||
3754             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3755         s->float_exception_flags |= float_flag_invalid;
3756         return float16_default_nan(s);
3757     }
3758 
3759     /* frsqrt7(qNaN) = canonical NaN */
3760     if (float16_is_quiet_nan(f, s)) {
3761         return float16_default_nan(s);
3762     }
3763 
3764     /* frsqrt7(+-0) = +-inf */
3765     if (float16_is_zero(f)) {
3766         s->float_exception_flags |= float_flag_divbyzero;
3767         return float16_set_sign(float16_infinity, sign);
3768     }
3769 
3770     /* frsqrt7(+inf) = +0 */
3771     if (float16_is_infinity(f) && !sign) {
3772         return float16_set_sign(float16_zero, sign);
3773     }
3774 
3775     /* +normal, +subnormal */
3776     uint64_t val = frsqrt7(f, exp_size, frac_size);
3777     return make_float16(val);
3778 }
3779 
3780 static float32 frsqrt7_s(float32 f, float_status *s)
3781 {
3782     int exp_size = 8, frac_size = 23;
3783     bool sign = float32_is_neg(f);
3784 
3785     /*
3786      * frsqrt7(sNaN) = canonical NaN
3787      * frsqrt7(-inf) = canonical NaN
3788      * frsqrt7(-normal) = canonical NaN
3789      * frsqrt7(-subnormal) = canonical NaN
3790      */
3791     if (float32_is_signaling_nan(f, s) ||
3792             (float32_is_infinity(f) && sign) ||
3793             (float32_is_normal(f) && sign) ||
3794             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3795         s->float_exception_flags |= float_flag_invalid;
3796         return float32_default_nan(s);
3797     }
3798 
3799     /* frsqrt7(qNaN) = canonical NaN */
3800     if (float32_is_quiet_nan(f, s)) {
3801         return float32_default_nan(s);
3802     }
3803 
3804     /* frsqrt7(+-0) = +-inf */
3805     if (float32_is_zero(f)) {
3806         s->float_exception_flags |= float_flag_divbyzero;
3807         return float32_set_sign(float32_infinity, sign);
3808     }
3809 
3810     /* frsqrt7(+inf) = +0 */
3811     if (float32_is_infinity(f) && !sign) {
3812         return float32_set_sign(float32_zero, sign);
3813     }
3814 
3815     /* +normal, +subnormal */
3816     uint64_t val = frsqrt7(f, exp_size, frac_size);
3817     return make_float32(val);
3818 }
3819 
3820 static float64 frsqrt7_d(float64 f, float_status *s)
3821 {
3822     int exp_size = 11, frac_size = 52;
3823     bool sign = float64_is_neg(f);
3824 
3825     /*
3826      * frsqrt7(sNaN) = canonical NaN
3827      * frsqrt7(-inf) = canonical NaN
3828      * frsqrt7(-normal) = canonical NaN
3829      * frsqrt7(-subnormal) = canonical NaN
3830      */
3831     if (float64_is_signaling_nan(f, s) ||
3832             (float64_is_infinity(f) && sign) ||
3833             (float64_is_normal(f) && sign) ||
3834             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3835         s->float_exception_flags |= float_flag_invalid;
3836         return float64_default_nan(s);
3837     }
3838 
3839     /* frsqrt7(qNaN) = canonical NaN */
3840     if (float64_is_quiet_nan(f, s)) {
3841         return float64_default_nan(s);
3842     }
3843 
3844     /* frsqrt7(+-0) = +-inf */
3845     if (float64_is_zero(f)) {
3846         s->float_exception_flags |= float_flag_divbyzero;
3847         return float64_set_sign(float64_infinity, sign);
3848     }
3849 
3850     /* frsqrt7(+inf) = +0 */
3851     if (float64_is_infinity(f) && !sign) {
3852         return float64_set_sign(float64_zero, sign);
3853     }
3854 
3855     /* +normal, +subnormal */
3856     uint64_t val = frsqrt7(f, exp_size, frac_size);
3857     return make_float64(val);
3858 }
3859 
3860 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3861 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3862 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3863 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3864 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3865 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3866 
3867 /*
3868  * Vector Floating-Point Reciprocal Estimate Instruction
3869  *
3870  * Adapted from riscv-v-spec recip.c:
3871  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3872  */
3873 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3874                       float_status *s)
3875 {
3876     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3877     uint64_t exp = extract64(f, frac_size, exp_size);
3878     uint64_t frac = extract64(f, 0, frac_size);
3879 
3880     const uint8_t lookup_table[] = {
3881         127, 125, 123, 121, 119, 117, 116, 114,
3882         112, 110, 109, 107, 105, 104, 102, 100,
3883         99, 97, 96, 94, 93, 91, 90, 88,
3884         87, 85, 84, 83, 81, 80, 79, 77,
3885         76, 75, 74, 72, 71, 70, 69, 68,
3886         66, 65, 64, 63, 62, 61, 60, 59,
3887         58, 57, 56, 55, 54, 53, 52, 51,
3888         50, 49, 48, 47, 46, 45, 44, 43,
3889         42, 41, 40, 40, 39, 38, 37, 36,
3890         35, 35, 34, 33, 32, 31, 31, 30,
3891         29, 28, 28, 27, 26, 25, 25, 24,
3892         23, 23, 22, 21, 21, 20, 19, 19,
3893         18, 17, 17, 16, 15, 15, 14, 14,
3894         13, 12, 12, 11, 11, 10, 9, 9,
3895         8, 8, 7, 7, 6, 5, 5, 4,
3896         4, 3, 3, 2, 2, 1, 1, 0
3897     };
3898     const int precision = 7;
3899 
3900     if (exp == 0 && frac != 0) { /* subnormal */
3901         /* Normalize the subnormal. */
3902         while (extract64(frac, frac_size - 1, 1) == 0) {
3903             exp--;
3904             frac <<= 1;
3905         }
3906 
3907         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3908 
3909         if (exp != 0 && exp != UINT64_MAX) {
3910             /*
3911              * Overflow to inf or max value of same sign,
3912              * depending on sign and rounding mode.
3913              */
3914             s->float_exception_flags |= (float_flag_inexact |
3915                                          float_flag_overflow);
3916 
3917             if ((s->float_rounding_mode == float_round_to_zero) ||
3918                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3919                 ((s->float_rounding_mode == float_round_up) && sign)) {
3920                 /* Return greatest/negative finite value. */
3921                 return (sign << (exp_size + frac_size)) |
3922                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3923             } else {
3924                 /* Return +-inf. */
3925                 return (sign << (exp_size + frac_size)) |
3926                     MAKE_64BIT_MASK(frac_size, exp_size);
3927             }
3928         }
3929     }
3930 
3931     int idx = frac >> (frac_size - precision);
3932     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3933                             (frac_size - precision);
3934     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3935 
3936     if (out_exp == 0 || out_exp == UINT64_MAX) {
3937         /*
3938          * The result is subnormal, but don't raise the underflow exception,
3939          * because there's no additional loss of precision.
3940          */
3941         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3942         if (out_exp == UINT64_MAX) {
3943             out_frac >>= 1;
3944             out_exp = 0;
3945         }
3946     }
3947 
3948     uint64_t val = 0;
3949     val = deposit64(val, 0, frac_size, out_frac);
3950     val = deposit64(val, frac_size, exp_size, out_exp);
3951     val = deposit64(val, frac_size + exp_size, 1, sign);
3952     return val;
3953 }
3954 
3955 static float16 frec7_h(float16 f, float_status *s)
3956 {
3957     int exp_size = 5, frac_size = 10;
3958     bool sign = float16_is_neg(f);
3959 
3960     /* frec7(+-inf) = +-0 */
3961     if (float16_is_infinity(f)) {
3962         return float16_set_sign(float16_zero, sign);
3963     }
3964 
3965     /* frec7(+-0) = +-inf */
3966     if (float16_is_zero(f)) {
3967         s->float_exception_flags |= float_flag_divbyzero;
3968         return float16_set_sign(float16_infinity, sign);
3969     }
3970 
3971     /* frec7(sNaN) = canonical NaN */
3972     if (float16_is_signaling_nan(f, s)) {
3973         s->float_exception_flags |= float_flag_invalid;
3974         return float16_default_nan(s);
3975     }
3976 
3977     /* frec7(qNaN) = canonical NaN */
3978     if (float16_is_quiet_nan(f, s)) {
3979         return float16_default_nan(s);
3980     }
3981 
3982     /* +-normal, +-subnormal */
3983     uint64_t val = frec7(f, exp_size, frac_size, s);
3984     return make_float16(val);
3985 }
3986 
3987 static float32 frec7_s(float32 f, float_status *s)
3988 {
3989     int exp_size = 8, frac_size = 23;
3990     bool sign = float32_is_neg(f);
3991 
3992     /* frec7(+-inf) = +-0 */
3993     if (float32_is_infinity(f)) {
3994         return float32_set_sign(float32_zero, sign);
3995     }
3996 
3997     /* frec7(+-0) = +-inf */
3998     if (float32_is_zero(f)) {
3999         s->float_exception_flags |= float_flag_divbyzero;
4000         return float32_set_sign(float32_infinity, sign);
4001     }
4002 
4003     /* frec7(sNaN) = canonical NaN */
4004     if (float32_is_signaling_nan(f, s)) {
4005         s->float_exception_flags |= float_flag_invalid;
4006         return float32_default_nan(s);
4007     }
4008 
4009     /* frec7(qNaN) = canonical NaN */
4010     if (float32_is_quiet_nan(f, s)) {
4011         return float32_default_nan(s);
4012     }
4013 
4014     /* +-normal, +-subnormal */
4015     uint64_t val = frec7(f, exp_size, frac_size, s);
4016     return make_float32(val);
4017 }
4018 
4019 static float64 frec7_d(float64 f, float_status *s)
4020 {
4021     int exp_size = 11, frac_size = 52;
4022     bool sign = float64_is_neg(f);
4023 
4024     /* frec7(+-inf) = +-0 */
4025     if (float64_is_infinity(f)) {
4026         return float64_set_sign(float64_zero, sign);
4027     }
4028 
4029     /* frec7(+-0) = +-inf */
4030     if (float64_is_zero(f)) {
4031         s->float_exception_flags |= float_flag_divbyzero;
4032         return float64_set_sign(float64_infinity, sign);
4033     }
4034 
4035     /* frec7(sNaN) = canonical NaN */
4036     if (float64_is_signaling_nan(f, s)) {
4037         s->float_exception_flags |= float_flag_invalid;
4038         return float64_default_nan(s);
4039     }
4040 
4041     /* frec7(qNaN) = canonical NaN */
4042     if (float64_is_quiet_nan(f, s)) {
4043         return float64_default_nan(s);
4044     }
4045 
4046     /* +-normal, +-subnormal */
4047     uint64_t val = frec7(f, exp_size, frac_size, s);
4048     return make_float64(val);
4049 }
4050 
4051 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4052 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4053 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4054 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4055 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4056 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4057 
4058 /* Vector Floating-Point MIN/MAX Instructions */
4059 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4060 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4061 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4062 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4063 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4064 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4065 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4066 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4067 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4068 GEN_VEXT_VF(vfmin_vf_h, 2)
4069 GEN_VEXT_VF(vfmin_vf_w, 4)
4070 GEN_VEXT_VF(vfmin_vf_d, 8)
4071 
4072 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4073 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4074 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4075 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4076 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4077 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4078 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4079 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4080 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4081 GEN_VEXT_VF(vfmax_vf_h, 2)
4082 GEN_VEXT_VF(vfmax_vf_w, 4)
4083 GEN_VEXT_VF(vfmax_vf_d, 8)
4084 
4085 /* Vector Floating-Point Sign-Injection Instructions */
4086 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4087 {
4088     return deposit64(b, 0, 15, a);
4089 }
4090 
4091 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4092 {
4093     return deposit64(b, 0, 31, a);
4094 }
4095 
4096 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4097 {
4098     return deposit64(b, 0, 63, a);
4099 }
4100 
4101 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4102 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4103 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4104 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4105 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4106 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4107 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4108 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4109 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4110 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4111 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4112 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4113 
4114 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4115 {
4116     return deposit64(~b, 0, 15, a);
4117 }
4118 
4119 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4120 {
4121     return deposit64(~b, 0, 31, a);
4122 }
4123 
4124 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4125 {
4126     return deposit64(~b, 0, 63, a);
4127 }
4128 
4129 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4130 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4131 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4132 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4133 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4134 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4135 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4136 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4137 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4138 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4139 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4140 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4141 
4142 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4143 {
4144     return deposit64(b ^ a, 0, 15, a);
4145 }
4146 
4147 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4148 {
4149     return deposit64(b ^ a, 0, 31, a);
4150 }
4151 
4152 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4153 {
4154     return deposit64(b ^ a, 0, 63, a);
4155 }
4156 
4157 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4158 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4159 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4160 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4161 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4162 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4163 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4164 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4165 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4166 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4167 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4168 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4169 
4170 /* Vector Floating-Point Compare Instructions */
4171 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4172 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4173                   CPURISCVState *env, uint32_t desc)          \
4174 {                                                             \
4175     uint32_t vm = vext_vm(desc);                              \
4176     uint32_t vl = env->vl;                                    \
4177     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
4178     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4179     uint32_t i;                                               \
4180                                                               \
4181     for (i = env->vstart; i < vl; i++) {                      \
4182         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4183         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4184         if (!vm && !vext_elem_mask(v0, i)) {                  \
4185             continue;                                         \
4186         }                                                     \
4187         vext_set_elem_mask(vd, i,                             \
4188                            DO_OP(s2, s1, &env->fp_status));   \
4189     }                                                         \
4190     env->vstart = 0;                                          \
4191     /* mask destination register are always tail-agnostic */  \
4192     /* set tail elements to 1s */                             \
4193     if (vta_all_1s) {                                         \
4194         for (; i < total_elems; i++) {                        \
4195             vext_set_elem_mask(vd, i, 1);                     \
4196         }                                                     \
4197     }                                                         \
4198 }
4199 
4200 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4201 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4202 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4203 
4204 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4205 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4206                   CPURISCVState *env, uint32_t desc)                \
4207 {                                                                   \
4208     uint32_t vm = vext_vm(desc);                                    \
4209     uint32_t vl = env->vl;                                          \
4210     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
4211     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4212     uint32_t i;                                                     \
4213                                                                     \
4214     for (i = env->vstart; i < vl; i++) {                            \
4215         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4216         if (!vm && !vext_elem_mask(v0, i)) {                        \
4217             continue;                                               \
4218         }                                                           \
4219         vext_set_elem_mask(vd, i,                                   \
4220                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4221     }                                                               \
4222     env->vstart = 0;                                                \
4223     /* mask destination register are always tail-agnostic */        \
4224     /* set tail elements to 1s */                                   \
4225     if (vta_all_1s) {                                               \
4226         for (; i < total_elems; i++) {                              \
4227             vext_set_elem_mask(vd, i, 1);                           \
4228         }                                                           \
4229     }                                                               \
4230 }
4231 
4232 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4233 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4234 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4235 
4236 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4237 {
4238     FloatRelation compare = float16_compare_quiet(a, b, s);
4239     return compare != float_relation_equal;
4240 }
4241 
4242 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4243 {
4244     FloatRelation compare = float32_compare_quiet(a, b, s);
4245     return compare != float_relation_equal;
4246 }
4247 
4248 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4249 {
4250     FloatRelation compare = float64_compare_quiet(a, b, s);
4251     return compare != float_relation_equal;
4252 }
4253 
4254 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4255 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4256 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4257 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4258 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4259 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4260 
4261 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4262 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4263 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4264 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4265 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4266 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4267 
4268 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4269 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4270 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4271 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4272 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4273 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4274 
4275 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4276 {
4277     FloatRelation compare = float16_compare(a, b, s);
4278     return compare == float_relation_greater;
4279 }
4280 
4281 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4282 {
4283     FloatRelation compare = float32_compare(a, b, s);
4284     return compare == float_relation_greater;
4285 }
4286 
4287 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4288 {
4289     FloatRelation compare = float64_compare(a, b, s);
4290     return compare == float_relation_greater;
4291 }
4292 
4293 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4294 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4295 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4296 
4297 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4298 {
4299     FloatRelation compare = float16_compare(a, b, s);
4300     return compare == float_relation_greater ||
4301            compare == float_relation_equal;
4302 }
4303 
4304 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4305 {
4306     FloatRelation compare = float32_compare(a, b, s);
4307     return compare == float_relation_greater ||
4308            compare == float_relation_equal;
4309 }
4310 
4311 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4312 {
4313     FloatRelation compare = float64_compare(a, b, s);
4314     return compare == float_relation_greater ||
4315            compare == float_relation_equal;
4316 }
4317 
4318 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4319 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4320 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4321 
4322 /* Vector Floating-Point Classify Instruction */
4323 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4324 static void do_##NAME(void *vd, void *vs2, int i)      \
4325 {                                                      \
4326     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4327     *((TD *)vd + HD(i)) = OP(s2);                      \
4328 }
4329 
4330 #define GEN_VEXT_V(NAME, ESZ)                          \
4331 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4332                   CPURISCVState *env, uint32_t desc)   \
4333 {                                                      \
4334     uint32_t vm = vext_vm(desc);                       \
4335     uint32_t vl = env->vl;                             \
4336     uint32_t total_elems =                             \
4337         vext_get_total_elems(env, desc, ESZ);          \
4338     uint32_t vta = vext_vta(desc);                     \
4339     uint32_t i;                                        \
4340                                                        \
4341     for (i = env->vstart; i < vl; i++) {               \
4342         if (!vm && !vext_elem_mask(v0, i)) {           \
4343             continue;                                  \
4344         }                                              \
4345         do_##NAME(vd, vs2, i);                         \
4346     }                                                  \
4347     env->vstart = 0;                                   \
4348     /* set tail elements to 1s */                      \
4349     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4350                       total_elems * ESZ);              \
4351 }
4352 
4353 target_ulong fclass_h(uint64_t frs1)
4354 {
4355     float16 f = frs1;
4356     bool sign = float16_is_neg(f);
4357 
4358     if (float16_is_infinity(f)) {
4359         return sign ? 1 << 0 : 1 << 7;
4360     } else if (float16_is_zero(f)) {
4361         return sign ? 1 << 3 : 1 << 4;
4362     } else if (float16_is_zero_or_denormal(f)) {
4363         return sign ? 1 << 2 : 1 << 5;
4364     } else if (float16_is_any_nan(f)) {
4365         float_status s = { }; /* for snan_bit_is_one */
4366         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4367     } else {
4368         return sign ? 1 << 1 : 1 << 6;
4369     }
4370 }
4371 
4372 target_ulong fclass_s(uint64_t frs1)
4373 {
4374     float32 f = frs1;
4375     bool sign = float32_is_neg(f);
4376 
4377     if (float32_is_infinity(f)) {
4378         return sign ? 1 << 0 : 1 << 7;
4379     } else if (float32_is_zero(f)) {
4380         return sign ? 1 << 3 : 1 << 4;
4381     } else if (float32_is_zero_or_denormal(f)) {
4382         return sign ? 1 << 2 : 1 << 5;
4383     } else if (float32_is_any_nan(f)) {
4384         float_status s = { }; /* for snan_bit_is_one */
4385         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4386     } else {
4387         return sign ? 1 << 1 : 1 << 6;
4388     }
4389 }
4390 
4391 target_ulong fclass_d(uint64_t frs1)
4392 {
4393     float64 f = frs1;
4394     bool sign = float64_is_neg(f);
4395 
4396     if (float64_is_infinity(f)) {
4397         return sign ? 1 << 0 : 1 << 7;
4398     } else if (float64_is_zero(f)) {
4399         return sign ? 1 << 3 : 1 << 4;
4400     } else if (float64_is_zero_or_denormal(f)) {
4401         return sign ? 1 << 2 : 1 << 5;
4402     } else if (float64_is_any_nan(f)) {
4403         float_status s = { }; /* for snan_bit_is_one */
4404         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4405     } else {
4406         return sign ? 1 << 1 : 1 << 6;
4407     }
4408 }
4409 
4410 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4411 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4412 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4413 GEN_VEXT_V(vfclass_v_h, 2)
4414 GEN_VEXT_V(vfclass_v_w, 4)
4415 GEN_VEXT_V(vfclass_v_d, 8)
4416 
4417 /* Vector Floating-Point Merge Instruction */
4418 
4419 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4420 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4421                   CPURISCVState *env, uint32_t desc)          \
4422 {                                                             \
4423     uint32_t vm = vext_vm(desc);                              \
4424     uint32_t vl = env->vl;                                    \
4425     uint32_t esz = sizeof(ETYPE);                             \
4426     uint32_t total_elems =                                    \
4427         vext_get_total_elems(env, desc, esz);                 \
4428     uint32_t vta = vext_vta(desc);                            \
4429     uint32_t i;                                               \
4430                                                               \
4431     for (i = env->vstart; i < vl; i++) {                      \
4432         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4433         *((ETYPE *)vd + H(i))                                 \
4434           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4435     }                                                         \
4436     env->vstart = 0;                                          \
4437     /* set tail elements to 1s */                             \
4438     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4439 }
4440 
4441 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4442 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4443 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4444 
4445 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4446 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4447 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4448 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4449 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4450 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4451 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4452 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4453 
4454 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4455 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4456 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4457 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4458 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4459 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4460 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4461 
4462 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4463 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4464 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4465 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4466 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4467 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4468 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4469 
4470 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4471 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4472 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4473 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4474 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4475 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4476 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4477 
4478 /* Widening Floating-Point/Integer Type-Convert Instructions */
4479 /* (TD, T2, TX2) */
4480 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4481 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4482 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4483 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4484 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4485 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4486 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4487 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4488 
4489 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4490 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4491 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4492 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4493 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4494 
4495 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4496 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4497 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4498 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4499 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4500 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4501 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4502 
4503 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4504 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4505 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4506 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4507 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4508 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4509 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4510 
4511 /*
4512  * vfwcvt.f.f.v vd, vs2, vm
4513  * Convert single-width float to double-width float.
4514  */
4515 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4516 {
4517     return float16_to_float32(a, true, s);
4518 }
4519 
4520 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4521 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4522 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4523 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4524 
4525 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4526 /* (TD, T2, TX2) */
4527 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4528 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4529 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4530 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4531 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4532 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4533 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4534 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4535 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4536 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4537 
4538 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4539 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4540 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4541 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4542 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4543 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4544 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4545 
4546 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4547 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4548 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4549 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4550 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4551 
4552 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4553 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4554 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4555 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4556 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4557 
4558 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4559 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4560 {
4561     return float32_to_float16(a, true, s);
4562 }
4563 
4564 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4565 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4566 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4567 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4568 
4569 /*
4570  *** Vector Reduction Operations
4571  */
4572 /* Vector Single-Width Integer Reduction Instructions */
4573 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4574 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4575         void *vs2, CPURISCVState *env, uint32_t desc)     \
4576 {                                                         \
4577     uint32_t vm = vext_vm(desc);                          \
4578     uint32_t vl = env->vl;                                \
4579     uint32_t esz = sizeof(TD);                            \
4580     uint32_t vlenb = simd_maxsz(desc);                    \
4581     uint32_t vta = vext_vta(desc);                        \
4582     uint32_t i;                                           \
4583     TD s1 =  *((TD *)vs1 + HD(0));                        \
4584                                                           \
4585     for (i = env->vstart; i < vl; i++) {                  \
4586         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4587         if (!vm && !vext_elem_mask(v0, i)) {              \
4588             continue;                                     \
4589         }                                                 \
4590         s1 = OP(s1, (TD)s2);                              \
4591     }                                                     \
4592     *((TD *)vd + HD(0)) = s1;                             \
4593     env->vstart = 0;                                      \
4594     /* set tail elements to 1s */                         \
4595     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4596 }
4597 
4598 /* vd[0] = sum(vs1[0], vs2[*]) */
4599 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4600 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4601 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4602 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4603 
4604 /* vd[0] = maxu(vs1[0], vs2[*]) */
4605 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4606 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4607 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4608 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4609 
4610 /* vd[0] = max(vs1[0], vs2[*]) */
4611 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4612 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4613 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4614 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4615 
4616 /* vd[0] = minu(vs1[0], vs2[*]) */
4617 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4618 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4619 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4620 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4621 
4622 /* vd[0] = min(vs1[0], vs2[*]) */
4623 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4624 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4625 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4626 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4627 
4628 /* vd[0] = and(vs1[0], vs2[*]) */
4629 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4630 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4631 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4632 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4633 
4634 /* vd[0] = or(vs1[0], vs2[*]) */
4635 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4636 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4637 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4638 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4639 
4640 /* vd[0] = xor(vs1[0], vs2[*]) */
4641 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4642 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4643 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4644 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4645 
4646 /* Vector Widening Integer Reduction Instructions */
4647 /* signed sum reduction into double-width accumulator */
4648 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4649 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4650 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4651 
4652 /* Unsigned sum reduction into double-width accumulator */
4653 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4654 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4655 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4656 
4657 /* Vector Single-Width Floating-Point Reduction Instructions */
4658 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4659 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4660                   void *vs2, CPURISCVState *env,           \
4661                   uint32_t desc)                           \
4662 {                                                          \
4663     uint32_t vm = vext_vm(desc);                           \
4664     uint32_t vl = env->vl;                                 \
4665     uint32_t esz = sizeof(TD);                             \
4666     uint32_t vlenb = simd_maxsz(desc);                     \
4667     uint32_t vta = vext_vta(desc);                         \
4668     uint32_t i;                                            \
4669     TD s1 =  *((TD *)vs1 + HD(0));                         \
4670                                                            \
4671     for (i = env->vstart; i < vl; i++) {                   \
4672         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4673         if (!vm && !vext_elem_mask(v0, i)) {               \
4674             continue;                                      \
4675         }                                                  \
4676         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4677     }                                                      \
4678     *((TD *)vd + HD(0)) = s1;                              \
4679     env->vstart = 0;                                       \
4680     /* set tail elements to 1s */                          \
4681     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4682 }
4683 
4684 /* Unordered sum */
4685 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4686 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4687 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4688 
4689 /* Maximum value */
4690 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4691 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4692 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4693 
4694 /* Minimum value */
4695 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4696 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4697 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4698 
4699 /* Vector Widening Floating-Point Reduction Instructions */
4700 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4701 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4702                             void *vs2, CPURISCVState *env, uint32_t desc)
4703 {
4704     uint32_t vm = vext_vm(desc);
4705     uint32_t vl = env->vl;
4706     uint32_t esz = sizeof(uint32_t);
4707     uint32_t vlenb = simd_maxsz(desc);
4708     uint32_t vta = vext_vta(desc);
4709     uint32_t i;
4710     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4711 
4712     for (i = env->vstart; i < vl; i++) {
4713         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4714         if (!vm && !vext_elem_mask(v0, i)) {
4715             continue;
4716         }
4717         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4718                          &env->fp_status);
4719     }
4720     *((uint32_t *)vd + H4(0)) = s1;
4721     env->vstart = 0;
4722     /* set tail elements to 1s */
4723     vext_set_elems_1s(vd, vta, esz, vlenb);
4724 }
4725 
4726 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4727                             void *vs2, CPURISCVState *env, uint32_t desc)
4728 {
4729     uint32_t vm = vext_vm(desc);
4730     uint32_t vl = env->vl;
4731     uint32_t esz = sizeof(uint64_t);
4732     uint32_t vlenb = simd_maxsz(desc);
4733     uint32_t vta = vext_vta(desc);
4734     uint32_t i;
4735     uint64_t s1 =  *((uint64_t *)vs1);
4736 
4737     for (i = env->vstart; i < vl; i++) {
4738         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4739         if (!vm && !vext_elem_mask(v0, i)) {
4740             continue;
4741         }
4742         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4743                          &env->fp_status);
4744     }
4745     *((uint64_t *)vd) = s1;
4746     env->vstart = 0;
4747     /* set tail elements to 1s */
4748     vext_set_elems_1s(vd, vta, esz, vlenb);
4749 }
4750 
4751 /*
4752  *** Vector Mask Operations
4753  */
4754 /* Vector Mask-Register Logical Instructions */
4755 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4756 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4757                   void *vs2, CPURISCVState *env,          \
4758                   uint32_t desc)                          \
4759 {                                                         \
4760     uint32_t vl = env->vl;                                \
4761     uint32_t total_elems = env_archcpu(env)->cfg.vlen;    \
4762     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4763     uint32_t i;                                           \
4764     int a, b;                                             \
4765                                                           \
4766     for (i = env->vstart; i < vl; i++) {                  \
4767         a = vext_elem_mask(vs1, i);                       \
4768         b = vext_elem_mask(vs2, i);                       \
4769         vext_set_elem_mask(vd, i, OP(b, a));              \
4770     }                                                     \
4771     env->vstart = 0;                                      \
4772     /* mask destination register are always tail-         \
4773      * agnostic                                           \
4774      */                                                   \
4775     /* set tail elements to 1s */                         \
4776     if (vta_all_1s) {                                     \
4777         for (; i < total_elems; i++) {                    \
4778             vext_set_elem_mask(vd, i, 1);                 \
4779         }                                                 \
4780     }                                                     \
4781 }
4782 
4783 #define DO_NAND(N, M)  (!(N & M))
4784 #define DO_ANDNOT(N, M)  (N & !M)
4785 #define DO_NOR(N, M)  (!(N | M))
4786 #define DO_ORNOT(N, M)  (N | !M)
4787 #define DO_XNOR(N, M)  (!(N ^ M))
4788 
4789 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4790 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4791 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4792 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4793 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4794 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4795 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4796 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4797 
4798 /* Vector count population in mask vcpop */
4799 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4800                              uint32_t desc)
4801 {
4802     target_ulong cnt = 0;
4803     uint32_t vm = vext_vm(desc);
4804     uint32_t vl = env->vl;
4805     int i;
4806 
4807     for (i = env->vstart; i < vl; i++) {
4808         if (vm || vext_elem_mask(v0, i)) {
4809             if (vext_elem_mask(vs2, i)) {
4810                 cnt++;
4811             }
4812         }
4813     }
4814     env->vstart = 0;
4815     return cnt;
4816 }
4817 
4818 /* vfirst find-first-set mask bit*/
4819 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4820                               uint32_t desc)
4821 {
4822     uint32_t vm = vext_vm(desc);
4823     uint32_t vl = env->vl;
4824     int i;
4825 
4826     for (i = env->vstart; i < vl; i++) {
4827         if (vm || vext_elem_mask(v0, i)) {
4828             if (vext_elem_mask(vs2, i)) {
4829                 return i;
4830             }
4831         }
4832     }
4833     env->vstart = 0;
4834     return -1LL;
4835 }
4836 
4837 enum set_mask_type {
4838     ONLY_FIRST = 1,
4839     INCLUDE_FIRST,
4840     BEFORE_FIRST,
4841 };
4842 
4843 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4844                    uint32_t desc, enum set_mask_type type)
4845 {
4846     uint32_t vm = vext_vm(desc);
4847     uint32_t vl = env->vl;
4848     uint32_t total_elems = env_archcpu(env)->cfg.vlen;
4849     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4850     int i;
4851     bool first_mask_bit = false;
4852 
4853     for (i = env->vstart; i < vl; i++) {
4854         if (!vm && !vext_elem_mask(v0, i)) {
4855             continue;
4856         }
4857         /* write a zero to all following active elements */
4858         if (first_mask_bit) {
4859             vext_set_elem_mask(vd, i, 0);
4860             continue;
4861         }
4862         if (vext_elem_mask(vs2, i)) {
4863             first_mask_bit = true;
4864             if (type == BEFORE_FIRST) {
4865                 vext_set_elem_mask(vd, i, 0);
4866             } else {
4867                 vext_set_elem_mask(vd, i, 1);
4868             }
4869         } else {
4870             if (type == ONLY_FIRST) {
4871                 vext_set_elem_mask(vd, i, 0);
4872             } else {
4873                 vext_set_elem_mask(vd, i, 1);
4874             }
4875         }
4876     }
4877     env->vstart = 0;
4878     /* mask destination register are always tail-agnostic */
4879     /* set tail elements to 1s */
4880     if (vta_all_1s) {
4881         for (; i < total_elems; i++) {
4882             vext_set_elem_mask(vd, i, 1);
4883         }
4884     }
4885 }
4886 
4887 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4888                      uint32_t desc)
4889 {
4890     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4891 }
4892 
4893 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4894                      uint32_t desc)
4895 {
4896     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4897 }
4898 
4899 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4900                      uint32_t desc)
4901 {
4902     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4903 }
4904 
4905 /* Vector Iota Instruction */
4906 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4907 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4908                   uint32_t desc)                                          \
4909 {                                                                         \
4910     uint32_t vm = vext_vm(desc);                                          \
4911     uint32_t vl = env->vl;                                                \
4912     uint32_t esz = sizeof(ETYPE);                                         \
4913     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4914     uint32_t vta = vext_vta(desc);                                        \
4915     uint32_t sum = 0;                                                     \
4916     int i;                                                                \
4917                                                                           \
4918     for (i = env->vstart; i < vl; i++) {                                  \
4919         if (!vm && !vext_elem_mask(v0, i)) {                              \
4920             continue;                                                     \
4921         }                                                                 \
4922         *((ETYPE *)vd + H(i)) = sum;                                      \
4923         if (vext_elem_mask(vs2, i)) {                                     \
4924             sum++;                                                        \
4925         }                                                                 \
4926     }                                                                     \
4927     env->vstart = 0;                                                      \
4928     /* set tail elements to 1s */                                         \
4929     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4930 }
4931 
4932 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4933 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4934 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4935 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4936 
4937 /* Vector Element Index Instruction */
4938 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4939 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4940 {                                                                         \
4941     uint32_t vm = vext_vm(desc);                                          \
4942     uint32_t vl = env->vl;                                                \
4943     uint32_t esz = sizeof(ETYPE);                                         \
4944     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4945     uint32_t vta = vext_vta(desc);                                        \
4946     int i;                                                                \
4947                                                                           \
4948     for (i = env->vstart; i < vl; i++) {                                  \
4949         if (!vm && !vext_elem_mask(v0, i)) {                              \
4950             continue;                                                     \
4951         }                                                                 \
4952         *((ETYPE *)vd + H(i)) = i;                                        \
4953     }                                                                     \
4954     env->vstart = 0;                                                      \
4955     /* set tail elements to 1s */                                         \
4956     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4957 }
4958 
4959 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4960 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4961 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4962 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4963 
4964 /*
4965  *** Vector Permutation Instructions
4966  */
4967 
4968 /* Vector Slide Instructions */
4969 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4970 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4971                   CPURISCVState *env, uint32_t desc)                      \
4972 {                                                                         \
4973     uint32_t vm = vext_vm(desc);                                          \
4974     uint32_t vl = env->vl;                                                \
4975     uint32_t esz = sizeof(ETYPE);                                         \
4976     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4977     uint32_t vta = vext_vta(desc);                                        \
4978     target_ulong offset = s1, i_min, i;                                   \
4979                                                                           \
4980     i_min = MAX(env->vstart, offset);                                     \
4981     for (i = i_min; i < vl; i++) {                                        \
4982         if (!vm && !vext_elem_mask(v0, i)) {                              \
4983             continue;                                                     \
4984         }                                                                 \
4985         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4986     }                                                                     \
4987     /* set tail elements to 1s */                                         \
4988     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4989 }
4990 
4991 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4992 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4993 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4994 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4995 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4996 
4997 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4998 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4999                   CPURISCVState *env, uint32_t desc)                      \
5000 {                                                                         \
5001     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5002     uint32_t vm = vext_vm(desc);                                          \
5003     uint32_t vl = env->vl;                                                \
5004     uint32_t esz = sizeof(ETYPE);                                         \
5005     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5006     uint32_t vta = vext_vta(desc);                                        \
5007     target_ulong i_max, i;                                                \
5008                                                                           \
5009     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
5010     for (i = env->vstart; i < i_max; ++i) {                               \
5011         if (vm || vext_elem_mask(v0, i)) {                                \
5012             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
5013         }                                                                 \
5014     }                                                                     \
5015                                                                           \
5016     for (i = i_max; i < vl; ++i) {                                        \
5017         if (vm || vext_elem_mask(v0, i)) {                                \
5018             *((ETYPE *)vd + H(i)) = 0;                                    \
5019         }                                                                 \
5020     }                                                                     \
5021                                                                           \
5022     env->vstart = 0;                                                      \
5023     /* set tail elements to 1s */                                         \
5024     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5025 }
5026 
5027 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5028 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5029 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5030 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5031 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5032 
5033 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5034 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5035                      void *vs2, CPURISCVState *env, uint32_t desc)          \
5036 {                                                                           \
5037     typedef uint##BITWIDTH##_t ETYPE;                                       \
5038     uint32_t vm = vext_vm(desc);                                            \
5039     uint32_t vl = env->vl;                                                  \
5040     uint32_t esz = sizeof(ETYPE);                                           \
5041     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5042     uint32_t vta = vext_vta(desc);                                          \
5043     uint32_t i;                                                             \
5044                                                                             \
5045     for (i = env->vstart; i < vl; i++) {                                    \
5046         if (!vm && !vext_elem_mask(v0, i)) {                                \
5047             continue;                                                       \
5048         }                                                                   \
5049         if (i == 0) {                                                       \
5050             *((ETYPE *)vd + H(i)) = s1;                                     \
5051         } else {                                                            \
5052             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5053         }                                                                   \
5054     }                                                                       \
5055     env->vstart = 0;                                                        \
5056     /* set tail elements to 1s */                                           \
5057     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5058 }
5059 
5060 GEN_VEXT_VSLIE1UP(8,  H1)
5061 GEN_VEXT_VSLIE1UP(16, H2)
5062 GEN_VEXT_VSLIE1UP(32, H4)
5063 GEN_VEXT_VSLIE1UP(64, H8)
5064 
5065 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5066 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5067                   CPURISCVState *env, uint32_t desc)              \
5068 {                                                                 \
5069     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5070 }
5071 
5072 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5073 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5074 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5075 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5076 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5077 
5078 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5079 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5080                        void *vs2, CPURISCVState *env, uint32_t desc)          \
5081 {                                                                             \
5082     typedef uint##BITWIDTH##_t ETYPE;                                         \
5083     uint32_t vm = vext_vm(desc);                                              \
5084     uint32_t vl = env->vl;                                                    \
5085     uint32_t esz = sizeof(ETYPE);                                             \
5086     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5087     uint32_t vta = vext_vta(desc);                                            \
5088     uint32_t i;                                                               \
5089                                                                               \
5090     for (i = env->vstart; i < vl; i++) {                                      \
5091         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5092             continue;                                                         \
5093         }                                                                     \
5094         if (i == vl - 1) {                                                    \
5095             *((ETYPE *)vd + H(i)) = s1;                                       \
5096         } else {                                                              \
5097             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5098         }                                                                     \
5099     }                                                                         \
5100     env->vstart = 0;                                                          \
5101     /* set tail elements to 1s */                                             \
5102     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5103 }
5104 
5105 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5106 GEN_VEXT_VSLIDE1DOWN(16, H2)
5107 GEN_VEXT_VSLIDE1DOWN(32, H4)
5108 GEN_VEXT_VSLIDE1DOWN(64, H8)
5109 
5110 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5111 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5112                   CPURISCVState *env, uint32_t desc)              \
5113 {                                                                 \
5114     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5115 }
5116 
5117 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5118 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5119 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5120 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5121 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5122 
5123 /* Vector Floating-Point Slide Instructions */
5124 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5125 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5126                   CPURISCVState *env, uint32_t desc)          \
5127 {                                                             \
5128     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5129 }
5130 
5131 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5132 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5133 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5134 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5135 
5136 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5137 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5138                   CPURISCVState *env, uint32_t desc)          \
5139 {                                                             \
5140     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5141 }
5142 
5143 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5144 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5145 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5146 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5147 
5148 /* Vector Register Gather Instruction */
5149 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5150 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5151                   CPURISCVState *env, uint32_t desc)                      \
5152 {                                                                         \
5153     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5154     uint32_t vm = vext_vm(desc);                                          \
5155     uint32_t vl = env->vl;                                                \
5156     uint32_t esz = sizeof(TS2);                                           \
5157     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5158     uint32_t vta = vext_vta(desc);                                        \
5159     uint64_t index;                                                       \
5160     uint32_t i;                                                           \
5161                                                                           \
5162     for (i = env->vstart; i < vl; i++) {                                  \
5163         if (!vm && !vext_elem_mask(v0, i)) {                              \
5164             continue;                                                     \
5165         }                                                                 \
5166         index = *((TS1 *)vs1 + HS1(i));                                   \
5167         if (index >= vlmax) {                                             \
5168             *((TS2 *)vd + HS2(i)) = 0;                                    \
5169         } else {                                                          \
5170             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5171         }                                                                 \
5172     }                                                                     \
5173     env->vstart = 0;                                                      \
5174     /* set tail elements to 1s */                                         \
5175     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5176 }
5177 
5178 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5179 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5180 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5181 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5182 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5183 
5184 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5185 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5186 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5187 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5188 
5189 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5190 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5191                   CPURISCVState *env, uint32_t desc)                      \
5192 {                                                                         \
5193     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5194     uint32_t vm = vext_vm(desc);                                          \
5195     uint32_t vl = env->vl;                                                \
5196     uint32_t esz = sizeof(ETYPE);                                         \
5197     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5198     uint32_t vta = vext_vta(desc);                                        \
5199     uint64_t index = s1;                                                  \
5200     uint32_t i;                                                           \
5201                                                                           \
5202     for (i = env->vstart; i < vl; i++) {                                  \
5203         if (!vm && !vext_elem_mask(v0, i)) {                              \
5204             continue;                                                     \
5205         }                                                                 \
5206         if (index >= vlmax) {                                             \
5207             *((ETYPE *)vd + H(i)) = 0;                                    \
5208         } else {                                                          \
5209             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5210         }                                                                 \
5211     }                                                                     \
5212     env->vstart = 0;                                                      \
5213     /* set tail elements to 1s */                                         \
5214     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5215 }
5216 
5217 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5218 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5219 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5220 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5221 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5222 
5223 /* Vector Compress Instruction */
5224 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5225 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5226                   CPURISCVState *env, uint32_t desc)                      \
5227 {                                                                         \
5228     uint32_t vl = env->vl;                                                \
5229     uint32_t esz = sizeof(ETYPE);                                         \
5230     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5231     uint32_t vta = vext_vta(desc);                                        \
5232     uint32_t num = 0, i;                                                  \
5233                                                                           \
5234     for (i = env->vstart; i < vl; i++) {                                  \
5235         if (!vext_elem_mask(vs1, i)) {                                    \
5236             continue;                                                     \
5237         }                                                                 \
5238         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5239         num++;                                                            \
5240     }                                                                     \
5241     env->vstart = 0;                                                      \
5242     /* set tail elements to 1s */                                         \
5243     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5244 }
5245 
5246 /* Compress into vd elements of vs2 where vs1 is enabled */
5247 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5248 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5249 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5250 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5251 
5252 /* Vector Whole Register Move */
5253 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5254 {
5255     /* EEW = SEW */
5256     uint32_t maxsz = simd_maxsz(desc);
5257     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5258     uint32_t startb = env->vstart * sewb;
5259     uint32_t i = startb;
5260 
5261     memcpy((uint8_t *)vd + H1(i),
5262            (uint8_t *)vs2 + H1(i),
5263            maxsz - startb);
5264 
5265     env->vstart = 0;
5266 }
5267 
5268 /* Vector Integer Extension */
5269 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5270 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5271                   CPURISCVState *env, uint32_t desc)             \
5272 {                                                                \
5273     uint32_t vl = env->vl;                                       \
5274     uint32_t vm = vext_vm(desc);                                 \
5275     uint32_t esz = sizeof(ETYPE);                                \
5276     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5277     uint32_t vta = vext_vta(desc);                               \
5278     uint32_t i;                                                  \
5279                                                                  \
5280     for (i = env->vstart; i < vl; i++) {                         \
5281         if (!vm && !vext_elem_mask(v0, i)) {                     \
5282             continue;                                            \
5283         }                                                        \
5284         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5285     }                                                            \
5286     env->vstart = 0;                                             \
5287     /* set tail elements to 1s */                                \
5288     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5289 }
5290 
5291 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5292 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5293 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5294 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5295 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5296 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5297 
5298 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5299 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5300 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5301 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5302 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5303 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5304