xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 72e17a9f)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vma(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VMA);
133 }
134 
135 static inline uint32_t vext_vta_all_1s(uint32_t desc)
136 {
137     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
138 }
139 
140 /*
141  * Get the maximum number of elements can be operated.
142  *
143  * log2_esz: log2 of element size in bytes.
144  */
145 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
146 {
147     /*
148      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
149      * so vlen in bytes (vlenb) is encoded as maxsz.
150      */
151     uint32_t vlenb = simd_maxsz(desc);
152 
153     /* Return VLMAX */
154     int scale = vext_lmul(desc) - log2_esz;
155     return scale < 0 ? vlenb >> -scale : vlenb << scale;
156 }
157 
158 /*
159  * Get number of total elements, including prestart, body and tail elements.
160  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
161  * are held in the same vector register.
162  */
163 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
164                                             uint32_t esz)
165 {
166     uint32_t vlenb = simd_maxsz(desc);
167     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
168     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
169                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
170     return (vlenb << emul) / esz;
171 }
172 
173 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
174 {
175     return (addr & env->cur_pmmask) | env->cur_pmbase;
176 }
177 
178 /*
179  * This function checks watchpoint before real load operation.
180  *
181  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
182  * In user mode, there is no watchpoint support now.
183  *
184  * It will trigger an exception if there is no mapping in TLB
185  * and page table walk can't fill the TLB entry. Then the guest
186  * software can return here after process the exception or never return.
187  */
188 static void probe_pages(CPURISCVState *env, target_ulong addr,
189                         target_ulong len, uintptr_t ra,
190                         MMUAccessType access_type)
191 {
192     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
193     target_ulong curlen = MIN(pagelen, len);
194 
195     probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                  cpu_mmu_index(env, false), ra);
197     if (len > curlen) {
198         addr += curlen;
199         curlen = len - curlen;
200         probe_access(env, adjust_addr(env, addr), curlen, access_type,
201                      cpu_mmu_index(env, false), ra);
202     }
203 }
204 
205 /* set agnostic elements to 1s */
206 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
207                               uint32_t tot)
208 {
209     if (is_agnostic == 0) {
210         /* policy undisturbed */
211         return;
212     }
213     if (tot - cnt == 0) {
214         return ;
215     }
216     memset(base + cnt, -1, tot - cnt);
217 }
218 
219 static inline void vext_set_elem_mask(void *v0, int index,
220                                       uint8_t value)
221 {
222     int idx = index / 64;
223     int pos = index % 64;
224     uint64_t old = ((uint64_t *)v0)[idx];
225     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
226 }
227 
228 /*
229  * Earlier designs (pre-0.9) had a varying number of bits
230  * per mask value (MLEN). In the 0.9 design, MLEN=1.
231  * (Section 4.5)
232  */
233 static inline int vext_elem_mask(void *v0, int index)
234 {
235     int idx = index / 64;
236     int pos = index  % 64;
237     return (((uint64_t *)v0)[idx] >> pos) & 1;
238 }
239 
240 /* elements operations for load and store */
241 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
242                                uint32_t idx, void *vd, uintptr_t retaddr);
243 
244 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
245 static void NAME(CPURISCVState *env, abi_ptr addr,         \
246                  uint32_t idx, void *vd, uintptr_t retaddr)\
247 {                                                          \
248     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
249     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
250 }                                                          \
251 
252 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
253 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
254 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
255 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
256 
257 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
258 static void NAME(CPURISCVState *env, abi_ptr addr,         \
259                  uint32_t idx, void *vd, uintptr_t retaddr)\
260 {                                                          \
261     ETYPE data = *((ETYPE *)vd + H(idx));                  \
262     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
263 }
264 
265 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
266 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
267 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
268 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
269 
270 /*
271  *** stride: access vector element from strided memory
272  */
273 static void
274 vext_ldst_stride(void *vd, void *v0, target_ulong base,
275                  target_ulong stride, CPURISCVState *env,
276                  uint32_t desc, uint32_t vm,
277                  vext_ldst_elem_fn *ldst_elem,
278                  uint32_t log2_esz, uintptr_t ra)
279 {
280     uint32_t i, k;
281     uint32_t nf = vext_nf(desc);
282     uint32_t max_elems = vext_max_elems(desc, log2_esz);
283     uint32_t esz = 1 << log2_esz;
284     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
285     uint32_t vta = vext_vta(desc);
286     uint32_t vma = vext_vma(desc);
287 
288     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
289         k = 0;
290         while (k < nf) {
291             if (!vm && !vext_elem_mask(v0, i)) {
292                 /* set masked-off elements to 1s */
293                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
294                                   (i + k * max_elems + 1) * esz);
295                 k++;
296                 continue;
297             }
298             target_ulong addr = base + stride * i + (k << log2_esz);
299             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
300             k++;
301         }
302     }
303     env->vstart = 0;
304     /* set tail elements to 1s */
305     for (k = 0; k < nf; ++k) {
306         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
307                           (k * max_elems + max_elems) * esz);
308     }
309     if (nf * max_elems % total_elems != 0) {
310         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
311         uint32_t registers_used =
312             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
313         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
314                           registers_used * vlenb);
315     }
316 }
317 
318 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
319 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
320                   target_ulong stride, CPURISCVState *env,              \
321                   uint32_t desc)                                        \
322 {                                                                       \
323     uint32_t vm = vext_vm(desc);                                        \
324     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
325                      ctzl(sizeof(ETYPE)), GETPC());                     \
326 }
327 
328 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
329 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
330 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
331 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
332 
333 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
334 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
335                   target_ulong stride, CPURISCVState *env,              \
336                   uint32_t desc)                                        \
337 {                                                                       \
338     uint32_t vm = vext_vm(desc);                                        \
339     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
340                      ctzl(sizeof(ETYPE)), GETPC());                     \
341 }
342 
343 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
344 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
345 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
346 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
347 
348 /*
349  *** unit-stride: access elements stored contiguously in memory
350  */
351 
352 /* unmasked unit-stride load and store operation*/
353 static void
354 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
355              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
356              uintptr_t ra)
357 {
358     uint32_t i, k;
359     uint32_t nf = vext_nf(desc);
360     uint32_t max_elems = vext_max_elems(desc, log2_esz);
361     uint32_t esz = 1 << log2_esz;
362     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
363     uint32_t vta = vext_vta(desc);
364 
365     /* load bytes from guest memory */
366     for (i = env->vstart; i < evl; i++, env->vstart++) {
367         k = 0;
368         while (k < nf) {
369             target_ulong addr = base + ((i * nf + k) << log2_esz);
370             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
371             k++;
372         }
373     }
374     env->vstart = 0;
375     /* set tail elements to 1s */
376     for (k = 0; k < nf; ++k) {
377         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
378                           (k * max_elems + max_elems) * esz);
379     }
380     if (nf * max_elems % total_elems != 0) {
381         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
382         uint32_t registers_used =
383             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
384         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
385                           registers_used * vlenb);
386     }
387 }
388 
389 /*
390  * masked unit-stride load and store operation will be a special case of stride,
391  * stride = NF * sizeof (MTYPE)
392  */
393 
394 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
395 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
396                          CPURISCVState *env, uint32_t desc)             \
397 {                                                                       \
398     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
399     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
400                      ctzl(sizeof(ETYPE)), GETPC());                     \
401 }                                                                       \
402                                                                         \
403 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
404                   CPURISCVState *env, uint32_t desc)                    \
405 {                                                                       \
406     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
407                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
408 }
409 
410 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
411 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
412 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
413 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
414 
415 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
416 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
417                          CPURISCVState *env, uint32_t desc)              \
418 {                                                                        \
419     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
420     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
421                      ctzl(sizeof(ETYPE)), GETPC());                      \
422 }                                                                        \
423                                                                          \
424 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
425                   CPURISCVState *env, uint32_t desc)                     \
426 {                                                                        \
427     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
428                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
429 }
430 
431 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
432 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
433 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
434 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
435 
436 /*
437  *** unit stride mask load and store, EEW = 1
438  */
439 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, lde_b,
445                  0, evl, GETPC());
446 }
447 
448 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
449                     CPURISCVState *env, uint32_t desc)
450 {
451     /* evl = ceil(vl/8) */
452     uint8_t evl = (env->vl + 7) >> 3;
453     vext_ldst_us(vd, base, env, desc, ste_b,
454                  0, evl, GETPC());
455 }
456 
457 /*
458  *** index: access vector element from indexed memory
459  */
460 typedef target_ulong vext_get_index_addr(target_ulong base,
461         uint32_t idx, void *vs2);
462 
463 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
464 static target_ulong NAME(target_ulong base,            \
465                          uint32_t idx, void *vs2)      \
466 {                                                      \
467     return (base + *((ETYPE *)vs2 + H(idx)));          \
468 }
469 
470 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
471 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
472 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
473 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
474 
475 static inline void
476 vext_ldst_index(void *vd, void *v0, target_ulong base,
477                 void *vs2, CPURISCVState *env, uint32_t desc,
478                 vext_get_index_addr get_index_addr,
479                 vext_ldst_elem_fn *ldst_elem,
480                 uint32_t log2_esz, uintptr_t ra)
481 {
482     uint32_t i, k;
483     uint32_t nf = vext_nf(desc);
484     uint32_t vm = vext_vm(desc);
485     uint32_t max_elems = vext_max_elems(desc, log2_esz);
486     uint32_t esz = 1 << log2_esz;
487     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
488     uint32_t vta = vext_vta(desc);
489     uint32_t vma = vext_vma(desc);
490 
491     /* load bytes from guest memory */
492     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
493         k = 0;
494         while (k < nf) {
495             if (!vm && !vext_elem_mask(v0, i)) {
496                 /* set masked-off elements to 1s */
497                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
498                                   (i + k * max_elems + 1) * esz);
499                 k++;
500                 continue;
501             }
502             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
503             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
504             k++;
505         }
506     }
507     env->vstart = 0;
508     /* set tail elements to 1s */
509     for (k = 0; k < nf; ++k) {
510         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
511                           (k * max_elems + max_elems) * esz);
512     }
513     if (nf * max_elems % total_elems != 0) {
514         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
515         uint32_t registers_used =
516             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
517         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
518                           registers_used * vlenb);
519     }
520 }
521 
522 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
523 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
524                   void *vs2, CPURISCVState *env, uint32_t desc)            \
525 {                                                                          \
526     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
527                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
528 }
529 
530 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
531 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
532 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
533 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
534 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
535 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
536 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
537 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
538 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
539 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
540 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
541 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
542 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
543 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
544 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
545 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
546 
547 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
548 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
549                   void *vs2, CPURISCVState *env, uint32_t desc)  \
550 {                                                                \
551     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
552                     STORE_FN, ctzl(sizeof(ETYPE)),               \
553                     GETPC());                                    \
554 }
555 
556 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
557 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
558 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
559 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
560 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
561 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
562 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
563 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
564 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
565 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
566 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
567 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
568 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
569 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
570 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
571 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
572 
573 /*
574  *** unit-stride fault-only-fisrt load instructions
575  */
576 static inline void
577 vext_ldff(void *vd, void *v0, target_ulong base,
578           CPURISCVState *env, uint32_t desc,
579           vext_ldst_elem_fn *ldst_elem,
580           uint32_t log2_esz, uintptr_t ra)
581 {
582     void *host;
583     uint32_t i, k, vl = 0;
584     uint32_t nf = vext_nf(desc);
585     uint32_t vm = vext_vm(desc);
586     uint32_t max_elems = vext_max_elems(desc, log2_esz);
587     uint32_t esz = 1 << log2_esz;
588     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
589     uint32_t vta = vext_vta(desc);
590     uint32_t vma = vext_vma(desc);
591     target_ulong addr, offset, remain;
592 
593     /* probe every access*/
594     for (i = env->vstart; i < env->vl; i++) {
595         if (!vm && !vext_elem_mask(v0, i)) {
596             continue;
597         }
598         addr = adjust_addr(env, base + i * (nf << log2_esz));
599         if (i == 0) {
600             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
601         } else {
602             /* if it triggers an exception, no need to check watchpoint */
603             remain = nf << log2_esz;
604             while (remain > 0) {
605                 offset = -(addr | TARGET_PAGE_MASK);
606                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
607                                          cpu_mmu_index(env, false));
608                 if (host) {
609 #ifdef CONFIG_USER_ONLY
610                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
611                         vl = i;
612                         goto ProbeSuccess;
613                     }
614 #else
615                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
616 #endif
617                 } else {
618                     vl = i;
619                     goto ProbeSuccess;
620                 }
621                 if (remain <=  offset) {
622                     break;
623                 }
624                 remain -= offset;
625                 addr = adjust_addr(env, addr + offset);
626             }
627         }
628     }
629 ProbeSuccess:
630     /* load bytes from guest memory */
631     if (vl != 0) {
632         env->vl = vl;
633     }
634     for (i = env->vstart; i < env->vl; i++) {
635         k = 0;
636         while (k < nf) {
637             if (!vm && !vext_elem_mask(v0, i)) {
638                 /* set masked-off elements to 1s */
639                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
640                                   (i + k * max_elems + 1) * esz);
641                 k++;
642                 continue;
643             }
644             target_ulong addr = base + ((i * nf + k) << log2_esz);
645             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
646             k++;
647         }
648     }
649     env->vstart = 0;
650     /* set tail elements to 1s */
651     for (k = 0; k < nf; ++k) {
652         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
653                           (k * max_elems + max_elems) * esz);
654     }
655     if (nf * max_elems % total_elems != 0) {
656         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
657         uint32_t registers_used =
658             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
659         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
660                           registers_used * vlenb);
661     }
662 }
663 
664 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
665 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
666                   CPURISCVState *env, uint32_t desc)      \
667 {                                                         \
668     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
669               ctzl(sizeof(ETYPE)), GETPC());              \
670 }
671 
672 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
673 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
674 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
675 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
676 
677 #define DO_SWAP(N, M) (M)
678 #define DO_AND(N, M)  (N & M)
679 #define DO_XOR(N, M)  (N ^ M)
680 #define DO_OR(N, M)   (N | M)
681 #define DO_ADD(N, M)  (N + M)
682 
683 /* Signed min/max */
684 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
685 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
686 
687 /* Unsigned min/max */
688 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
689 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
690 
691 /*
692  *** load and store whole register instructions
693  */
694 static void
695 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
696                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
697 {
698     uint32_t i, k, off, pos;
699     uint32_t nf = vext_nf(desc);
700     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
701     uint32_t max_elems = vlenb >> log2_esz;
702 
703     k = env->vstart / max_elems;
704     off = env->vstart % max_elems;
705 
706     if (off) {
707         /* load/store rest of elements of current segment pointed by vstart */
708         for (pos = off; pos < max_elems; pos++, env->vstart++) {
709             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
710             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
711         }
712         k++;
713     }
714 
715     /* load/store elements for rest of segments */
716     for (; k < nf; k++) {
717         for (i = 0; i < max_elems; i++, env->vstart++) {
718             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
719             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
720         }
721     }
722 
723     env->vstart = 0;
724 }
725 
726 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
727 void HELPER(NAME)(void *vd, target_ulong base,       \
728                   CPURISCVState *env, uint32_t desc) \
729 {                                                    \
730     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
731                     ctzl(sizeof(ETYPE)), GETPC());   \
732 }
733 
734 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
735 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
736 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
737 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
738 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
739 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
740 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
741 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
742 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
743 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
744 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
745 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
746 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
747 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
748 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
749 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
750 
751 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
752 void HELPER(NAME)(void *vd, target_ulong base,       \
753                   CPURISCVState *env, uint32_t desc) \
754 {                                                    \
755     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
756                     ctzl(sizeof(ETYPE)), GETPC());   \
757 }
758 
759 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
760 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
761 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
762 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
763 
764 /*
765  *** Vector Integer Arithmetic Instructions
766  */
767 
768 /* expand macro args before macro */
769 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
770 
771 /* (TD, T1, T2, TX1, TX2) */
772 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
773 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
774 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
775 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
776 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
777 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
778 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
779 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
780 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
781 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
782 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
783 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
784 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
785 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
786 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
787 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
788 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
789 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
790 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
791 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
792 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
793 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
794 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
795 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
796 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
797 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
798 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
799 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
800 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
801 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
802 
803 /* operation of two vector elements */
804 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
805 
806 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
807 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
808 {                                                               \
809     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
810     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
811     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
812 }
813 #define DO_SUB(N, M) (N - M)
814 #define DO_RSUB(N, M) (M - N)
815 
816 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
817 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
818 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
819 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
820 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
821 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
822 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
823 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
824 
825 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
826                        CPURISCVState *env, uint32_t desc,
827                        opivv2_fn *fn, uint32_t esz)
828 {
829     uint32_t vm = vext_vm(desc);
830     uint32_t vl = env->vl;
831     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
832     uint32_t vta = vext_vta(desc);
833     uint32_t vma = vext_vma(desc);
834     uint32_t i;
835 
836     for (i = env->vstart; i < vl; i++) {
837         if (!vm && !vext_elem_mask(v0, i)) {
838             /* set masked-off elements to 1s */
839             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
840             continue;
841         }
842         fn(vd, vs1, vs2, i);
843     }
844     env->vstart = 0;
845     /* set tail elements to 1s */
846     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
847 }
848 
849 /* generate the helpers for OPIVV */
850 #define GEN_VEXT_VV(NAME, ESZ)                            \
851 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
852                   void *vs2, CPURISCVState *env,          \
853                   uint32_t desc)                          \
854 {                                                         \
855     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
856                do_##NAME, ESZ);                           \
857 }
858 
859 GEN_VEXT_VV(vadd_vv_b, 1)
860 GEN_VEXT_VV(vadd_vv_h, 2)
861 GEN_VEXT_VV(vadd_vv_w, 4)
862 GEN_VEXT_VV(vadd_vv_d, 8)
863 GEN_VEXT_VV(vsub_vv_b, 1)
864 GEN_VEXT_VV(vsub_vv_h, 2)
865 GEN_VEXT_VV(vsub_vv_w, 4)
866 GEN_VEXT_VV(vsub_vv_d, 8)
867 
868 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
869 
870 /*
871  * (T1)s1 gives the real operator type.
872  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
873  */
874 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
875 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
876 {                                                                   \
877     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
878     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
879 }
880 
881 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
882 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
883 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
884 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
885 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
886 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
887 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
888 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
889 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
890 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
891 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
892 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
893 
894 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
895                        CPURISCVState *env, uint32_t desc,
896                        opivx2_fn fn, uint32_t esz)
897 {
898     uint32_t vm = vext_vm(desc);
899     uint32_t vl = env->vl;
900     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
901     uint32_t vta = vext_vta(desc);
902     uint32_t vma = vext_vma(desc);
903     uint32_t i;
904 
905     for (i = env->vstart; i < vl; i++) {
906         if (!vm && !vext_elem_mask(v0, i)) {
907             /* set masked-off elements to 1s */
908             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
909             continue;
910         }
911         fn(vd, s1, vs2, i);
912     }
913     env->vstart = 0;
914     /* set tail elements to 1s */
915     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
916 }
917 
918 /* generate the helpers for OPIVX */
919 #define GEN_VEXT_VX(NAME, ESZ)                            \
920 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
921                   void *vs2, CPURISCVState *env,          \
922                   uint32_t desc)                          \
923 {                                                         \
924     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
925                do_##NAME, ESZ);                           \
926 }
927 
928 GEN_VEXT_VX(vadd_vx_b, 1)
929 GEN_VEXT_VX(vadd_vx_h, 2)
930 GEN_VEXT_VX(vadd_vx_w, 4)
931 GEN_VEXT_VX(vadd_vx_d, 8)
932 GEN_VEXT_VX(vsub_vx_b, 1)
933 GEN_VEXT_VX(vsub_vx_h, 2)
934 GEN_VEXT_VX(vsub_vx_w, 4)
935 GEN_VEXT_VX(vsub_vx_d, 8)
936 GEN_VEXT_VX(vrsub_vx_b, 1)
937 GEN_VEXT_VX(vrsub_vx_h, 2)
938 GEN_VEXT_VX(vrsub_vx_w, 4)
939 GEN_VEXT_VX(vrsub_vx_d, 8)
940 
941 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
942 {
943     intptr_t oprsz = simd_oprsz(desc);
944     intptr_t i;
945 
946     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
947         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
948     }
949 }
950 
951 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
952 {
953     intptr_t oprsz = simd_oprsz(desc);
954     intptr_t i;
955 
956     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
957         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
958     }
959 }
960 
961 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
962 {
963     intptr_t oprsz = simd_oprsz(desc);
964     intptr_t i;
965 
966     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
967         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
968     }
969 }
970 
971 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
972 {
973     intptr_t oprsz = simd_oprsz(desc);
974     intptr_t i;
975 
976     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
977         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
978     }
979 }
980 
981 /* Vector Widening Integer Add/Subtract */
982 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
983 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
984 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
985 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
986 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
987 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
988 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
989 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
990 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
991 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
992 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
993 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
994 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
995 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
996 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
997 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
998 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
999 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1000 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1001 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1002 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1003 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1004 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1005 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1006 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1007 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1008 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1009 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1010 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1011 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1012 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1013 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1014 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1015 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1016 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1017 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1018 GEN_VEXT_VV(vwaddu_vv_b, 2)
1019 GEN_VEXT_VV(vwaddu_vv_h, 4)
1020 GEN_VEXT_VV(vwaddu_vv_w, 8)
1021 GEN_VEXT_VV(vwsubu_vv_b, 2)
1022 GEN_VEXT_VV(vwsubu_vv_h, 4)
1023 GEN_VEXT_VV(vwsubu_vv_w, 8)
1024 GEN_VEXT_VV(vwadd_vv_b, 2)
1025 GEN_VEXT_VV(vwadd_vv_h, 4)
1026 GEN_VEXT_VV(vwadd_vv_w, 8)
1027 GEN_VEXT_VV(vwsub_vv_b, 2)
1028 GEN_VEXT_VV(vwsub_vv_h, 4)
1029 GEN_VEXT_VV(vwsub_vv_w, 8)
1030 GEN_VEXT_VV(vwaddu_wv_b, 2)
1031 GEN_VEXT_VV(vwaddu_wv_h, 4)
1032 GEN_VEXT_VV(vwaddu_wv_w, 8)
1033 GEN_VEXT_VV(vwsubu_wv_b, 2)
1034 GEN_VEXT_VV(vwsubu_wv_h, 4)
1035 GEN_VEXT_VV(vwsubu_wv_w, 8)
1036 GEN_VEXT_VV(vwadd_wv_b, 2)
1037 GEN_VEXT_VV(vwadd_wv_h, 4)
1038 GEN_VEXT_VV(vwadd_wv_w, 8)
1039 GEN_VEXT_VV(vwsub_wv_b, 2)
1040 GEN_VEXT_VV(vwsub_wv_h, 4)
1041 GEN_VEXT_VV(vwsub_wv_w, 8)
1042 
1043 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1044 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1045 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1046 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1047 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1048 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1049 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1050 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1051 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1052 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1053 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1054 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1055 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1056 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1057 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1058 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1059 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1060 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1061 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1062 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1063 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1064 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1065 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1066 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1067 GEN_VEXT_VX(vwaddu_vx_b, 2)
1068 GEN_VEXT_VX(vwaddu_vx_h, 4)
1069 GEN_VEXT_VX(vwaddu_vx_w, 8)
1070 GEN_VEXT_VX(vwsubu_vx_b, 2)
1071 GEN_VEXT_VX(vwsubu_vx_h, 4)
1072 GEN_VEXT_VX(vwsubu_vx_w, 8)
1073 GEN_VEXT_VX(vwadd_vx_b, 2)
1074 GEN_VEXT_VX(vwadd_vx_h, 4)
1075 GEN_VEXT_VX(vwadd_vx_w, 8)
1076 GEN_VEXT_VX(vwsub_vx_b, 2)
1077 GEN_VEXT_VX(vwsub_vx_h, 4)
1078 GEN_VEXT_VX(vwsub_vx_w, 8)
1079 GEN_VEXT_VX(vwaddu_wx_b, 2)
1080 GEN_VEXT_VX(vwaddu_wx_h, 4)
1081 GEN_VEXT_VX(vwaddu_wx_w, 8)
1082 GEN_VEXT_VX(vwsubu_wx_b, 2)
1083 GEN_VEXT_VX(vwsubu_wx_h, 4)
1084 GEN_VEXT_VX(vwsubu_wx_w, 8)
1085 GEN_VEXT_VX(vwadd_wx_b, 2)
1086 GEN_VEXT_VX(vwadd_wx_h, 4)
1087 GEN_VEXT_VX(vwadd_wx_w, 8)
1088 GEN_VEXT_VX(vwsub_wx_b, 2)
1089 GEN_VEXT_VX(vwsub_wx_h, 4)
1090 GEN_VEXT_VX(vwsub_wx_w, 8)
1091 
1092 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1093 #define DO_VADC(N, M, C) (N + M + C)
1094 #define DO_VSBC(N, M, C) (N - M - C)
1095 
1096 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1097 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1098                   CPURISCVState *env, uint32_t desc)          \
1099 {                                                             \
1100     uint32_t vl = env->vl;                                    \
1101     uint32_t esz = sizeof(ETYPE);                             \
1102     uint32_t total_elems =                                    \
1103         vext_get_total_elems(env, desc, esz);                 \
1104     uint32_t vta = vext_vta(desc);                            \
1105     uint32_t i;                                               \
1106                                                               \
1107     for (i = env->vstart; i < vl; i++) {                      \
1108         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1109         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1110         ETYPE carry = vext_elem_mask(v0, i);                  \
1111                                                               \
1112         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1113     }                                                         \
1114     env->vstart = 0;                                          \
1115     /* set tail elements to 1s */                             \
1116     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1117 }
1118 
1119 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1120 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1121 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1122 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1123 
1124 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1125 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1126 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1127 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1128 
1129 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1130 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1131                   CPURISCVState *env, uint32_t desc)                     \
1132 {                                                                        \
1133     uint32_t vl = env->vl;                                               \
1134     uint32_t esz = sizeof(ETYPE);                                        \
1135     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1136     uint32_t vta = vext_vta(desc);                                       \
1137     uint32_t i;                                                          \
1138                                                                          \
1139     for (i = env->vstart; i < vl; i++) {                                 \
1140         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1141         ETYPE carry = vext_elem_mask(v0, i);                             \
1142                                                                          \
1143         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1144     }                                                                    \
1145     env->vstart = 0;                                          \
1146     /* set tail elements to 1s */                                        \
1147     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1148 }
1149 
1150 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1151 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1152 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1153 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1154 
1155 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1156 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1157 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1158 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1159 
1160 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1161                           (__typeof(N))(N + M) < N)
1162 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1163 
1164 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1165 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1166                   CPURISCVState *env, uint32_t desc)          \
1167 {                                                             \
1168     uint32_t vl = env->vl;                                    \
1169     uint32_t vm = vext_vm(desc);                              \
1170     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1171     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1172     uint32_t i;                                               \
1173                                                               \
1174     for (i = env->vstart; i < vl; i++) {                      \
1175         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1176         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1177         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1178         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1179     }                                                         \
1180     env->vstart = 0;                                          \
1181     /* mask destination register are always tail-agnostic */  \
1182     /* set tail elements to 1s */                             \
1183     if (vta_all_1s) {                                         \
1184         for (; i < total_elems; i++) {                        \
1185             vext_set_elem_mask(vd, i, 1);                     \
1186         }                                                     \
1187     }                                                         \
1188 }
1189 
1190 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1191 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1192 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1193 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1194 
1195 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1196 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1197 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1198 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1199 
1200 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1201 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1202                   void *vs2, CPURISCVState *env, uint32_t desc) \
1203 {                                                               \
1204     uint32_t vl = env->vl;                                      \
1205     uint32_t vm = vext_vm(desc);                                \
1206     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1207     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1208     uint32_t i;                                                 \
1209                                                                 \
1210     for (i = env->vstart; i < vl; i++) {                        \
1211         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1212         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1213         vext_set_elem_mask(vd, i,                               \
1214                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1215     }                                                           \
1216     env->vstart = 0;                                            \
1217     /* mask destination register are always tail-agnostic */    \
1218     /* set tail elements to 1s */                               \
1219     if (vta_all_1s) {                                           \
1220         for (; i < total_elems; i++) {                          \
1221             vext_set_elem_mask(vd, i, 1);                       \
1222         }                                                       \
1223     }                                                           \
1224 }
1225 
1226 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1227 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1228 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1229 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1230 
1231 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1232 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1233 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1234 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1235 
1236 /* Vector Bitwise Logical Instructions */
1237 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1238 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1239 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1240 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1241 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1242 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1243 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1244 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1245 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1246 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1247 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1248 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1249 GEN_VEXT_VV(vand_vv_b, 1)
1250 GEN_VEXT_VV(vand_vv_h, 2)
1251 GEN_VEXT_VV(vand_vv_w, 4)
1252 GEN_VEXT_VV(vand_vv_d, 8)
1253 GEN_VEXT_VV(vor_vv_b, 1)
1254 GEN_VEXT_VV(vor_vv_h, 2)
1255 GEN_VEXT_VV(vor_vv_w, 4)
1256 GEN_VEXT_VV(vor_vv_d, 8)
1257 GEN_VEXT_VV(vxor_vv_b, 1)
1258 GEN_VEXT_VV(vxor_vv_h, 2)
1259 GEN_VEXT_VV(vxor_vv_w, 4)
1260 GEN_VEXT_VV(vxor_vv_d, 8)
1261 
1262 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1263 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1264 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1265 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1266 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1267 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1268 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1269 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1270 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1271 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1272 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1273 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1274 GEN_VEXT_VX(vand_vx_b, 1)
1275 GEN_VEXT_VX(vand_vx_h, 2)
1276 GEN_VEXT_VX(vand_vx_w, 4)
1277 GEN_VEXT_VX(vand_vx_d, 8)
1278 GEN_VEXT_VX(vor_vx_b, 1)
1279 GEN_VEXT_VX(vor_vx_h, 2)
1280 GEN_VEXT_VX(vor_vx_w, 4)
1281 GEN_VEXT_VX(vor_vx_d, 8)
1282 GEN_VEXT_VX(vxor_vx_b, 1)
1283 GEN_VEXT_VX(vxor_vx_h, 2)
1284 GEN_VEXT_VX(vxor_vx_w, 4)
1285 GEN_VEXT_VX(vxor_vx_d, 8)
1286 
1287 /* Vector Single-Width Bit Shift Instructions */
1288 #define DO_SLL(N, M)  (N << (M))
1289 #define DO_SRL(N, M)  (N >> (M))
1290 
1291 /* generate the helpers for shift instructions with two vector operators */
1292 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1293 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1294                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1295 {                                                                         \
1296     uint32_t vm = vext_vm(desc);                                          \
1297     uint32_t vl = env->vl;                                                \
1298     uint32_t esz = sizeof(TS1);                                           \
1299     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1300     uint32_t vta = vext_vta(desc);                                        \
1301     uint32_t vma = vext_vma(desc);                                        \
1302     uint32_t i;                                                           \
1303                                                                           \
1304     for (i = env->vstart; i < vl; i++) {                                  \
1305         if (!vm && !vext_elem_mask(v0, i)) {                              \
1306             /* set masked-off elements to 1s */                           \
1307             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1308             continue;                                                     \
1309         }                                                                 \
1310         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1311         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1312         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1313     }                                                                     \
1314     env->vstart = 0;                                                      \
1315     /* set tail elements to 1s */                                         \
1316     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1317 }
1318 
1319 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1320 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1321 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1322 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1323 
1324 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1325 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1326 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1327 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1328 
1329 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1330 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1331 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1332 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1333 
1334 /* generate the helpers for shift instructions with one vector and one scalar */
1335 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1336 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1337         void *vs2, CPURISCVState *env, uint32_t desc)       \
1338 {                                                           \
1339     uint32_t vm = vext_vm(desc);                            \
1340     uint32_t vl = env->vl;                                  \
1341     uint32_t esz = sizeof(TD);                              \
1342     uint32_t total_elems =                                  \
1343         vext_get_total_elems(env, desc, esz);               \
1344     uint32_t vta = vext_vta(desc);                          \
1345     uint32_t vma = vext_vma(desc);                          \
1346     uint32_t i;                                             \
1347                                                             \
1348     for (i = env->vstart; i < vl; i++) {                    \
1349         if (!vm && !vext_elem_mask(v0, i)) {                \
1350             /* set masked-off elements to 1s */             \
1351             vext_set_elems_1s(vd, vma, i * esz,             \
1352                               (i + 1) * esz);               \
1353             continue;                                       \
1354         }                                                   \
1355         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1356         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1357     }                                                       \
1358     env->vstart = 0;                                        \
1359     /* set tail elements to 1s */                           \
1360     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1361 }
1362 
1363 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1364 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1365 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1366 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1367 
1368 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1369 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1370 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1371 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1372 
1373 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1374 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1375 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1376 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1377 
1378 /* Vector Narrowing Integer Right Shift Instructions */
1379 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1380 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1381 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1382 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1383 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1384 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1385 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1386 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1387 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1388 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1389 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1390 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1391 
1392 /* Vector Integer Comparison Instructions */
1393 #define DO_MSEQ(N, M) (N == M)
1394 #define DO_MSNE(N, M) (N != M)
1395 #define DO_MSLT(N, M) (N < M)
1396 #define DO_MSLE(N, M) (N <= M)
1397 #define DO_MSGT(N, M) (N > M)
1398 
1399 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1400 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1401                   CPURISCVState *env, uint32_t desc)          \
1402 {                                                             \
1403     uint32_t vm = vext_vm(desc);                              \
1404     uint32_t vl = env->vl;                                    \
1405     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1406     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1407     uint32_t vma = vext_vma(desc);                            \
1408     uint32_t i;                                               \
1409                                                               \
1410     for (i = env->vstart; i < vl; i++) {                      \
1411         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1412         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1413         if (!vm && !vext_elem_mask(v0, i)) {                  \
1414             /* set masked-off elements to 1s */               \
1415             if (vma) {                                        \
1416                 vext_set_elem_mask(vd, i, 1);                 \
1417             }                                                 \
1418             continue;                                         \
1419         }                                                     \
1420         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1421     }                                                         \
1422     env->vstart = 0;                                          \
1423     /* mask destination register are always tail-agnostic */  \
1424     /* set tail elements to 1s */                             \
1425     if (vta_all_1s) {                                         \
1426         for (; i < total_elems; i++) {                        \
1427             vext_set_elem_mask(vd, i, 1);                     \
1428         }                                                     \
1429     }                                                         \
1430 }
1431 
1432 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1433 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1434 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1435 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1436 
1437 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1438 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1439 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1440 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1441 
1442 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1443 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1444 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1445 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1446 
1447 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1448 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1449 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1450 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1451 
1452 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1453 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1454 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1455 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1456 
1457 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1458 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1459 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1460 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1461 
1462 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1463 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1464                   CPURISCVState *env, uint32_t desc)                \
1465 {                                                                   \
1466     uint32_t vm = vext_vm(desc);                                    \
1467     uint32_t vl = env->vl;                                          \
1468     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1469     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1470     uint32_t vma = vext_vma(desc);                                  \
1471     uint32_t i;                                                     \
1472                                                                     \
1473     for (i = env->vstart; i < vl; i++) {                            \
1474         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1475         if (!vm && !vext_elem_mask(v0, i)) {                        \
1476             /* set masked-off elements to 1s */                     \
1477             if (vma) {                                              \
1478                 vext_set_elem_mask(vd, i, 1);                       \
1479             }                                                       \
1480             continue;                                               \
1481         }                                                           \
1482         vext_set_elem_mask(vd, i,                                   \
1483                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1484     }                                                               \
1485     env->vstart = 0;                                                \
1486     /* mask destination register are always tail-agnostic */        \
1487     /* set tail elements to 1s */                                   \
1488     if (vta_all_1s) {                                               \
1489         for (; i < total_elems; i++) {                              \
1490             vext_set_elem_mask(vd, i, 1);                           \
1491         }                                                           \
1492     }                                                               \
1493 }
1494 
1495 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1496 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1497 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1498 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1499 
1500 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1501 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1502 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1503 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1504 
1505 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1506 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1507 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1508 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1509 
1510 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1511 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1512 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1513 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1514 
1515 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1516 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1517 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1518 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1519 
1520 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1521 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1522 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1523 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1524 
1525 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1526 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1527 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1528 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1529 
1530 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1531 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1532 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1533 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1534 
1535 /* Vector Integer Min/Max Instructions */
1536 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1537 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1538 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1539 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1540 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1541 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1542 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1543 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1544 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1545 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1546 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1547 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1548 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1549 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1550 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1551 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1552 GEN_VEXT_VV(vminu_vv_b, 1)
1553 GEN_VEXT_VV(vminu_vv_h, 2)
1554 GEN_VEXT_VV(vminu_vv_w, 4)
1555 GEN_VEXT_VV(vminu_vv_d, 8)
1556 GEN_VEXT_VV(vmin_vv_b, 1)
1557 GEN_VEXT_VV(vmin_vv_h, 2)
1558 GEN_VEXT_VV(vmin_vv_w, 4)
1559 GEN_VEXT_VV(vmin_vv_d, 8)
1560 GEN_VEXT_VV(vmaxu_vv_b, 1)
1561 GEN_VEXT_VV(vmaxu_vv_h, 2)
1562 GEN_VEXT_VV(vmaxu_vv_w, 4)
1563 GEN_VEXT_VV(vmaxu_vv_d, 8)
1564 GEN_VEXT_VV(vmax_vv_b, 1)
1565 GEN_VEXT_VV(vmax_vv_h, 2)
1566 GEN_VEXT_VV(vmax_vv_w, 4)
1567 GEN_VEXT_VV(vmax_vv_d, 8)
1568 
1569 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1570 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1571 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1572 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1573 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1574 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1575 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1576 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1577 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1578 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1579 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1580 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1581 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1582 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1583 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1584 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1585 GEN_VEXT_VX(vminu_vx_b, 1)
1586 GEN_VEXT_VX(vminu_vx_h, 2)
1587 GEN_VEXT_VX(vminu_vx_w, 4)
1588 GEN_VEXT_VX(vminu_vx_d, 8)
1589 GEN_VEXT_VX(vmin_vx_b, 1)
1590 GEN_VEXT_VX(vmin_vx_h, 2)
1591 GEN_VEXT_VX(vmin_vx_w, 4)
1592 GEN_VEXT_VX(vmin_vx_d, 8)
1593 GEN_VEXT_VX(vmaxu_vx_b, 1)
1594 GEN_VEXT_VX(vmaxu_vx_h, 2)
1595 GEN_VEXT_VX(vmaxu_vx_w, 4)
1596 GEN_VEXT_VX(vmaxu_vx_d, 8)
1597 GEN_VEXT_VX(vmax_vx_b, 1)
1598 GEN_VEXT_VX(vmax_vx_h, 2)
1599 GEN_VEXT_VX(vmax_vx_w, 4)
1600 GEN_VEXT_VX(vmax_vx_d, 8)
1601 
1602 /* Vector Single-Width Integer Multiply Instructions */
1603 #define DO_MUL(N, M) (N * M)
1604 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1605 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1606 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1607 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1608 GEN_VEXT_VV(vmul_vv_b, 1)
1609 GEN_VEXT_VV(vmul_vv_h, 2)
1610 GEN_VEXT_VV(vmul_vv_w, 4)
1611 GEN_VEXT_VV(vmul_vv_d, 8)
1612 
1613 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1614 {
1615     return (int16_t)s2 * (int16_t)s1 >> 8;
1616 }
1617 
1618 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1619 {
1620     return (int32_t)s2 * (int32_t)s1 >> 16;
1621 }
1622 
1623 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1624 {
1625     return (int64_t)s2 * (int64_t)s1 >> 32;
1626 }
1627 
1628 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1629 {
1630     uint64_t hi_64, lo_64;
1631 
1632     muls64(&lo_64, &hi_64, s1, s2);
1633     return hi_64;
1634 }
1635 
1636 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1637 {
1638     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1639 }
1640 
1641 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1642 {
1643     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1644 }
1645 
1646 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1647 {
1648     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1649 }
1650 
1651 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1652 {
1653     uint64_t hi_64, lo_64;
1654 
1655     mulu64(&lo_64, &hi_64, s2, s1);
1656     return hi_64;
1657 }
1658 
1659 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1660 {
1661     return (int16_t)s2 * (uint16_t)s1 >> 8;
1662 }
1663 
1664 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1665 {
1666     return (int32_t)s2 * (uint32_t)s1 >> 16;
1667 }
1668 
1669 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1670 {
1671     return (int64_t)s2 * (uint64_t)s1 >> 32;
1672 }
1673 
1674 /*
1675  * Let  A = signed operand,
1676  *      B = unsigned operand
1677  *      P = mulu64(A, B), unsigned product
1678  *
1679  * LET  X = 2 ** 64  - A, 2's complement of A
1680  *      SP = signed product
1681  * THEN
1682  *      IF A < 0
1683  *          SP = -X * B
1684  *             = -(2 ** 64 - A) * B
1685  *             = A * B - 2 ** 64 * B
1686  *             = P - 2 ** 64 * B
1687  *      ELSE
1688  *          SP = P
1689  * THEN
1690  *      HI_P -= (A < 0 ? B : 0)
1691  */
1692 
1693 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1694 {
1695     uint64_t hi_64, lo_64;
1696 
1697     mulu64(&lo_64, &hi_64, s2, s1);
1698 
1699     hi_64 -= s2 < 0 ? s1 : 0;
1700     return hi_64;
1701 }
1702 
1703 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1704 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1705 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1706 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1707 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1708 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1709 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1710 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1711 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1712 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1713 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1714 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1715 GEN_VEXT_VV(vmulh_vv_b, 1)
1716 GEN_VEXT_VV(vmulh_vv_h, 2)
1717 GEN_VEXT_VV(vmulh_vv_w, 4)
1718 GEN_VEXT_VV(vmulh_vv_d, 8)
1719 GEN_VEXT_VV(vmulhu_vv_b, 1)
1720 GEN_VEXT_VV(vmulhu_vv_h, 2)
1721 GEN_VEXT_VV(vmulhu_vv_w, 4)
1722 GEN_VEXT_VV(vmulhu_vv_d, 8)
1723 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1724 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1725 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1726 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1727 
1728 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1729 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1730 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1731 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1732 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1733 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1734 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1735 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1736 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1737 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1738 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1739 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1740 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1741 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1742 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1743 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1744 GEN_VEXT_VX(vmul_vx_b, 1)
1745 GEN_VEXT_VX(vmul_vx_h, 2)
1746 GEN_VEXT_VX(vmul_vx_w, 4)
1747 GEN_VEXT_VX(vmul_vx_d, 8)
1748 GEN_VEXT_VX(vmulh_vx_b, 1)
1749 GEN_VEXT_VX(vmulh_vx_h, 2)
1750 GEN_VEXT_VX(vmulh_vx_w, 4)
1751 GEN_VEXT_VX(vmulh_vx_d, 8)
1752 GEN_VEXT_VX(vmulhu_vx_b, 1)
1753 GEN_VEXT_VX(vmulhu_vx_h, 2)
1754 GEN_VEXT_VX(vmulhu_vx_w, 4)
1755 GEN_VEXT_VX(vmulhu_vx_d, 8)
1756 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1757 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1758 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1759 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1760 
1761 /* Vector Integer Divide Instructions */
1762 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1763 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1764 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1765         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1766 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1767         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1768 
1769 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1770 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1771 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1772 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1773 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1774 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1775 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1776 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1777 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1778 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1779 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1780 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1781 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1782 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1783 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1784 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1785 GEN_VEXT_VV(vdivu_vv_b, 1)
1786 GEN_VEXT_VV(vdivu_vv_h, 2)
1787 GEN_VEXT_VV(vdivu_vv_w, 4)
1788 GEN_VEXT_VV(vdivu_vv_d, 8)
1789 GEN_VEXT_VV(vdiv_vv_b, 1)
1790 GEN_VEXT_VV(vdiv_vv_h, 2)
1791 GEN_VEXT_VV(vdiv_vv_w, 4)
1792 GEN_VEXT_VV(vdiv_vv_d, 8)
1793 GEN_VEXT_VV(vremu_vv_b, 1)
1794 GEN_VEXT_VV(vremu_vv_h, 2)
1795 GEN_VEXT_VV(vremu_vv_w, 4)
1796 GEN_VEXT_VV(vremu_vv_d, 8)
1797 GEN_VEXT_VV(vrem_vv_b, 1)
1798 GEN_VEXT_VV(vrem_vv_h, 2)
1799 GEN_VEXT_VV(vrem_vv_w, 4)
1800 GEN_VEXT_VV(vrem_vv_d, 8)
1801 
1802 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1803 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1804 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1805 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1806 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1807 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1808 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1809 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1810 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1811 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1812 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1813 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1814 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1815 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1816 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1817 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1818 GEN_VEXT_VX(vdivu_vx_b, 1)
1819 GEN_VEXT_VX(vdivu_vx_h, 2)
1820 GEN_VEXT_VX(vdivu_vx_w, 4)
1821 GEN_VEXT_VX(vdivu_vx_d, 8)
1822 GEN_VEXT_VX(vdiv_vx_b, 1)
1823 GEN_VEXT_VX(vdiv_vx_h, 2)
1824 GEN_VEXT_VX(vdiv_vx_w, 4)
1825 GEN_VEXT_VX(vdiv_vx_d, 8)
1826 GEN_VEXT_VX(vremu_vx_b, 1)
1827 GEN_VEXT_VX(vremu_vx_h, 2)
1828 GEN_VEXT_VX(vremu_vx_w, 4)
1829 GEN_VEXT_VX(vremu_vx_d, 8)
1830 GEN_VEXT_VX(vrem_vx_b, 1)
1831 GEN_VEXT_VX(vrem_vx_h, 2)
1832 GEN_VEXT_VX(vrem_vx_w, 4)
1833 GEN_VEXT_VX(vrem_vx_d, 8)
1834 
1835 /* Vector Widening Integer Multiply Instructions */
1836 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1837 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1838 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1839 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1840 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1841 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1842 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1843 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1844 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1845 GEN_VEXT_VV(vwmul_vv_b, 2)
1846 GEN_VEXT_VV(vwmul_vv_h, 4)
1847 GEN_VEXT_VV(vwmul_vv_w, 8)
1848 GEN_VEXT_VV(vwmulu_vv_b, 2)
1849 GEN_VEXT_VV(vwmulu_vv_h, 4)
1850 GEN_VEXT_VV(vwmulu_vv_w, 8)
1851 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1852 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1853 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1854 
1855 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1856 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1857 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1858 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1859 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1860 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1861 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1862 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1863 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1864 GEN_VEXT_VX(vwmul_vx_b, 2)
1865 GEN_VEXT_VX(vwmul_vx_h, 4)
1866 GEN_VEXT_VX(vwmul_vx_w, 8)
1867 GEN_VEXT_VX(vwmulu_vx_b, 2)
1868 GEN_VEXT_VX(vwmulu_vx_h, 4)
1869 GEN_VEXT_VX(vwmulu_vx_w, 8)
1870 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1871 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1872 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1873 
1874 /* Vector Single-Width Integer Multiply-Add Instructions */
1875 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1876 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1877 {                                                                  \
1878     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1879     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1880     TD d = *((TD *)vd + HD(i));                                    \
1881     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1882 }
1883 
1884 #define DO_MACC(N, M, D) (M * N + D)
1885 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1886 #define DO_MADD(N, M, D) (M * D + N)
1887 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1888 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1889 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1890 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1891 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1892 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1893 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1894 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1895 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1896 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1897 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1898 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1899 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1900 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1901 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1902 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1903 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1904 GEN_VEXT_VV(vmacc_vv_b, 1)
1905 GEN_VEXT_VV(vmacc_vv_h, 2)
1906 GEN_VEXT_VV(vmacc_vv_w, 4)
1907 GEN_VEXT_VV(vmacc_vv_d, 8)
1908 GEN_VEXT_VV(vnmsac_vv_b, 1)
1909 GEN_VEXT_VV(vnmsac_vv_h, 2)
1910 GEN_VEXT_VV(vnmsac_vv_w, 4)
1911 GEN_VEXT_VV(vnmsac_vv_d, 8)
1912 GEN_VEXT_VV(vmadd_vv_b, 1)
1913 GEN_VEXT_VV(vmadd_vv_h, 2)
1914 GEN_VEXT_VV(vmadd_vv_w, 4)
1915 GEN_VEXT_VV(vmadd_vv_d, 8)
1916 GEN_VEXT_VV(vnmsub_vv_b, 1)
1917 GEN_VEXT_VV(vnmsub_vv_h, 2)
1918 GEN_VEXT_VV(vnmsub_vv_w, 4)
1919 GEN_VEXT_VV(vnmsub_vv_d, 8)
1920 
1921 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1922 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1923 {                                                                   \
1924     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1925     TD d = *((TD *)vd + HD(i));                                     \
1926     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1927 }
1928 
1929 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1930 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1931 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1932 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1933 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1934 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1935 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1936 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1937 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1938 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1939 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1940 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1941 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1942 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1943 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1944 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1945 GEN_VEXT_VX(vmacc_vx_b, 1)
1946 GEN_VEXT_VX(vmacc_vx_h, 2)
1947 GEN_VEXT_VX(vmacc_vx_w, 4)
1948 GEN_VEXT_VX(vmacc_vx_d, 8)
1949 GEN_VEXT_VX(vnmsac_vx_b, 1)
1950 GEN_VEXT_VX(vnmsac_vx_h, 2)
1951 GEN_VEXT_VX(vnmsac_vx_w, 4)
1952 GEN_VEXT_VX(vnmsac_vx_d, 8)
1953 GEN_VEXT_VX(vmadd_vx_b, 1)
1954 GEN_VEXT_VX(vmadd_vx_h, 2)
1955 GEN_VEXT_VX(vmadd_vx_w, 4)
1956 GEN_VEXT_VX(vmadd_vx_d, 8)
1957 GEN_VEXT_VX(vnmsub_vx_b, 1)
1958 GEN_VEXT_VX(vnmsub_vx_h, 2)
1959 GEN_VEXT_VX(vnmsub_vx_w, 4)
1960 GEN_VEXT_VX(vnmsub_vx_d, 8)
1961 
1962 /* Vector Widening Integer Multiply-Add Instructions */
1963 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1964 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1965 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1966 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1967 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1968 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1969 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1970 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1971 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1972 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1973 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1974 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1975 GEN_VEXT_VV(vwmacc_vv_b, 2)
1976 GEN_VEXT_VV(vwmacc_vv_h, 4)
1977 GEN_VEXT_VV(vwmacc_vv_w, 8)
1978 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1979 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1980 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1981 
1982 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1983 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1984 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1985 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1986 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1987 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1988 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1989 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1990 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1991 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1992 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1993 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1994 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1995 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1996 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1997 GEN_VEXT_VX(vwmacc_vx_b, 2)
1998 GEN_VEXT_VX(vwmacc_vx_h, 4)
1999 GEN_VEXT_VX(vwmacc_vx_w, 8)
2000 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2001 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2002 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2003 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2004 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2005 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2006 
2007 /* Vector Integer Merge and Move Instructions */
2008 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2009 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2010                   uint32_t desc)                                     \
2011 {                                                                    \
2012     uint32_t vl = env->vl;                                           \
2013     uint32_t esz = sizeof(ETYPE);                                    \
2014     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2015     uint32_t vta = vext_vta(desc);                                   \
2016     uint32_t i;                                                      \
2017                                                                      \
2018     for (i = env->vstart; i < vl; i++) {                             \
2019         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2020         *((ETYPE *)vd + H(i)) = s1;                                  \
2021     }                                                                \
2022     env->vstart = 0;                                                 \
2023     /* set tail elements to 1s */                                    \
2024     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2025 }
2026 
2027 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2028 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2029 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2030 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2031 
2032 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2033 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2034                   uint32_t desc)                                     \
2035 {                                                                    \
2036     uint32_t vl = env->vl;                                           \
2037     uint32_t esz = sizeof(ETYPE);                                    \
2038     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2039     uint32_t vta = vext_vta(desc);                                   \
2040     uint32_t i;                                                      \
2041                                                                      \
2042     for (i = env->vstart; i < vl; i++) {                             \
2043         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2044     }                                                                \
2045     env->vstart = 0;                                                 \
2046     /* set tail elements to 1s */                                    \
2047     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2048 }
2049 
2050 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2051 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2052 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2053 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2054 
2055 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2056 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2057                   CPURISCVState *env, uint32_t desc)                 \
2058 {                                                                    \
2059     uint32_t vl = env->vl;                                           \
2060     uint32_t esz = sizeof(ETYPE);                                    \
2061     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2062     uint32_t vta = vext_vta(desc);                                   \
2063     uint32_t i;                                                      \
2064                                                                      \
2065     for (i = env->vstart; i < vl; i++) {                             \
2066         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2067         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2068     }                                                                \
2069     env->vstart = 0;                                                 \
2070     /* set tail elements to 1s */                                    \
2071     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2072 }
2073 
2074 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2075 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2076 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2077 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2078 
2079 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2080 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2081                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2082 {                                                                    \
2083     uint32_t vl = env->vl;                                           \
2084     uint32_t esz = sizeof(ETYPE);                                    \
2085     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2086     uint32_t vta = vext_vta(desc);                                   \
2087     uint32_t i;                                                      \
2088                                                                      \
2089     for (i = env->vstart; i < vl; i++) {                             \
2090         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2091         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2092                    (ETYPE)(target_long)s1);                          \
2093         *((ETYPE *)vd + H(i)) = d;                                   \
2094     }                                                                \
2095     env->vstart = 0;                                                 \
2096     /* set tail elements to 1s */                                    \
2097     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2098 }
2099 
2100 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2101 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2102 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2103 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2104 
2105 /*
2106  *** Vector Fixed-Point Arithmetic Instructions
2107  */
2108 
2109 /* Vector Single-Width Saturating Add and Subtract */
2110 
2111 /*
2112  * As fixed point instructions probably have round mode and saturation,
2113  * define common macros for fixed point here.
2114  */
2115 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2116                           CPURISCVState *env, int vxrm);
2117 
2118 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2119 static inline void                                                  \
2120 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2121           CPURISCVState *env, int vxrm)                             \
2122 {                                                                   \
2123     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2124     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2125     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2126 }
2127 
2128 static inline void
2129 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2130              CPURISCVState *env,
2131              uint32_t vl, uint32_t vm, int vxrm,
2132              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2133 {
2134     for (uint32_t i = env->vstart; i < vl; i++) {
2135         if (!vm && !vext_elem_mask(v0, i)) {
2136             /* set masked-off elements to 1s */
2137             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2138             continue;
2139         }
2140         fn(vd, vs1, vs2, i, env, vxrm);
2141     }
2142     env->vstart = 0;
2143 }
2144 
2145 static inline void
2146 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2147              CPURISCVState *env,
2148              uint32_t desc,
2149              opivv2_rm_fn *fn, uint32_t esz)
2150 {
2151     uint32_t vm = vext_vm(desc);
2152     uint32_t vl = env->vl;
2153     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2154     uint32_t vta = vext_vta(desc);
2155     uint32_t vma = vext_vma(desc);
2156 
2157     switch (env->vxrm) {
2158     case 0: /* rnu */
2159         vext_vv_rm_1(vd, v0, vs1, vs2,
2160                      env, vl, vm, 0, fn, vma, esz);
2161         break;
2162     case 1: /* rne */
2163         vext_vv_rm_1(vd, v0, vs1, vs2,
2164                      env, vl, vm, 1, fn, vma, esz);
2165         break;
2166     case 2: /* rdn */
2167         vext_vv_rm_1(vd, v0, vs1, vs2,
2168                      env, vl, vm, 2, fn, vma, esz);
2169         break;
2170     default: /* rod */
2171         vext_vv_rm_1(vd, v0, vs1, vs2,
2172                      env, vl, vm, 3, fn, vma, esz);
2173         break;
2174     }
2175     /* set tail elements to 1s */
2176     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2177 }
2178 
2179 /* generate helpers for fixed point instructions with OPIVV format */
2180 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2181 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2182                   CPURISCVState *env, uint32_t desc)            \
2183 {                                                               \
2184     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2185                  do_##NAME, ESZ);                               \
2186 }
2187 
2188 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2189 {
2190     uint8_t res = a + b;
2191     if (res < a) {
2192         res = UINT8_MAX;
2193         env->vxsat = 0x1;
2194     }
2195     return res;
2196 }
2197 
2198 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2199                                uint16_t b)
2200 {
2201     uint16_t res = a + b;
2202     if (res < a) {
2203         res = UINT16_MAX;
2204         env->vxsat = 0x1;
2205     }
2206     return res;
2207 }
2208 
2209 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2210                                uint32_t b)
2211 {
2212     uint32_t res = a + b;
2213     if (res < a) {
2214         res = UINT32_MAX;
2215         env->vxsat = 0x1;
2216     }
2217     return res;
2218 }
2219 
2220 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2221                                uint64_t b)
2222 {
2223     uint64_t res = a + b;
2224     if (res < a) {
2225         res = UINT64_MAX;
2226         env->vxsat = 0x1;
2227     }
2228     return res;
2229 }
2230 
2231 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2232 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2233 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2234 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2235 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2236 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2237 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2238 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2239 
2240 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2241                           CPURISCVState *env, int vxrm);
2242 
2243 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2244 static inline void                                                  \
2245 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2246           CPURISCVState *env, int vxrm)                             \
2247 {                                                                   \
2248     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2249     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2250 }
2251 
2252 static inline void
2253 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2254              CPURISCVState *env,
2255              uint32_t vl, uint32_t vm, int vxrm,
2256              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2257 {
2258     for (uint32_t i = env->vstart; i < vl; i++) {
2259         if (!vm && !vext_elem_mask(v0, i)) {
2260             /* set masked-off elements to 1s */
2261             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2262             continue;
2263         }
2264         fn(vd, s1, vs2, i, env, vxrm);
2265     }
2266     env->vstart = 0;
2267 }
2268 
2269 static inline void
2270 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2271              CPURISCVState *env,
2272              uint32_t desc,
2273              opivx2_rm_fn *fn, uint32_t esz)
2274 {
2275     uint32_t vm = vext_vm(desc);
2276     uint32_t vl = env->vl;
2277     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2278     uint32_t vta = vext_vta(desc);
2279     uint32_t vma = vext_vma(desc);
2280 
2281     switch (env->vxrm) {
2282     case 0: /* rnu */
2283         vext_vx_rm_1(vd, v0, s1, vs2,
2284                      env, vl, vm, 0, fn, vma, esz);
2285         break;
2286     case 1: /* rne */
2287         vext_vx_rm_1(vd, v0, s1, vs2,
2288                      env, vl, vm, 1, fn, vma, esz);
2289         break;
2290     case 2: /* rdn */
2291         vext_vx_rm_1(vd, v0, s1, vs2,
2292                      env, vl, vm, 2, fn, vma, esz);
2293         break;
2294     default: /* rod */
2295         vext_vx_rm_1(vd, v0, s1, vs2,
2296                      env, vl, vm, 3, fn, vma, esz);
2297         break;
2298     }
2299     /* set tail elements to 1s */
2300     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2301 }
2302 
2303 /* generate helpers for fixed point instructions with OPIVX format */
2304 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2305 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2306         void *vs2, CPURISCVState *env, uint32_t desc)     \
2307 {                                                         \
2308     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2309                  do_##NAME, ESZ);                         \
2310 }
2311 
2312 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2313 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2314 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2315 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2316 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2317 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2318 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2319 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2320 
2321 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2322 {
2323     int8_t res = a + b;
2324     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2325         res = a > 0 ? INT8_MAX : INT8_MIN;
2326         env->vxsat = 0x1;
2327     }
2328     return res;
2329 }
2330 
2331 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2332 {
2333     int16_t res = a + b;
2334     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2335         res = a > 0 ? INT16_MAX : INT16_MIN;
2336         env->vxsat = 0x1;
2337     }
2338     return res;
2339 }
2340 
2341 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2342 {
2343     int32_t res = a + b;
2344     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2345         res = a > 0 ? INT32_MAX : INT32_MIN;
2346         env->vxsat = 0x1;
2347     }
2348     return res;
2349 }
2350 
2351 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2352 {
2353     int64_t res = a + b;
2354     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2355         res = a > 0 ? INT64_MAX : INT64_MIN;
2356         env->vxsat = 0x1;
2357     }
2358     return res;
2359 }
2360 
2361 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2362 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2363 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2364 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2365 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2366 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2367 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2368 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2369 
2370 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2371 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2372 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2373 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2374 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2375 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2376 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2377 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2378 
2379 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2380 {
2381     uint8_t res = a - b;
2382     if (res > a) {
2383         res = 0;
2384         env->vxsat = 0x1;
2385     }
2386     return res;
2387 }
2388 
2389 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2390                                uint16_t b)
2391 {
2392     uint16_t res = a - b;
2393     if (res > a) {
2394         res = 0;
2395         env->vxsat = 0x1;
2396     }
2397     return res;
2398 }
2399 
2400 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2401                                uint32_t b)
2402 {
2403     uint32_t res = a - b;
2404     if (res > a) {
2405         res = 0;
2406         env->vxsat = 0x1;
2407     }
2408     return res;
2409 }
2410 
2411 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2412                                uint64_t b)
2413 {
2414     uint64_t res = a - b;
2415     if (res > a) {
2416         res = 0;
2417         env->vxsat = 0x1;
2418     }
2419     return res;
2420 }
2421 
2422 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2423 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2424 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2425 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2426 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2427 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2428 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2429 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2430 
2431 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2432 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2433 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2434 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2435 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2436 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2437 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2438 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2439 
2440 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2441 {
2442     int8_t res = a - b;
2443     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2444         res = a >= 0 ? INT8_MAX : INT8_MIN;
2445         env->vxsat = 0x1;
2446     }
2447     return res;
2448 }
2449 
2450 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2451 {
2452     int16_t res = a - b;
2453     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2454         res = a >= 0 ? INT16_MAX : INT16_MIN;
2455         env->vxsat = 0x1;
2456     }
2457     return res;
2458 }
2459 
2460 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2461 {
2462     int32_t res = a - b;
2463     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2464         res = a >= 0 ? INT32_MAX : INT32_MIN;
2465         env->vxsat = 0x1;
2466     }
2467     return res;
2468 }
2469 
2470 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2471 {
2472     int64_t res = a - b;
2473     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2474         res = a >= 0 ? INT64_MAX : INT64_MIN;
2475         env->vxsat = 0x1;
2476     }
2477     return res;
2478 }
2479 
2480 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2481 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2482 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2483 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2484 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2485 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2486 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2487 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2488 
2489 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2490 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2491 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2492 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2493 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2494 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2495 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2496 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2497 
2498 /* Vector Single-Width Averaging Add and Subtract */
2499 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2500 {
2501     uint8_t d = extract64(v, shift, 1);
2502     uint8_t d1;
2503     uint64_t D1, D2;
2504 
2505     if (shift == 0 || shift > 64) {
2506         return 0;
2507     }
2508 
2509     d1 = extract64(v, shift - 1, 1);
2510     D1 = extract64(v, 0, shift);
2511     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2512         return d1;
2513     } else if (vxrm == 1) { /* round-to-nearest-even */
2514         if (shift > 1) {
2515             D2 = extract64(v, 0, shift - 1);
2516             return d1 & ((D2 != 0) | d);
2517         } else {
2518             return d1 & d;
2519         }
2520     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2521         return !d & (D1 != 0);
2522     }
2523     return 0; /* round-down (truncate) */
2524 }
2525 
2526 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2527 {
2528     int64_t res = (int64_t)a + b;
2529     uint8_t round = get_round(vxrm, res, 1);
2530 
2531     return (res >> 1) + round;
2532 }
2533 
2534 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2535 {
2536     int64_t res = a + b;
2537     uint8_t round = get_round(vxrm, res, 1);
2538     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2539 
2540     /* With signed overflow, bit 64 is inverse of bit 63. */
2541     return ((res >> 1) ^ over) + round;
2542 }
2543 
2544 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2545 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2546 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2547 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2548 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2549 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2550 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2551 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2552 
2553 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2554 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2555 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2556 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2557 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2558 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2559 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2560 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2561 
2562 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2563                                uint32_t a, uint32_t b)
2564 {
2565     uint64_t res = (uint64_t)a + b;
2566     uint8_t round = get_round(vxrm, res, 1);
2567 
2568     return (res >> 1) + round;
2569 }
2570 
2571 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2572                                uint64_t a, uint64_t b)
2573 {
2574     uint64_t res = a + b;
2575     uint8_t round = get_round(vxrm, res, 1);
2576     uint64_t over = (uint64_t)(res < a) << 63;
2577 
2578     return ((res >> 1) | over) + round;
2579 }
2580 
2581 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2582 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2583 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2584 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2585 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2586 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2587 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2588 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2589 
2590 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2591 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2592 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2593 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2594 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2595 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2596 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2597 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2598 
2599 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2600 {
2601     int64_t res = (int64_t)a - b;
2602     uint8_t round = get_round(vxrm, res, 1);
2603 
2604     return (res >> 1) + round;
2605 }
2606 
2607 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2608 {
2609     int64_t res = (int64_t)a - b;
2610     uint8_t round = get_round(vxrm, res, 1);
2611     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2612 
2613     /* With signed overflow, bit 64 is inverse of bit 63. */
2614     return ((res >> 1) ^ over) + round;
2615 }
2616 
2617 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2618 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2619 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2620 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2621 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2622 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2623 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2624 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2625 
2626 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2627 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2628 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2629 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2630 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2631 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2632 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2633 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2634 
2635 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2636                                uint32_t a, uint32_t b)
2637 {
2638     int64_t res = (int64_t)a - b;
2639     uint8_t round = get_round(vxrm, res, 1);
2640 
2641     return (res >> 1) + round;
2642 }
2643 
2644 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2645                                uint64_t a, uint64_t b)
2646 {
2647     uint64_t res = (uint64_t)a - b;
2648     uint8_t round = get_round(vxrm, res, 1);
2649     uint64_t over = (uint64_t)(res > a) << 63;
2650 
2651     return ((res >> 1) | over) + round;
2652 }
2653 
2654 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2655 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2656 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2657 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2658 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2659 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2660 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2661 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2662 
2663 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2664 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2665 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2666 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2667 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2668 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2669 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2670 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2671 
2672 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2673 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2674 {
2675     uint8_t round;
2676     int16_t res;
2677 
2678     res = (int16_t)a * (int16_t)b;
2679     round = get_round(vxrm, res, 7);
2680     res   = (res >> 7) + round;
2681 
2682     if (res > INT8_MAX) {
2683         env->vxsat = 0x1;
2684         return INT8_MAX;
2685     } else if (res < INT8_MIN) {
2686         env->vxsat = 0x1;
2687         return INT8_MIN;
2688     } else {
2689         return res;
2690     }
2691 }
2692 
2693 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2694 {
2695     uint8_t round;
2696     int32_t res;
2697 
2698     res = (int32_t)a * (int32_t)b;
2699     round = get_round(vxrm, res, 15);
2700     res   = (res >> 15) + round;
2701 
2702     if (res > INT16_MAX) {
2703         env->vxsat = 0x1;
2704         return INT16_MAX;
2705     } else if (res < INT16_MIN) {
2706         env->vxsat = 0x1;
2707         return INT16_MIN;
2708     } else {
2709         return res;
2710     }
2711 }
2712 
2713 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2714 {
2715     uint8_t round;
2716     int64_t res;
2717 
2718     res = (int64_t)a * (int64_t)b;
2719     round = get_round(vxrm, res, 31);
2720     res   = (res >> 31) + round;
2721 
2722     if (res > INT32_MAX) {
2723         env->vxsat = 0x1;
2724         return INT32_MAX;
2725     } else if (res < INT32_MIN) {
2726         env->vxsat = 0x1;
2727         return INT32_MIN;
2728     } else {
2729         return res;
2730     }
2731 }
2732 
2733 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2734 {
2735     uint8_t round;
2736     uint64_t hi_64, lo_64;
2737     int64_t res;
2738 
2739     if (a == INT64_MIN && b == INT64_MIN) {
2740         env->vxsat = 1;
2741         return INT64_MAX;
2742     }
2743 
2744     muls64(&lo_64, &hi_64, a, b);
2745     round = get_round(vxrm, lo_64, 63);
2746     /*
2747      * Cannot overflow, as there are always
2748      * 2 sign bits after multiply.
2749      */
2750     res = (hi_64 << 1) | (lo_64 >> 63);
2751     if (round) {
2752         if (res == INT64_MAX) {
2753             env->vxsat = 1;
2754         } else {
2755             res += 1;
2756         }
2757     }
2758     return res;
2759 }
2760 
2761 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2762 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2763 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2764 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2765 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2766 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2767 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2768 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2769 
2770 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2771 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2772 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2773 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2774 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2775 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2776 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2777 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2778 
2779 /* Vector Single-Width Scaling Shift Instructions */
2780 static inline uint8_t
2781 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2782 {
2783     uint8_t round, shift = b & 0x7;
2784     uint8_t res;
2785 
2786     round = get_round(vxrm, a, shift);
2787     res   = (a >> shift)  + round;
2788     return res;
2789 }
2790 static inline uint16_t
2791 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2792 {
2793     uint8_t round, shift = b & 0xf;
2794     uint16_t res;
2795 
2796     round = get_round(vxrm, a, shift);
2797     res   = (a >> shift)  + round;
2798     return res;
2799 }
2800 static inline uint32_t
2801 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2802 {
2803     uint8_t round, shift = b & 0x1f;
2804     uint32_t res;
2805 
2806     round = get_round(vxrm, a, shift);
2807     res   = (a >> shift)  + round;
2808     return res;
2809 }
2810 static inline uint64_t
2811 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2812 {
2813     uint8_t round, shift = b & 0x3f;
2814     uint64_t res;
2815 
2816     round = get_round(vxrm, a, shift);
2817     res   = (a >> shift)  + round;
2818     return res;
2819 }
2820 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2821 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2822 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2823 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2824 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2825 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2826 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2827 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2828 
2829 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2830 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2831 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2832 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2833 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2834 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2835 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2836 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2837 
2838 static inline int8_t
2839 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2840 {
2841     uint8_t round, shift = b & 0x7;
2842     int8_t res;
2843 
2844     round = get_round(vxrm, a, shift);
2845     res   = (a >> shift)  + round;
2846     return res;
2847 }
2848 static inline int16_t
2849 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2850 {
2851     uint8_t round, shift = b & 0xf;
2852     int16_t res;
2853 
2854     round = get_round(vxrm, a, shift);
2855     res   = (a >> shift)  + round;
2856     return res;
2857 }
2858 static inline int32_t
2859 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2860 {
2861     uint8_t round, shift = b & 0x1f;
2862     int32_t res;
2863 
2864     round = get_round(vxrm, a, shift);
2865     res   = (a >> shift)  + round;
2866     return res;
2867 }
2868 static inline int64_t
2869 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2870 {
2871     uint8_t round, shift = b & 0x3f;
2872     int64_t res;
2873 
2874     round = get_round(vxrm, a, shift);
2875     res   = (a >> shift)  + round;
2876     return res;
2877 }
2878 
2879 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2880 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2881 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2882 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2883 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2884 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2885 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2886 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2887 
2888 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2889 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2890 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2891 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2892 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2893 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2894 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2895 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2896 
2897 /* Vector Narrowing Fixed-Point Clip Instructions */
2898 static inline int8_t
2899 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2900 {
2901     uint8_t round, shift = b & 0xf;
2902     int16_t res;
2903 
2904     round = get_round(vxrm, a, shift);
2905     res   = (a >> shift)  + round;
2906     if (res > INT8_MAX) {
2907         env->vxsat = 0x1;
2908         return INT8_MAX;
2909     } else if (res < INT8_MIN) {
2910         env->vxsat = 0x1;
2911         return INT8_MIN;
2912     } else {
2913         return res;
2914     }
2915 }
2916 
2917 static inline int16_t
2918 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2919 {
2920     uint8_t round, shift = b & 0x1f;
2921     int32_t res;
2922 
2923     round = get_round(vxrm, a, shift);
2924     res   = (a >> shift)  + round;
2925     if (res > INT16_MAX) {
2926         env->vxsat = 0x1;
2927         return INT16_MAX;
2928     } else if (res < INT16_MIN) {
2929         env->vxsat = 0x1;
2930         return INT16_MIN;
2931     } else {
2932         return res;
2933     }
2934 }
2935 
2936 static inline int32_t
2937 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2938 {
2939     uint8_t round, shift = b & 0x3f;
2940     int64_t res;
2941 
2942     round = get_round(vxrm, a, shift);
2943     res   = (a >> shift)  + round;
2944     if (res > INT32_MAX) {
2945         env->vxsat = 0x1;
2946         return INT32_MAX;
2947     } else if (res < INT32_MIN) {
2948         env->vxsat = 0x1;
2949         return INT32_MIN;
2950     } else {
2951         return res;
2952     }
2953 }
2954 
2955 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2956 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2957 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2958 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2959 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2960 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2961 
2962 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2963 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2964 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2965 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2966 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2967 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2968 
2969 static inline uint8_t
2970 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2971 {
2972     uint8_t round, shift = b & 0xf;
2973     uint16_t res;
2974 
2975     round = get_round(vxrm, a, shift);
2976     res   = (a >> shift)  + round;
2977     if (res > UINT8_MAX) {
2978         env->vxsat = 0x1;
2979         return UINT8_MAX;
2980     } else {
2981         return res;
2982     }
2983 }
2984 
2985 static inline uint16_t
2986 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2987 {
2988     uint8_t round, shift = b & 0x1f;
2989     uint32_t res;
2990 
2991     round = get_round(vxrm, a, shift);
2992     res   = (a >> shift)  + round;
2993     if (res > UINT16_MAX) {
2994         env->vxsat = 0x1;
2995         return UINT16_MAX;
2996     } else {
2997         return res;
2998     }
2999 }
3000 
3001 static inline uint32_t
3002 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3003 {
3004     uint8_t round, shift = b & 0x3f;
3005     uint64_t res;
3006 
3007     round = get_round(vxrm, a, shift);
3008     res   = (a >> shift)  + round;
3009     if (res > UINT32_MAX) {
3010         env->vxsat = 0x1;
3011         return UINT32_MAX;
3012     } else {
3013         return res;
3014     }
3015 }
3016 
3017 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3018 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3019 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3020 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3021 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3022 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3023 
3024 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3025 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3026 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3027 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3028 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3029 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3030 
3031 /*
3032  *** Vector Float Point Arithmetic Instructions
3033  */
3034 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3035 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3036 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3037                       CPURISCVState *env)                      \
3038 {                                                              \
3039     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3040     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3041     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3042 }
3043 
3044 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3045 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3046                   void *vs2, CPURISCVState *env,          \
3047                   uint32_t desc)                          \
3048 {                                                         \
3049     uint32_t vm = vext_vm(desc);                          \
3050     uint32_t vl = env->vl;                                \
3051     uint32_t total_elems =                                \
3052         vext_get_total_elems(env, desc, ESZ);             \
3053     uint32_t vta = vext_vta(desc);                        \
3054     uint32_t i;                                           \
3055                                                           \
3056     for (i = env->vstart; i < vl; i++) {                  \
3057         if (!vm && !vext_elem_mask(v0, i)) {              \
3058             continue;                                     \
3059         }                                                 \
3060         do_##NAME(vd, vs1, vs2, i, env);                  \
3061     }                                                     \
3062     env->vstart = 0;                                      \
3063     /* set tail elements to 1s */                         \
3064     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3065                       total_elems * ESZ);                 \
3066 }
3067 
3068 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3069 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3070 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3071 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3072 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3073 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3074 
3075 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3076 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3077                       CPURISCVState *env)                      \
3078 {                                                              \
3079     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3080     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3081 }
3082 
3083 #define GEN_VEXT_VF(NAME, ESZ)                            \
3084 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3085                   void *vs2, CPURISCVState *env,          \
3086                   uint32_t desc)                          \
3087 {                                                         \
3088     uint32_t vm = vext_vm(desc);                          \
3089     uint32_t vl = env->vl;                                \
3090     uint32_t total_elems =                                \
3091         vext_get_total_elems(env, desc, ESZ);              \
3092     uint32_t vta = vext_vta(desc);                        \
3093     uint32_t i;                                           \
3094                                                           \
3095     for (i = env->vstart; i < vl; i++) {                  \
3096         if (!vm && !vext_elem_mask(v0, i)) {              \
3097             continue;                                     \
3098         }                                                 \
3099         do_##NAME(vd, s1, vs2, i, env);                   \
3100     }                                                     \
3101     env->vstart = 0;                                      \
3102     /* set tail elements to 1s */                         \
3103     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3104                       total_elems * ESZ);                 \
3105 }
3106 
3107 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3108 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3109 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3110 GEN_VEXT_VF(vfadd_vf_h, 2)
3111 GEN_VEXT_VF(vfadd_vf_w, 4)
3112 GEN_VEXT_VF(vfadd_vf_d, 8)
3113 
3114 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3115 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3116 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3117 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3118 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3119 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3120 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3121 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3122 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3123 GEN_VEXT_VF(vfsub_vf_h, 2)
3124 GEN_VEXT_VF(vfsub_vf_w, 4)
3125 GEN_VEXT_VF(vfsub_vf_d, 8)
3126 
3127 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3128 {
3129     return float16_sub(b, a, s);
3130 }
3131 
3132 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3133 {
3134     return float32_sub(b, a, s);
3135 }
3136 
3137 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3138 {
3139     return float64_sub(b, a, s);
3140 }
3141 
3142 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3143 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3144 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3145 GEN_VEXT_VF(vfrsub_vf_h, 2)
3146 GEN_VEXT_VF(vfrsub_vf_w, 4)
3147 GEN_VEXT_VF(vfrsub_vf_d, 8)
3148 
3149 /* Vector Widening Floating-Point Add/Subtract Instructions */
3150 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3151 {
3152     return float32_add(float16_to_float32(a, true, s),
3153             float16_to_float32(b, true, s), s);
3154 }
3155 
3156 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3157 {
3158     return float64_add(float32_to_float64(a, s),
3159             float32_to_float64(b, s), s);
3160 
3161 }
3162 
3163 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3164 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3165 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3166 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3167 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3168 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3169 GEN_VEXT_VF(vfwadd_vf_h, 4)
3170 GEN_VEXT_VF(vfwadd_vf_w, 8)
3171 
3172 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3173 {
3174     return float32_sub(float16_to_float32(a, true, s),
3175             float16_to_float32(b, true, s), s);
3176 }
3177 
3178 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3179 {
3180     return float64_sub(float32_to_float64(a, s),
3181             float32_to_float64(b, s), s);
3182 
3183 }
3184 
3185 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3186 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3187 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3188 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3189 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3190 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3191 GEN_VEXT_VF(vfwsub_vf_h, 4)
3192 GEN_VEXT_VF(vfwsub_vf_w, 8)
3193 
3194 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3195 {
3196     return float32_add(a, float16_to_float32(b, true, s), s);
3197 }
3198 
3199 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3200 {
3201     return float64_add(a, float32_to_float64(b, s), s);
3202 }
3203 
3204 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3205 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3206 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3207 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3208 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3209 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3210 GEN_VEXT_VF(vfwadd_wf_h, 4)
3211 GEN_VEXT_VF(vfwadd_wf_w, 8)
3212 
3213 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3214 {
3215     return float32_sub(a, float16_to_float32(b, true, s), s);
3216 }
3217 
3218 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3219 {
3220     return float64_sub(a, float32_to_float64(b, s), s);
3221 }
3222 
3223 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3224 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3225 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3226 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3227 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3228 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3229 GEN_VEXT_VF(vfwsub_wf_h, 4)
3230 GEN_VEXT_VF(vfwsub_wf_w, 8)
3231 
3232 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3233 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3234 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3235 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3236 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3237 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3238 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3239 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3240 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3241 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3242 GEN_VEXT_VF(vfmul_vf_h, 2)
3243 GEN_VEXT_VF(vfmul_vf_w, 4)
3244 GEN_VEXT_VF(vfmul_vf_d, 8)
3245 
3246 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3247 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3248 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3249 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3250 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3251 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3252 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3253 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3254 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3255 GEN_VEXT_VF(vfdiv_vf_h, 2)
3256 GEN_VEXT_VF(vfdiv_vf_w, 4)
3257 GEN_VEXT_VF(vfdiv_vf_d, 8)
3258 
3259 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3260 {
3261     return float16_div(b, a, s);
3262 }
3263 
3264 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3265 {
3266     return float32_div(b, a, s);
3267 }
3268 
3269 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3270 {
3271     return float64_div(b, a, s);
3272 }
3273 
3274 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3275 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3276 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3277 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3278 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3279 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3280 
3281 /* Vector Widening Floating-Point Multiply */
3282 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3283 {
3284     return float32_mul(float16_to_float32(a, true, s),
3285             float16_to_float32(b, true, s), s);
3286 }
3287 
3288 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3289 {
3290     return float64_mul(float32_to_float64(a, s),
3291             float32_to_float64(b, s), s);
3292 
3293 }
3294 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3295 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3296 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3297 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3298 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3299 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3300 GEN_VEXT_VF(vfwmul_vf_h, 4)
3301 GEN_VEXT_VF(vfwmul_vf_w, 8)
3302 
3303 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3304 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3305 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3306         CPURISCVState *env)                                        \
3307 {                                                                  \
3308     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3309     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3310     TD d = *((TD *)vd + HD(i));                                    \
3311     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3312 }
3313 
3314 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3315 {
3316     return float16_muladd(a, b, d, 0, s);
3317 }
3318 
3319 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3320 {
3321     return float32_muladd(a, b, d, 0, s);
3322 }
3323 
3324 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3325 {
3326     return float64_muladd(a, b, d, 0, s);
3327 }
3328 
3329 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3330 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3331 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3332 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3333 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3334 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3335 
3336 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3337 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3338         CPURISCVState *env)                                       \
3339 {                                                                 \
3340     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3341     TD d = *((TD *)vd + HD(i));                                   \
3342     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3343 }
3344 
3345 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3346 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3347 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3348 GEN_VEXT_VF(vfmacc_vf_h, 2)
3349 GEN_VEXT_VF(vfmacc_vf_w, 4)
3350 GEN_VEXT_VF(vfmacc_vf_d, 8)
3351 
3352 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3353 {
3354     return float16_muladd(a, b, d,
3355             float_muladd_negate_c | float_muladd_negate_product, s);
3356 }
3357 
3358 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3359 {
3360     return float32_muladd(a, b, d,
3361             float_muladd_negate_c | float_muladd_negate_product, s);
3362 }
3363 
3364 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3365 {
3366     return float64_muladd(a, b, d,
3367             float_muladd_negate_c | float_muladd_negate_product, s);
3368 }
3369 
3370 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3371 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3372 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3373 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3374 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3375 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3376 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3377 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3378 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3379 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3380 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3381 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3382 
3383 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3384 {
3385     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3386 }
3387 
3388 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3389 {
3390     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3391 }
3392 
3393 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3394 {
3395     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3396 }
3397 
3398 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3399 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3400 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3401 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3402 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3403 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3404 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3405 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3406 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3407 GEN_VEXT_VF(vfmsac_vf_h, 2)
3408 GEN_VEXT_VF(vfmsac_vf_w, 4)
3409 GEN_VEXT_VF(vfmsac_vf_d, 8)
3410 
3411 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3412 {
3413     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3414 }
3415 
3416 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3417 {
3418     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3419 }
3420 
3421 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3422 {
3423     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3424 }
3425 
3426 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3427 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3428 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3429 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3430 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3431 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3432 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3433 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3434 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3435 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3436 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3437 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3438 
3439 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3440 {
3441     return float16_muladd(d, b, a, 0, s);
3442 }
3443 
3444 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3445 {
3446     return float32_muladd(d, b, a, 0, s);
3447 }
3448 
3449 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3450 {
3451     return float64_muladd(d, b, a, 0, s);
3452 }
3453 
3454 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3455 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3456 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3457 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3458 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3459 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3460 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3461 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3462 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3463 GEN_VEXT_VF(vfmadd_vf_h, 2)
3464 GEN_VEXT_VF(vfmadd_vf_w, 4)
3465 GEN_VEXT_VF(vfmadd_vf_d, 8)
3466 
3467 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3468 {
3469     return float16_muladd(d, b, a,
3470             float_muladd_negate_c | float_muladd_negate_product, s);
3471 }
3472 
3473 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3474 {
3475     return float32_muladd(d, b, a,
3476             float_muladd_negate_c | float_muladd_negate_product, s);
3477 }
3478 
3479 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3480 {
3481     return float64_muladd(d, b, a,
3482             float_muladd_negate_c | float_muladd_negate_product, s);
3483 }
3484 
3485 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3486 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3487 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3488 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3489 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3490 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3491 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3492 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3493 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3494 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3495 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3496 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3497 
3498 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3499 {
3500     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3501 }
3502 
3503 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3504 {
3505     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3506 }
3507 
3508 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3509 {
3510     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3511 }
3512 
3513 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3514 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3515 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3516 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3517 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3518 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3519 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3520 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3521 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3522 GEN_VEXT_VF(vfmsub_vf_h, 2)
3523 GEN_VEXT_VF(vfmsub_vf_w, 4)
3524 GEN_VEXT_VF(vfmsub_vf_d, 8)
3525 
3526 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3527 {
3528     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3529 }
3530 
3531 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3532 {
3533     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3534 }
3535 
3536 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3537 {
3538     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3539 }
3540 
3541 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3542 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3543 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3544 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3545 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3546 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3547 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3548 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3549 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3550 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3551 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3552 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3553 
3554 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3555 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3556 {
3557     return float32_muladd(float16_to_float32(a, true, s),
3558                         float16_to_float32(b, true, s), d, 0, s);
3559 }
3560 
3561 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3562 {
3563     return float64_muladd(float32_to_float64(a, s),
3564                         float32_to_float64(b, s), d, 0, s);
3565 }
3566 
3567 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3568 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3569 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3570 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3571 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3572 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3573 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3574 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3575 
3576 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3577 {
3578     return float32_muladd(float16_to_float32(a, true, s),
3579                         float16_to_float32(b, true, s), d,
3580                         float_muladd_negate_c | float_muladd_negate_product, s);
3581 }
3582 
3583 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3584 {
3585     return float64_muladd(float32_to_float64(a, s),
3586                         float32_to_float64(b, s), d,
3587                         float_muladd_negate_c | float_muladd_negate_product, s);
3588 }
3589 
3590 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3591 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3592 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3593 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3594 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3595 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3596 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3597 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3598 
3599 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3600 {
3601     return float32_muladd(float16_to_float32(a, true, s),
3602                         float16_to_float32(b, true, s), d,
3603                         float_muladd_negate_c, s);
3604 }
3605 
3606 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3607 {
3608     return float64_muladd(float32_to_float64(a, s),
3609                         float32_to_float64(b, s), d,
3610                         float_muladd_negate_c, s);
3611 }
3612 
3613 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3614 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3615 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3616 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3617 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3618 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3619 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3620 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3621 
3622 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3623 {
3624     return float32_muladd(float16_to_float32(a, true, s),
3625                         float16_to_float32(b, true, s), d,
3626                         float_muladd_negate_product, s);
3627 }
3628 
3629 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3630 {
3631     return float64_muladd(float32_to_float64(a, s),
3632                         float32_to_float64(b, s), d,
3633                         float_muladd_negate_product, s);
3634 }
3635 
3636 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3637 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3638 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3639 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3640 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3641 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3642 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3643 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3644 
3645 /* Vector Floating-Point Square-Root Instruction */
3646 /* (TD, T2, TX2) */
3647 #define OP_UU_H uint16_t, uint16_t, uint16_t
3648 #define OP_UU_W uint32_t, uint32_t, uint32_t
3649 #define OP_UU_D uint64_t, uint64_t, uint64_t
3650 
3651 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3652 static void do_##NAME(void *vd, void *vs2, int i,      \
3653         CPURISCVState *env)                            \
3654 {                                                      \
3655     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3656     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3657 }
3658 
3659 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3660 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3661         CPURISCVState *env, uint32_t desc)             \
3662 {                                                      \
3663     uint32_t vm = vext_vm(desc);                       \
3664     uint32_t vl = env->vl;                             \
3665     uint32_t total_elems =                             \
3666         vext_get_total_elems(env, desc, ESZ);          \
3667     uint32_t vta = vext_vta(desc);                     \
3668     uint32_t i;                                        \
3669                                                        \
3670     if (vl == 0) {                                     \
3671         return;                                        \
3672     }                                                  \
3673     for (i = env->vstart; i < vl; i++) {               \
3674         if (!vm && !vext_elem_mask(v0, i)) {           \
3675             continue;                                  \
3676         }                                              \
3677         do_##NAME(vd, vs2, i, env);                    \
3678     }                                                  \
3679     env->vstart = 0;                                   \
3680     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3681                       total_elems * ESZ);              \
3682 }
3683 
3684 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3685 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3686 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3687 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3688 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3689 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3690 
3691 /*
3692  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3693  *
3694  * Adapted from riscv-v-spec recip.c:
3695  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3696  */
3697 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3698 {
3699     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3700     uint64_t exp = extract64(f, frac_size, exp_size);
3701     uint64_t frac = extract64(f, 0, frac_size);
3702 
3703     const uint8_t lookup_table[] = {
3704         52, 51, 50, 48, 47, 46, 44, 43,
3705         42, 41, 40, 39, 38, 36, 35, 34,
3706         33, 32, 31, 30, 30, 29, 28, 27,
3707         26, 25, 24, 23, 23, 22, 21, 20,
3708         19, 19, 18, 17, 16, 16, 15, 14,
3709         14, 13, 12, 12, 11, 10, 10, 9,
3710         9, 8, 7, 7, 6, 6, 5, 4,
3711         4, 3, 3, 2, 2, 1, 1, 0,
3712         127, 125, 123, 121, 119, 118, 116, 114,
3713         113, 111, 109, 108, 106, 105, 103, 102,
3714         100, 99, 97, 96, 95, 93, 92, 91,
3715         90, 88, 87, 86, 85, 84, 83, 82,
3716         80, 79, 78, 77, 76, 75, 74, 73,
3717         72, 71, 70, 70, 69, 68, 67, 66,
3718         65, 64, 63, 63, 62, 61, 60, 59,
3719         59, 58, 57, 56, 56, 55, 54, 53
3720     };
3721     const int precision = 7;
3722 
3723     if (exp == 0 && frac != 0) { /* subnormal */
3724         /* Normalize the subnormal. */
3725         while (extract64(frac, frac_size - 1, 1) == 0) {
3726             exp--;
3727             frac <<= 1;
3728         }
3729 
3730         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3731     }
3732 
3733     int idx = ((exp & 1) << (precision - 1)) |
3734                 (frac >> (frac_size - precision + 1));
3735     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3736                             (frac_size - precision);
3737     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3738 
3739     uint64_t val = 0;
3740     val = deposit64(val, 0, frac_size, out_frac);
3741     val = deposit64(val, frac_size, exp_size, out_exp);
3742     val = deposit64(val, frac_size + exp_size, 1, sign);
3743     return val;
3744 }
3745 
3746 static float16 frsqrt7_h(float16 f, float_status *s)
3747 {
3748     int exp_size = 5, frac_size = 10;
3749     bool sign = float16_is_neg(f);
3750 
3751     /*
3752      * frsqrt7(sNaN) = canonical NaN
3753      * frsqrt7(-inf) = canonical NaN
3754      * frsqrt7(-normal) = canonical NaN
3755      * frsqrt7(-subnormal) = canonical NaN
3756      */
3757     if (float16_is_signaling_nan(f, s) ||
3758             (float16_is_infinity(f) && sign) ||
3759             (float16_is_normal(f) && sign) ||
3760             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3761         s->float_exception_flags |= float_flag_invalid;
3762         return float16_default_nan(s);
3763     }
3764 
3765     /* frsqrt7(qNaN) = canonical NaN */
3766     if (float16_is_quiet_nan(f, s)) {
3767         return float16_default_nan(s);
3768     }
3769 
3770     /* frsqrt7(+-0) = +-inf */
3771     if (float16_is_zero(f)) {
3772         s->float_exception_flags |= float_flag_divbyzero;
3773         return float16_set_sign(float16_infinity, sign);
3774     }
3775 
3776     /* frsqrt7(+inf) = +0 */
3777     if (float16_is_infinity(f) && !sign) {
3778         return float16_set_sign(float16_zero, sign);
3779     }
3780 
3781     /* +normal, +subnormal */
3782     uint64_t val = frsqrt7(f, exp_size, frac_size);
3783     return make_float16(val);
3784 }
3785 
3786 static float32 frsqrt7_s(float32 f, float_status *s)
3787 {
3788     int exp_size = 8, frac_size = 23;
3789     bool sign = float32_is_neg(f);
3790 
3791     /*
3792      * frsqrt7(sNaN) = canonical NaN
3793      * frsqrt7(-inf) = canonical NaN
3794      * frsqrt7(-normal) = canonical NaN
3795      * frsqrt7(-subnormal) = canonical NaN
3796      */
3797     if (float32_is_signaling_nan(f, s) ||
3798             (float32_is_infinity(f) && sign) ||
3799             (float32_is_normal(f) && sign) ||
3800             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3801         s->float_exception_flags |= float_flag_invalid;
3802         return float32_default_nan(s);
3803     }
3804 
3805     /* frsqrt7(qNaN) = canonical NaN */
3806     if (float32_is_quiet_nan(f, s)) {
3807         return float32_default_nan(s);
3808     }
3809 
3810     /* frsqrt7(+-0) = +-inf */
3811     if (float32_is_zero(f)) {
3812         s->float_exception_flags |= float_flag_divbyzero;
3813         return float32_set_sign(float32_infinity, sign);
3814     }
3815 
3816     /* frsqrt7(+inf) = +0 */
3817     if (float32_is_infinity(f) && !sign) {
3818         return float32_set_sign(float32_zero, sign);
3819     }
3820 
3821     /* +normal, +subnormal */
3822     uint64_t val = frsqrt7(f, exp_size, frac_size);
3823     return make_float32(val);
3824 }
3825 
3826 static float64 frsqrt7_d(float64 f, float_status *s)
3827 {
3828     int exp_size = 11, frac_size = 52;
3829     bool sign = float64_is_neg(f);
3830 
3831     /*
3832      * frsqrt7(sNaN) = canonical NaN
3833      * frsqrt7(-inf) = canonical NaN
3834      * frsqrt7(-normal) = canonical NaN
3835      * frsqrt7(-subnormal) = canonical NaN
3836      */
3837     if (float64_is_signaling_nan(f, s) ||
3838             (float64_is_infinity(f) && sign) ||
3839             (float64_is_normal(f) && sign) ||
3840             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3841         s->float_exception_flags |= float_flag_invalid;
3842         return float64_default_nan(s);
3843     }
3844 
3845     /* frsqrt7(qNaN) = canonical NaN */
3846     if (float64_is_quiet_nan(f, s)) {
3847         return float64_default_nan(s);
3848     }
3849 
3850     /* frsqrt7(+-0) = +-inf */
3851     if (float64_is_zero(f)) {
3852         s->float_exception_flags |= float_flag_divbyzero;
3853         return float64_set_sign(float64_infinity, sign);
3854     }
3855 
3856     /* frsqrt7(+inf) = +0 */
3857     if (float64_is_infinity(f) && !sign) {
3858         return float64_set_sign(float64_zero, sign);
3859     }
3860 
3861     /* +normal, +subnormal */
3862     uint64_t val = frsqrt7(f, exp_size, frac_size);
3863     return make_float64(val);
3864 }
3865 
3866 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3867 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3868 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3869 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3870 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3871 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3872 
3873 /*
3874  * Vector Floating-Point Reciprocal Estimate Instruction
3875  *
3876  * Adapted from riscv-v-spec recip.c:
3877  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3878  */
3879 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3880                       float_status *s)
3881 {
3882     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3883     uint64_t exp = extract64(f, frac_size, exp_size);
3884     uint64_t frac = extract64(f, 0, frac_size);
3885 
3886     const uint8_t lookup_table[] = {
3887         127, 125, 123, 121, 119, 117, 116, 114,
3888         112, 110, 109, 107, 105, 104, 102, 100,
3889         99, 97, 96, 94, 93, 91, 90, 88,
3890         87, 85, 84, 83, 81, 80, 79, 77,
3891         76, 75, 74, 72, 71, 70, 69, 68,
3892         66, 65, 64, 63, 62, 61, 60, 59,
3893         58, 57, 56, 55, 54, 53, 52, 51,
3894         50, 49, 48, 47, 46, 45, 44, 43,
3895         42, 41, 40, 40, 39, 38, 37, 36,
3896         35, 35, 34, 33, 32, 31, 31, 30,
3897         29, 28, 28, 27, 26, 25, 25, 24,
3898         23, 23, 22, 21, 21, 20, 19, 19,
3899         18, 17, 17, 16, 15, 15, 14, 14,
3900         13, 12, 12, 11, 11, 10, 9, 9,
3901         8, 8, 7, 7, 6, 5, 5, 4,
3902         4, 3, 3, 2, 2, 1, 1, 0
3903     };
3904     const int precision = 7;
3905 
3906     if (exp == 0 && frac != 0) { /* subnormal */
3907         /* Normalize the subnormal. */
3908         while (extract64(frac, frac_size - 1, 1) == 0) {
3909             exp--;
3910             frac <<= 1;
3911         }
3912 
3913         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3914 
3915         if (exp != 0 && exp != UINT64_MAX) {
3916             /*
3917              * Overflow to inf or max value of same sign,
3918              * depending on sign and rounding mode.
3919              */
3920             s->float_exception_flags |= (float_flag_inexact |
3921                                          float_flag_overflow);
3922 
3923             if ((s->float_rounding_mode == float_round_to_zero) ||
3924                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3925                 ((s->float_rounding_mode == float_round_up) && sign)) {
3926                 /* Return greatest/negative finite value. */
3927                 return (sign << (exp_size + frac_size)) |
3928                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3929             } else {
3930                 /* Return +-inf. */
3931                 return (sign << (exp_size + frac_size)) |
3932                     MAKE_64BIT_MASK(frac_size, exp_size);
3933             }
3934         }
3935     }
3936 
3937     int idx = frac >> (frac_size - precision);
3938     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3939                             (frac_size - precision);
3940     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3941 
3942     if (out_exp == 0 || out_exp == UINT64_MAX) {
3943         /*
3944          * The result is subnormal, but don't raise the underflow exception,
3945          * because there's no additional loss of precision.
3946          */
3947         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3948         if (out_exp == UINT64_MAX) {
3949             out_frac >>= 1;
3950             out_exp = 0;
3951         }
3952     }
3953 
3954     uint64_t val = 0;
3955     val = deposit64(val, 0, frac_size, out_frac);
3956     val = deposit64(val, frac_size, exp_size, out_exp);
3957     val = deposit64(val, frac_size + exp_size, 1, sign);
3958     return val;
3959 }
3960 
3961 static float16 frec7_h(float16 f, float_status *s)
3962 {
3963     int exp_size = 5, frac_size = 10;
3964     bool sign = float16_is_neg(f);
3965 
3966     /* frec7(+-inf) = +-0 */
3967     if (float16_is_infinity(f)) {
3968         return float16_set_sign(float16_zero, sign);
3969     }
3970 
3971     /* frec7(+-0) = +-inf */
3972     if (float16_is_zero(f)) {
3973         s->float_exception_flags |= float_flag_divbyzero;
3974         return float16_set_sign(float16_infinity, sign);
3975     }
3976 
3977     /* frec7(sNaN) = canonical NaN */
3978     if (float16_is_signaling_nan(f, s)) {
3979         s->float_exception_flags |= float_flag_invalid;
3980         return float16_default_nan(s);
3981     }
3982 
3983     /* frec7(qNaN) = canonical NaN */
3984     if (float16_is_quiet_nan(f, s)) {
3985         return float16_default_nan(s);
3986     }
3987 
3988     /* +-normal, +-subnormal */
3989     uint64_t val = frec7(f, exp_size, frac_size, s);
3990     return make_float16(val);
3991 }
3992 
3993 static float32 frec7_s(float32 f, float_status *s)
3994 {
3995     int exp_size = 8, frac_size = 23;
3996     bool sign = float32_is_neg(f);
3997 
3998     /* frec7(+-inf) = +-0 */
3999     if (float32_is_infinity(f)) {
4000         return float32_set_sign(float32_zero, sign);
4001     }
4002 
4003     /* frec7(+-0) = +-inf */
4004     if (float32_is_zero(f)) {
4005         s->float_exception_flags |= float_flag_divbyzero;
4006         return float32_set_sign(float32_infinity, sign);
4007     }
4008 
4009     /* frec7(sNaN) = canonical NaN */
4010     if (float32_is_signaling_nan(f, s)) {
4011         s->float_exception_flags |= float_flag_invalid;
4012         return float32_default_nan(s);
4013     }
4014 
4015     /* frec7(qNaN) = canonical NaN */
4016     if (float32_is_quiet_nan(f, s)) {
4017         return float32_default_nan(s);
4018     }
4019 
4020     /* +-normal, +-subnormal */
4021     uint64_t val = frec7(f, exp_size, frac_size, s);
4022     return make_float32(val);
4023 }
4024 
4025 static float64 frec7_d(float64 f, float_status *s)
4026 {
4027     int exp_size = 11, frac_size = 52;
4028     bool sign = float64_is_neg(f);
4029 
4030     /* frec7(+-inf) = +-0 */
4031     if (float64_is_infinity(f)) {
4032         return float64_set_sign(float64_zero, sign);
4033     }
4034 
4035     /* frec7(+-0) = +-inf */
4036     if (float64_is_zero(f)) {
4037         s->float_exception_flags |= float_flag_divbyzero;
4038         return float64_set_sign(float64_infinity, sign);
4039     }
4040 
4041     /* frec7(sNaN) = canonical NaN */
4042     if (float64_is_signaling_nan(f, s)) {
4043         s->float_exception_flags |= float_flag_invalid;
4044         return float64_default_nan(s);
4045     }
4046 
4047     /* frec7(qNaN) = canonical NaN */
4048     if (float64_is_quiet_nan(f, s)) {
4049         return float64_default_nan(s);
4050     }
4051 
4052     /* +-normal, +-subnormal */
4053     uint64_t val = frec7(f, exp_size, frac_size, s);
4054     return make_float64(val);
4055 }
4056 
4057 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4058 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4059 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4060 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4061 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4062 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4063 
4064 /* Vector Floating-Point MIN/MAX Instructions */
4065 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4066 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4067 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4068 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4069 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4070 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4071 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4072 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4073 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4074 GEN_VEXT_VF(vfmin_vf_h, 2)
4075 GEN_VEXT_VF(vfmin_vf_w, 4)
4076 GEN_VEXT_VF(vfmin_vf_d, 8)
4077 
4078 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4079 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4080 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4081 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4082 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4083 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4084 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4085 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4086 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4087 GEN_VEXT_VF(vfmax_vf_h, 2)
4088 GEN_VEXT_VF(vfmax_vf_w, 4)
4089 GEN_VEXT_VF(vfmax_vf_d, 8)
4090 
4091 /* Vector Floating-Point Sign-Injection Instructions */
4092 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4093 {
4094     return deposit64(b, 0, 15, a);
4095 }
4096 
4097 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4098 {
4099     return deposit64(b, 0, 31, a);
4100 }
4101 
4102 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4103 {
4104     return deposit64(b, 0, 63, a);
4105 }
4106 
4107 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4108 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4109 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4110 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4111 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4112 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4113 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4114 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4115 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4116 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4117 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4118 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4119 
4120 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4121 {
4122     return deposit64(~b, 0, 15, a);
4123 }
4124 
4125 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4126 {
4127     return deposit64(~b, 0, 31, a);
4128 }
4129 
4130 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4131 {
4132     return deposit64(~b, 0, 63, a);
4133 }
4134 
4135 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4136 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4137 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4138 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4139 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4140 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4141 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4142 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4143 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4144 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4145 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4146 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4147 
4148 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4149 {
4150     return deposit64(b ^ a, 0, 15, a);
4151 }
4152 
4153 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4154 {
4155     return deposit64(b ^ a, 0, 31, a);
4156 }
4157 
4158 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4159 {
4160     return deposit64(b ^ a, 0, 63, a);
4161 }
4162 
4163 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4164 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4165 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4166 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4167 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4168 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4169 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4170 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4171 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4172 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4173 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4174 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4175 
4176 /* Vector Floating-Point Compare Instructions */
4177 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4178 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4179                   CPURISCVState *env, uint32_t desc)          \
4180 {                                                             \
4181     uint32_t vm = vext_vm(desc);                              \
4182     uint32_t vl = env->vl;                                    \
4183     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
4184     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4185     uint32_t i;                                               \
4186                                                               \
4187     for (i = env->vstart; i < vl; i++) {                      \
4188         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4189         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4190         if (!vm && !vext_elem_mask(v0, i)) {                  \
4191             continue;                                         \
4192         }                                                     \
4193         vext_set_elem_mask(vd, i,                             \
4194                            DO_OP(s2, s1, &env->fp_status));   \
4195     }                                                         \
4196     env->vstart = 0;                                          \
4197     /* mask destination register are always tail-agnostic */  \
4198     /* set tail elements to 1s */                             \
4199     if (vta_all_1s) {                                         \
4200         for (; i < total_elems; i++) {                        \
4201             vext_set_elem_mask(vd, i, 1);                     \
4202         }                                                     \
4203     }                                                         \
4204 }
4205 
4206 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4207 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4208 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4209 
4210 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4211 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4212                   CPURISCVState *env, uint32_t desc)                \
4213 {                                                                   \
4214     uint32_t vm = vext_vm(desc);                                    \
4215     uint32_t vl = env->vl;                                          \
4216     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
4217     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4218     uint32_t i;                                                     \
4219                                                                     \
4220     for (i = env->vstart; i < vl; i++) {                            \
4221         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4222         if (!vm && !vext_elem_mask(v0, i)) {                        \
4223             continue;                                               \
4224         }                                                           \
4225         vext_set_elem_mask(vd, i,                                   \
4226                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4227     }                                                               \
4228     env->vstart = 0;                                                \
4229     /* mask destination register are always tail-agnostic */        \
4230     /* set tail elements to 1s */                                   \
4231     if (vta_all_1s) {                                               \
4232         for (; i < total_elems; i++) {                              \
4233             vext_set_elem_mask(vd, i, 1);                           \
4234         }                                                           \
4235     }                                                               \
4236 }
4237 
4238 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4239 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4240 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4241 
4242 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4243 {
4244     FloatRelation compare = float16_compare_quiet(a, b, s);
4245     return compare != float_relation_equal;
4246 }
4247 
4248 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4249 {
4250     FloatRelation compare = float32_compare_quiet(a, b, s);
4251     return compare != float_relation_equal;
4252 }
4253 
4254 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4255 {
4256     FloatRelation compare = float64_compare_quiet(a, b, s);
4257     return compare != float_relation_equal;
4258 }
4259 
4260 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4261 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4262 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4263 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4264 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4265 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4266 
4267 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4268 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4269 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4270 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4271 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4272 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4273 
4274 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4275 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4276 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4277 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4278 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4279 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4280 
4281 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4282 {
4283     FloatRelation compare = float16_compare(a, b, s);
4284     return compare == float_relation_greater;
4285 }
4286 
4287 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4288 {
4289     FloatRelation compare = float32_compare(a, b, s);
4290     return compare == float_relation_greater;
4291 }
4292 
4293 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4294 {
4295     FloatRelation compare = float64_compare(a, b, s);
4296     return compare == float_relation_greater;
4297 }
4298 
4299 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4300 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4301 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4302 
4303 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4304 {
4305     FloatRelation compare = float16_compare(a, b, s);
4306     return compare == float_relation_greater ||
4307            compare == float_relation_equal;
4308 }
4309 
4310 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4311 {
4312     FloatRelation compare = float32_compare(a, b, s);
4313     return compare == float_relation_greater ||
4314            compare == float_relation_equal;
4315 }
4316 
4317 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4318 {
4319     FloatRelation compare = float64_compare(a, b, s);
4320     return compare == float_relation_greater ||
4321            compare == float_relation_equal;
4322 }
4323 
4324 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4325 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4326 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4327 
4328 /* Vector Floating-Point Classify Instruction */
4329 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4330 static void do_##NAME(void *vd, void *vs2, int i)      \
4331 {                                                      \
4332     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4333     *((TD *)vd + HD(i)) = OP(s2);                      \
4334 }
4335 
4336 #define GEN_VEXT_V(NAME, ESZ)                          \
4337 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4338                   CPURISCVState *env, uint32_t desc)   \
4339 {                                                      \
4340     uint32_t vm = vext_vm(desc);                       \
4341     uint32_t vl = env->vl;                             \
4342     uint32_t total_elems =                             \
4343         vext_get_total_elems(env, desc, ESZ);          \
4344     uint32_t vta = vext_vta(desc);                     \
4345     uint32_t i;                                        \
4346                                                        \
4347     for (i = env->vstart; i < vl; i++) {               \
4348         if (!vm && !vext_elem_mask(v0, i)) {           \
4349             continue;                                  \
4350         }                                              \
4351         do_##NAME(vd, vs2, i);                         \
4352     }                                                  \
4353     env->vstart = 0;                                   \
4354     /* set tail elements to 1s */                      \
4355     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4356                       total_elems * ESZ);              \
4357 }
4358 
4359 target_ulong fclass_h(uint64_t frs1)
4360 {
4361     float16 f = frs1;
4362     bool sign = float16_is_neg(f);
4363 
4364     if (float16_is_infinity(f)) {
4365         return sign ? 1 << 0 : 1 << 7;
4366     } else if (float16_is_zero(f)) {
4367         return sign ? 1 << 3 : 1 << 4;
4368     } else if (float16_is_zero_or_denormal(f)) {
4369         return sign ? 1 << 2 : 1 << 5;
4370     } else if (float16_is_any_nan(f)) {
4371         float_status s = { }; /* for snan_bit_is_one */
4372         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4373     } else {
4374         return sign ? 1 << 1 : 1 << 6;
4375     }
4376 }
4377 
4378 target_ulong fclass_s(uint64_t frs1)
4379 {
4380     float32 f = frs1;
4381     bool sign = float32_is_neg(f);
4382 
4383     if (float32_is_infinity(f)) {
4384         return sign ? 1 << 0 : 1 << 7;
4385     } else if (float32_is_zero(f)) {
4386         return sign ? 1 << 3 : 1 << 4;
4387     } else if (float32_is_zero_or_denormal(f)) {
4388         return sign ? 1 << 2 : 1 << 5;
4389     } else if (float32_is_any_nan(f)) {
4390         float_status s = { }; /* for snan_bit_is_one */
4391         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4392     } else {
4393         return sign ? 1 << 1 : 1 << 6;
4394     }
4395 }
4396 
4397 target_ulong fclass_d(uint64_t frs1)
4398 {
4399     float64 f = frs1;
4400     bool sign = float64_is_neg(f);
4401 
4402     if (float64_is_infinity(f)) {
4403         return sign ? 1 << 0 : 1 << 7;
4404     } else if (float64_is_zero(f)) {
4405         return sign ? 1 << 3 : 1 << 4;
4406     } else if (float64_is_zero_or_denormal(f)) {
4407         return sign ? 1 << 2 : 1 << 5;
4408     } else if (float64_is_any_nan(f)) {
4409         float_status s = { }; /* for snan_bit_is_one */
4410         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4411     } else {
4412         return sign ? 1 << 1 : 1 << 6;
4413     }
4414 }
4415 
4416 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4417 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4418 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4419 GEN_VEXT_V(vfclass_v_h, 2)
4420 GEN_VEXT_V(vfclass_v_w, 4)
4421 GEN_VEXT_V(vfclass_v_d, 8)
4422 
4423 /* Vector Floating-Point Merge Instruction */
4424 
4425 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4426 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4427                   CPURISCVState *env, uint32_t desc)          \
4428 {                                                             \
4429     uint32_t vm = vext_vm(desc);                              \
4430     uint32_t vl = env->vl;                                    \
4431     uint32_t esz = sizeof(ETYPE);                             \
4432     uint32_t total_elems =                                    \
4433         vext_get_total_elems(env, desc, esz);                 \
4434     uint32_t vta = vext_vta(desc);                            \
4435     uint32_t i;                                               \
4436                                                               \
4437     for (i = env->vstart; i < vl; i++) {                      \
4438         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4439         *((ETYPE *)vd + H(i))                                 \
4440           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4441     }                                                         \
4442     env->vstart = 0;                                          \
4443     /* set tail elements to 1s */                             \
4444     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4445 }
4446 
4447 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4448 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4449 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4450 
4451 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4452 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4453 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4454 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4455 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4456 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4457 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4458 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4459 
4460 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4461 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4462 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4463 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4464 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4465 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4466 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4467 
4468 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4469 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4470 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4471 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4472 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4473 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4474 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4475 
4476 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4477 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4478 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4479 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4480 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4481 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4482 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4483 
4484 /* Widening Floating-Point/Integer Type-Convert Instructions */
4485 /* (TD, T2, TX2) */
4486 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4487 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4488 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4489 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4490 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4491 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4492 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4493 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4494 
4495 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4496 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4497 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4498 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4499 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4500 
4501 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4502 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4503 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4504 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4505 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4506 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4507 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4508 
4509 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4510 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4511 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4512 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4513 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4514 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4515 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4516 
4517 /*
4518  * vfwcvt.f.f.v vd, vs2, vm
4519  * Convert single-width float to double-width float.
4520  */
4521 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4522 {
4523     return float16_to_float32(a, true, s);
4524 }
4525 
4526 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4527 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4528 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4529 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4530 
4531 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4532 /* (TD, T2, TX2) */
4533 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4534 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4535 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4536 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4537 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4538 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4539 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4540 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4541 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4542 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4543 
4544 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4545 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4546 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4547 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4548 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4549 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4550 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4551 
4552 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4553 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4554 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4555 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4556 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4557 
4558 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4559 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4560 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4561 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4562 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4563 
4564 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4565 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4566 {
4567     return float32_to_float16(a, true, s);
4568 }
4569 
4570 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4571 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4572 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4573 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4574 
4575 /*
4576  *** Vector Reduction Operations
4577  */
4578 /* Vector Single-Width Integer Reduction Instructions */
4579 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4580 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4581         void *vs2, CPURISCVState *env, uint32_t desc)     \
4582 {                                                         \
4583     uint32_t vm = vext_vm(desc);                          \
4584     uint32_t vl = env->vl;                                \
4585     uint32_t esz = sizeof(TD);                            \
4586     uint32_t vlenb = simd_maxsz(desc);                    \
4587     uint32_t vta = vext_vta(desc);                        \
4588     uint32_t i;                                           \
4589     TD s1 =  *((TD *)vs1 + HD(0));                        \
4590                                                           \
4591     for (i = env->vstart; i < vl; i++) {                  \
4592         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4593         if (!vm && !vext_elem_mask(v0, i)) {              \
4594             continue;                                     \
4595         }                                                 \
4596         s1 = OP(s1, (TD)s2);                              \
4597     }                                                     \
4598     *((TD *)vd + HD(0)) = s1;                             \
4599     env->vstart = 0;                                      \
4600     /* set tail elements to 1s */                         \
4601     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4602 }
4603 
4604 /* vd[0] = sum(vs1[0], vs2[*]) */
4605 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4606 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4607 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4608 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4609 
4610 /* vd[0] = maxu(vs1[0], vs2[*]) */
4611 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4612 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4613 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4614 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4615 
4616 /* vd[0] = max(vs1[0], vs2[*]) */
4617 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4618 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4619 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4620 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4621 
4622 /* vd[0] = minu(vs1[0], vs2[*]) */
4623 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4624 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4625 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4626 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4627 
4628 /* vd[0] = min(vs1[0], vs2[*]) */
4629 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4630 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4631 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4632 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4633 
4634 /* vd[0] = and(vs1[0], vs2[*]) */
4635 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4636 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4637 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4638 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4639 
4640 /* vd[0] = or(vs1[0], vs2[*]) */
4641 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4642 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4643 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4644 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4645 
4646 /* vd[0] = xor(vs1[0], vs2[*]) */
4647 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4648 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4649 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4650 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4651 
4652 /* Vector Widening Integer Reduction Instructions */
4653 /* signed sum reduction into double-width accumulator */
4654 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4655 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4656 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4657 
4658 /* Unsigned sum reduction into double-width accumulator */
4659 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4660 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4661 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4662 
4663 /* Vector Single-Width Floating-Point Reduction Instructions */
4664 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4665 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4666                   void *vs2, CPURISCVState *env,           \
4667                   uint32_t desc)                           \
4668 {                                                          \
4669     uint32_t vm = vext_vm(desc);                           \
4670     uint32_t vl = env->vl;                                 \
4671     uint32_t esz = sizeof(TD);                             \
4672     uint32_t vlenb = simd_maxsz(desc);                     \
4673     uint32_t vta = vext_vta(desc);                         \
4674     uint32_t i;                                            \
4675     TD s1 =  *((TD *)vs1 + HD(0));                         \
4676                                                            \
4677     for (i = env->vstart; i < vl; i++) {                   \
4678         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4679         if (!vm && !vext_elem_mask(v0, i)) {               \
4680             continue;                                      \
4681         }                                                  \
4682         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4683     }                                                      \
4684     *((TD *)vd + HD(0)) = s1;                              \
4685     env->vstart = 0;                                       \
4686     /* set tail elements to 1s */                          \
4687     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4688 }
4689 
4690 /* Unordered sum */
4691 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4692 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4693 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4694 
4695 /* Maximum value */
4696 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4697 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4698 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4699 
4700 /* Minimum value */
4701 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4702 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4703 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4704 
4705 /* Vector Widening Floating-Point Reduction Instructions */
4706 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4707 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4708                             void *vs2, CPURISCVState *env, uint32_t desc)
4709 {
4710     uint32_t vm = vext_vm(desc);
4711     uint32_t vl = env->vl;
4712     uint32_t esz = sizeof(uint32_t);
4713     uint32_t vlenb = simd_maxsz(desc);
4714     uint32_t vta = vext_vta(desc);
4715     uint32_t i;
4716     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4717 
4718     for (i = env->vstart; i < vl; i++) {
4719         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4720         if (!vm && !vext_elem_mask(v0, i)) {
4721             continue;
4722         }
4723         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4724                          &env->fp_status);
4725     }
4726     *((uint32_t *)vd + H4(0)) = s1;
4727     env->vstart = 0;
4728     /* set tail elements to 1s */
4729     vext_set_elems_1s(vd, vta, esz, vlenb);
4730 }
4731 
4732 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4733                             void *vs2, CPURISCVState *env, uint32_t desc)
4734 {
4735     uint32_t vm = vext_vm(desc);
4736     uint32_t vl = env->vl;
4737     uint32_t esz = sizeof(uint64_t);
4738     uint32_t vlenb = simd_maxsz(desc);
4739     uint32_t vta = vext_vta(desc);
4740     uint32_t i;
4741     uint64_t s1 =  *((uint64_t *)vs1);
4742 
4743     for (i = env->vstart; i < vl; i++) {
4744         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4745         if (!vm && !vext_elem_mask(v0, i)) {
4746             continue;
4747         }
4748         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4749                          &env->fp_status);
4750     }
4751     *((uint64_t *)vd) = s1;
4752     env->vstart = 0;
4753     /* set tail elements to 1s */
4754     vext_set_elems_1s(vd, vta, esz, vlenb);
4755 }
4756 
4757 /*
4758  *** Vector Mask Operations
4759  */
4760 /* Vector Mask-Register Logical Instructions */
4761 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4762 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4763                   void *vs2, CPURISCVState *env,          \
4764                   uint32_t desc)                          \
4765 {                                                         \
4766     uint32_t vl = env->vl;                                \
4767     uint32_t total_elems = env_archcpu(env)->cfg.vlen;    \
4768     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4769     uint32_t i;                                           \
4770     int a, b;                                             \
4771                                                           \
4772     for (i = env->vstart; i < vl; i++) {                  \
4773         a = vext_elem_mask(vs1, i);                       \
4774         b = vext_elem_mask(vs2, i);                       \
4775         vext_set_elem_mask(vd, i, OP(b, a));              \
4776     }                                                     \
4777     env->vstart = 0;                                      \
4778     /* mask destination register are always tail-         \
4779      * agnostic                                           \
4780      */                                                   \
4781     /* set tail elements to 1s */                         \
4782     if (vta_all_1s) {                                     \
4783         for (; i < total_elems; i++) {                    \
4784             vext_set_elem_mask(vd, i, 1);                 \
4785         }                                                 \
4786     }                                                     \
4787 }
4788 
4789 #define DO_NAND(N, M)  (!(N & M))
4790 #define DO_ANDNOT(N, M)  (N & !M)
4791 #define DO_NOR(N, M)  (!(N | M))
4792 #define DO_ORNOT(N, M)  (N | !M)
4793 #define DO_XNOR(N, M)  (!(N ^ M))
4794 
4795 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4796 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4797 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4798 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4799 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4800 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4801 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4802 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4803 
4804 /* Vector count population in mask vcpop */
4805 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4806                              uint32_t desc)
4807 {
4808     target_ulong cnt = 0;
4809     uint32_t vm = vext_vm(desc);
4810     uint32_t vl = env->vl;
4811     int i;
4812 
4813     for (i = env->vstart; i < vl; i++) {
4814         if (vm || vext_elem_mask(v0, i)) {
4815             if (vext_elem_mask(vs2, i)) {
4816                 cnt++;
4817             }
4818         }
4819     }
4820     env->vstart = 0;
4821     return cnt;
4822 }
4823 
4824 /* vfirst find-first-set mask bit*/
4825 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4826                               uint32_t desc)
4827 {
4828     uint32_t vm = vext_vm(desc);
4829     uint32_t vl = env->vl;
4830     int i;
4831 
4832     for (i = env->vstart; i < vl; i++) {
4833         if (vm || vext_elem_mask(v0, i)) {
4834             if (vext_elem_mask(vs2, i)) {
4835                 return i;
4836             }
4837         }
4838     }
4839     env->vstart = 0;
4840     return -1LL;
4841 }
4842 
4843 enum set_mask_type {
4844     ONLY_FIRST = 1,
4845     INCLUDE_FIRST,
4846     BEFORE_FIRST,
4847 };
4848 
4849 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4850                    uint32_t desc, enum set_mask_type type)
4851 {
4852     uint32_t vm = vext_vm(desc);
4853     uint32_t vl = env->vl;
4854     uint32_t total_elems = env_archcpu(env)->cfg.vlen;
4855     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4856     int i;
4857     bool first_mask_bit = false;
4858 
4859     for (i = env->vstart; i < vl; i++) {
4860         if (!vm && !vext_elem_mask(v0, i)) {
4861             continue;
4862         }
4863         /* write a zero to all following active elements */
4864         if (first_mask_bit) {
4865             vext_set_elem_mask(vd, i, 0);
4866             continue;
4867         }
4868         if (vext_elem_mask(vs2, i)) {
4869             first_mask_bit = true;
4870             if (type == BEFORE_FIRST) {
4871                 vext_set_elem_mask(vd, i, 0);
4872             } else {
4873                 vext_set_elem_mask(vd, i, 1);
4874             }
4875         } else {
4876             if (type == ONLY_FIRST) {
4877                 vext_set_elem_mask(vd, i, 0);
4878             } else {
4879                 vext_set_elem_mask(vd, i, 1);
4880             }
4881         }
4882     }
4883     env->vstart = 0;
4884     /* mask destination register are always tail-agnostic */
4885     /* set tail elements to 1s */
4886     if (vta_all_1s) {
4887         for (; i < total_elems; i++) {
4888             vext_set_elem_mask(vd, i, 1);
4889         }
4890     }
4891 }
4892 
4893 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4894                      uint32_t desc)
4895 {
4896     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4897 }
4898 
4899 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4900                      uint32_t desc)
4901 {
4902     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4903 }
4904 
4905 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4906                      uint32_t desc)
4907 {
4908     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4909 }
4910 
4911 /* Vector Iota Instruction */
4912 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4913 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4914                   uint32_t desc)                                          \
4915 {                                                                         \
4916     uint32_t vm = vext_vm(desc);                                          \
4917     uint32_t vl = env->vl;                                                \
4918     uint32_t esz = sizeof(ETYPE);                                         \
4919     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4920     uint32_t vta = vext_vta(desc);                                        \
4921     uint32_t sum = 0;                                                     \
4922     int i;                                                                \
4923                                                                           \
4924     for (i = env->vstart; i < vl; i++) {                                  \
4925         if (!vm && !vext_elem_mask(v0, i)) {                              \
4926             continue;                                                     \
4927         }                                                                 \
4928         *((ETYPE *)vd + H(i)) = sum;                                      \
4929         if (vext_elem_mask(vs2, i)) {                                     \
4930             sum++;                                                        \
4931         }                                                                 \
4932     }                                                                     \
4933     env->vstart = 0;                                                      \
4934     /* set tail elements to 1s */                                         \
4935     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4936 }
4937 
4938 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4939 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4940 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4941 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4942 
4943 /* Vector Element Index Instruction */
4944 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4945 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4946 {                                                                         \
4947     uint32_t vm = vext_vm(desc);                                          \
4948     uint32_t vl = env->vl;                                                \
4949     uint32_t esz = sizeof(ETYPE);                                         \
4950     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4951     uint32_t vta = vext_vta(desc);                                        \
4952     int i;                                                                \
4953                                                                           \
4954     for (i = env->vstart; i < vl; i++) {                                  \
4955         if (!vm && !vext_elem_mask(v0, i)) {                              \
4956             continue;                                                     \
4957         }                                                                 \
4958         *((ETYPE *)vd + H(i)) = i;                                        \
4959     }                                                                     \
4960     env->vstart = 0;                                                      \
4961     /* set tail elements to 1s */                                         \
4962     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4963 }
4964 
4965 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4966 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4967 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4968 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4969 
4970 /*
4971  *** Vector Permutation Instructions
4972  */
4973 
4974 /* Vector Slide Instructions */
4975 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4976 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4977                   CPURISCVState *env, uint32_t desc)                      \
4978 {                                                                         \
4979     uint32_t vm = vext_vm(desc);                                          \
4980     uint32_t vl = env->vl;                                                \
4981     uint32_t esz = sizeof(ETYPE);                                         \
4982     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4983     uint32_t vta = vext_vta(desc);                                        \
4984     target_ulong offset = s1, i_min, i;                                   \
4985                                                                           \
4986     i_min = MAX(env->vstart, offset);                                     \
4987     for (i = i_min; i < vl; i++) {                                        \
4988         if (!vm && !vext_elem_mask(v0, i)) {                              \
4989             continue;                                                     \
4990         }                                                                 \
4991         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4992     }                                                                     \
4993     /* set tail elements to 1s */                                         \
4994     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4995 }
4996 
4997 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4998 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4999 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5000 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5001 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5002 
5003 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5004 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5005                   CPURISCVState *env, uint32_t desc)                      \
5006 {                                                                         \
5007     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5008     uint32_t vm = vext_vm(desc);                                          \
5009     uint32_t vl = env->vl;                                                \
5010     uint32_t esz = sizeof(ETYPE);                                         \
5011     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5012     uint32_t vta = vext_vta(desc);                                        \
5013     target_ulong i_max, i;                                                \
5014                                                                           \
5015     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
5016     for (i = env->vstart; i < i_max; ++i) {                               \
5017         if (vm || vext_elem_mask(v0, i)) {                                \
5018             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
5019         }                                                                 \
5020     }                                                                     \
5021                                                                           \
5022     for (i = i_max; i < vl; ++i) {                                        \
5023         if (vm || vext_elem_mask(v0, i)) {                                \
5024             *((ETYPE *)vd + H(i)) = 0;                                    \
5025         }                                                                 \
5026     }                                                                     \
5027                                                                           \
5028     env->vstart = 0;                                                      \
5029     /* set tail elements to 1s */                                         \
5030     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5031 }
5032 
5033 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5034 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5035 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5036 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5037 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5038 
5039 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5040 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5041                      void *vs2, CPURISCVState *env, uint32_t desc)          \
5042 {                                                                           \
5043     typedef uint##BITWIDTH##_t ETYPE;                                       \
5044     uint32_t vm = vext_vm(desc);                                            \
5045     uint32_t vl = env->vl;                                                  \
5046     uint32_t esz = sizeof(ETYPE);                                           \
5047     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5048     uint32_t vta = vext_vta(desc);                                          \
5049     uint32_t i;                                                             \
5050                                                                             \
5051     for (i = env->vstart; i < vl; i++) {                                    \
5052         if (!vm && !vext_elem_mask(v0, i)) {                                \
5053             continue;                                                       \
5054         }                                                                   \
5055         if (i == 0) {                                                       \
5056             *((ETYPE *)vd + H(i)) = s1;                                     \
5057         } else {                                                            \
5058             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5059         }                                                                   \
5060     }                                                                       \
5061     env->vstart = 0;                                                        \
5062     /* set tail elements to 1s */                                           \
5063     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5064 }
5065 
5066 GEN_VEXT_VSLIE1UP(8,  H1)
5067 GEN_VEXT_VSLIE1UP(16, H2)
5068 GEN_VEXT_VSLIE1UP(32, H4)
5069 GEN_VEXT_VSLIE1UP(64, H8)
5070 
5071 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5072 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5073                   CPURISCVState *env, uint32_t desc)              \
5074 {                                                                 \
5075     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5076 }
5077 
5078 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5079 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5080 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5081 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5082 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5083 
5084 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5085 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5086                        void *vs2, CPURISCVState *env, uint32_t desc)          \
5087 {                                                                             \
5088     typedef uint##BITWIDTH##_t ETYPE;                                         \
5089     uint32_t vm = vext_vm(desc);                                              \
5090     uint32_t vl = env->vl;                                                    \
5091     uint32_t esz = sizeof(ETYPE);                                             \
5092     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5093     uint32_t vta = vext_vta(desc);                                            \
5094     uint32_t i;                                                               \
5095                                                                               \
5096     for (i = env->vstart; i < vl; i++) {                                      \
5097         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5098             continue;                                                         \
5099         }                                                                     \
5100         if (i == vl - 1) {                                                    \
5101             *((ETYPE *)vd + H(i)) = s1;                                       \
5102         } else {                                                              \
5103             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5104         }                                                                     \
5105     }                                                                         \
5106     env->vstart = 0;                                                          \
5107     /* set tail elements to 1s */                                             \
5108     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5109 }
5110 
5111 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5112 GEN_VEXT_VSLIDE1DOWN(16, H2)
5113 GEN_VEXT_VSLIDE1DOWN(32, H4)
5114 GEN_VEXT_VSLIDE1DOWN(64, H8)
5115 
5116 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5117 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5118                   CPURISCVState *env, uint32_t desc)              \
5119 {                                                                 \
5120     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5121 }
5122 
5123 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5124 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5125 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5126 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5127 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5128 
5129 /* Vector Floating-Point Slide Instructions */
5130 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5131 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5132                   CPURISCVState *env, uint32_t desc)          \
5133 {                                                             \
5134     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5135 }
5136 
5137 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5138 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5139 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5140 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5141 
5142 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5143 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5144                   CPURISCVState *env, uint32_t desc)          \
5145 {                                                             \
5146     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5147 }
5148 
5149 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5150 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5151 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5152 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5153 
5154 /* Vector Register Gather Instruction */
5155 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5156 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5157                   CPURISCVState *env, uint32_t desc)                      \
5158 {                                                                         \
5159     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5160     uint32_t vm = vext_vm(desc);                                          \
5161     uint32_t vl = env->vl;                                                \
5162     uint32_t esz = sizeof(TS2);                                           \
5163     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5164     uint32_t vta = vext_vta(desc);                                        \
5165     uint64_t index;                                                       \
5166     uint32_t i;                                                           \
5167                                                                           \
5168     for (i = env->vstart; i < vl; i++) {                                  \
5169         if (!vm && !vext_elem_mask(v0, i)) {                              \
5170             continue;                                                     \
5171         }                                                                 \
5172         index = *((TS1 *)vs1 + HS1(i));                                   \
5173         if (index >= vlmax) {                                             \
5174             *((TS2 *)vd + HS2(i)) = 0;                                    \
5175         } else {                                                          \
5176             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5177         }                                                                 \
5178     }                                                                     \
5179     env->vstart = 0;                                                      \
5180     /* set tail elements to 1s */                                         \
5181     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5182 }
5183 
5184 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5185 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5186 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5187 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5188 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5189 
5190 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5191 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5192 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5193 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5194 
5195 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5196 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5197                   CPURISCVState *env, uint32_t desc)                      \
5198 {                                                                         \
5199     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5200     uint32_t vm = vext_vm(desc);                                          \
5201     uint32_t vl = env->vl;                                                \
5202     uint32_t esz = sizeof(ETYPE);                                         \
5203     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5204     uint32_t vta = vext_vta(desc);                                        \
5205     uint64_t index = s1;                                                  \
5206     uint32_t i;                                                           \
5207                                                                           \
5208     for (i = env->vstart; i < vl; i++) {                                  \
5209         if (!vm && !vext_elem_mask(v0, i)) {                              \
5210             continue;                                                     \
5211         }                                                                 \
5212         if (index >= vlmax) {                                             \
5213             *((ETYPE *)vd + H(i)) = 0;                                    \
5214         } else {                                                          \
5215             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5216         }                                                                 \
5217     }                                                                     \
5218     env->vstart = 0;                                                      \
5219     /* set tail elements to 1s */                                         \
5220     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5221 }
5222 
5223 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5224 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5225 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5226 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5227 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5228 
5229 /* Vector Compress Instruction */
5230 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5231 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5232                   CPURISCVState *env, uint32_t desc)                      \
5233 {                                                                         \
5234     uint32_t vl = env->vl;                                                \
5235     uint32_t esz = sizeof(ETYPE);                                         \
5236     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5237     uint32_t vta = vext_vta(desc);                                        \
5238     uint32_t num = 0, i;                                                  \
5239                                                                           \
5240     for (i = env->vstart; i < vl; i++) {                                  \
5241         if (!vext_elem_mask(vs1, i)) {                                    \
5242             continue;                                                     \
5243         }                                                                 \
5244         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5245         num++;                                                            \
5246     }                                                                     \
5247     env->vstart = 0;                                                      \
5248     /* set tail elements to 1s */                                         \
5249     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5250 }
5251 
5252 /* Compress into vd elements of vs2 where vs1 is enabled */
5253 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5254 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5255 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5256 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5257 
5258 /* Vector Whole Register Move */
5259 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5260 {
5261     /* EEW = SEW */
5262     uint32_t maxsz = simd_maxsz(desc);
5263     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5264     uint32_t startb = env->vstart * sewb;
5265     uint32_t i = startb;
5266 
5267     memcpy((uint8_t *)vd + H1(i),
5268            (uint8_t *)vs2 + H1(i),
5269            maxsz - startb);
5270 
5271     env->vstart = 0;
5272 }
5273 
5274 /* Vector Integer Extension */
5275 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5276 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5277                   CPURISCVState *env, uint32_t desc)             \
5278 {                                                                \
5279     uint32_t vl = env->vl;                                       \
5280     uint32_t vm = vext_vm(desc);                                 \
5281     uint32_t esz = sizeof(ETYPE);                                \
5282     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5283     uint32_t vta = vext_vta(desc);                               \
5284     uint32_t i;                                                  \
5285                                                                  \
5286     for (i = env->vstart; i < vl; i++) {                         \
5287         if (!vm && !vext_elem_mask(v0, i)) {                     \
5288             continue;                                            \
5289         }                                                        \
5290         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5291     }                                                            \
5292     env->vstart = 0;                                             \
5293     /* set tail elements to 1s */                                \
5294     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5295 }
5296 
5297 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5298 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5299 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5300 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5301 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5302 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5303 
5304 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5305 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5306 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5307 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5308 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5309 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5310