xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 89a32de2)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen)
54         || vill
55         || (ediv != 0)
56         || (reserved != 0)) {
57         /* only set vill bit. */
58         env->vill = 1;
59         env->vtype = 0;
60         env->vl = 0;
61         env->vstart = 0;
62         return 0;
63     }
64 
65     vlmax = vext_get_vlmax(cpu, s2);
66     if (s1 <= vlmax) {
67         vl = s1;
68     } else {
69         vl = vlmax;
70     }
71     env->vl = vl;
72     env->vtype = s2;
73     env->vstart = 0;
74     env->vill = 0;
75     return vl;
76 }
77 
78 /*
79  * Note that vector data is stored in host-endian 64-bit chunks,
80  * so addressing units smaller than that needs a host-endian fixup.
81  */
82 #if HOST_BIG_ENDIAN
83 #define H1(x)   ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x)   ((x) ^ 3)
87 #define H4(x)   ((x) ^ 1)
88 #define H8(x)   ((x))
89 #else
90 #define H1(x)   (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x)   (x)
94 #define H4(x)   (x)
95 #define H8(x)   (x)
96 #endif
97 
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100     return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102 
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105     return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107 
108 /*
109  * Encode LMUL to lmul as following:
110  *     LMUL    vlmul    lmul
111  *      1       000       0
112  *      2       001       1
113  *      4       010       2
114  *      8       011       3
115  *      -       100       -
116  *     1/8      101      -3
117  *     1/4      110      -2
118  *     1/2      111      -1
119  */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124 
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129 
130 static inline uint32_t vext_vta_all_1s(uint32_t desc)
131 {
132     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
133 }
134 
135 /*
136  * Get the maximum number of elements can be operated.
137  *
138  * log2_esz: log2 of element size in bytes.
139  */
140 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
141 {
142     /*
143      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
144      * so vlen in bytes (vlenb) is encoded as maxsz.
145      */
146     uint32_t vlenb = simd_maxsz(desc);
147 
148     /* Return VLMAX */
149     int scale = vext_lmul(desc) - log2_esz;
150     return scale < 0 ? vlenb >> -scale : vlenb << scale;
151 }
152 
153 /*
154  * Get number of total elements, including prestart, body and tail elements.
155  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
156  * are held in the same vector register.
157  */
158 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
159                                             uint32_t esz)
160 {
161     uint32_t vlenb = simd_maxsz(desc);
162     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
163     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
164                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
165     return (vlenb << emul) / esz;
166 }
167 
168 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
169 {
170     return (addr & env->cur_pmmask) | env->cur_pmbase;
171 }
172 
173 /*
174  * This function checks watchpoint before real load operation.
175  *
176  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
177  * In user mode, there is no watchpoint support now.
178  *
179  * It will trigger an exception if there is no mapping in TLB
180  * and page table walk can't fill the TLB entry. Then the guest
181  * software can return here after process the exception or never return.
182  */
183 static void probe_pages(CPURISCVState *env, target_ulong addr,
184                         target_ulong len, uintptr_t ra,
185                         MMUAccessType access_type)
186 {
187     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
188     target_ulong curlen = MIN(pagelen, len);
189 
190     probe_access(env, adjust_addr(env, addr), curlen, access_type,
191                  cpu_mmu_index(env, false), ra);
192     if (len > curlen) {
193         addr += curlen;
194         curlen = len - curlen;
195         probe_access(env, adjust_addr(env, addr), curlen, access_type,
196                      cpu_mmu_index(env, false), ra);
197     }
198 }
199 
200 /* set agnostic elements to 1s */
201 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
202                               uint32_t tot)
203 {
204     if (is_agnostic == 0) {
205         /* policy undisturbed */
206         return;
207     }
208     if (tot - cnt == 0) {
209         return ;
210     }
211     memset(base + cnt, -1, tot - cnt);
212 }
213 
214 static inline void vext_set_elem_mask(void *v0, int index,
215                                       uint8_t value)
216 {
217     int idx = index / 64;
218     int pos = index % 64;
219     uint64_t old = ((uint64_t *)v0)[idx];
220     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
221 }
222 
223 /*
224  * Earlier designs (pre-0.9) had a varying number of bits
225  * per mask value (MLEN). In the 0.9 design, MLEN=1.
226  * (Section 4.5)
227  */
228 static inline int vext_elem_mask(void *v0, int index)
229 {
230     int idx = index / 64;
231     int pos = index  % 64;
232     return (((uint64_t *)v0)[idx] >> pos) & 1;
233 }
234 
235 /* elements operations for load and store */
236 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
237                                uint32_t idx, void *vd, uintptr_t retaddr);
238 
239 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
240 static void NAME(CPURISCVState *env, abi_ptr addr,         \
241                  uint32_t idx, void *vd, uintptr_t retaddr)\
242 {                                                          \
243     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
244     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
245 }                                                          \
246 
247 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
248 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
249 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
250 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
251 
252 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
253 static void NAME(CPURISCVState *env, abi_ptr addr,         \
254                  uint32_t idx, void *vd, uintptr_t retaddr)\
255 {                                                          \
256     ETYPE data = *((ETYPE *)vd + H(idx));                  \
257     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
258 }
259 
260 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
261 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
262 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
263 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
264 
265 /*
266  *** stride: access vector element from strided memory
267  */
268 static void
269 vext_ldst_stride(void *vd, void *v0, target_ulong base,
270                  target_ulong stride, CPURISCVState *env,
271                  uint32_t desc, uint32_t vm,
272                  vext_ldst_elem_fn *ldst_elem,
273                  uint32_t log2_esz, uintptr_t ra)
274 {
275     uint32_t i, k;
276     uint32_t nf = vext_nf(desc);
277     uint32_t max_elems = vext_max_elems(desc, log2_esz);
278     uint32_t esz = 1 << log2_esz;
279     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
280     uint32_t vta = vext_vta(desc);
281 
282     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
283         if (!vm && !vext_elem_mask(v0, i)) {
284             continue;
285         }
286 
287         k = 0;
288         while (k < nf) {
289             target_ulong addr = base + stride * i + (k << log2_esz);
290             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
291             k++;
292         }
293     }
294     env->vstart = 0;
295     /* set tail elements to 1s */
296     for (k = 0; k < nf; ++k) {
297         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
298                           (k * max_elems + max_elems) * esz);
299     }
300     if (nf * max_elems % total_elems != 0) {
301         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
302         uint32_t registers_used =
303             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
304         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
305                           registers_used * vlenb);
306     }
307 }
308 
309 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
310 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
311                   target_ulong stride, CPURISCVState *env,              \
312                   uint32_t desc)                                        \
313 {                                                                       \
314     uint32_t vm = vext_vm(desc);                                        \
315     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
316                      ctzl(sizeof(ETYPE)), GETPC());                     \
317 }
318 
319 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
320 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
321 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
322 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
323 
324 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
325 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
326                   target_ulong stride, CPURISCVState *env,              \
327                   uint32_t desc)                                        \
328 {                                                                       \
329     uint32_t vm = vext_vm(desc);                                        \
330     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
331                      ctzl(sizeof(ETYPE)), GETPC());                     \
332 }
333 
334 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
335 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
336 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
337 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
338 
339 /*
340  *** unit-stride: access elements stored contiguously in memory
341  */
342 
343 /* unmasked unit-stride load and store operation*/
344 static void
345 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
346              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
347              uintptr_t ra)
348 {
349     uint32_t i, k;
350     uint32_t nf = vext_nf(desc);
351     uint32_t max_elems = vext_max_elems(desc, log2_esz);
352     uint32_t esz = 1 << log2_esz;
353     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
354     uint32_t vta = vext_vta(desc);
355 
356     /* load bytes from guest memory */
357     for (i = env->vstart; i < evl; i++, env->vstart++) {
358         k = 0;
359         while (k < nf) {
360             target_ulong addr = base + ((i * nf + k) << log2_esz);
361             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
362             k++;
363         }
364     }
365     env->vstart = 0;
366     /* set tail elements to 1s */
367     for (k = 0; k < nf; ++k) {
368         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
369                           (k * max_elems + max_elems) * esz);
370     }
371     if (nf * max_elems % total_elems != 0) {
372         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
373         uint32_t registers_used =
374             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
375         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
376                           registers_used * vlenb);
377     }
378 }
379 
380 /*
381  * masked unit-stride load and store operation will be a special case of stride,
382  * stride = NF * sizeof (MTYPE)
383  */
384 
385 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
386 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
387                          CPURISCVState *env, uint32_t desc)             \
388 {                                                                       \
389     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
390     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
391                      ctzl(sizeof(ETYPE)), GETPC());                     \
392 }                                                                       \
393                                                                         \
394 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
395                   CPURISCVState *env, uint32_t desc)                    \
396 {                                                                       \
397     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
398                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
399 }
400 
401 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
402 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
403 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
404 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
405 
406 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
407 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
408                          CPURISCVState *env, uint32_t desc)              \
409 {                                                                        \
410     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
411     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
412                      ctzl(sizeof(ETYPE)), GETPC());                      \
413 }                                                                        \
414                                                                          \
415 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
416                   CPURISCVState *env, uint32_t desc)                     \
417 {                                                                        \
418     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
419                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
420 }
421 
422 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
423 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
424 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
425 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
426 
427 /*
428  *** unit stride mask load and store, EEW = 1
429  */
430 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
431                     CPURISCVState *env, uint32_t desc)
432 {
433     /* evl = ceil(vl/8) */
434     uint8_t evl = (env->vl + 7) >> 3;
435     vext_ldst_us(vd, base, env, desc, lde_b,
436                  0, evl, GETPC());
437 }
438 
439 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, ste_b,
445                  0, evl, GETPC());
446 }
447 
448 /*
449  *** index: access vector element from indexed memory
450  */
451 typedef target_ulong vext_get_index_addr(target_ulong base,
452         uint32_t idx, void *vs2);
453 
454 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
455 static target_ulong NAME(target_ulong base,            \
456                          uint32_t idx, void *vs2)      \
457 {                                                      \
458     return (base + *((ETYPE *)vs2 + H(idx)));          \
459 }
460 
461 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
462 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
463 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
464 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
465 
466 static inline void
467 vext_ldst_index(void *vd, void *v0, target_ulong base,
468                 void *vs2, CPURISCVState *env, uint32_t desc,
469                 vext_get_index_addr get_index_addr,
470                 vext_ldst_elem_fn *ldst_elem,
471                 uint32_t log2_esz, uintptr_t ra)
472 {
473     uint32_t i, k;
474     uint32_t nf = vext_nf(desc);
475     uint32_t vm = vext_vm(desc);
476     uint32_t max_elems = vext_max_elems(desc, log2_esz);
477     uint32_t esz = 1 << log2_esz;
478     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
479     uint32_t vta = vext_vta(desc);
480 
481     /* load bytes from guest memory */
482     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
483         if (!vm && !vext_elem_mask(v0, i)) {
484             continue;
485         }
486 
487         k = 0;
488         while (k < nf) {
489             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
490             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
491             k++;
492         }
493     }
494     env->vstart = 0;
495     /* set tail elements to 1s */
496     for (k = 0; k < nf; ++k) {
497         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
498                           (k * max_elems + max_elems) * esz);
499     }
500     if (nf * max_elems % total_elems != 0) {
501         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
502         uint32_t registers_used =
503             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
504         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
505                           registers_used * vlenb);
506     }
507 }
508 
509 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
510 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
511                   void *vs2, CPURISCVState *env, uint32_t desc)            \
512 {                                                                          \
513     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
514                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
515 }
516 
517 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
518 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
519 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
520 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
521 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
522 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
523 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
524 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
525 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
526 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
527 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
528 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
529 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
530 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
531 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
532 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
533 
534 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
535 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
536                   void *vs2, CPURISCVState *env, uint32_t desc)  \
537 {                                                                \
538     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
539                     STORE_FN, ctzl(sizeof(ETYPE)),               \
540                     GETPC());                                    \
541 }
542 
543 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
544 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
545 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
546 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
547 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
548 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
549 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
550 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
551 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
552 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
553 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
554 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
555 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
556 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
557 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
558 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
559 
560 /*
561  *** unit-stride fault-only-fisrt load instructions
562  */
563 static inline void
564 vext_ldff(void *vd, void *v0, target_ulong base,
565           CPURISCVState *env, uint32_t desc,
566           vext_ldst_elem_fn *ldst_elem,
567           uint32_t log2_esz, uintptr_t ra)
568 {
569     void *host;
570     uint32_t i, k, vl = 0;
571     uint32_t nf = vext_nf(desc);
572     uint32_t vm = vext_vm(desc);
573     uint32_t max_elems = vext_max_elems(desc, log2_esz);
574     uint32_t esz = 1 << log2_esz;
575     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
576     uint32_t vta = vext_vta(desc);
577     target_ulong addr, offset, remain;
578 
579     /* probe every access*/
580     for (i = env->vstart; i < env->vl; i++) {
581         if (!vm && !vext_elem_mask(v0, i)) {
582             continue;
583         }
584         addr = adjust_addr(env, base + i * (nf << log2_esz));
585         if (i == 0) {
586             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
587         } else {
588             /* if it triggers an exception, no need to check watchpoint */
589             remain = nf << log2_esz;
590             while (remain > 0) {
591                 offset = -(addr | TARGET_PAGE_MASK);
592                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
593                                          cpu_mmu_index(env, false));
594                 if (host) {
595 #ifdef CONFIG_USER_ONLY
596                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
597                         vl = i;
598                         goto ProbeSuccess;
599                     }
600 #else
601                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
602 #endif
603                 } else {
604                     vl = i;
605                     goto ProbeSuccess;
606                 }
607                 if (remain <=  offset) {
608                     break;
609                 }
610                 remain -= offset;
611                 addr = adjust_addr(env, addr + offset);
612             }
613         }
614     }
615 ProbeSuccess:
616     /* load bytes from guest memory */
617     if (vl != 0) {
618         env->vl = vl;
619     }
620     for (i = env->vstart; i < env->vl; i++) {
621         k = 0;
622         if (!vm && !vext_elem_mask(v0, i)) {
623             continue;
624         }
625         while (k < nf) {
626             target_ulong addr = base + ((i * nf + k) << log2_esz);
627             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
628             k++;
629         }
630     }
631     env->vstart = 0;
632     /* set tail elements to 1s */
633     for (k = 0; k < nf; ++k) {
634         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
635                           (k * max_elems + max_elems) * esz);
636     }
637     if (nf * max_elems % total_elems != 0) {
638         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
639         uint32_t registers_used =
640             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
641         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
642                           registers_used * vlenb);
643     }
644 }
645 
646 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
647 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
648                   CPURISCVState *env, uint32_t desc)      \
649 {                                                         \
650     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
651               ctzl(sizeof(ETYPE)), GETPC());              \
652 }
653 
654 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
655 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
656 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
657 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
658 
659 #define DO_SWAP(N, M) (M)
660 #define DO_AND(N, M)  (N & M)
661 #define DO_XOR(N, M)  (N ^ M)
662 #define DO_OR(N, M)   (N | M)
663 #define DO_ADD(N, M)  (N + M)
664 
665 /* Signed min/max */
666 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
667 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
668 
669 /* Unsigned min/max */
670 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
671 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
672 
673 /*
674  *** load and store whole register instructions
675  */
676 static void
677 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
678                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
679 {
680     uint32_t i, k, off, pos;
681     uint32_t nf = vext_nf(desc);
682     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
683     uint32_t max_elems = vlenb >> log2_esz;
684 
685     k = env->vstart / max_elems;
686     off = env->vstart % max_elems;
687 
688     if (off) {
689         /* load/store rest of elements of current segment pointed by vstart */
690         for (pos = off; pos < max_elems; pos++, env->vstart++) {
691             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
692             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
693         }
694         k++;
695     }
696 
697     /* load/store elements for rest of segments */
698     for (; k < nf; k++) {
699         for (i = 0; i < max_elems; i++, env->vstart++) {
700             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
701             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
702         }
703     }
704 
705     env->vstart = 0;
706 }
707 
708 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
709 void HELPER(NAME)(void *vd, target_ulong base,       \
710                   CPURISCVState *env, uint32_t desc) \
711 {                                                    \
712     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
713                     ctzl(sizeof(ETYPE)), GETPC());   \
714 }
715 
716 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
717 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
718 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
719 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
720 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
721 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
722 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
723 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
724 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
725 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
726 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
727 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
728 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
729 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
730 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
731 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
732 
733 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
734 void HELPER(NAME)(void *vd, target_ulong base,       \
735                   CPURISCVState *env, uint32_t desc) \
736 {                                                    \
737     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
738                     ctzl(sizeof(ETYPE)), GETPC());   \
739 }
740 
741 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
742 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
743 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
744 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
745 
746 /*
747  *** Vector Integer Arithmetic Instructions
748  */
749 
750 /* expand macro args before macro */
751 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
752 
753 /* (TD, T1, T2, TX1, TX2) */
754 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
755 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
756 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
757 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
758 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
759 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
760 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
761 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
762 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
763 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
764 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
765 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
766 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
767 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
768 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
769 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
770 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
771 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
772 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
773 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
774 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
775 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
776 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
777 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
778 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
779 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
780 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
781 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
782 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
783 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
784 
785 /* operation of two vector elements */
786 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
787 
788 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
789 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
790 {                                                               \
791     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
792     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
793     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
794 }
795 #define DO_SUB(N, M) (N - M)
796 #define DO_RSUB(N, M) (M - N)
797 
798 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
799 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
800 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
801 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
802 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
803 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
804 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
805 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
806 
807 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
808                        CPURISCVState *env, uint32_t desc,
809                        opivv2_fn *fn, uint32_t esz)
810 {
811     uint32_t vm = vext_vm(desc);
812     uint32_t vl = env->vl;
813     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
814     uint32_t vta = vext_vta(desc);
815     uint32_t i;
816 
817     for (i = env->vstart; i < vl; i++) {
818         if (!vm && !vext_elem_mask(v0, i)) {
819             continue;
820         }
821         fn(vd, vs1, vs2, i);
822     }
823     env->vstart = 0;
824     /* set tail elements to 1s */
825     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
826 }
827 
828 /* generate the helpers for OPIVV */
829 #define GEN_VEXT_VV(NAME, ESZ)                            \
830 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
831                   void *vs2, CPURISCVState *env,          \
832                   uint32_t desc)                          \
833 {                                                         \
834     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
835                do_##NAME, ESZ);                           \
836 }
837 
838 GEN_VEXT_VV(vadd_vv_b, 1)
839 GEN_VEXT_VV(vadd_vv_h, 2)
840 GEN_VEXT_VV(vadd_vv_w, 4)
841 GEN_VEXT_VV(vadd_vv_d, 8)
842 GEN_VEXT_VV(vsub_vv_b, 1)
843 GEN_VEXT_VV(vsub_vv_h, 2)
844 GEN_VEXT_VV(vsub_vv_w, 4)
845 GEN_VEXT_VV(vsub_vv_d, 8)
846 
847 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
848 
849 /*
850  * (T1)s1 gives the real operator type.
851  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
852  */
853 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
854 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
855 {                                                                   \
856     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
857     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
858 }
859 
860 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
861 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
862 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
863 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
864 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
865 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
866 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
867 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
868 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
869 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
870 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
871 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
872 
873 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
874                        CPURISCVState *env, uint32_t desc,
875                        opivx2_fn fn, uint32_t esz)
876 {
877     uint32_t vm = vext_vm(desc);
878     uint32_t vl = env->vl;
879     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
880     uint32_t vta = vext_vta(desc);
881     uint32_t i;
882 
883     for (i = env->vstart; i < vl; i++) {
884         if (!vm && !vext_elem_mask(v0, i)) {
885             continue;
886         }
887         fn(vd, s1, vs2, i);
888     }
889     env->vstart = 0;
890     /* set tail elements to 1s */
891     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
892 }
893 
894 /* generate the helpers for OPIVX */
895 #define GEN_VEXT_VX(NAME, ESZ)                            \
896 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
897                   void *vs2, CPURISCVState *env,          \
898                   uint32_t desc)                          \
899 {                                                         \
900     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
901                do_##NAME, ESZ);                           \
902 }
903 
904 GEN_VEXT_VX(vadd_vx_b, 1)
905 GEN_VEXT_VX(vadd_vx_h, 2)
906 GEN_VEXT_VX(vadd_vx_w, 4)
907 GEN_VEXT_VX(vadd_vx_d, 8)
908 GEN_VEXT_VX(vsub_vx_b, 1)
909 GEN_VEXT_VX(vsub_vx_h, 2)
910 GEN_VEXT_VX(vsub_vx_w, 4)
911 GEN_VEXT_VX(vsub_vx_d, 8)
912 GEN_VEXT_VX(vrsub_vx_b, 1)
913 GEN_VEXT_VX(vrsub_vx_h, 2)
914 GEN_VEXT_VX(vrsub_vx_w, 4)
915 GEN_VEXT_VX(vrsub_vx_d, 8)
916 
917 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
918 {
919     intptr_t oprsz = simd_oprsz(desc);
920     intptr_t i;
921 
922     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
923         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
924     }
925 }
926 
927 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
928 {
929     intptr_t oprsz = simd_oprsz(desc);
930     intptr_t i;
931 
932     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
933         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
934     }
935 }
936 
937 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
938 {
939     intptr_t oprsz = simd_oprsz(desc);
940     intptr_t i;
941 
942     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
943         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
944     }
945 }
946 
947 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
948 {
949     intptr_t oprsz = simd_oprsz(desc);
950     intptr_t i;
951 
952     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
953         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
954     }
955 }
956 
957 /* Vector Widening Integer Add/Subtract */
958 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
959 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
960 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
961 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
962 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
963 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
964 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
965 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
966 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
967 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
968 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
969 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
970 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
971 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
972 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
973 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
974 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
975 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
976 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
977 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
978 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
979 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
980 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
981 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
982 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
983 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
984 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
985 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
986 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
987 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
988 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
989 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
990 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
991 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
992 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
993 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
994 GEN_VEXT_VV(vwaddu_vv_b, 2)
995 GEN_VEXT_VV(vwaddu_vv_h, 4)
996 GEN_VEXT_VV(vwaddu_vv_w, 8)
997 GEN_VEXT_VV(vwsubu_vv_b, 2)
998 GEN_VEXT_VV(vwsubu_vv_h, 4)
999 GEN_VEXT_VV(vwsubu_vv_w, 8)
1000 GEN_VEXT_VV(vwadd_vv_b, 2)
1001 GEN_VEXT_VV(vwadd_vv_h, 4)
1002 GEN_VEXT_VV(vwadd_vv_w, 8)
1003 GEN_VEXT_VV(vwsub_vv_b, 2)
1004 GEN_VEXT_VV(vwsub_vv_h, 4)
1005 GEN_VEXT_VV(vwsub_vv_w, 8)
1006 GEN_VEXT_VV(vwaddu_wv_b, 2)
1007 GEN_VEXT_VV(vwaddu_wv_h, 4)
1008 GEN_VEXT_VV(vwaddu_wv_w, 8)
1009 GEN_VEXT_VV(vwsubu_wv_b, 2)
1010 GEN_VEXT_VV(vwsubu_wv_h, 4)
1011 GEN_VEXT_VV(vwsubu_wv_w, 8)
1012 GEN_VEXT_VV(vwadd_wv_b, 2)
1013 GEN_VEXT_VV(vwadd_wv_h, 4)
1014 GEN_VEXT_VV(vwadd_wv_w, 8)
1015 GEN_VEXT_VV(vwsub_wv_b, 2)
1016 GEN_VEXT_VV(vwsub_wv_h, 4)
1017 GEN_VEXT_VV(vwsub_wv_w, 8)
1018 
1019 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1020 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1021 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1022 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1023 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1024 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1025 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1026 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1027 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1028 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1029 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1030 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1031 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1032 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1033 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1034 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1035 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1036 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1037 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1038 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1039 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1040 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1041 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1042 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1043 GEN_VEXT_VX(vwaddu_vx_b, 2)
1044 GEN_VEXT_VX(vwaddu_vx_h, 4)
1045 GEN_VEXT_VX(vwaddu_vx_w, 8)
1046 GEN_VEXT_VX(vwsubu_vx_b, 2)
1047 GEN_VEXT_VX(vwsubu_vx_h, 4)
1048 GEN_VEXT_VX(vwsubu_vx_w, 8)
1049 GEN_VEXT_VX(vwadd_vx_b, 2)
1050 GEN_VEXT_VX(vwadd_vx_h, 4)
1051 GEN_VEXT_VX(vwadd_vx_w, 8)
1052 GEN_VEXT_VX(vwsub_vx_b, 2)
1053 GEN_VEXT_VX(vwsub_vx_h, 4)
1054 GEN_VEXT_VX(vwsub_vx_w, 8)
1055 GEN_VEXT_VX(vwaddu_wx_b, 2)
1056 GEN_VEXT_VX(vwaddu_wx_h, 4)
1057 GEN_VEXT_VX(vwaddu_wx_w, 8)
1058 GEN_VEXT_VX(vwsubu_wx_b, 2)
1059 GEN_VEXT_VX(vwsubu_wx_h, 4)
1060 GEN_VEXT_VX(vwsubu_wx_w, 8)
1061 GEN_VEXT_VX(vwadd_wx_b, 2)
1062 GEN_VEXT_VX(vwadd_wx_h, 4)
1063 GEN_VEXT_VX(vwadd_wx_w, 8)
1064 GEN_VEXT_VX(vwsub_wx_b, 2)
1065 GEN_VEXT_VX(vwsub_wx_h, 4)
1066 GEN_VEXT_VX(vwsub_wx_w, 8)
1067 
1068 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1069 #define DO_VADC(N, M, C) (N + M + C)
1070 #define DO_VSBC(N, M, C) (N - M - C)
1071 
1072 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1073 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1074                   CPURISCVState *env, uint32_t desc)          \
1075 {                                                             \
1076     uint32_t vl = env->vl;                                    \
1077     uint32_t esz = sizeof(ETYPE);                             \
1078     uint32_t total_elems =                                    \
1079         vext_get_total_elems(env, desc, esz);                 \
1080     uint32_t vta = vext_vta(desc);                            \
1081     uint32_t i;                                               \
1082                                                               \
1083     for (i = env->vstart; i < vl; i++) {                      \
1084         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1085         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1086         ETYPE carry = vext_elem_mask(v0, i);                  \
1087                                                               \
1088         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1089     }                                                         \
1090     env->vstart = 0;                                          \
1091     /* set tail elements to 1s */                             \
1092     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1093 }
1094 
1095 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1096 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1097 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1098 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1099 
1100 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1101 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1102 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1103 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1104 
1105 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1106 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1107                   CPURISCVState *env, uint32_t desc)                     \
1108 {                                                                        \
1109     uint32_t vl = env->vl;                                               \
1110     uint32_t esz = sizeof(ETYPE);                                        \
1111     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1112     uint32_t vta = vext_vta(desc);                                       \
1113     uint32_t i;                                                          \
1114                                                                          \
1115     for (i = env->vstart; i < vl; i++) {                                 \
1116         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1117         ETYPE carry = vext_elem_mask(v0, i);                             \
1118                                                                          \
1119         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1120     }                                                                    \
1121     env->vstart = 0;                                          \
1122     /* set tail elements to 1s */                                        \
1123     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1124 }
1125 
1126 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1127 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1128 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1129 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1130 
1131 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1132 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1133 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1134 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1135 
1136 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1137                           (__typeof(N))(N + M) < N)
1138 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1139 
1140 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1141 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1142                   CPURISCVState *env, uint32_t desc)          \
1143 {                                                             \
1144     uint32_t vl = env->vl;                                    \
1145     uint32_t vm = vext_vm(desc);                              \
1146     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1147     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1148     uint32_t i;                                               \
1149                                                               \
1150     for (i = env->vstart; i < vl; i++) {                      \
1151         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1152         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1153         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1154         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1155     }                                                         \
1156     env->vstart = 0;                                          \
1157     /* mask destination register are always tail-agnostic */  \
1158     /* set tail elements to 1s */                             \
1159     if (vta_all_1s) {                                         \
1160         for (; i < total_elems; i++) {                        \
1161             vext_set_elem_mask(vd, i, 1);                     \
1162         }                                                     \
1163     }                                                         \
1164 }
1165 
1166 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1167 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1168 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1169 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1170 
1171 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1172 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1173 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1174 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1175 
1176 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1177 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1178                   void *vs2, CPURISCVState *env, uint32_t desc) \
1179 {                                                               \
1180     uint32_t vl = env->vl;                                      \
1181     uint32_t vm = vext_vm(desc);                                \
1182     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1183     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1184     uint32_t i;                                                 \
1185                                                                 \
1186     for (i = env->vstart; i < vl; i++) {                        \
1187         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1188         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1189         vext_set_elem_mask(vd, i,                               \
1190                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1191     }                                                           \
1192     env->vstart = 0;                                            \
1193     /* mask destination register are always tail-agnostic */    \
1194     /* set tail elements to 1s */                               \
1195     if (vta_all_1s) {                                           \
1196         for (; i < total_elems; i++) {                          \
1197             vext_set_elem_mask(vd, i, 1);                       \
1198         }                                                       \
1199     }                                                           \
1200 }
1201 
1202 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1203 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1204 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1205 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1206 
1207 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1208 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1209 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1210 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1211 
1212 /* Vector Bitwise Logical Instructions */
1213 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1214 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1215 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1216 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1217 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1218 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1219 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1220 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1221 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1222 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1223 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1224 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1225 GEN_VEXT_VV(vand_vv_b, 1)
1226 GEN_VEXT_VV(vand_vv_h, 2)
1227 GEN_VEXT_VV(vand_vv_w, 4)
1228 GEN_VEXT_VV(vand_vv_d, 8)
1229 GEN_VEXT_VV(vor_vv_b, 1)
1230 GEN_VEXT_VV(vor_vv_h, 2)
1231 GEN_VEXT_VV(vor_vv_w, 4)
1232 GEN_VEXT_VV(vor_vv_d, 8)
1233 GEN_VEXT_VV(vxor_vv_b, 1)
1234 GEN_VEXT_VV(vxor_vv_h, 2)
1235 GEN_VEXT_VV(vxor_vv_w, 4)
1236 GEN_VEXT_VV(vxor_vv_d, 8)
1237 
1238 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1239 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1240 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1241 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1242 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1243 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1244 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1245 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1246 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1247 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1248 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1249 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1250 GEN_VEXT_VX(vand_vx_b, 1)
1251 GEN_VEXT_VX(vand_vx_h, 2)
1252 GEN_VEXT_VX(vand_vx_w, 4)
1253 GEN_VEXT_VX(vand_vx_d, 8)
1254 GEN_VEXT_VX(vor_vx_b, 1)
1255 GEN_VEXT_VX(vor_vx_h, 2)
1256 GEN_VEXT_VX(vor_vx_w, 4)
1257 GEN_VEXT_VX(vor_vx_d, 8)
1258 GEN_VEXT_VX(vxor_vx_b, 1)
1259 GEN_VEXT_VX(vxor_vx_h, 2)
1260 GEN_VEXT_VX(vxor_vx_w, 4)
1261 GEN_VEXT_VX(vxor_vx_d, 8)
1262 
1263 /* Vector Single-Width Bit Shift Instructions */
1264 #define DO_SLL(N, M)  (N << (M))
1265 #define DO_SRL(N, M)  (N >> (M))
1266 
1267 /* generate the helpers for shift instructions with two vector operators */
1268 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1269 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1270                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1271 {                                                                         \
1272     uint32_t vm = vext_vm(desc);                                          \
1273     uint32_t vl = env->vl;                                                \
1274     uint32_t esz = sizeof(TS1);                                           \
1275     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1276     uint32_t vta = vext_vta(desc);                                        \
1277     uint32_t i;                                                           \
1278                                                                           \
1279     for (i = env->vstart; i < vl; i++) {                                  \
1280         if (!vm && !vext_elem_mask(v0, i)) {                              \
1281             continue;                                                     \
1282         }                                                                 \
1283         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1284         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1285         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1286     }                                                                     \
1287     env->vstart = 0;                                                      \
1288     /* set tail elements to 1s */                                         \
1289     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1290 }
1291 
1292 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1293 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1294 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1295 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1296 
1297 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1298 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1299 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1300 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1301 
1302 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1303 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1304 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1305 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1306 
1307 /* generate the helpers for shift instructions with one vector and one scalar */
1308 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1309 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1310         void *vs2, CPURISCVState *env, uint32_t desc)       \
1311 {                                                           \
1312     uint32_t vm = vext_vm(desc);                            \
1313     uint32_t vl = env->vl;                                  \
1314     uint32_t esz = sizeof(TD);                              \
1315     uint32_t total_elems =                                  \
1316         vext_get_total_elems(env, desc, esz);               \
1317     uint32_t vta = vext_vta(desc);                          \
1318     uint32_t i;                                             \
1319                                                             \
1320     for (i = env->vstart; i < vl; i++) {                    \
1321         if (!vm && !vext_elem_mask(v0, i)) {                \
1322             continue;                                       \
1323         }                                                   \
1324         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1325         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1326     }                                                       \
1327     env->vstart = 0;                                        \
1328     /* set tail elements to 1s */                           \
1329     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1330 }
1331 
1332 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1333 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1334 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1335 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1336 
1337 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1338 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1339 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1340 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1341 
1342 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1343 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1344 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1345 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1346 
1347 /* Vector Narrowing Integer Right Shift Instructions */
1348 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1349 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1350 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1351 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1352 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1353 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1354 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1355 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1356 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1357 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1358 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1359 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1360 
1361 /* Vector Integer Comparison Instructions */
1362 #define DO_MSEQ(N, M) (N == M)
1363 #define DO_MSNE(N, M) (N != M)
1364 #define DO_MSLT(N, M) (N < M)
1365 #define DO_MSLE(N, M) (N <= M)
1366 #define DO_MSGT(N, M) (N > M)
1367 
1368 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1369 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1370                   CPURISCVState *env, uint32_t desc)          \
1371 {                                                             \
1372     uint32_t vm = vext_vm(desc);                              \
1373     uint32_t vl = env->vl;                                    \
1374     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1375     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1376     uint32_t i;                                               \
1377                                                               \
1378     for (i = env->vstart; i < vl; i++) {                      \
1379         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1380         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1381         if (!vm && !vext_elem_mask(v0, i)) {                  \
1382             continue;                                         \
1383         }                                                     \
1384         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1385     }                                                         \
1386     env->vstart = 0;                                          \
1387     /* mask destination register are always tail-agnostic */  \
1388     /* set tail elements to 1s */                             \
1389     if (vta_all_1s) {                                         \
1390         for (; i < total_elems; i++) {                        \
1391             vext_set_elem_mask(vd, i, 1);                     \
1392         }                                                     \
1393     }                                                         \
1394 }
1395 
1396 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1397 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1398 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1399 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1400 
1401 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1402 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1403 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1404 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1405 
1406 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1407 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1408 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1409 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1410 
1411 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1412 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1413 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1414 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1415 
1416 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1417 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1418 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1419 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1420 
1421 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1422 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1423 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1424 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1425 
1426 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1427 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1428                   CPURISCVState *env, uint32_t desc)                \
1429 {                                                                   \
1430     uint32_t vm = vext_vm(desc);                                    \
1431     uint32_t vl = env->vl;                                          \
1432     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1433     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1434     uint32_t i;                                                     \
1435                                                                     \
1436     for (i = env->vstart; i < vl; i++) {                            \
1437         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1438         if (!vm && !vext_elem_mask(v0, i)) {                        \
1439             continue;                                               \
1440         }                                                           \
1441         vext_set_elem_mask(vd, i,                                   \
1442                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1443     }                                                               \
1444     env->vstart = 0;                                                \
1445     /* mask destination register are always tail-agnostic */        \
1446     /* set tail elements to 1s */                                   \
1447     if (vta_all_1s) {                                               \
1448         for (; i < total_elems; i++) {                              \
1449             vext_set_elem_mask(vd, i, 1);                           \
1450         }                                                           \
1451     }                                                               \
1452 }
1453 
1454 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1455 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1456 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1457 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1458 
1459 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1460 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1461 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1462 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1463 
1464 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1465 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1466 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1467 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1468 
1469 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1470 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1471 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1472 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1473 
1474 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1475 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1476 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1477 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1478 
1479 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1480 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1481 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1482 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1483 
1484 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1485 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1486 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1487 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1488 
1489 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1490 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1491 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1492 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1493 
1494 /* Vector Integer Min/Max Instructions */
1495 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1496 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1497 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1498 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1499 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1500 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1501 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1502 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1503 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1504 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1505 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1506 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1507 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1508 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1509 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1510 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1511 GEN_VEXT_VV(vminu_vv_b, 1)
1512 GEN_VEXT_VV(vminu_vv_h, 2)
1513 GEN_VEXT_VV(vminu_vv_w, 4)
1514 GEN_VEXT_VV(vminu_vv_d, 8)
1515 GEN_VEXT_VV(vmin_vv_b, 1)
1516 GEN_VEXT_VV(vmin_vv_h, 2)
1517 GEN_VEXT_VV(vmin_vv_w, 4)
1518 GEN_VEXT_VV(vmin_vv_d, 8)
1519 GEN_VEXT_VV(vmaxu_vv_b, 1)
1520 GEN_VEXT_VV(vmaxu_vv_h, 2)
1521 GEN_VEXT_VV(vmaxu_vv_w, 4)
1522 GEN_VEXT_VV(vmaxu_vv_d, 8)
1523 GEN_VEXT_VV(vmax_vv_b, 1)
1524 GEN_VEXT_VV(vmax_vv_h, 2)
1525 GEN_VEXT_VV(vmax_vv_w, 4)
1526 GEN_VEXT_VV(vmax_vv_d, 8)
1527 
1528 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1529 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1530 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1531 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1532 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1533 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1534 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1535 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1536 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1537 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1538 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1539 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1540 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1541 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1542 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1543 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1544 GEN_VEXT_VX(vminu_vx_b, 1)
1545 GEN_VEXT_VX(vminu_vx_h, 2)
1546 GEN_VEXT_VX(vminu_vx_w, 4)
1547 GEN_VEXT_VX(vminu_vx_d, 8)
1548 GEN_VEXT_VX(vmin_vx_b, 1)
1549 GEN_VEXT_VX(vmin_vx_h, 2)
1550 GEN_VEXT_VX(vmin_vx_w, 4)
1551 GEN_VEXT_VX(vmin_vx_d, 8)
1552 GEN_VEXT_VX(vmaxu_vx_b, 1)
1553 GEN_VEXT_VX(vmaxu_vx_h, 2)
1554 GEN_VEXT_VX(vmaxu_vx_w, 4)
1555 GEN_VEXT_VX(vmaxu_vx_d, 8)
1556 GEN_VEXT_VX(vmax_vx_b, 1)
1557 GEN_VEXT_VX(vmax_vx_h, 2)
1558 GEN_VEXT_VX(vmax_vx_w, 4)
1559 GEN_VEXT_VX(vmax_vx_d, 8)
1560 
1561 /* Vector Single-Width Integer Multiply Instructions */
1562 #define DO_MUL(N, M) (N * M)
1563 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1564 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1565 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1566 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1567 GEN_VEXT_VV(vmul_vv_b, 1)
1568 GEN_VEXT_VV(vmul_vv_h, 2)
1569 GEN_VEXT_VV(vmul_vv_w, 4)
1570 GEN_VEXT_VV(vmul_vv_d, 8)
1571 
1572 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1573 {
1574     return (int16_t)s2 * (int16_t)s1 >> 8;
1575 }
1576 
1577 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1578 {
1579     return (int32_t)s2 * (int32_t)s1 >> 16;
1580 }
1581 
1582 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1583 {
1584     return (int64_t)s2 * (int64_t)s1 >> 32;
1585 }
1586 
1587 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1588 {
1589     uint64_t hi_64, lo_64;
1590 
1591     muls64(&lo_64, &hi_64, s1, s2);
1592     return hi_64;
1593 }
1594 
1595 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1596 {
1597     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1598 }
1599 
1600 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1601 {
1602     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1603 }
1604 
1605 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1606 {
1607     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1608 }
1609 
1610 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1611 {
1612     uint64_t hi_64, lo_64;
1613 
1614     mulu64(&lo_64, &hi_64, s2, s1);
1615     return hi_64;
1616 }
1617 
1618 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1619 {
1620     return (int16_t)s2 * (uint16_t)s1 >> 8;
1621 }
1622 
1623 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1624 {
1625     return (int32_t)s2 * (uint32_t)s1 >> 16;
1626 }
1627 
1628 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1629 {
1630     return (int64_t)s2 * (uint64_t)s1 >> 32;
1631 }
1632 
1633 /*
1634  * Let  A = signed operand,
1635  *      B = unsigned operand
1636  *      P = mulu64(A, B), unsigned product
1637  *
1638  * LET  X = 2 ** 64  - A, 2's complement of A
1639  *      SP = signed product
1640  * THEN
1641  *      IF A < 0
1642  *          SP = -X * B
1643  *             = -(2 ** 64 - A) * B
1644  *             = A * B - 2 ** 64 * B
1645  *             = P - 2 ** 64 * B
1646  *      ELSE
1647  *          SP = P
1648  * THEN
1649  *      HI_P -= (A < 0 ? B : 0)
1650  */
1651 
1652 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1653 {
1654     uint64_t hi_64, lo_64;
1655 
1656     mulu64(&lo_64, &hi_64, s2, s1);
1657 
1658     hi_64 -= s2 < 0 ? s1 : 0;
1659     return hi_64;
1660 }
1661 
1662 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1663 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1664 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1665 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1666 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1667 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1668 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1669 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1670 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1671 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1672 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1673 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1674 GEN_VEXT_VV(vmulh_vv_b, 1)
1675 GEN_VEXT_VV(vmulh_vv_h, 2)
1676 GEN_VEXT_VV(vmulh_vv_w, 4)
1677 GEN_VEXT_VV(vmulh_vv_d, 8)
1678 GEN_VEXT_VV(vmulhu_vv_b, 1)
1679 GEN_VEXT_VV(vmulhu_vv_h, 2)
1680 GEN_VEXT_VV(vmulhu_vv_w, 4)
1681 GEN_VEXT_VV(vmulhu_vv_d, 8)
1682 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1683 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1684 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1685 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1686 
1687 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1688 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1689 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1690 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1691 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1692 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1693 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1694 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1695 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1696 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1697 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1698 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1699 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1700 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1701 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1702 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1703 GEN_VEXT_VX(vmul_vx_b, 1)
1704 GEN_VEXT_VX(vmul_vx_h, 2)
1705 GEN_VEXT_VX(vmul_vx_w, 4)
1706 GEN_VEXT_VX(vmul_vx_d, 8)
1707 GEN_VEXT_VX(vmulh_vx_b, 1)
1708 GEN_VEXT_VX(vmulh_vx_h, 2)
1709 GEN_VEXT_VX(vmulh_vx_w, 4)
1710 GEN_VEXT_VX(vmulh_vx_d, 8)
1711 GEN_VEXT_VX(vmulhu_vx_b, 1)
1712 GEN_VEXT_VX(vmulhu_vx_h, 2)
1713 GEN_VEXT_VX(vmulhu_vx_w, 4)
1714 GEN_VEXT_VX(vmulhu_vx_d, 8)
1715 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1716 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1717 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1718 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1719 
1720 /* Vector Integer Divide Instructions */
1721 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1722 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1723 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1724         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1725 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1726         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1727 
1728 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1729 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1730 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1731 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1732 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1733 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1734 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1735 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1736 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1737 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1738 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1739 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1740 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1741 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1742 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1743 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1744 GEN_VEXT_VV(vdivu_vv_b, 1)
1745 GEN_VEXT_VV(vdivu_vv_h, 2)
1746 GEN_VEXT_VV(vdivu_vv_w, 4)
1747 GEN_VEXT_VV(vdivu_vv_d, 8)
1748 GEN_VEXT_VV(vdiv_vv_b, 1)
1749 GEN_VEXT_VV(vdiv_vv_h, 2)
1750 GEN_VEXT_VV(vdiv_vv_w, 4)
1751 GEN_VEXT_VV(vdiv_vv_d, 8)
1752 GEN_VEXT_VV(vremu_vv_b, 1)
1753 GEN_VEXT_VV(vremu_vv_h, 2)
1754 GEN_VEXT_VV(vremu_vv_w, 4)
1755 GEN_VEXT_VV(vremu_vv_d, 8)
1756 GEN_VEXT_VV(vrem_vv_b, 1)
1757 GEN_VEXT_VV(vrem_vv_h, 2)
1758 GEN_VEXT_VV(vrem_vv_w, 4)
1759 GEN_VEXT_VV(vrem_vv_d, 8)
1760 
1761 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1762 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1763 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1764 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1765 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1766 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1767 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1768 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1769 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1770 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1771 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1772 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1773 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1774 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1775 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1776 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1777 GEN_VEXT_VX(vdivu_vx_b, 1)
1778 GEN_VEXT_VX(vdivu_vx_h, 2)
1779 GEN_VEXT_VX(vdivu_vx_w, 4)
1780 GEN_VEXT_VX(vdivu_vx_d, 8)
1781 GEN_VEXT_VX(vdiv_vx_b, 1)
1782 GEN_VEXT_VX(vdiv_vx_h, 2)
1783 GEN_VEXT_VX(vdiv_vx_w, 4)
1784 GEN_VEXT_VX(vdiv_vx_d, 8)
1785 GEN_VEXT_VX(vremu_vx_b, 1)
1786 GEN_VEXT_VX(vremu_vx_h, 2)
1787 GEN_VEXT_VX(vremu_vx_w, 4)
1788 GEN_VEXT_VX(vremu_vx_d, 8)
1789 GEN_VEXT_VX(vrem_vx_b, 1)
1790 GEN_VEXT_VX(vrem_vx_h, 2)
1791 GEN_VEXT_VX(vrem_vx_w, 4)
1792 GEN_VEXT_VX(vrem_vx_d, 8)
1793 
1794 /* Vector Widening Integer Multiply Instructions */
1795 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1796 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1797 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1798 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1799 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1800 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1801 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1802 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1803 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1804 GEN_VEXT_VV(vwmul_vv_b, 2)
1805 GEN_VEXT_VV(vwmul_vv_h, 4)
1806 GEN_VEXT_VV(vwmul_vv_w, 8)
1807 GEN_VEXT_VV(vwmulu_vv_b, 2)
1808 GEN_VEXT_VV(vwmulu_vv_h, 4)
1809 GEN_VEXT_VV(vwmulu_vv_w, 8)
1810 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1811 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1812 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1813 
1814 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1815 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1816 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1817 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1818 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1819 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1820 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1821 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1822 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1823 GEN_VEXT_VX(vwmul_vx_b, 2)
1824 GEN_VEXT_VX(vwmul_vx_h, 4)
1825 GEN_VEXT_VX(vwmul_vx_w, 8)
1826 GEN_VEXT_VX(vwmulu_vx_b, 2)
1827 GEN_VEXT_VX(vwmulu_vx_h, 4)
1828 GEN_VEXT_VX(vwmulu_vx_w, 8)
1829 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1830 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1831 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1832 
1833 /* Vector Single-Width Integer Multiply-Add Instructions */
1834 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1835 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1836 {                                                                  \
1837     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1838     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1839     TD d = *((TD *)vd + HD(i));                                    \
1840     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1841 }
1842 
1843 #define DO_MACC(N, M, D) (M * N + D)
1844 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1845 #define DO_MADD(N, M, D) (M * D + N)
1846 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1847 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1848 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1849 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1850 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1851 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1852 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1853 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1854 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1855 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1856 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1857 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1858 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1859 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1860 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1861 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1862 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1863 GEN_VEXT_VV(vmacc_vv_b, 1)
1864 GEN_VEXT_VV(vmacc_vv_h, 2)
1865 GEN_VEXT_VV(vmacc_vv_w, 4)
1866 GEN_VEXT_VV(vmacc_vv_d, 8)
1867 GEN_VEXT_VV(vnmsac_vv_b, 1)
1868 GEN_VEXT_VV(vnmsac_vv_h, 2)
1869 GEN_VEXT_VV(vnmsac_vv_w, 4)
1870 GEN_VEXT_VV(vnmsac_vv_d, 8)
1871 GEN_VEXT_VV(vmadd_vv_b, 1)
1872 GEN_VEXT_VV(vmadd_vv_h, 2)
1873 GEN_VEXT_VV(vmadd_vv_w, 4)
1874 GEN_VEXT_VV(vmadd_vv_d, 8)
1875 GEN_VEXT_VV(vnmsub_vv_b, 1)
1876 GEN_VEXT_VV(vnmsub_vv_h, 2)
1877 GEN_VEXT_VV(vnmsub_vv_w, 4)
1878 GEN_VEXT_VV(vnmsub_vv_d, 8)
1879 
1880 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1881 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1882 {                                                                   \
1883     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1884     TD d = *((TD *)vd + HD(i));                                     \
1885     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1886 }
1887 
1888 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1889 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1890 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1891 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1892 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1893 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1894 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1895 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1896 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1897 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1898 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1899 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1900 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1901 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1902 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1903 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1904 GEN_VEXT_VX(vmacc_vx_b, 1)
1905 GEN_VEXT_VX(vmacc_vx_h, 2)
1906 GEN_VEXT_VX(vmacc_vx_w, 4)
1907 GEN_VEXT_VX(vmacc_vx_d, 8)
1908 GEN_VEXT_VX(vnmsac_vx_b, 1)
1909 GEN_VEXT_VX(vnmsac_vx_h, 2)
1910 GEN_VEXT_VX(vnmsac_vx_w, 4)
1911 GEN_VEXT_VX(vnmsac_vx_d, 8)
1912 GEN_VEXT_VX(vmadd_vx_b, 1)
1913 GEN_VEXT_VX(vmadd_vx_h, 2)
1914 GEN_VEXT_VX(vmadd_vx_w, 4)
1915 GEN_VEXT_VX(vmadd_vx_d, 8)
1916 GEN_VEXT_VX(vnmsub_vx_b, 1)
1917 GEN_VEXT_VX(vnmsub_vx_h, 2)
1918 GEN_VEXT_VX(vnmsub_vx_w, 4)
1919 GEN_VEXT_VX(vnmsub_vx_d, 8)
1920 
1921 /* Vector Widening Integer Multiply-Add Instructions */
1922 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1923 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1924 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1925 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1926 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1927 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1928 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1929 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1930 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1931 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1932 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1933 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1934 GEN_VEXT_VV(vwmacc_vv_b, 2)
1935 GEN_VEXT_VV(vwmacc_vv_h, 4)
1936 GEN_VEXT_VV(vwmacc_vv_w, 8)
1937 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1938 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1939 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1940 
1941 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1942 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1943 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1944 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1945 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1946 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1947 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1948 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1949 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1950 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1951 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1952 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1953 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1954 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1955 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1956 GEN_VEXT_VX(vwmacc_vx_b, 2)
1957 GEN_VEXT_VX(vwmacc_vx_h, 4)
1958 GEN_VEXT_VX(vwmacc_vx_w, 8)
1959 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1960 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1961 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1962 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1963 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1964 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1965 
1966 /* Vector Integer Merge and Move Instructions */
1967 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1968 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1969                   uint32_t desc)                                     \
1970 {                                                                    \
1971     uint32_t vl = env->vl;                                           \
1972     uint32_t esz = sizeof(ETYPE);                                    \
1973     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1974     uint32_t vta = vext_vta(desc);                                   \
1975     uint32_t i;                                                      \
1976                                                                      \
1977     for (i = env->vstart; i < vl; i++) {                             \
1978         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1979         *((ETYPE *)vd + H(i)) = s1;                                  \
1980     }                                                                \
1981     env->vstart = 0;                                                 \
1982     /* set tail elements to 1s */                                    \
1983     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1984 }
1985 
1986 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1987 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1988 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1989 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1990 
1991 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1992 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1993                   uint32_t desc)                                     \
1994 {                                                                    \
1995     uint32_t vl = env->vl;                                           \
1996     uint32_t esz = sizeof(ETYPE);                                    \
1997     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1998     uint32_t vta = vext_vta(desc);                                   \
1999     uint32_t i;                                                      \
2000                                                                      \
2001     for (i = env->vstart; i < vl; i++) {                             \
2002         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2003     }                                                                \
2004     env->vstart = 0;                                                 \
2005     /* set tail elements to 1s */                                    \
2006     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2007 }
2008 
2009 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2010 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2011 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2012 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2013 
2014 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2015 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2016                   CPURISCVState *env, uint32_t desc)                 \
2017 {                                                                    \
2018     uint32_t vl = env->vl;                                           \
2019     uint32_t esz = sizeof(ETYPE);                                    \
2020     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2021     uint32_t vta = vext_vta(desc);                                   \
2022     uint32_t i;                                                      \
2023                                                                      \
2024     for (i = env->vstart; i < vl; i++) {                             \
2025         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2026         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2027     }                                                                \
2028     env->vstart = 0;                                                 \
2029     /* set tail elements to 1s */                                    \
2030     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2031 }
2032 
2033 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2034 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2035 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2036 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2037 
2038 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2039 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2040                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2041 {                                                                    \
2042     uint32_t vl = env->vl;                                           \
2043     uint32_t esz = sizeof(ETYPE);                                    \
2044     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2045     uint32_t vta = vext_vta(desc);                                   \
2046     uint32_t i;                                                      \
2047                                                                      \
2048     for (i = env->vstart; i < vl; i++) {                             \
2049         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2050         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2051                    (ETYPE)(target_long)s1);                          \
2052         *((ETYPE *)vd + H(i)) = d;                                   \
2053     }                                                                \
2054     env->vstart = 0;                                                 \
2055     /* set tail elements to 1s */                                    \
2056     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2057 }
2058 
2059 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2060 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2061 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2062 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2063 
2064 /*
2065  *** Vector Fixed-Point Arithmetic Instructions
2066  */
2067 
2068 /* Vector Single-Width Saturating Add and Subtract */
2069 
2070 /*
2071  * As fixed point instructions probably have round mode and saturation,
2072  * define common macros for fixed point here.
2073  */
2074 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2075                           CPURISCVState *env, int vxrm);
2076 
2077 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2078 static inline void                                                  \
2079 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2080           CPURISCVState *env, int vxrm)                             \
2081 {                                                                   \
2082     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2083     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2084     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2085 }
2086 
2087 static inline void
2088 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2089              CPURISCVState *env,
2090              uint32_t vl, uint32_t vm, int vxrm,
2091              opivv2_rm_fn *fn)
2092 {
2093     for (uint32_t i = env->vstart; i < vl; i++) {
2094         if (!vm && !vext_elem_mask(v0, i)) {
2095             continue;
2096         }
2097         fn(vd, vs1, vs2, i, env, vxrm);
2098     }
2099     env->vstart = 0;
2100 }
2101 
2102 static inline void
2103 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2104              CPURISCVState *env,
2105              uint32_t desc,
2106              opivv2_rm_fn *fn)
2107 {
2108     uint32_t vm = vext_vm(desc);
2109     uint32_t vl = env->vl;
2110 
2111     switch (env->vxrm) {
2112     case 0: /* rnu */
2113         vext_vv_rm_1(vd, v0, vs1, vs2,
2114                      env, vl, vm, 0, fn);
2115         break;
2116     case 1: /* rne */
2117         vext_vv_rm_1(vd, v0, vs1, vs2,
2118                      env, vl, vm, 1, fn);
2119         break;
2120     case 2: /* rdn */
2121         vext_vv_rm_1(vd, v0, vs1, vs2,
2122                      env, vl, vm, 2, fn);
2123         break;
2124     default: /* rod */
2125         vext_vv_rm_1(vd, v0, vs1, vs2,
2126                      env, vl, vm, 3, fn);
2127         break;
2128     }
2129 }
2130 
2131 /* generate helpers for fixed point instructions with OPIVV format */
2132 #define GEN_VEXT_VV_RM(NAME)                                    \
2133 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2134                   CPURISCVState *env, uint32_t desc)            \
2135 {                                                               \
2136     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2137                  do_##NAME);                                    \
2138 }
2139 
2140 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2141 {
2142     uint8_t res = a + b;
2143     if (res < a) {
2144         res = UINT8_MAX;
2145         env->vxsat = 0x1;
2146     }
2147     return res;
2148 }
2149 
2150 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2151                                uint16_t b)
2152 {
2153     uint16_t res = a + b;
2154     if (res < a) {
2155         res = UINT16_MAX;
2156         env->vxsat = 0x1;
2157     }
2158     return res;
2159 }
2160 
2161 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2162                                uint32_t b)
2163 {
2164     uint32_t res = a + b;
2165     if (res < a) {
2166         res = UINT32_MAX;
2167         env->vxsat = 0x1;
2168     }
2169     return res;
2170 }
2171 
2172 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2173                                uint64_t b)
2174 {
2175     uint64_t res = a + b;
2176     if (res < a) {
2177         res = UINT64_MAX;
2178         env->vxsat = 0x1;
2179     }
2180     return res;
2181 }
2182 
2183 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2184 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2185 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2186 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2187 GEN_VEXT_VV_RM(vsaddu_vv_b)
2188 GEN_VEXT_VV_RM(vsaddu_vv_h)
2189 GEN_VEXT_VV_RM(vsaddu_vv_w)
2190 GEN_VEXT_VV_RM(vsaddu_vv_d)
2191 
2192 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2193                           CPURISCVState *env, int vxrm);
2194 
2195 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2196 static inline void                                                  \
2197 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2198           CPURISCVState *env, int vxrm)                             \
2199 {                                                                   \
2200     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2201     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2202 }
2203 
2204 static inline void
2205 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2206              CPURISCVState *env,
2207              uint32_t vl, uint32_t vm, int vxrm,
2208              opivx2_rm_fn *fn)
2209 {
2210     for (uint32_t i = env->vstart; i < vl; i++) {
2211         if (!vm && !vext_elem_mask(v0, i)) {
2212             continue;
2213         }
2214         fn(vd, s1, vs2, i, env, vxrm);
2215     }
2216     env->vstart = 0;
2217 }
2218 
2219 static inline void
2220 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2221              CPURISCVState *env,
2222              uint32_t desc,
2223              opivx2_rm_fn *fn)
2224 {
2225     uint32_t vm = vext_vm(desc);
2226     uint32_t vl = env->vl;
2227 
2228     switch (env->vxrm) {
2229     case 0: /* rnu */
2230         vext_vx_rm_1(vd, v0, s1, vs2,
2231                      env, vl, vm, 0, fn);
2232         break;
2233     case 1: /* rne */
2234         vext_vx_rm_1(vd, v0, s1, vs2,
2235                      env, vl, vm, 1, fn);
2236         break;
2237     case 2: /* rdn */
2238         vext_vx_rm_1(vd, v0, s1, vs2,
2239                      env, vl, vm, 2, fn);
2240         break;
2241     default: /* rod */
2242         vext_vx_rm_1(vd, v0, s1, vs2,
2243                      env, vl, vm, 3, fn);
2244         break;
2245     }
2246 }
2247 
2248 /* generate helpers for fixed point instructions with OPIVX format */
2249 #define GEN_VEXT_VX_RM(NAME)                              \
2250 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2251         void *vs2, CPURISCVState *env, uint32_t desc)     \
2252 {                                                         \
2253     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2254                  do_##NAME);                              \
2255 }
2256 
2257 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2258 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2259 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2260 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2261 GEN_VEXT_VX_RM(vsaddu_vx_b)
2262 GEN_VEXT_VX_RM(vsaddu_vx_h)
2263 GEN_VEXT_VX_RM(vsaddu_vx_w)
2264 GEN_VEXT_VX_RM(vsaddu_vx_d)
2265 
2266 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2267 {
2268     int8_t res = a + b;
2269     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2270         res = a > 0 ? INT8_MAX : INT8_MIN;
2271         env->vxsat = 0x1;
2272     }
2273     return res;
2274 }
2275 
2276 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2277 {
2278     int16_t res = a + b;
2279     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2280         res = a > 0 ? INT16_MAX : INT16_MIN;
2281         env->vxsat = 0x1;
2282     }
2283     return res;
2284 }
2285 
2286 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2287 {
2288     int32_t res = a + b;
2289     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2290         res = a > 0 ? INT32_MAX : INT32_MIN;
2291         env->vxsat = 0x1;
2292     }
2293     return res;
2294 }
2295 
2296 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2297 {
2298     int64_t res = a + b;
2299     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2300         res = a > 0 ? INT64_MAX : INT64_MIN;
2301         env->vxsat = 0x1;
2302     }
2303     return res;
2304 }
2305 
2306 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2307 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2308 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2309 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2310 GEN_VEXT_VV_RM(vsadd_vv_b)
2311 GEN_VEXT_VV_RM(vsadd_vv_h)
2312 GEN_VEXT_VV_RM(vsadd_vv_w)
2313 GEN_VEXT_VV_RM(vsadd_vv_d)
2314 
2315 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2316 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2317 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2318 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2319 GEN_VEXT_VX_RM(vsadd_vx_b)
2320 GEN_VEXT_VX_RM(vsadd_vx_h)
2321 GEN_VEXT_VX_RM(vsadd_vx_w)
2322 GEN_VEXT_VX_RM(vsadd_vx_d)
2323 
2324 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2325 {
2326     uint8_t res = a - b;
2327     if (res > a) {
2328         res = 0;
2329         env->vxsat = 0x1;
2330     }
2331     return res;
2332 }
2333 
2334 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2335                                uint16_t b)
2336 {
2337     uint16_t res = a - b;
2338     if (res > a) {
2339         res = 0;
2340         env->vxsat = 0x1;
2341     }
2342     return res;
2343 }
2344 
2345 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2346                                uint32_t b)
2347 {
2348     uint32_t res = a - b;
2349     if (res > a) {
2350         res = 0;
2351         env->vxsat = 0x1;
2352     }
2353     return res;
2354 }
2355 
2356 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2357                                uint64_t b)
2358 {
2359     uint64_t res = a - b;
2360     if (res > a) {
2361         res = 0;
2362         env->vxsat = 0x1;
2363     }
2364     return res;
2365 }
2366 
2367 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2368 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2369 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2370 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2371 GEN_VEXT_VV_RM(vssubu_vv_b)
2372 GEN_VEXT_VV_RM(vssubu_vv_h)
2373 GEN_VEXT_VV_RM(vssubu_vv_w)
2374 GEN_VEXT_VV_RM(vssubu_vv_d)
2375 
2376 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2377 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2378 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2379 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2380 GEN_VEXT_VX_RM(vssubu_vx_b)
2381 GEN_VEXT_VX_RM(vssubu_vx_h)
2382 GEN_VEXT_VX_RM(vssubu_vx_w)
2383 GEN_VEXT_VX_RM(vssubu_vx_d)
2384 
2385 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2386 {
2387     int8_t res = a - b;
2388     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2389         res = a >= 0 ? INT8_MAX : INT8_MIN;
2390         env->vxsat = 0x1;
2391     }
2392     return res;
2393 }
2394 
2395 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2396 {
2397     int16_t res = a - b;
2398     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2399         res = a >= 0 ? INT16_MAX : INT16_MIN;
2400         env->vxsat = 0x1;
2401     }
2402     return res;
2403 }
2404 
2405 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2406 {
2407     int32_t res = a - b;
2408     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2409         res = a >= 0 ? INT32_MAX : INT32_MIN;
2410         env->vxsat = 0x1;
2411     }
2412     return res;
2413 }
2414 
2415 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2416 {
2417     int64_t res = a - b;
2418     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2419         res = a >= 0 ? INT64_MAX : INT64_MIN;
2420         env->vxsat = 0x1;
2421     }
2422     return res;
2423 }
2424 
2425 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2426 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2427 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2428 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2429 GEN_VEXT_VV_RM(vssub_vv_b)
2430 GEN_VEXT_VV_RM(vssub_vv_h)
2431 GEN_VEXT_VV_RM(vssub_vv_w)
2432 GEN_VEXT_VV_RM(vssub_vv_d)
2433 
2434 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2435 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2436 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2437 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2438 GEN_VEXT_VX_RM(vssub_vx_b)
2439 GEN_VEXT_VX_RM(vssub_vx_h)
2440 GEN_VEXT_VX_RM(vssub_vx_w)
2441 GEN_VEXT_VX_RM(vssub_vx_d)
2442 
2443 /* Vector Single-Width Averaging Add and Subtract */
2444 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2445 {
2446     uint8_t d = extract64(v, shift, 1);
2447     uint8_t d1;
2448     uint64_t D1, D2;
2449 
2450     if (shift == 0 || shift > 64) {
2451         return 0;
2452     }
2453 
2454     d1 = extract64(v, shift - 1, 1);
2455     D1 = extract64(v, 0, shift);
2456     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2457         return d1;
2458     } else if (vxrm == 1) { /* round-to-nearest-even */
2459         if (shift > 1) {
2460             D2 = extract64(v, 0, shift - 1);
2461             return d1 & ((D2 != 0) | d);
2462         } else {
2463             return d1 & d;
2464         }
2465     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2466         return !d & (D1 != 0);
2467     }
2468     return 0; /* round-down (truncate) */
2469 }
2470 
2471 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2472 {
2473     int64_t res = (int64_t)a + b;
2474     uint8_t round = get_round(vxrm, res, 1);
2475 
2476     return (res >> 1) + round;
2477 }
2478 
2479 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2480 {
2481     int64_t res = a + b;
2482     uint8_t round = get_round(vxrm, res, 1);
2483     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2484 
2485     /* With signed overflow, bit 64 is inverse of bit 63. */
2486     return ((res >> 1) ^ over) + round;
2487 }
2488 
2489 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2490 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2491 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2492 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2493 GEN_VEXT_VV_RM(vaadd_vv_b)
2494 GEN_VEXT_VV_RM(vaadd_vv_h)
2495 GEN_VEXT_VV_RM(vaadd_vv_w)
2496 GEN_VEXT_VV_RM(vaadd_vv_d)
2497 
2498 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2499 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2500 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2501 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2502 GEN_VEXT_VX_RM(vaadd_vx_b)
2503 GEN_VEXT_VX_RM(vaadd_vx_h)
2504 GEN_VEXT_VX_RM(vaadd_vx_w)
2505 GEN_VEXT_VX_RM(vaadd_vx_d)
2506 
2507 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2508                                uint32_t a, uint32_t b)
2509 {
2510     uint64_t res = (uint64_t)a + b;
2511     uint8_t round = get_round(vxrm, res, 1);
2512 
2513     return (res >> 1) + round;
2514 }
2515 
2516 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2517                                uint64_t a, uint64_t b)
2518 {
2519     uint64_t res = a + b;
2520     uint8_t round = get_round(vxrm, res, 1);
2521     uint64_t over = (uint64_t)(res < a) << 63;
2522 
2523     return ((res >> 1) | over) + round;
2524 }
2525 
2526 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2527 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2528 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2529 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2530 GEN_VEXT_VV_RM(vaaddu_vv_b)
2531 GEN_VEXT_VV_RM(vaaddu_vv_h)
2532 GEN_VEXT_VV_RM(vaaddu_vv_w)
2533 GEN_VEXT_VV_RM(vaaddu_vv_d)
2534 
2535 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2536 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2537 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2538 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2539 GEN_VEXT_VX_RM(vaaddu_vx_b)
2540 GEN_VEXT_VX_RM(vaaddu_vx_h)
2541 GEN_VEXT_VX_RM(vaaddu_vx_w)
2542 GEN_VEXT_VX_RM(vaaddu_vx_d)
2543 
2544 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2545 {
2546     int64_t res = (int64_t)a - b;
2547     uint8_t round = get_round(vxrm, res, 1);
2548 
2549     return (res >> 1) + round;
2550 }
2551 
2552 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2553 {
2554     int64_t res = (int64_t)a - b;
2555     uint8_t round = get_round(vxrm, res, 1);
2556     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2557 
2558     /* With signed overflow, bit 64 is inverse of bit 63. */
2559     return ((res >> 1) ^ over) + round;
2560 }
2561 
2562 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2563 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2564 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2565 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2566 GEN_VEXT_VV_RM(vasub_vv_b)
2567 GEN_VEXT_VV_RM(vasub_vv_h)
2568 GEN_VEXT_VV_RM(vasub_vv_w)
2569 GEN_VEXT_VV_RM(vasub_vv_d)
2570 
2571 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2572 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2573 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2574 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2575 GEN_VEXT_VX_RM(vasub_vx_b)
2576 GEN_VEXT_VX_RM(vasub_vx_h)
2577 GEN_VEXT_VX_RM(vasub_vx_w)
2578 GEN_VEXT_VX_RM(vasub_vx_d)
2579 
2580 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2581                                uint32_t a, uint32_t b)
2582 {
2583     int64_t res = (int64_t)a - b;
2584     uint8_t round = get_round(vxrm, res, 1);
2585 
2586     return (res >> 1) + round;
2587 }
2588 
2589 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2590                                uint64_t a, uint64_t b)
2591 {
2592     uint64_t res = (uint64_t)a - b;
2593     uint8_t round = get_round(vxrm, res, 1);
2594     uint64_t over = (uint64_t)(res > a) << 63;
2595 
2596     return ((res >> 1) | over) + round;
2597 }
2598 
2599 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2600 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2601 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2602 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2603 GEN_VEXT_VV_RM(vasubu_vv_b)
2604 GEN_VEXT_VV_RM(vasubu_vv_h)
2605 GEN_VEXT_VV_RM(vasubu_vv_w)
2606 GEN_VEXT_VV_RM(vasubu_vv_d)
2607 
2608 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2609 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2610 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2611 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2612 GEN_VEXT_VX_RM(vasubu_vx_b)
2613 GEN_VEXT_VX_RM(vasubu_vx_h)
2614 GEN_VEXT_VX_RM(vasubu_vx_w)
2615 GEN_VEXT_VX_RM(vasubu_vx_d)
2616 
2617 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2618 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2619 {
2620     uint8_t round;
2621     int16_t res;
2622 
2623     res = (int16_t)a * (int16_t)b;
2624     round = get_round(vxrm, res, 7);
2625     res   = (res >> 7) + round;
2626 
2627     if (res > INT8_MAX) {
2628         env->vxsat = 0x1;
2629         return INT8_MAX;
2630     } else if (res < INT8_MIN) {
2631         env->vxsat = 0x1;
2632         return INT8_MIN;
2633     } else {
2634         return res;
2635     }
2636 }
2637 
2638 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2639 {
2640     uint8_t round;
2641     int32_t res;
2642 
2643     res = (int32_t)a * (int32_t)b;
2644     round = get_round(vxrm, res, 15);
2645     res   = (res >> 15) + round;
2646 
2647     if (res > INT16_MAX) {
2648         env->vxsat = 0x1;
2649         return INT16_MAX;
2650     } else if (res < INT16_MIN) {
2651         env->vxsat = 0x1;
2652         return INT16_MIN;
2653     } else {
2654         return res;
2655     }
2656 }
2657 
2658 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2659 {
2660     uint8_t round;
2661     int64_t res;
2662 
2663     res = (int64_t)a * (int64_t)b;
2664     round = get_round(vxrm, res, 31);
2665     res   = (res >> 31) + round;
2666 
2667     if (res > INT32_MAX) {
2668         env->vxsat = 0x1;
2669         return INT32_MAX;
2670     } else if (res < INT32_MIN) {
2671         env->vxsat = 0x1;
2672         return INT32_MIN;
2673     } else {
2674         return res;
2675     }
2676 }
2677 
2678 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2679 {
2680     uint8_t round;
2681     uint64_t hi_64, lo_64;
2682     int64_t res;
2683 
2684     if (a == INT64_MIN && b == INT64_MIN) {
2685         env->vxsat = 1;
2686         return INT64_MAX;
2687     }
2688 
2689     muls64(&lo_64, &hi_64, a, b);
2690     round = get_round(vxrm, lo_64, 63);
2691     /*
2692      * Cannot overflow, as there are always
2693      * 2 sign bits after multiply.
2694      */
2695     res = (hi_64 << 1) | (lo_64 >> 63);
2696     if (round) {
2697         if (res == INT64_MAX) {
2698             env->vxsat = 1;
2699         } else {
2700             res += 1;
2701         }
2702     }
2703     return res;
2704 }
2705 
2706 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2707 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2708 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2709 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2710 GEN_VEXT_VV_RM(vsmul_vv_b)
2711 GEN_VEXT_VV_RM(vsmul_vv_h)
2712 GEN_VEXT_VV_RM(vsmul_vv_w)
2713 GEN_VEXT_VV_RM(vsmul_vv_d)
2714 
2715 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2716 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2717 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2718 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2719 GEN_VEXT_VX_RM(vsmul_vx_b)
2720 GEN_VEXT_VX_RM(vsmul_vx_h)
2721 GEN_VEXT_VX_RM(vsmul_vx_w)
2722 GEN_VEXT_VX_RM(vsmul_vx_d)
2723 
2724 /* Vector Single-Width Scaling Shift Instructions */
2725 static inline uint8_t
2726 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2727 {
2728     uint8_t round, shift = b & 0x7;
2729     uint8_t res;
2730 
2731     round = get_round(vxrm, a, shift);
2732     res   = (a >> shift)  + round;
2733     return res;
2734 }
2735 static inline uint16_t
2736 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2737 {
2738     uint8_t round, shift = b & 0xf;
2739     uint16_t res;
2740 
2741     round = get_round(vxrm, a, shift);
2742     res   = (a >> shift)  + round;
2743     return res;
2744 }
2745 static inline uint32_t
2746 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2747 {
2748     uint8_t round, shift = b & 0x1f;
2749     uint32_t res;
2750 
2751     round = get_round(vxrm, a, shift);
2752     res   = (a >> shift)  + round;
2753     return res;
2754 }
2755 static inline uint64_t
2756 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2757 {
2758     uint8_t round, shift = b & 0x3f;
2759     uint64_t res;
2760 
2761     round = get_round(vxrm, a, shift);
2762     res   = (a >> shift)  + round;
2763     return res;
2764 }
2765 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2766 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2767 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2768 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2769 GEN_VEXT_VV_RM(vssrl_vv_b)
2770 GEN_VEXT_VV_RM(vssrl_vv_h)
2771 GEN_VEXT_VV_RM(vssrl_vv_w)
2772 GEN_VEXT_VV_RM(vssrl_vv_d)
2773 
2774 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2775 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2776 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2777 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2778 GEN_VEXT_VX_RM(vssrl_vx_b)
2779 GEN_VEXT_VX_RM(vssrl_vx_h)
2780 GEN_VEXT_VX_RM(vssrl_vx_w)
2781 GEN_VEXT_VX_RM(vssrl_vx_d)
2782 
2783 static inline int8_t
2784 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2785 {
2786     uint8_t round, shift = b & 0x7;
2787     int8_t res;
2788 
2789     round = get_round(vxrm, a, shift);
2790     res   = (a >> shift)  + round;
2791     return res;
2792 }
2793 static inline int16_t
2794 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2795 {
2796     uint8_t round, shift = b & 0xf;
2797     int16_t res;
2798 
2799     round = get_round(vxrm, a, shift);
2800     res   = (a >> shift)  + round;
2801     return res;
2802 }
2803 static inline int32_t
2804 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2805 {
2806     uint8_t round, shift = b & 0x1f;
2807     int32_t res;
2808 
2809     round = get_round(vxrm, a, shift);
2810     res   = (a >> shift)  + round;
2811     return res;
2812 }
2813 static inline int64_t
2814 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2815 {
2816     uint8_t round, shift = b & 0x3f;
2817     int64_t res;
2818 
2819     round = get_round(vxrm, a, shift);
2820     res   = (a >> shift)  + round;
2821     return res;
2822 }
2823 
2824 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2825 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2826 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2827 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2828 GEN_VEXT_VV_RM(vssra_vv_b)
2829 GEN_VEXT_VV_RM(vssra_vv_h)
2830 GEN_VEXT_VV_RM(vssra_vv_w)
2831 GEN_VEXT_VV_RM(vssra_vv_d)
2832 
2833 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2834 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2835 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2836 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2837 GEN_VEXT_VX_RM(vssra_vx_b)
2838 GEN_VEXT_VX_RM(vssra_vx_h)
2839 GEN_VEXT_VX_RM(vssra_vx_w)
2840 GEN_VEXT_VX_RM(vssra_vx_d)
2841 
2842 /* Vector Narrowing Fixed-Point Clip Instructions */
2843 static inline int8_t
2844 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2845 {
2846     uint8_t round, shift = b & 0xf;
2847     int16_t res;
2848 
2849     round = get_round(vxrm, a, shift);
2850     res   = (a >> shift)  + round;
2851     if (res > INT8_MAX) {
2852         env->vxsat = 0x1;
2853         return INT8_MAX;
2854     } else if (res < INT8_MIN) {
2855         env->vxsat = 0x1;
2856         return INT8_MIN;
2857     } else {
2858         return res;
2859     }
2860 }
2861 
2862 static inline int16_t
2863 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2864 {
2865     uint8_t round, shift = b & 0x1f;
2866     int32_t res;
2867 
2868     round = get_round(vxrm, a, shift);
2869     res   = (a >> shift)  + round;
2870     if (res > INT16_MAX) {
2871         env->vxsat = 0x1;
2872         return INT16_MAX;
2873     } else if (res < INT16_MIN) {
2874         env->vxsat = 0x1;
2875         return INT16_MIN;
2876     } else {
2877         return res;
2878     }
2879 }
2880 
2881 static inline int32_t
2882 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2883 {
2884     uint8_t round, shift = b & 0x3f;
2885     int64_t res;
2886 
2887     round = get_round(vxrm, a, shift);
2888     res   = (a >> shift)  + round;
2889     if (res > INT32_MAX) {
2890         env->vxsat = 0x1;
2891         return INT32_MAX;
2892     } else if (res < INT32_MIN) {
2893         env->vxsat = 0x1;
2894         return INT32_MIN;
2895     } else {
2896         return res;
2897     }
2898 }
2899 
2900 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2901 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2902 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2903 GEN_VEXT_VV_RM(vnclip_wv_b)
2904 GEN_VEXT_VV_RM(vnclip_wv_h)
2905 GEN_VEXT_VV_RM(vnclip_wv_w)
2906 
2907 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2908 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2909 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2910 GEN_VEXT_VX_RM(vnclip_wx_b)
2911 GEN_VEXT_VX_RM(vnclip_wx_h)
2912 GEN_VEXT_VX_RM(vnclip_wx_w)
2913 
2914 static inline uint8_t
2915 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2916 {
2917     uint8_t round, shift = b & 0xf;
2918     uint16_t res;
2919 
2920     round = get_round(vxrm, a, shift);
2921     res   = (a >> shift)  + round;
2922     if (res > UINT8_MAX) {
2923         env->vxsat = 0x1;
2924         return UINT8_MAX;
2925     } else {
2926         return res;
2927     }
2928 }
2929 
2930 static inline uint16_t
2931 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2932 {
2933     uint8_t round, shift = b & 0x1f;
2934     uint32_t res;
2935 
2936     round = get_round(vxrm, a, shift);
2937     res   = (a >> shift)  + round;
2938     if (res > UINT16_MAX) {
2939         env->vxsat = 0x1;
2940         return UINT16_MAX;
2941     } else {
2942         return res;
2943     }
2944 }
2945 
2946 static inline uint32_t
2947 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2948 {
2949     uint8_t round, shift = b & 0x3f;
2950     uint64_t res;
2951 
2952     round = get_round(vxrm, a, shift);
2953     res   = (a >> shift)  + round;
2954     if (res > UINT32_MAX) {
2955         env->vxsat = 0x1;
2956         return UINT32_MAX;
2957     } else {
2958         return res;
2959     }
2960 }
2961 
2962 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2963 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2964 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2965 GEN_VEXT_VV_RM(vnclipu_wv_b)
2966 GEN_VEXT_VV_RM(vnclipu_wv_h)
2967 GEN_VEXT_VV_RM(vnclipu_wv_w)
2968 
2969 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2970 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2971 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2972 GEN_VEXT_VX_RM(vnclipu_wx_b)
2973 GEN_VEXT_VX_RM(vnclipu_wx_h)
2974 GEN_VEXT_VX_RM(vnclipu_wx_w)
2975 
2976 /*
2977  *** Vector Float Point Arithmetic Instructions
2978  */
2979 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2980 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2981 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2982                       CPURISCVState *env)                      \
2983 {                                                              \
2984     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2985     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2986     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2987 }
2988 
2989 #define GEN_VEXT_VV_ENV(NAME)                             \
2990 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2991                   void *vs2, CPURISCVState *env,          \
2992                   uint32_t desc)                          \
2993 {                                                         \
2994     uint32_t vm = vext_vm(desc);                          \
2995     uint32_t vl = env->vl;                                \
2996     uint32_t i;                                           \
2997                                                           \
2998     for (i = env->vstart; i < vl; i++) {                  \
2999         if (!vm && !vext_elem_mask(v0, i)) {              \
3000             continue;                                     \
3001         }                                                 \
3002         do_##NAME(vd, vs1, vs2, i, env);                  \
3003     }                                                     \
3004     env->vstart = 0;                                      \
3005 }
3006 
3007 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3008 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3009 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3010 GEN_VEXT_VV_ENV(vfadd_vv_h)
3011 GEN_VEXT_VV_ENV(vfadd_vv_w)
3012 GEN_VEXT_VV_ENV(vfadd_vv_d)
3013 
3014 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3015 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3016                       CPURISCVState *env)                      \
3017 {                                                              \
3018     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3019     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3020 }
3021 
3022 #define GEN_VEXT_VF(NAME)                                 \
3023 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3024                   void *vs2, CPURISCVState *env,          \
3025                   uint32_t desc)                          \
3026 {                                                         \
3027     uint32_t vm = vext_vm(desc);                          \
3028     uint32_t vl = env->vl;                                \
3029     uint32_t i;                                           \
3030                                                           \
3031     for (i = env->vstart; i < vl; i++) {                  \
3032         if (!vm && !vext_elem_mask(v0, i)) {              \
3033             continue;                                     \
3034         }                                                 \
3035         do_##NAME(vd, s1, vs2, i, env);                   \
3036     }                                                     \
3037     env->vstart = 0;                                      \
3038 }
3039 
3040 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3041 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3042 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3043 GEN_VEXT_VF(vfadd_vf_h)
3044 GEN_VEXT_VF(vfadd_vf_w)
3045 GEN_VEXT_VF(vfadd_vf_d)
3046 
3047 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3048 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3049 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3050 GEN_VEXT_VV_ENV(vfsub_vv_h)
3051 GEN_VEXT_VV_ENV(vfsub_vv_w)
3052 GEN_VEXT_VV_ENV(vfsub_vv_d)
3053 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3054 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3055 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3056 GEN_VEXT_VF(vfsub_vf_h)
3057 GEN_VEXT_VF(vfsub_vf_w)
3058 GEN_VEXT_VF(vfsub_vf_d)
3059 
3060 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3061 {
3062     return float16_sub(b, a, s);
3063 }
3064 
3065 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3066 {
3067     return float32_sub(b, a, s);
3068 }
3069 
3070 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3071 {
3072     return float64_sub(b, a, s);
3073 }
3074 
3075 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3076 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3077 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3078 GEN_VEXT_VF(vfrsub_vf_h)
3079 GEN_VEXT_VF(vfrsub_vf_w)
3080 GEN_VEXT_VF(vfrsub_vf_d)
3081 
3082 /* Vector Widening Floating-Point Add/Subtract Instructions */
3083 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3084 {
3085     return float32_add(float16_to_float32(a, true, s),
3086             float16_to_float32(b, true, s), s);
3087 }
3088 
3089 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3090 {
3091     return float64_add(float32_to_float64(a, s),
3092             float32_to_float64(b, s), s);
3093 
3094 }
3095 
3096 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3097 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3098 GEN_VEXT_VV_ENV(vfwadd_vv_h)
3099 GEN_VEXT_VV_ENV(vfwadd_vv_w)
3100 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3101 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3102 GEN_VEXT_VF(vfwadd_vf_h)
3103 GEN_VEXT_VF(vfwadd_vf_w)
3104 
3105 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3106 {
3107     return float32_sub(float16_to_float32(a, true, s),
3108             float16_to_float32(b, true, s), s);
3109 }
3110 
3111 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3112 {
3113     return float64_sub(float32_to_float64(a, s),
3114             float32_to_float64(b, s), s);
3115 
3116 }
3117 
3118 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3119 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3120 GEN_VEXT_VV_ENV(vfwsub_vv_h)
3121 GEN_VEXT_VV_ENV(vfwsub_vv_w)
3122 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3123 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3124 GEN_VEXT_VF(vfwsub_vf_h)
3125 GEN_VEXT_VF(vfwsub_vf_w)
3126 
3127 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3128 {
3129     return float32_add(a, float16_to_float32(b, true, s), s);
3130 }
3131 
3132 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3133 {
3134     return float64_add(a, float32_to_float64(b, s), s);
3135 }
3136 
3137 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3138 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3139 GEN_VEXT_VV_ENV(vfwadd_wv_h)
3140 GEN_VEXT_VV_ENV(vfwadd_wv_w)
3141 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3142 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3143 GEN_VEXT_VF(vfwadd_wf_h)
3144 GEN_VEXT_VF(vfwadd_wf_w)
3145 
3146 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3147 {
3148     return float32_sub(a, float16_to_float32(b, true, s), s);
3149 }
3150 
3151 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3152 {
3153     return float64_sub(a, float32_to_float64(b, s), s);
3154 }
3155 
3156 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3157 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3158 GEN_VEXT_VV_ENV(vfwsub_wv_h)
3159 GEN_VEXT_VV_ENV(vfwsub_wv_w)
3160 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3161 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3162 GEN_VEXT_VF(vfwsub_wf_h)
3163 GEN_VEXT_VF(vfwsub_wf_w)
3164 
3165 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3166 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3167 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3168 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3169 GEN_VEXT_VV_ENV(vfmul_vv_h)
3170 GEN_VEXT_VV_ENV(vfmul_vv_w)
3171 GEN_VEXT_VV_ENV(vfmul_vv_d)
3172 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3173 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3174 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3175 GEN_VEXT_VF(vfmul_vf_h)
3176 GEN_VEXT_VF(vfmul_vf_w)
3177 GEN_VEXT_VF(vfmul_vf_d)
3178 
3179 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3180 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3181 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3182 GEN_VEXT_VV_ENV(vfdiv_vv_h)
3183 GEN_VEXT_VV_ENV(vfdiv_vv_w)
3184 GEN_VEXT_VV_ENV(vfdiv_vv_d)
3185 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3186 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3187 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3188 GEN_VEXT_VF(vfdiv_vf_h)
3189 GEN_VEXT_VF(vfdiv_vf_w)
3190 GEN_VEXT_VF(vfdiv_vf_d)
3191 
3192 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3193 {
3194     return float16_div(b, a, s);
3195 }
3196 
3197 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3198 {
3199     return float32_div(b, a, s);
3200 }
3201 
3202 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3203 {
3204     return float64_div(b, a, s);
3205 }
3206 
3207 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3208 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3209 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3210 GEN_VEXT_VF(vfrdiv_vf_h)
3211 GEN_VEXT_VF(vfrdiv_vf_w)
3212 GEN_VEXT_VF(vfrdiv_vf_d)
3213 
3214 /* Vector Widening Floating-Point Multiply */
3215 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3216 {
3217     return float32_mul(float16_to_float32(a, true, s),
3218             float16_to_float32(b, true, s), s);
3219 }
3220 
3221 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3222 {
3223     return float64_mul(float32_to_float64(a, s),
3224             float32_to_float64(b, s), s);
3225 
3226 }
3227 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3228 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3229 GEN_VEXT_VV_ENV(vfwmul_vv_h)
3230 GEN_VEXT_VV_ENV(vfwmul_vv_w)
3231 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3232 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3233 GEN_VEXT_VF(vfwmul_vf_h)
3234 GEN_VEXT_VF(vfwmul_vf_w)
3235 
3236 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3237 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3238 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3239         CPURISCVState *env)                                        \
3240 {                                                                  \
3241     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3242     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3243     TD d = *((TD *)vd + HD(i));                                    \
3244     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3245 }
3246 
3247 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3248 {
3249     return float16_muladd(a, b, d, 0, s);
3250 }
3251 
3252 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3253 {
3254     return float32_muladd(a, b, d, 0, s);
3255 }
3256 
3257 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3258 {
3259     return float64_muladd(a, b, d, 0, s);
3260 }
3261 
3262 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3263 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3264 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3265 GEN_VEXT_VV_ENV(vfmacc_vv_h)
3266 GEN_VEXT_VV_ENV(vfmacc_vv_w)
3267 GEN_VEXT_VV_ENV(vfmacc_vv_d)
3268 
3269 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3270 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3271         CPURISCVState *env)                                       \
3272 {                                                                 \
3273     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3274     TD d = *((TD *)vd + HD(i));                                   \
3275     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3276 }
3277 
3278 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3279 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3280 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3281 GEN_VEXT_VF(vfmacc_vf_h)
3282 GEN_VEXT_VF(vfmacc_vf_w)
3283 GEN_VEXT_VF(vfmacc_vf_d)
3284 
3285 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3286 {
3287     return float16_muladd(a, b, d,
3288             float_muladd_negate_c | float_muladd_negate_product, s);
3289 }
3290 
3291 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3292 {
3293     return float32_muladd(a, b, d,
3294             float_muladd_negate_c | float_muladd_negate_product, s);
3295 }
3296 
3297 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3298 {
3299     return float64_muladd(a, b, d,
3300             float_muladd_negate_c | float_muladd_negate_product, s);
3301 }
3302 
3303 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3304 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3305 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3306 GEN_VEXT_VV_ENV(vfnmacc_vv_h)
3307 GEN_VEXT_VV_ENV(vfnmacc_vv_w)
3308 GEN_VEXT_VV_ENV(vfnmacc_vv_d)
3309 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3310 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3311 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3312 GEN_VEXT_VF(vfnmacc_vf_h)
3313 GEN_VEXT_VF(vfnmacc_vf_w)
3314 GEN_VEXT_VF(vfnmacc_vf_d)
3315 
3316 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3317 {
3318     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3319 }
3320 
3321 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3322 {
3323     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3324 }
3325 
3326 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3327 {
3328     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3329 }
3330 
3331 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3332 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3333 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3334 GEN_VEXT_VV_ENV(vfmsac_vv_h)
3335 GEN_VEXT_VV_ENV(vfmsac_vv_w)
3336 GEN_VEXT_VV_ENV(vfmsac_vv_d)
3337 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3338 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3339 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3340 GEN_VEXT_VF(vfmsac_vf_h)
3341 GEN_VEXT_VF(vfmsac_vf_w)
3342 GEN_VEXT_VF(vfmsac_vf_d)
3343 
3344 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3345 {
3346     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3347 }
3348 
3349 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3350 {
3351     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3352 }
3353 
3354 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3355 {
3356     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3357 }
3358 
3359 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3360 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3361 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3362 GEN_VEXT_VV_ENV(vfnmsac_vv_h)
3363 GEN_VEXT_VV_ENV(vfnmsac_vv_w)
3364 GEN_VEXT_VV_ENV(vfnmsac_vv_d)
3365 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3366 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3367 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3368 GEN_VEXT_VF(vfnmsac_vf_h)
3369 GEN_VEXT_VF(vfnmsac_vf_w)
3370 GEN_VEXT_VF(vfnmsac_vf_d)
3371 
3372 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3373 {
3374     return float16_muladd(d, b, a, 0, s);
3375 }
3376 
3377 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3378 {
3379     return float32_muladd(d, b, a, 0, s);
3380 }
3381 
3382 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3383 {
3384     return float64_muladd(d, b, a, 0, s);
3385 }
3386 
3387 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3388 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3389 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3390 GEN_VEXT_VV_ENV(vfmadd_vv_h)
3391 GEN_VEXT_VV_ENV(vfmadd_vv_w)
3392 GEN_VEXT_VV_ENV(vfmadd_vv_d)
3393 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3394 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3395 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3396 GEN_VEXT_VF(vfmadd_vf_h)
3397 GEN_VEXT_VF(vfmadd_vf_w)
3398 GEN_VEXT_VF(vfmadd_vf_d)
3399 
3400 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3401 {
3402     return float16_muladd(d, b, a,
3403             float_muladd_negate_c | float_muladd_negate_product, s);
3404 }
3405 
3406 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3407 {
3408     return float32_muladd(d, b, a,
3409             float_muladd_negate_c | float_muladd_negate_product, s);
3410 }
3411 
3412 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3413 {
3414     return float64_muladd(d, b, a,
3415             float_muladd_negate_c | float_muladd_negate_product, s);
3416 }
3417 
3418 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3419 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3420 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3421 GEN_VEXT_VV_ENV(vfnmadd_vv_h)
3422 GEN_VEXT_VV_ENV(vfnmadd_vv_w)
3423 GEN_VEXT_VV_ENV(vfnmadd_vv_d)
3424 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3425 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3426 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3427 GEN_VEXT_VF(vfnmadd_vf_h)
3428 GEN_VEXT_VF(vfnmadd_vf_w)
3429 GEN_VEXT_VF(vfnmadd_vf_d)
3430 
3431 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3432 {
3433     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3434 }
3435 
3436 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3437 {
3438     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3439 }
3440 
3441 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3442 {
3443     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3444 }
3445 
3446 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3447 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3448 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3449 GEN_VEXT_VV_ENV(vfmsub_vv_h)
3450 GEN_VEXT_VV_ENV(vfmsub_vv_w)
3451 GEN_VEXT_VV_ENV(vfmsub_vv_d)
3452 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3453 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3454 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3455 GEN_VEXT_VF(vfmsub_vf_h)
3456 GEN_VEXT_VF(vfmsub_vf_w)
3457 GEN_VEXT_VF(vfmsub_vf_d)
3458 
3459 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3460 {
3461     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3462 }
3463 
3464 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3465 {
3466     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3467 }
3468 
3469 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3470 {
3471     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3472 }
3473 
3474 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3475 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3476 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3477 GEN_VEXT_VV_ENV(vfnmsub_vv_h)
3478 GEN_VEXT_VV_ENV(vfnmsub_vv_w)
3479 GEN_VEXT_VV_ENV(vfnmsub_vv_d)
3480 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3481 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3482 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3483 GEN_VEXT_VF(vfnmsub_vf_h)
3484 GEN_VEXT_VF(vfnmsub_vf_w)
3485 GEN_VEXT_VF(vfnmsub_vf_d)
3486 
3487 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3488 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3489 {
3490     return float32_muladd(float16_to_float32(a, true, s),
3491                         float16_to_float32(b, true, s), d, 0, s);
3492 }
3493 
3494 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3495 {
3496     return float64_muladd(float32_to_float64(a, s),
3497                         float32_to_float64(b, s), d, 0, s);
3498 }
3499 
3500 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3501 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3502 GEN_VEXT_VV_ENV(vfwmacc_vv_h)
3503 GEN_VEXT_VV_ENV(vfwmacc_vv_w)
3504 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3505 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3506 GEN_VEXT_VF(vfwmacc_vf_h)
3507 GEN_VEXT_VF(vfwmacc_vf_w)
3508 
3509 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3510 {
3511     return float32_muladd(float16_to_float32(a, true, s),
3512                         float16_to_float32(b, true, s), d,
3513                         float_muladd_negate_c | float_muladd_negate_product, s);
3514 }
3515 
3516 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3517 {
3518     return float64_muladd(float32_to_float64(a, s),
3519                         float32_to_float64(b, s), d,
3520                         float_muladd_negate_c | float_muladd_negate_product, s);
3521 }
3522 
3523 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3524 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3525 GEN_VEXT_VV_ENV(vfwnmacc_vv_h)
3526 GEN_VEXT_VV_ENV(vfwnmacc_vv_w)
3527 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3528 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3529 GEN_VEXT_VF(vfwnmacc_vf_h)
3530 GEN_VEXT_VF(vfwnmacc_vf_w)
3531 
3532 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3533 {
3534     return float32_muladd(float16_to_float32(a, true, s),
3535                         float16_to_float32(b, true, s), d,
3536                         float_muladd_negate_c, s);
3537 }
3538 
3539 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3540 {
3541     return float64_muladd(float32_to_float64(a, s),
3542                         float32_to_float64(b, s), d,
3543                         float_muladd_negate_c, s);
3544 }
3545 
3546 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3547 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3548 GEN_VEXT_VV_ENV(vfwmsac_vv_h)
3549 GEN_VEXT_VV_ENV(vfwmsac_vv_w)
3550 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3551 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3552 GEN_VEXT_VF(vfwmsac_vf_h)
3553 GEN_VEXT_VF(vfwmsac_vf_w)
3554 
3555 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3556 {
3557     return float32_muladd(float16_to_float32(a, true, s),
3558                         float16_to_float32(b, true, s), d,
3559                         float_muladd_negate_product, s);
3560 }
3561 
3562 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3563 {
3564     return float64_muladd(float32_to_float64(a, s),
3565                         float32_to_float64(b, s), d,
3566                         float_muladd_negate_product, s);
3567 }
3568 
3569 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3570 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3571 GEN_VEXT_VV_ENV(vfwnmsac_vv_h)
3572 GEN_VEXT_VV_ENV(vfwnmsac_vv_w)
3573 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3574 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3575 GEN_VEXT_VF(vfwnmsac_vf_h)
3576 GEN_VEXT_VF(vfwnmsac_vf_w)
3577 
3578 /* Vector Floating-Point Square-Root Instruction */
3579 /* (TD, T2, TX2) */
3580 #define OP_UU_H uint16_t, uint16_t, uint16_t
3581 #define OP_UU_W uint32_t, uint32_t, uint32_t
3582 #define OP_UU_D uint64_t, uint64_t, uint64_t
3583 
3584 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3585 static void do_##NAME(void *vd, void *vs2, int i,      \
3586         CPURISCVState *env)                            \
3587 {                                                      \
3588     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3589     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3590 }
3591 
3592 #define GEN_VEXT_V_ENV(NAME)                           \
3593 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3594         CPURISCVState *env, uint32_t desc)             \
3595 {                                                      \
3596     uint32_t vm = vext_vm(desc);                       \
3597     uint32_t vl = env->vl;                             \
3598     uint32_t i;                                        \
3599                                                        \
3600     if (vl == 0) {                                     \
3601         return;                                        \
3602     }                                                  \
3603     for (i = env->vstart; i < vl; i++) {               \
3604         if (!vm && !vext_elem_mask(v0, i)) {           \
3605             continue;                                  \
3606         }                                              \
3607         do_##NAME(vd, vs2, i, env);                    \
3608     }                                                  \
3609     env->vstart = 0;                                   \
3610 }
3611 
3612 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3613 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3614 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3615 GEN_VEXT_V_ENV(vfsqrt_v_h)
3616 GEN_VEXT_V_ENV(vfsqrt_v_w)
3617 GEN_VEXT_V_ENV(vfsqrt_v_d)
3618 
3619 /*
3620  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3621  *
3622  * Adapted from riscv-v-spec recip.c:
3623  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3624  */
3625 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3626 {
3627     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3628     uint64_t exp = extract64(f, frac_size, exp_size);
3629     uint64_t frac = extract64(f, 0, frac_size);
3630 
3631     const uint8_t lookup_table[] = {
3632         52, 51, 50, 48, 47, 46, 44, 43,
3633         42, 41, 40, 39, 38, 36, 35, 34,
3634         33, 32, 31, 30, 30, 29, 28, 27,
3635         26, 25, 24, 23, 23, 22, 21, 20,
3636         19, 19, 18, 17, 16, 16, 15, 14,
3637         14, 13, 12, 12, 11, 10, 10, 9,
3638         9, 8, 7, 7, 6, 6, 5, 4,
3639         4, 3, 3, 2, 2, 1, 1, 0,
3640         127, 125, 123, 121, 119, 118, 116, 114,
3641         113, 111, 109, 108, 106, 105, 103, 102,
3642         100, 99, 97, 96, 95, 93, 92, 91,
3643         90, 88, 87, 86, 85, 84, 83, 82,
3644         80, 79, 78, 77, 76, 75, 74, 73,
3645         72, 71, 70, 70, 69, 68, 67, 66,
3646         65, 64, 63, 63, 62, 61, 60, 59,
3647         59, 58, 57, 56, 56, 55, 54, 53
3648     };
3649     const int precision = 7;
3650 
3651     if (exp == 0 && frac != 0) { /* subnormal */
3652         /* Normalize the subnormal. */
3653         while (extract64(frac, frac_size - 1, 1) == 0) {
3654             exp--;
3655             frac <<= 1;
3656         }
3657 
3658         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3659     }
3660 
3661     int idx = ((exp & 1) << (precision - 1)) |
3662                 (frac >> (frac_size - precision + 1));
3663     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3664                             (frac_size - precision);
3665     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3666 
3667     uint64_t val = 0;
3668     val = deposit64(val, 0, frac_size, out_frac);
3669     val = deposit64(val, frac_size, exp_size, out_exp);
3670     val = deposit64(val, frac_size + exp_size, 1, sign);
3671     return val;
3672 }
3673 
3674 static float16 frsqrt7_h(float16 f, float_status *s)
3675 {
3676     int exp_size = 5, frac_size = 10;
3677     bool sign = float16_is_neg(f);
3678 
3679     /*
3680      * frsqrt7(sNaN) = canonical NaN
3681      * frsqrt7(-inf) = canonical NaN
3682      * frsqrt7(-normal) = canonical NaN
3683      * frsqrt7(-subnormal) = canonical NaN
3684      */
3685     if (float16_is_signaling_nan(f, s) ||
3686             (float16_is_infinity(f) && sign) ||
3687             (float16_is_normal(f) && sign) ||
3688             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3689         s->float_exception_flags |= float_flag_invalid;
3690         return float16_default_nan(s);
3691     }
3692 
3693     /* frsqrt7(qNaN) = canonical NaN */
3694     if (float16_is_quiet_nan(f, s)) {
3695         return float16_default_nan(s);
3696     }
3697 
3698     /* frsqrt7(+-0) = +-inf */
3699     if (float16_is_zero(f)) {
3700         s->float_exception_flags |= float_flag_divbyzero;
3701         return float16_set_sign(float16_infinity, sign);
3702     }
3703 
3704     /* frsqrt7(+inf) = +0 */
3705     if (float16_is_infinity(f) && !sign) {
3706         return float16_set_sign(float16_zero, sign);
3707     }
3708 
3709     /* +normal, +subnormal */
3710     uint64_t val = frsqrt7(f, exp_size, frac_size);
3711     return make_float16(val);
3712 }
3713 
3714 static float32 frsqrt7_s(float32 f, float_status *s)
3715 {
3716     int exp_size = 8, frac_size = 23;
3717     bool sign = float32_is_neg(f);
3718 
3719     /*
3720      * frsqrt7(sNaN) = canonical NaN
3721      * frsqrt7(-inf) = canonical NaN
3722      * frsqrt7(-normal) = canonical NaN
3723      * frsqrt7(-subnormal) = canonical NaN
3724      */
3725     if (float32_is_signaling_nan(f, s) ||
3726             (float32_is_infinity(f) && sign) ||
3727             (float32_is_normal(f) && sign) ||
3728             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3729         s->float_exception_flags |= float_flag_invalid;
3730         return float32_default_nan(s);
3731     }
3732 
3733     /* frsqrt7(qNaN) = canonical NaN */
3734     if (float32_is_quiet_nan(f, s)) {
3735         return float32_default_nan(s);
3736     }
3737 
3738     /* frsqrt7(+-0) = +-inf */
3739     if (float32_is_zero(f)) {
3740         s->float_exception_flags |= float_flag_divbyzero;
3741         return float32_set_sign(float32_infinity, sign);
3742     }
3743 
3744     /* frsqrt7(+inf) = +0 */
3745     if (float32_is_infinity(f) && !sign) {
3746         return float32_set_sign(float32_zero, sign);
3747     }
3748 
3749     /* +normal, +subnormal */
3750     uint64_t val = frsqrt7(f, exp_size, frac_size);
3751     return make_float32(val);
3752 }
3753 
3754 static float64 frsqrt7_d(float64 f, float_status *s)
3755 {
3756     int exp_size = 11, frac_size = 52;
3757     bool sign = float64_is_neg(f);
3758 
3759     /*
3760      * frsqrt7(sNaN) = canonical NaN
3761      * frsqrt7(-inf) = canonical NaN
3762      * frsqrt7(-normal) = canonical NaN
3763      * frsqrt7(-subnormal) = canonical NaN
3764      */
3765     if (float64_is_signaling_nan(f, s) ||
3766             (float64_is_infinity(f) && sign) ||
3767             (float64_is_normal(f) && sign) ||
3768             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3769         s->float_exception_flags |= float_flag_invalid;
3770         return float64_default_nan(s);
3771     }
3772 
3773     /* frsqrt7(qNaN) = canonical NaN */
3774     if (float64_is_quiet_nan(f, s)) {
3775         return float64_default_nan(s);
3776     }
3777 
3778     /* frsqrt7(+-0) = +-inf */
3779     if (float64_is_zero(f)) {
3780         s->float_exception_flags |= float_flag_divbyzero;
3781         return float64_set_sign(float64_infinity, sign);
3782     }
3783 
3784     /* frsqrt7(+inf) = +0 */
3785     if (float64_is_infinity(f) && !sign) {
3786         return float64_set_sign(float64_zero, sign);
3787     }
3788 
3789     /* +normal, +subnormal */
3790     uint64_t val = frsqrt7(f, exp_size, frac_size);
3791     return make_float64(val);
3792 }
3793 
3794 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3795 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3796 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3797 GEN_VEXT_V_ENV(vfrsqrt7_v_h)
3798 GEN_VEXT_V_ENV(vfrsqrt7_v_w)
3799 GEN_VEXT_V_ENV(vfrsqrt7_v_d)
3800 
3801 /*
3802  * Vector Floating-Point Reciprocal Estimate Instruction
3803  *
3804  * Adapted from riscv-v-spec recip.c:
3805  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3806  */
3807 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3808                       float_status *s)
3809 {
3810     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3811     uint64_t exp = extract64(f, frac_size, exp_size);
3812     uint64_t frac = extract64(f, 0, frac_size);
3813 
3814     const uint8_t lookup_table[] = {
3815         127, 125, 123, 121, 119, 117, 116, 114,
3816         112, 110, 109, 107, 105, 104, 102, 100,
3817         99, 97, 96, 94, 93, 91, 90, 88,
3818         87, 85, 84, 83, 81, 80, 79, 77,
3819         76, 75, 74, 72, 71, 70, 69, 68,
3820         66, 65, 64, 63, 62, 61, 60, 59,
3821         58, 57, 56, 55, 54, 53, 52, 51,
3822         50, 49, 48, 47, 46, 45, 44, 43,
3823         42, 41, 40, 40, 39, 38, 37, 36,
3824         35, 35, 34, 33, 32, 31, 31, 30,
3825         29, 28, 28, 27, 26, 25, 25, 24,
3826         23, 23, 22, 21, 21, 20, 19, 19,
3827         18, 17, 17, 16, 15, 15, 14, 14,
3828         13, 12, 12, 11, 11, 10, 9, 9,
3829         8, 8, 7, 7, 6, 5, 5, 4,
3830         4, 3, 3, 2, 2, 1, 1, 0
3831     };
3832     const int precision = 7;
3833 
3834     if (exp == 0 && frac != 0) { /* subnormal */
3835         /* Normalize the subnormal. */
3836         while (extract64(frac, frac_size - 1, 1) == 0) {
3837             exp--;
3838             frac <<= 1;
3839         }
3840 
3841         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3842 
3843         if (exp != 0 && exp != UINT64_MAX) {
3844             /*
3845              * Overflow to inf or max value of same sign,
3846              * depending on sign and rounding mode.
3847              */
3848             s->float_exception_flags |= (float_flag_inexact |
3849                                          float_flag_overflow);
3850 
3851             if ((s->float_rounding_mode == float_round_to_zero) ||
3852                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3853                 ((s->float_rounding_mode == float_round_up) && sign)) {
3854                 /* Return greatest/negative finite value. */
3855                 return (sign << (exp_size + frac_size)) |
3856                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3857             } else {
3858                 /* Return +-inf. */
3859                 return (sign << (exp_size + frac_size)) |
3860                     MAKE_64BIT_MASK(frac_size, exp_size);
3861             }
3862         }
3863     }
3864 
3865     int idx = frac >> (frac_size - precision);
3866     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3867                             (frac_size - precision);
3868     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3869 
3870     if (out_exp == 0 || out_exp == UINT64_MAX) {
3871         /*
3872          * The result is subnormal, but don't raise the underflow exception,
3873          * because there's no additional loss of precision.
3874          */
3875         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3876         if (out_exp == UINT64_MAX) {
3877             out_frac >>= 1;
3878             out_exp = 0;
3879         }
3880     }
3881 
3882     uint64_t val = 0;
3883     val = deposit64(val, 0, frac_size, out_frac);
3884     val = deposit64(val, frac_size, exp_size, out_exp);
3885     val = deposit64(val, frac_size + exp_size, 1, sign);
3886     return val;
3887 }
3888 
3889 static float16 frec7_h(float16 f, float_status *s)
3890 {
3891     int exp_size = 5, frac_size = 10;
3892     bool sign = float16_is_neg(f);
3893 
3894     /* frec7(+-inf) = +-0 */
3895     if (float16_is_infinity(f)) {
3896         return float16_set_sign(float16_zero, sign);
3897     }
3898 
3899     /* frec7(+-0) = +-inf */
3900     if (float16_is_zero(f)) {
3901         s->float_exception_flags |= float_flag_divbyzero;
3902         return float16_set_sign(float16_infinity, sign);
3903     }
3904 
3905     /* frec7(sNaN) = canonical NaN */
3906     if (float16_is_signaling_nan(f, s)) {
3907         s->float_exception_flags |= float_flag_invalid;
3908         return float16_default_nan(s);
3909     }
3910 
3911     /* frec7(qNaN) = canonical NaN */
3912     if (float16_is_quiet_nan(f, s)) {
3913         return float16_default_nan(s);
3914     }
3915 
3916     /* +-normal, +-subnormal */
3917     uint64_t val = frec7(f, exp_size, frac_size, s);
3918     return make_float16(val);
3919 }
3920 
3921 static float32 frec7_s(float32 f, float_status *s)
3922 {
3923     int exp_size = 8, frac_size = 23;
3924     bool sign = float32_is_neg(f);
3925 
3926     /* frec7(+-inf) = +-0 */
3927     if (float32_is_infinity(f)) {
3928         return float32_set_sign(float32_zero, sign);
3929     }
3930 
3931     /* frec7(+-0) = +-inf */
3932     if (float32_is_zero(f)) {
3933         s->float_exception_flags |= float_flag_divbyzero;
3934         return float32_set_sign(float32_infinity, sign);
3935     }
3936 
3937     /* frec7(sNaN) = canonical NaN */
3938     if (float32_is_signaling_nan(f, s)) {
3939         s->float_exception_flags |= float_flag_invalid;
3940         return float32_default_nan(s);
3941     }
3942 
3943     /* frec7(qNaN) = canonical NaN */
3944     if (float32_is_quiet_nan(f, s)) {
3945         return float32_default_nan(s);
3946     }
3947 
3948     /* +-normal, +-subnormal */
3949     uint64_t val = frec7(f, exp_size, frac_size, s);
3950     return make_float32(val);
3951 }
3952 
3953 static float64 frec7_d(float64 f, float_status *s)
3954 {
3955     int exp_size = 11, frac_size = 52;
3956     bool sign = float64_is_neg(f);
3957 
3958     /* frec7(+-inf) = +-0 */
3959     if (float64_is_infinity(f)) {
3960         return float64_set_sign(float64_zero, sign);
3961     }
3962 
3963     /* frec7(+-0) = +-inf */
3964     if (float64_is_zero(f)) {
3965         s->float_exception_flags |= float_flag_divbyzero;
3966         return float64_set_sign(float64_infinity, sign);
3967     }
3968 
3969     /* frec7(sNaN) = canonical NaN */
3970     if (float64_is_signaling_nan(f, s)) {
3971         s->float_exception_flags |= float_flag_invalid;
3972         return float64_default_nan(s);
3973     }
3974 
3975     /* frec7(qNaN) = canonical NaN */
3976     if (float64_is_quiet_nan(f, s)) {
3977         return float64_default_nan(s);
3978     }
3979 
3980     /* +-normal, +-subnormal */
3981     uint64_t val = frec7(f, exp_size, frac_size, s);
3982     return make_float64(val);
3983 }
3984 
3985 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
3986 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
3987 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
3988 GEN_VEXT_V_ENV(vfrec7_v_h)
3989 GEN_VEXT_V_ENV(vfrec7_v_w)
3990 GEN_VEXT_V_ENV(vfrec7_v_d)
3991 
3992 /* Vector Floating-Point MIN/MAX Instructions */
3993 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
3994 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
3995 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
3996 GEN_VEXT_VV_ENV(vfmin_vv_h)
3997 GEN_VEXT_VV_ENV(vfmin_vv_w)
3998 GEN_VEXT_VV_ENV(vfmin_vv_d)
3999 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4000 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4001 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4002 GEN_VEXT_VF(vfmin_vf_h)
4003 GEN_VEXT_VF(vfmin_vf_w)
4004 GEN_VEXT_VF(vfmin_vf_d)
4005 
4006 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4007 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4008 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4009 GEN_VEXT_VV_ENV(vfmax_vv_h)
4010 GEN_VEXT_VV_ENV(vfmax_vv_w)
4011 GEN_VEXT_VV_ENV(vfmax_vv_d)
4012 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4013 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4014 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4015 GEN_VEXT_VF(vfmax_vf_h)
4016 GEN_VEXT_VF(vfmax_vf_w)
4017 GEN_VEXT_VF(vfmax_vf_d)
4018 
4019 /* Vector Floating-Point Sign-Injection Instructions */
4020 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4021 {
4022     return deposit64(b, 0, 15, a);
4023 }
4024 
4025 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4026 {
4027     return deposit64(b, 0, 31, a);
4028 }
4029 
4030 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4031 {
4032     return deposit64(b, 0, 63, a);
4033 }
4034 
4035 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4036 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4037 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4038 GEN_VEXT_VV_ENV(vfsgnj_vv_h)
4039 GEN_VEXT_VV_ENV(vfsgnj_vv_w)
4040 GEN_VEXT_VV_ENV(vfsgnj_vv_d)
4041 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4042 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4043 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4044 GEN_VEXT_VF(vfsgnj_vf_h)
4045 GEN_VEXT_VF(vfsgnj_vf_w)
4046 GEN_VEXT_VF(vfsgnj_vf_d)
4047 
4048 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4049 {
4050     return deposit64(~b, 0, 15, a);
4051 }
4052 
4053 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4054 {
4055     return deposit64(~b, 0, 31, a);
4056 }
4057 
4058 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4059 {
4060     return deposit64(~b, 0, 63, a);
4061 }
4062 
4063 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4064 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4065 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4066 GEN_VEXT_VV_ENV(vfsgnjn_vv_h)
4067 GEN_VEXT_VV_ENV(vfsgnjn_vv_w)
4068 GEN_VEXT_VV_ENV(vfsgnjn_vv_d)
4069 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4070 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4071 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4072 GEN_VEXT_VF(vfsgnjn_vf_h)
4073 GEN_VEXT_VF(vfsgnjn_vf_w)
4074 GEN_VEXT_VF(vfsgnjn_vf_d)
4075 
4076 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4077 {
4078     return deposit64(b ^ a, 0, 15, a);
4079 }
4080 
4081 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4082 {
4083     return deposit64(b ^ a, 0, 31, a);
4084 }
4085 
4086 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4087 {
4088     return deposit64(b ^ a, 0, 63, a);
4089 }
4090 
4091 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4092 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4093 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4094 GEN_VEXT_VV_ENV(vfsgnjx_vv_h)
4095 GEN_VEXT_VV_ENV(vfsgnjx_vv_w)
4096 GEN_VEXT_VV_ENV(vfsgnjx_vv_d)
4097 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4098 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4099 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4100 GEN_VEXT_VF(vfsgnjx_vf_h)
4101 GEN_VEXT_VF(vfsgnjx_vf_w)
4102 GEN_VEXT_VF(vfsgnjx_vf_d)
4103 
4104 /* Vector Floating-Point Compare Instructions */
4105 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4106 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4107                   CPURISCVState *env, uint32_t desc)          \
4108 {                                                             \
4109     uint32_t vm = vext_vm(desc);                              \
4110     uint32_t vl = env->vl;                                    \
4111     uint32_t i;                                               \
4112                                                               \
4113     for (i = env->vstart; i < vl; i++) {                      \
4114         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4115         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4116         if (!vm && !vext_elem_mask(v0, i)) {                  \
4117             continue;                                         \
4118         }                                                     \
4119         vext_set_elem_mask(vd, i,                             \
4120                            DO_OP(s2, s1, &env->fp_status));   \
4121     }                                                         \
4122     env->vstart = 0;                                          \
4123 }
4124 
4125 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4126 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4127 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4128 
4129 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4130 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4131                   CPURISCVState *env, uint32_t desc)                \
4132 {                                                                   \
4133     uint32_t vm = vext_vm(desc);                                    \
4134     uint32_t vl = env->vl;                                          \
4135     uint32_t i;                                                     \
4136                                                                     \
4137     for (i = env->vstart; i < vl; i++) {                            \
4138         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4139         if (!vm && !vext_elem_mask(v0, i)) {                        \
4140             continue;                                               \
4141         }                                                           \
4142         vext_set_elem_mask(vd, i,                                   \
4143                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4144     }                                                               \
4145     env->vstart = 0;                                                \
4146 }
4147 
4148 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4149 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4150 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4151 
4152 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4153 {
4154     FloatRelation compare = float16_compare_quiet(a, b, s);
4155     return compare != float_relation_equal;
4156 }
4157 
4158 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4159 {
4160     FloatRelation compare = float32_compare_quiet(a, b, s);
4161     return compare != float_relation_equal;
4162 }
4163 
4164 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4165 {
4166     FloatRelation compare = float64_compare_quiet(a, b, s);
4167     return compare != float_relation_equal;
4168 }
4169 
4170 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4171 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4172 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4173 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4174 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4175 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4176 
4177 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4178 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4179 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4180 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4181 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4182 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4183 
4184 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4185 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4186 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4187 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4188 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4189 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4190 
4191 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4192 {
4193     FloatRelation compare = float16_compare(a, b, s);
4194     return compare == float_relation_greater;
4195 }
4196 
4197 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4198 {
4199     FloatRelation compare = float32_compare(a, b, s);
4200     return compare == float_relation_greater;
4201 }
4202 
4203 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4204 {
4205     FloatRelation compare = float64_compare(a, b, s);
4206     return compare == float_relation_greater;
4207 }
4208 
4209 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4210 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4211 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4212 
4213 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4214 {
4215     FloatRelation compare = float16_compare(a, b, s);
4216     return compare == float_relation_greater ||
4217            compare == float_relation_equal;
4218 }
4219 
4220 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4221 {
4222     FloatRelation compare = float32_compare(a, b, s);
4223     return compare == float_relation_greater ||
4224            compare == float_relation_equal;
4225 }
4226 
4227 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4228 {
4229     FloatRelation compare = float64_compare(a, b, s);
4230     return compare == float_relation_greater ||
4231            compare == float_relation_equal;
4232 }
4233 
4234 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4235 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4236 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4237 
4238 /* Vector Floating-Point Classify Instruction */
4239 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4240 static void do_##NAME(void *vd, void *vs2, int i)      \
4241 {                                                      \
4242     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4243     *((TD *)vd + HD(i)) = OP(s2);                      \
4244 }
4245 
4246 #define GEN_VEXT_V(NAME)                               \
4247 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4248                   CPURISCVState *env, uint32_t desc)   \
4249 {                                                      \
4250     uint32_t vm = vext_vm(desc);                       \
4251     uint32_t vl = env->vl;                             \
4252     uint32_t i;                                        \
4253                                                        \
4254     for (i = env->vstart; i < vl; i++) {               \
4255         if (!vm && !vext_elem_mask(v0, i)) {           \
4256             continue;                                  \
4257         }                                              \
4258         do_##NAME(vd, vs2, i);                         \
4259     }                                                  \
4260     env->vstart = 0;                                   \
4261 }
4262 
4263 target_ulong fclass_h(uint64_t frs1)
4264 {
4265     float16 f = frs1;
4266     bool sign = float16_is_neg(f);
4267 
4268     if (float16_is_infinity(f)) {
4269         return sign ? 1 << 0 : 1 << 7;
4270     } else if (float16_is_zero(f)) {
4271         return sign ? 1 << 3 : 1 << 4;
4272     } else if (float16_is_zero_or_denormal(f)) {
4273         return sign ? 1 << 2 : 1 << 5;
4274     } else if (float16_is_any_nan(f)) {
4275         float_status s = { }; /* for snan_bit_is_one */
4276         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4277     } else {
4278         return sign ? 1 << 1 : 1 << 6;
4279     }
4280 }
4281 
4282 target_ulong fclass_s(uint64_t frs1)
4283 {
4284     float32 f = frs1;
4285     bool sign = float32_is_neg(f);
4286 
4287     if (float32_is_infinity(f)) {
4288         return sign ? 1 << 0 : 1 << 7;
4289     } else if (float32_is_zero(f)) {
4290         return sign ? 1 << 3 : 1 << 4;
4291     } else if (float32_is_zero_or_denormal(f)) {
4292         return sign ? 1 << 2 : 1 << 5;
4293     } else if (float32_is_any_nan(f)) {
4294         float_status s = { }; /* for snan_bit_is_one */
4295         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4296     } else {
4297         return sign ? 1 << 1 : 1 << 6;
4298     }
4299 }
4300 
4301 target_ulong fclass_d(uint64_t frs1)
4302 {
4303     float64 f = frs1;
4304     bool sign = float64_is_neg(f);
4305 
4306     if (float64_is_infinity(f)) {
4307         return sign ? 1 << 0 : 1 << 7;
4308     } else if (float64_is_zero(f)) {
4309         return sign ? 1 << 3 : 1 << 4;
4310     } else if (float64_is_zero_or_denormal(f)) {
4311         return sign ? 1 << 2 : 1 << 5;
4312     } else if (float64_is_any_nan(f)) {
4313         float_status s = { }; /* for snan_bit_is_one */
4314         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4315     } else {
4316         return sign ? 1 << 1 : 1 << 6;
4317     }
4318 }
4319 
4320 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4321 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4322 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4323 GEN_VEXT_V(vfclass_v_h)
4324 GEN_VEXT_V(vfclass_v_w)
4325 GEN_VEXT_V(vfclass_v_d)
4326 
4327 /* Vector Floating-Point Merge Instruction */
4328 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4329 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4330                   CPURISCVState *env, uint32_t desc)          \
4331 {                                                             \
4332     uint32_t vm = vext_vm(desc);                              \
4333     uint32_t vl = env->vl;                                    \
4334     uint32_t i;                                               \
4335                                                               \
4336     for (i = env->vstart; i < vl; i++) {                      \
4337         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4338         *((ETYPE *)vd + H(i))                                 \
4339           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4340     }                                                         \
4341     env->vstart = 0;                                          \
4342 }
4343 
4344 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4345 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4346 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4347 
4348 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4349 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4350 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4351 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4352 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4353 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h)
4354 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w)
4355 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d)
4356 
4357 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4358 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4359 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4360 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4361 GEN_VEXT_V_ENV(vfcvt_x_f_v_h)
4362 GEN_VEXT_V_ENV(vfcvt_x_f_v_w)
4363 GEN_VEXT_V_ENV(vfcvt_x_f_v_d)
4364 
4365 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4366 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4367 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4368 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4369 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h)
4370 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w)
4371 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d)
4372 
4373 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4374 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4375 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4376 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4377 GEN_VEXT_V_ENV(vfcvt_f_x_v_h)
4378 GEN_VEXT_V_ENV(vfcvt_f_x_v_w)
4379 GEN_VEXT_V_ENV(vfcvt_f_x_v_d)
4380 
4381 /* Widening Floating-Point/Integer Type-Convert Instructions */
4382 /* (TD, T2, TX2) */
4383 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4384 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4385 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4386 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4387 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4388 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4389 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h)
4390 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w)
4391 
4392 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4393 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4394 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4395 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h)
4396 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w)
4397 
4398 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4399 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4400 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4401 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4402 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b)
4403 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h)
4404 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w)
4405 
4406 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4407 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4408 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4409 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4410 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b)
4411 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h)
4412 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w)
4413 
4414 /*
4415  * vfwcvt.f.f.v vd, vs2, vm
4416  * Convert single-width float to double-width float.
4417  */
4418 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4419 {
4420     return float16_to_float32(a, true, s);
4421 }
4422 
4423 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4424 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4425 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h)
4426 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w)
4427 
4428 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4429 /* (TD, T2, TX2) */
4430 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4431 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4432 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4433 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4434 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4435 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4436 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4437 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b)
4438 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h)
4439 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w)
4440 
4441 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4442 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4443 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4444 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4445 GEN_VEXT_V_ENV(vfncvt_x_f_w_b)
4446 GEN_VEXT_V_ENV(vfncvt_x_f_w_h)
4447 GEN_VEXT_V_ENV(vfncvt_x_f_w_w)
4448 
4449 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4450 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4451 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4452 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h)
4453 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w)
4454 
4455 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4456 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4457 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4458 GEN_VEXT_V_ENV(vfncvt_f_x_w_h)
4459 GEN_VEXT_V_ENV(vfncvt_f_x_w_w)
4460 
4461 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4462 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4463 {
4464     return float32_to_float16(a, true, s);
4465 }
4466 
4467 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4468 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4469 GEN_VEXT_V_ENV(vfncvt_f_f_w_h)
4470 GEN_VEXT_V_ENV(vfncvt_f_f_w_w)
4471 
4472 /*
4473  *** Vector Reduction Operations
4474  */
4475 /* Vector Single-Width Integer Reduction Instructions */
4476 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4477 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4478         void *vs2, CPURISCVState *env, uint32_t desc)     \
4479 {                                                         \
4480     uint32_t vm = vext_vm(desc);                          \
4481     uint32_t vl = env->vl;                                \
4482     uint32_t i;                                           \
4483     TD s1 =  *((TD *)vs1 + HD(0));                        \
4484                                                           \
4485     for (i = env->vstart; i < vl; i++) {                  \
4486         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4487         if (!vm && !vext_elem_mask(v0, i)) {              \
4488             continue;                                     \
4489         }                                                 \
4490         s1 = OP(s1, (TD)s2);                              \
4491     }                                                     \
4492     *((TD *)vd + HD(0)) = s1;                             \
4493     env->vstart = 0;                                      \
4494 }
4495 
4496 /* vd[0] = sum(vs1[0], vs2[*]) */
4497 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4498 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4499 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4500 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4501 
4502 /* vd[0] = maxu(vs1[0], vs2[*]) */
4503 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4504 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4505 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4506 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4507 
4508 /* vd[0] = max(vs1[0], vs2[*]) */
4509 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4510 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4511 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4512 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4513 
4514 /* vd[0] = minu(vs1[0], vs2[*]) */
4515 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4516 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4517 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4518 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4519 
4520 /* vd[0] = min(vs1[0], vs2[*]) */
4521 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4522 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4523 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4524 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4525 
4526 /* vd[0] = and(vs1[0], vs2[*]) */
4527 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4528 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4529 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4530 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4531 
4532 /* vd[0] = or(vs1[0], vs2[*]) */
4533 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4534 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4535 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4536 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4537 
4538 /* vd[0] = xor(vs1[0], vs2[*]) */
4539 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4540 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4541 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4542 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4543 
4544 /* Vector Widening Integer Reduction Instructions */
4545 /* signed sum reduction into double-width accumulator */
4546 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4547 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4548 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4549 
4550 /* Unsigned sum reduction into double-width accumulator */
4551 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4552 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4553 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4554 
4555 /* Vector Single-Width Floating-Point Reduction Instructions */
4556 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4557 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4558                   void *vs2, CPURISCVState *env,           \
4559                   uint32_t desc)                           \
4560 {                                                          \
4561     uint32_t vm = vext_vm(desc);                           \
4562     uint32_t vl = env->vl;                                 \
4563     uint32_t i;                                            \
4564     TD s1 =  *((TD *)vs1 + HD(0));                         \
4565                                                            \
4566     for (i = env->vstart; i < vl; i++) {                   \
4567         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4568         if (!vm && !vext_elem_mask(v0, i)) {               \
4569             continue;                                      \
4570         }                                                  \
4571         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4572     }                                                      \
4573     *((TD *)vd + HD(0)) = s1;                              \
4574     env->vstart = 0;                                       \
4575 }
4576 
4577 /* Unordered sum */
4578 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4579 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4580 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4581 
4582 /* Maximum value */
4583 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4584 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4585 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4586 
4587 /* Minimum value */
4588 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4589 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4590 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4591 
4592 /* Vector Widening Floating-Point Reduction Instructions */
4593 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4594 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4595                             void *vs2, CPURISCVState *env, uint32_t desc)
4596 {
4597     uint32_t vm = vext_vm(desc);
4598     uint32_t vl = env->vl;
4599     uint32_t i;
4600     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4601 
4602     for (i = env->vstart; i < vl; i++) {
4603         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4604         if (!vm && !vext_elem_mask(v0, i)) {
4605             continue;
4606         }
4607         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4608                          &env->fp_status);
4609     }
4610     *((uint32_t *)vd + H4(0)) = s1;
4611     env->vstart = 0;
4612 }
4613 
4614 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4615                             void *vs2, CPURISCVState *env, uint32_t desc)
4616 {
4617     uint32_t vm = vext_vm(desc);
4618     uint32_t vl = env->vl;
4619     uint32_t i;
4620     uint64_t s1 =  *((uint64_t *)vs1);
4621 
4622     for (i = env->vstart; i < vl; i++) {
4623         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4624         if (!vm && !vext_elem_mask(v0, i)) {
4625             continue;
4626         }
4627         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4628                          &env->fp_status);
4629     }
4630     *((uint64_t *)vd) = s1;
4631     env->vstart = 0;
4632 }
4633 
4634 /*
4635  *** Vector Mask Operations
4636  */
4637 /* Vector Mask-Register Logical Instructions */
4638 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4639 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4640                   void *vs2, CPURISCVState *env,          \
4641                   uint32_t desc)                          \
4642 {                                                         \
4643     uint32_t vl = env->vl;                                \
4644     uint32_t i;                                           \
4645     int a, b;                                             \
4646                                                           \
4647     for (i = env->vstart; i < vl; i++) {                  \
4648         a = vext_elem_mask(vs1, i);                       \
4649         b = vext_elem_mask(vs2, i);                       \
4650         vext_set_elem_mask(vd, i, OP(b, a));              \
4651     }                                                     \
4652     env->vstart = 0;                                      \
4653 }
4654 
4655 #define DO_NAND(N, M)  (!(N & M))
4656 #define DO_ANDNOT(N, M)  (N & !M)
4657 #define DO_NOR(N, M)  (!(N | M))
4658 #define DO_ORNOT(N, M)  (N | !M)
4659 #define DO_XNOR(N, M)  (!(N ^ M))
4660 
4661 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4662 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4663 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4664 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4665 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4666 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4667 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4668 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4669 
4670 /* Vector count population in mask vcpop */
4671 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4672                              uint32_t desc)
4673 {
4674     target_ulong cnt = 0;
4675     uint32_t vm = vext_vm(desc);
4676     uint32_t vl = env->vl;
4677     int i;
4678 
4679     for (i = env->vstart; i < vl; i++) {
4680         if (vm || vext_elem_mask(v0, i)) {
4681             if (vext_elem_mask(vs2, i)) {
4682                 cnt++;
4683             }
4684         }
4685     }
4686     env->vstart = 0;
4687     return cnt;
4688 }
4689 
4690 /* vfirst find-first-set mask bit*/
4691 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4692                               uint32_t desc)
4693 {
4694     uint32_t vm = vext_vm(desc);
4695     uint32_t vl = env->vl;
4696     int i;
4697 
4698     for (i = env->vstart; i < vl; i++) {
4699         if (vm || vext_elem_mask(v0, i)) {
4700             if (vext_elem_mask(vs2, i)) {
4701                 return i;
4702             }
4703         }
4704     }
4705     env->vstart = 0;
4706     return -1LL;
4707 }
4708 
4709 enum set_mask_type {
4710     ONLY_FIRST = 1,
4711     INCLUDE_FIRST,
4712     BEFORE_FIRST,
4713 };
4714 
4715 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4716                    uint32_t desc, enum set_mask_type type)
4717 {
4718     uint32_t vm = vext_vm(desc);
4719     uint32_t vl = env->vl;
4720     int i;
4721     bool first_mask_bit = false;
4722 
4723     for (i = env->vstart; i < vl; i++) {
4724         if (!vm && !vext_elem_mask(v0, i)) {
4725             continue;
4726         }
4727         /* write a zero to all following active elements */
4728         if (first_mask_bit) {
4729             vext_set_elem_mask(vd, i, 0);
4730             continue;
4731         }
4732         if (vext_elem_mask(vs2, i)) {
4733             first_mask_bit = true;
4734             if (type == BEFORE_FIRST) {
4735                 vext_set_elem_mask(vd, i, 0);
4736             } else {
4737                 vext_set_elem_mask(vd, i, 1);
4738             }
4739         } else {
4740             if (type == ONLY_FIRST) {
4741                 vext_set_elem_mask(vd, i, 0);
4742             } else {
4743                 vext_set_elem_mask(vd, i, 1);
4744             }
4745         }
4746     }
4747     env->vstart = 0;
4748 }
4749 
4750 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4751                      uint32_t desc)
4752 {
4753     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4754 }
4755 
4756 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4757                      uint32_t desc)
4758 {
4759     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4760 }
4761 
4762 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4763                      uint32_t desc)
4764 {
4765     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4766 }
4767 
4768 /* Vector Iota Instruction */
4769 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4770 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4771                   uint32_t desc)                                          \
4772 {                                                                         \
4773     uint32_t vm = vext_vm(desc);                                          \
4774     uint32_t vl = env->vl;                                                \
4775     uint32_t sum = 0;                                                     \
4776     int i;                                                                \
4777                                                                           \
4778     for (i = env->vstart; i < vl; i++) {                                  \
4779         if (!vm && !vext_elem_mask(v0, i)) {                              \
4780             continue;                                                     \
4781         }                                                                 \
4782         *((ETYPE *)vd + H(i)) = sum;                                      \
4783         if (vext_elem_mask(vs2, i)) {                                     \
4784             sum++;                                                        \
4785         }                                                                 \
4786     }                                                                     \
4787     env->vstart = 0;                                                      \
4788 }
4789 
4790 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4791 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4792 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4793 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4794 
4795 /* Vector Element Index Instruction */
4796 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4797 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4798 {                                                                         \
4799     uint32_t vm = vext_vm(desc);                                          \
4800     uint32_t vl = env->vl;                                                \
4801     int i;                                                                \
4802                                                                           \
4803     for (i = env->vstart; i < vl; i++) {                                  \
4804         if (!vm && !vext_elem_mask(v0, i)) {                              \
4805             continue;                                                     \
4806         }                                                                 \
4807         *((ETYPE *)vd + H(i)) = i;                                        \
4808     }                                                                     \
4809     env->vstart = 0;                                                      \
4810 }
4811 
4812 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4813 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4814 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4815 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4816 
4817 /*
4818  *** Vector Permutation Instructions
4819  */
4820 
4821 /* Vector Slide Instructions */
4822 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4823 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4824                   CPURISCVState *env, uint32_t desc)                      \
4825 {                                                                         \
4826     uint32_t vm = vext_vm(desc);                                          \
4827     uint32_t vl = env->vl;                                                \
4828     target_ulong offset = s1, i_min, i;                                   \
4829                                                                           \
4830     i_min = MAX(env->vstart, offset);                                     \
4831     for (i = i_min; i < vl; i++) {                                        \
4832         if (!vm && !vext_elem_mask(v0, i)) {                              \
4833             continue;                                                     \
4834         }                                                                 \
4835         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4836     }                                                                     \
4837 }
4838 
4839 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4840 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4841 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4842 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4843 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4844 
4845 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4846 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4847                   CPURISCVState *env, uint32_t desc)                      \
4848 {                                                                         \
4849     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4850     uint32_t vm = vext_vm(desc);                                          \
4851     uint32_t vl = env->vl;                                                \
4852     target_ulong i_max, i;                                                \
4853                                                                           \
4854     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4855     for (i = env->vstart; i < i_max; ++i) {                               \
4856         if (vm || vext_elem_mask(v0, i)) {                                \
4857             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
4858         }                                                                 \
4859     }                                                                     \
4860                                                                           \
4861     for (i = i_max; i < vl; ++i) {                                        \
4862         if (vm || vext_elem_mask(v0, i)) {                                \
4863             *((ETYPE *)vd + H(i)) = 0;                                    \
4864         }                                                                 \
4865     }                                                                     \
4866                                                                           \
4867     env->vstart = 0;                                                      \
4868 }
4869 
4870 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4871 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4872 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4873 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4874 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4875 
4876 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4877 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4878                      void *vs2, CPURISCVState *env, uint32_t desc)          \
4879 {                                                                           \
4880     typedef uint##BITWIDTH##_t ETYPE;                                       \
4881     uint32_t vm = vext_vm(desc);                                            \
4882     uint32_t vl = env->vl;                                                  \
4883     uint32_t i;                                                             \
4884                                                                             \
4885     for (i = env->vstart; i < vl; i++) {                                    \
4886         if (!vm && !vext_elem_mask(v0, i)) {                                \
4887             continue;                                                       \
4888         }                                                                   \
4889         if (i == 0) {                                                       \
4890             *((ETYPE *)vd + H(i)) = s1;                                     \
4891         } else {                                                            \
4892             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
4893         }                                                                   \
4894     }                                                                       \
4895     env->vstart = 0;                                                        \
4896 }
4897 
4898 GEN_VEXT_VSLIE1UP(8,  H1)
4899 GEN_VEXT_VSLIE1UP(16, H2)
4900 GEN_VEXT_VSLIE1UP(32, H4)
4901 GEN_VEXT_VSLIE1UP(64, H8)
4902 
4903 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
4904 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4905                   CPURISCVState *env, uint32_t desc)              \
4906 {                                                                 \
4907     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
4908 }
4909 
4910 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
4911 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
4912 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
4913 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
4914 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
4915 
4916 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
4917 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4918                        void *vs2, CPURISCVState *env, uint32_t desc)          \
4919 {                                                                             \
4920     typedef uint##BITWIDTH##_t ETYPE;                                         \
4921     uint32_t vm = vext_vm(desc);                                              \
4922     uint32_t vl = env->vl;                                                    \
4923     uint32_t i;                                                               \
4924                                                                               \
4925     for (i = env->vstart; i < vl; i++) {                                      \
4926         if (!vm && !vext_elem_mask(v0, i)) {                                  \
4927             continue;                                                         \
4928         }                                                                     \
4929         if (i == vl - 1) {                                                    \
4930             *((ETYPE *)vd + H(i)) = s1;                                       \
4931         } else {                                                              \
4932             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
4933         }                                                                     \
4934     }                                                                         \
4935     env->vstart = 0;                                                          \
4936 }
4937 
4938 GEN_VEXT_VSLIDE1DOWN(8,  H1)
4939 GEN_VEXT_VSLIDE1DOWN(16, H2)
4940 GEN_VEXT_VSLIDE1DOWN(32, H4)
4941 GEN_VEXT_VSLIDE1DOWN(64, H8)
4942 
4943 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
4944 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4945                   CPURISCVState *env, uint32_t desc)              \
4946 {                                                                 \
4947     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
4948 }
4949 
4950 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
4951 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
4952 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
4953 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
4954 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
4955 
4956 /* Vector Floating-Point Slide Instructions */
4957 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
4958 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4959                   CPURISCVState *env, uint32_t desc)          \
4960 {                                                             \
4961     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
4962 }
4963 
4964 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
4965 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
4966 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
4967 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
4968 
4969 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
4970 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4971                   CPURISCVState *env, uint32_t desc)          \
4972 {                                                             \
4973     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
4974 }
4975 
4976 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
4977 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
4978 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
4979 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
4980 
4981 /* Vector Register Gather Instruction */
4982 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
4983 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
4984                   CPURISCVState *env, uint32_t desc)                      \
4985 {                                                                         \
4986     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
4987     uint32_t vm = vext_vm(desc);                                          \
4988     uint32_t vl = env->vl;                                                \
4989     uint64_t index;                                                       \
4990     uint32_t i;                                                           \
4991                                                                           \
4992     for (i = env->vstart; i < vl; i++) {                                  \
4993         if (!vm && !vext_elem_mask(v0, i)) {                              \
4994             continue;                                                     \
4995         }                                                                 \
4996         index = *((TS1 *)vs1 + HS1(i));                                   \
4997         if (index >= vlmax) {                                             \
4998             *((TS2 *)vd + HS2(i)) = 0;                                    \
4999         } else {                                                          \
5000             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5001         }                                                                 \
5002     }                                                                     \
5003     env->vstart = 0;                                                      \
5004 }
5005 
5006 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5007 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5008 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5009 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5010 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5011 
5012 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5013 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5014 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5015 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5016 
5017 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5018 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5019                   CPURISCVState *env, uint32_t desc)                      \
5020 {                                                                         \
5021     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5022     uint32_t vm = vext_vm(desc);                                          \
5023     uint32_t vl = env->vl;                                                \
5024     uint64_t index = s1;                                                  \
5025     uint32_t i;                                                           \
5026                                                                           \
5027     for (i = env->vstart; i < vl; i++) {                                  \
5028         if (!vm && !vext_elem_mask(v0, i)) {                              \
5029             continue;                                                     \
5030         }                                                                 \
5031         if (index >= vlmax) {                                             \
5032             *((ETYPE *)vd + H(i)) = 0;                                    \
5033         } else {                                                          \
5034             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5035         }                                                                 \
5036     }                                                                     \
5037     env->vstart = 0;                                                      \
5038 }
5039 
5040 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5041 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5042 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5043 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5044 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5045 
5046 /* Vector Compress Instruction */
5047 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5048 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5049                   CPURISCVState *env, uint32_t desc)                      \
5050 {                                                                         \
5051     uint32_t vl = env->vl;                                                \
5052     uint32_t num = 0, i;                                                  \
5053                                                                           \
5054     for (i = env->vstart; i < vl; i++) {                                  \
5055         if (!vext_elem_mask(vs1, i)) {                                    \
5056             continue;                                                     \
5057         }                                                                 \
5058         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5059         num++;                                                            \
5060     }                                                                     \
5061     env->vstart = 0;                                                      \
5062 }
5063 
5064 /* Compress into vd elements of vs2 where vs1 is enabled */
5065 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5066 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5067 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5068 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5069 
5070 /* Vector Whole Register Move */
5071 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5072 {
5073     /* EEW = SEW */
5074     uint32_t maxsz = simd_maxsz(desc);
5075     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5076     uint32_t startb = env->vstart * sewb;
5077     uint32_t i = startb;
5078 
5079     memcpy((uint8_t *)vd + H1(i),
5080            (uint8_t *)vs2 + H1(i),
5081            maxsz - startb);
5082 
5083     env->vstart = 0;
5084 }
5085 
5086 /* Vector Integer Extension */
5087 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5088 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5089                   CPURISCVState *env, uint32_t desc)             \
5090 {                                                                \
5091     uint32_t vl = env->vl;                                       \
5092     uint32_t vm = vext_vm(desc);                                 \
5093     uint32_t i;                                                  \
5094                                                                  \
5095     for (i = env->vstart; i < vl; i++) {                         \
5096         if (!vm && !vext_elem_mask(v0, i)) {                     \
5097             continue;                                            \
5098         }                                                        \
5099         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5100     }                                                            \
5101     env->vstart = 0;                                             \
5102 }
5103 
5104 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5105 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5106 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5107 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5108 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5109 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5110 
5111 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5112 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5113 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5114 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5115 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5116 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5117