xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 3cdd1f45aa55d77c6a04a546013de911eec3cced)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "fpu/softfloat.h"
29 #include "tcg/tcg-gvec-desc.h"
30 #include "internals.h"
31 #include "vector_internals.h"
32 #include <math.h>
33 
HELPER(vsetvl)34 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
35                             target_ulong s2)
36 {
37     int vlmax, vl;
38     RISCVCPU *cpu = env_archcpu(env);
39     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
40     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
41     uint16_t sew = 8 << vsew;
42     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
43     int xlen = riscv_cpu_xlen(env);
44     bool vill = (s2 >> (xlen - 1)) & 0x1;
45     target_ulong reserved = s2 &
46                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
47                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
48     uint16_t vlen = cpu->cfg.vlenb << 3;
49     int8_t lmul;
50 
51     if (vlmul & 4) {
52         /*
53          * Fractional LMUL, check:
54          *
55          * VLEN * LMUL >= SEW
56          * VLEN >> (8 - lmul) >= sew
57          * (vlenb << 3) >> (8 - lmul) >= sew
58          */
59         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
60             vill = true;
61         }
62     }
63 
64     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
65         /* only set vill bit. */
66         env->vill = 1;
67         env->vtype = 0;
68         env->vl = 0;
69         env->vstart = 0;
70         return 0;
71     }
72 
73     /* lmul encoded as in DisasContext::lmul */
74     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
75     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
76     if (s1 <= vlmax) {
77         vl = s1;
78     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
79         vl = (s1 + 1) >> 1;
80     } else {
81         vl = vlmax;
82     }
83     env->vl = vl;
84     env->vtype = s2;
85     env->vstart = 0;
86     env->vill = 0;
87     return vl;
88 }
89 
90 /*
91  * Get the maximum number of elements can be operated.
92  *
93  * log2_esz: log2 of element size in bytes.
94  */
vext_max_elems(uint32_t desc,uint32_t log2_esz)95 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
96 {
97     /*
98      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
99      * so vlen in bytes (vlenb) is encoded as maxsz.
100      */
101     uint32_t vlenb = simd_maxsz(desc);
102 
103     /* Return VLMAX */
104     int scale = vext_lmul(desc) - log2_esz;
105     return scale < 0 ? vlenb >> -scale : vlenb << scale;
106 }
107 
adjust_addr(CPURISCVState * env,target_ulong addr)108 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
109 {
110     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
111 }
112 
113 /*
114  * This function checks watchpoint before real load operation.
115  *
116  * In system mode, the TLB API probe_access is enough for watchpoint check.
117  * In user mode, there is no watchpoint support now.
118  *
119  * It will trigger an exception if there is no mapping in TLB
120  * and page table walk can't fill the TLB entry. Then the guest
121  * software can return here after process the exception or never return.
122  */
probe_pages(CPURISCVState * env,target_ulong addr,target_ulong len,uintptr_t ra,MMUAccessType access_type)123 static void probe_pages(CPURISCVState *env, target_ulong addr,
124                         target_ulong len, uintptr_t ra,
125                         MMUAccessType access_type)
126 {
127     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
128     target_ulong curlen = MIN(pagelen, len);
129     int mmu_index = riscv_env_mmu_index(env, false);
130 
131     probe_access(env, adjust_addr(env, addr), curlen, access_type,
132                  mmu_index, ra);
133     if (len > curlen) {
134         addr += curlen;
135         curlen = len - curlen;
136         probe_access(env, adjust_addr(env, addr), curlen, access_type,
137                      mmu_index, ra);
138     }
139 }
140 
vext_set_elem_mask(void * v0,int index,uint8_t value)141 static inline void vext_set_elem_mask(void *v0, int index,
142                                       uint8_t value)
143 {
144     int idx = index / 64;
145     int pos = index % 64;
146     uint64_t old = ((uint64_t *)v0)[idx];
147     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
148 }
149 
150 /* elements operations for load and store */
151 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
152                                    uint32_t idx, void *vd, uintptr_t retaddr);
153 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
154 
155 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
156 static inline QEMU_ALWAYS_INLINE                            \
157 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
158                 uint32_t idx, void *vd, uintptr_t retaddr)  \
159 {                                                           \
160     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
161     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
162 }                                                           \
163                                                             \
164 static inline QEMU_ALWAYS_INLINE                            \
165 void NAME##_host(void *vd, uint32_t idx, void *host)        \
166 {                                                           \
167     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
168     *cur = (ETYPE)LDSUF##_p(host);                          \
169 }
170 
GEN_VEXT_LD_ELEM(lde_b,uint8_t,H1,ldub)171 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
172 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
173 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
174 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
175 
176 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
177 static inline QEMU_ALWAYS_INLINE                            \
178 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
179                 uint32_t idx, void *vd, uintptr_t retaddr)  \
180 {                                                           \
181     ETYPE data = *((ETYPE *)vd + H(idx));                   \
182     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
183 }                                                           \
184                                                             \
185 static inline QEMU_ALWAYS_INLINE                            \
186 void NAME##_host(void *vd, uint32_t idx, void *host)        \
187 {                                                           \
188     ETYPE data = *((ETYPE *)vd + H(idx));                   \
189     STSUF##_p(host, data);                                  \
190 }
191 
192 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
193 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
194 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
195 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
196 
197 static inline QEMU_ALWAYS_INLINE void
198 vext_continus_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
199                        void *vd, uint32_t evl, target_ulong addr,
200                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
201                        bool is_load)
202 {
203     uint32_t i;
204     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
205         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
206     }
207 }
208 
209 static inline QEMU_ALWAYS_INLINE void
vext_continus_ldst_host(CPURISCVState * env,vext_ldst_elem_fn_host * ldst_host,void * vd,uint32_t evl,uint32_t reg_start,void * host,uint32_t esz,bool is_load)210 vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
211                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
212                         uint32_t esz, bool is_load)
213 {
214 #if HOST_BIG_ENDIAN
215     for (; reg_start < evl; reg_start++, host += esz) {
216         ldst_host(vd, reg_start, host);
217     }
218 #else
219     if (esz == 1) {
220         uint32_t byte_offset = reg_start * esz;
221         uint32_t size = (evl - reg_start) * esz;
222 
223         if (is_load) {
224             memcpy(vd + byte_offset, host, size);
225         } else {
226             memcpy(host, vd + byte_offset, size);
227         }
228     } else {
229         for (; reg_start < evl; reg_start++, host += esz) {
230             ldst_host(vd, reg_start, host);
231         }
232     }
233 #endif
234 }
235 
vext_set_tail_elems_1s(target_ulong vl,void * vd,uint32_t desc,uint32_t nf,uint32_t esz,uint32_t max_elems)236 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
237                                    uint32_t desc, uint32_t nf,
238                                    uint32_t esz, uint32_t max_elems)
239 {
240     uint32_t vta = vext_vta(desc);
241     int k;
242 
243     if (vta == 0) {
244         return;
245     }
246 
247     for (k = 0; k < nf; ++k) {
248         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
249                           (k * max_elems + max_elems) * esz);
250     }
251 }
252 
253 /*
254  * stride: access vector element from strided memory
255  */
256 static void
vext_ldst_stride(void * vd,void * v0,target_ulong base,target_ulong stride,CPURISCVState * env,uint32_t desc,uint32_t vm,vext_ldst_elem_fn_tlb * ldst_elem,uint32_t log2_esz,uintptr_t ra)257 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
258                  CPURISCVState *env, uint32_t desc, uint32_t vm,
259                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
260                  uintptr_t ra)
261 {
262     uint32_t i, k;
263     uint32_t nf = vext_nf(desc);
264     uint32_t max_elems = vext_max_elems(desc, log2_esz);
265     uint32_t esz = 1 << log2_esz;
266     uint32_t vma = vext_vma(desc);
267 
268     VSTART_CHECK_EARLY_EXIT(env, env->vl);
269 
270     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
271         k = 0;
272         while (k < nf) {
273             if (!vm && !vext_elem_mask(v0, i)) {
274                 /* set masked-off elements to 1s */
275                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
276                                   (i + k * max_elems + 1) * esz);
277                 k++;
278                 continue;
279             }
280             target_ulong addr = base + stride * i + (k << log2_esz);
281             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
282             k++;
283         }
284     }
285     env->vstart = 0;
286 
287     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
288 }
289 
290 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
291 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
292                   target_ulong stride, CPURISCVState *env,              \
293                   uint32_t desc)                                        \
294 {                                                                       \
295     uint32_t vm = vext_vm(desc);                                        \
296     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
297                      ctzl(sizeof(ETYPE)), GETPC());                     \
298 }
299 
GEN_VEXT_LD_STRIDE(vlse8_v,int8_t,lde_b_tlb)300 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
301 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
302 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
303 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
304 
305 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
306 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
307                   target_ulong stride, CPURISCVState *env,              \
308                   uint32_t desc)                                        \
309 {                                                                       \
310     uint32_t vm = vext_vm(desc);                                        \
311     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
312                      ctzl(sizeof(ETYPE)), GETPC());                     \
313 }
314 
315 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
316 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
317 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
318 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
319 
320 /*
321  * unit-stride: access elements stored contiguously in memory
322  */
323 
324 /* unmasked unit-stride load and store operation */
325 static inline QEMU_ALWAYS_INLINE void
326 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
327                   uint32_t elems, uint32_t nf, uint32_t max_elems,
328                   uint32_t log2_esz, bool is_load, int mmu_index,
329                   vext_ldst_elem_fn_tlb *ldst_tlb,
330                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
331 {
332     void *host;
333     int i, k, flags;
334     uint32_t esz = 1 << log2_esz;
335     uint32_t size = (elems * nf) << log2_esz;
336     uint32_t evl = env->vstart + elems;
337     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
338 
339     /* Check page permission/pmp/watchpoint/etc. */
340     flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
341                                mmu_index, true, &host, ra);
342 
343     if (flags == 0) {
344         if (nf == 1) {
345             vext_continus_ldst_host(env, ldst_host, vd, evl, env->vstart, host,
346                                     esz, is_load);
347         } else {
348             for (i = env->vstart; i < evl; ++i) {
349                 k = 0;
350                 while (k < nf) {
351                     ldst_host(vd, i + k * max_elems, host);
352                     host += esz;
353                     k++;
354                 }
355             }
356         }
357         env->vstart += elems;
358     } else {
359         if (nf == 1) {
360             vext_continus_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
361                                    ra, esz, is_load);
362         } else {
363             /* load bytes from guest memory */
364             for (i = env->vstart; i < evl; env->vstart = ++i) {
365                 k = 0;
366                 while (k < nf) {
367                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
368                              vd, ra);
369                     addr += esz;
370                     k++;
371                 }
372             }
373         }
374     }
375 }
376 
377 static inline QEMU_ALWAYS_INLINE void
vext_ldst_us(void * vd,target_ulong base,CPURISCVState * env,uint32_t desc,vext_ldst_elem_fn_tlb * ldst_tlb,vext_ldst_elem_fn_host * ldst_host,uint32_t log2_esz,uint32_t evl,uintptr_t ra,bool is_load)378 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
379              vext_ldst_elem_fn_tlb *ldst_tlb,
380              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
381              uint32_t evl, uintptr_t ra, bool is_load)
382 {
383     uint32_t k;
384     target_ulong page_split, elems, addr;
385     uint32_t nf = vext_nf(desc);
386     uint32_t max_elems = vext_max_elems(desc, log2_esz);
387     uint32_t esz = 1 << log2_esz;
388     uint32_t msize = nf * esz;
389     int mmu_index = riscv_env_mmu_index(env, false);
390 
391     VSTART_CHECK_EARLY_EXIT(env, evl);
392 
393     /* Calculate the page range of first page */
394     addr = base + ((env->vstart * nf) << log2_esz);
395     page_split = -(addr | TARGET_PAGE_MASK);
396     /* Get number of elements */
397     elems = page_split / msize;
398     if (unlikely(env->vstart + elems >= evl)) {
399         elems = evl - env->vstart;
400     }
401 
402     /* Load/store elements in the first page */
403     if (likely(elems)) {
404         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
405                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
406     }
407 
408     /* Load/store elements in the second page */
409     if (unlikely(env->vstart < evl)) {
410         /* Cross page element */
411         if (unlikely(page_split % msize)) {
412             for (k = 0; k < nf; k++) {
413                 addr = base + ((env->vstart * nf + k) << log2_esz);
414                 ldst_tlb(env, adjust_addr(env, addr),
415                         env->vstart + k * max_elems, vd, ra);
416             }
417             env->vstart++;
418         }
419 
420         addr = base + ((env->vstart * nf) << log2_esz);
421         /* Get number of elements of second page */
422         elems = evl - env->vstart;
423 
424         /* Load/store elements in the second page */
425         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
426                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
427     }
428 
429     env->vstart = 0;
430     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
431 }
432 
433 /*
434  * masked unit-stride load and store operation will be a special case of
435  * stride, stride = NF * sizeof (ETYPE)
436  */
437 
438 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
439 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
440                          CPURISCVState *env, uint32_t desc)         \
441 {                                                                   \
442     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
443     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
444                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
445 }                                                                   \
446                                                                     \
447 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
448                   CPURISCVState *env, uint32_t desc)                \
449 {                                                                   \
450     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
451                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
452 }
453 
GEN_VEXT_LD_US(vle8_v,int8_t,lde_b_tlb,lde_b_host)454 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
455 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
456 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
457 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
458 
459 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
460 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
461                          CPURISCVState *env, uint32_t desc)              \
462 {                                                                        \
463     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
464     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
465                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
466 }                                                                        \
467                                                                          \
468 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
469                   CPURISCVState *env, uint32_t desc)                     \
470 {                                                                        \
471     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
472                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
473 }
474 
475 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
476 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
477 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
478 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
479 
480 /*
481  * unit stride mask load and store, EEW = 1
482  */
483 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
484                     CPURISCVState *env, uint32_t desc)
485 {
486     /* evl = ceil(vl/8) */
487     uint8_t evl = (env->vl + 7) >> 3;
488     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
489                  0, evl, GETPC(), true);
490 }
491 
HELPER(vsm_v)492 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
493                     CPURISCVState *env, uint32_t desc)
494 {
495     /* evl = ceil(vl/8) */
496     uint8_t evl = (env->vl + 7) >> 3;
497     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
498                  0, evl, GETPC(), false);
499 }
500 
501 /*
502  * index: access vector element from indexed memory
503  */
504 typedef target_ulong vext_get_index_addr(target_ulong base,
505         uint32_t idx, void *vs2);
506 
507 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
508 static target_ulong NAME(target_ulong base,            \
509                          uint32_t idx, void *vs2)      \
510 {                                                      \
511     return (base + *((ETYPE *)vs2 + H(idx)));          \
512 }
513 
GEN_VEXT_GET_INDEX_ADDR(idx_b,uint8_t,H1)514 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
515 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
516 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
517 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
518 
519 static inline void
520 vext_ldst_index(void *vd, void *v0, target_ulong base,
521                 void *vs2, CPURISCVState *env, uint32_t desc,
522                 vext_get_index_addr get_index_addr,
523                 vext_ldst_elem_fn_tlb *ldst_elem,
524                 uint32_t log2_esz, uintptr_t ra)
525 {
526     uint32_t i, k;
527     uint32_t nf = vext_nf(desc);
528     uint32_t vm = vext_vm(desc);
529     uint32_t max_elems = vext_max_elems(desc, log2_esz);
530     uint32_t esz = 1 << log2_esz;
531     uint32_t vma = vext_vma(desc);
532 
533     VSTART_CHECK_EARLY_EXIT(env, env->vl);
534 
535     /* load bytes from guest memory */
536     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
537         k = 0;
538         while (k < nf) {
539             if (!vm && !vext_elem_mask(v0, i)) {
540                 /* set masked-off elements to 1s */
541                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
542                                   (i + k * max_elems + 1) * esz);
543                 k++;
544                 continue;
545             }
546             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
547             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
548             k++;
549         }
550     }
551     env->vstart = 0;
552 
553     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
554 }
555 
556 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
557 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
558                   void *vs2, CPURISCVState *env, uint32_t desc)            \
559 {                                                                          \
560     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
561                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
562 }
563 
GEN_VEXT_LD_INDEX(vlxei8_8_v,int8_t,idx_b,lde_b_tlb)564 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
565 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
566 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
567 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
568 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
569 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
570 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
571 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
572 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
573 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
574 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
575 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
576 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
577 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
578 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
579 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
580 
581 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
582 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
583                   void *vs2, CPURISCVState *env, uint32_t desc)  \
584 {                                                                \
585     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
586                     STORE_FN, ctzl(sizeof(ETYPE)),               \
587                     GETPC());                                    \
588 }
589 
590 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
591 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
592 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
593 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
594 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
595 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
596 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
597 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
598 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
599 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
600 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
601 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
602 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
603 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
604 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
605 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
606 
607 /*
608  * unit-stride fault-only-fisrt load instructions
609  */
610 static inline void
611 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
612           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
613           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
614 {
615     uint32_t i, k, vl = 0;
616     uint32_t nf = vext_nf(desc);
617     uint32_t vm = vext_vm(desc);
618     uint32_t max_elems = vext_max_elems(desc, log2_esz);
619     uint32_t esz = 1 << log2_esz;
620     uint32_t msize = nf * esz;
621     uint32_t vma = vext_vma(desc);
622     target_ulong addr, offset, remain, page_split, elems;
623     int mmu_index = riscv_env_mmu_index(env, false);
624 
625     VSTART_CHECK_EARLY_EXIT(env, env->vl);
626 
627     /* probe every access */
628     for (i = env->vstart; i < env->vl; i++) {
629         if (!vm && !vext_elem_mask(v0, i)) {
630             continue;
631         }
632         addr = adjust_addr(env, base + i * (nf << log2_esz));
633         if (i == 0) {
634             /* Allow fault on first element. */
635             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
636         } else {
637             remain = nf << log2_esz;
638             while (remain > 0) {
639                 void *host;
640                 int flags;
641 
642                 offset = -(addr | TARGET_PAGE_MASK);
643 
644                 /* Probe nonfault on subsequent elements. */
645                 flags = probe_access_flags(env, addr, offset, MMU_DATA_LOAD,
646                                            mmu_index, true, &host, 0);
647 
648                 /*
649                  * Stop if invalid (unmapped) or mmio (transaction may fail).
650                  * Do not stop if watchpoint, as the spec says that
651                  * first-fault should continue to access the same
652                  * elements regardless of any watchpoint.
653                  */
654                 if (flags & ~TLB_WATCHPOINT) {
655                     vl = i;
656                     goto ProbeSuccess;
657                 }
658                 if (remain <= offset) {
659                     break;
660                 }
661                 remain -= offset;
662                 addr = adjust_addr(env, addr + offset);
663             }
664         }
665     }
666 ProbeSuccess:
667     /* load bytes from guest memory */
668     if (vl != 0) {
669         env->vl = vl;
670     }
671 
672     if (env->vstart < env->vl) {
673         if (vm) {
674             /* Calculate the page range of first page */
675             addr = base + ((env->vstart * nf) << log2_esz);
676             page_split = -(addr | TARGET_PAGE_MASK);
677             /* Get number of elements */
678             elems = page_split / msize;
679             if (unlikely(env->vstart + elems >= env->vl)) {
680                 elems = env->vl - env->vstart;
681             }
682 
683             /* Load/store elements in the first page */
684             if (likely(elems)) {
685                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
686                                   log2_esz, true, mmu_index, ldst_tlb,
687                                   ldst_host, ra);
688             }
689 
690             /* Load/store elements in the second page */
691             if (unlikely(env->vstart < env->vl)) {
692                 /* Cross page element */
693                 if (unlikely(page_split % msize)) {
694                     for (k = 0; k < nf; k++) {
695                         addr = base + ((env->vstart * nf + k) << log2_esz);
696                         ldst_tlb(env, adjust_addr(env, addr),
697                                  env->vstart + k * max_elems, vd, ra);
698                     }
699                     env->vstart++;
700                 }
701 
702                 addr = base + ((env->vstart * nf) << log2_esz);
703                 /* Get number of elements of second page */
704                 elems = env->vl - env->vstart;
705 
706                 /* Load/store elements in the second page */
707                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
708                                   log2_esz, true, mmu_index, ldst_tlb,
709                                   ldst_host, ra);
710             }
711         } else {
712             for (i = env->vstart; i < env->vl; i++) {
713                 k = 0;
714                 while (k < nf) {
715                     if (!vext_elem_mask(v0, i)) {
716                         /* set masked-off elements to 1s */
717                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
718                                           (i + k * max_elems + 1) * esz);
719                         k++;
720                         continue;
721                     }
722                     addr = base + ((i * nf + k) << log2_esz);
723                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
724                              vd, ra);
725                     k++;
726                 }
727             }
728         }
729     }
730     env->vstart = 0;
731 
732     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
733 }
734 
735 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
736 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
737                   CPURISCVState *env, uint32_t desc)            \
738 {                                                               \
739     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
740               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
741 }
742 
GEN_VEXT_LDFF(vle8ff_v,int8_t,lde_b_tlb,lde_b_host)743 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
744 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
745 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
746 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
747 
748 #define DO_SWAP(N, M) (M)
749 #define DO_AND(N, M)  (N & M)
750 #define DO_XOR(N, M)  (N ^ M)
751 #define DO_OR(N, M)   (N | M)
752 #define DO_ADD(N, M)  (N + M)
753 
754 /* Signed min/max */
755 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
756 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
757 
758 /*
759  * load and store whole register instructions
760  */
761 static inline QEMU_ALWAYS_INLINE void
762 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
763                 vext_ldst_elem_fn_tlb *ldst_tlb,
764                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
765                 uintptr_t ra, bool is_load)
766 {
767     target_ulong page_split, elems, addr;
768     uint32_t nf = vext_nf(desc);
769     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
770     uint32_t max_elems = vlenb >> log2_esz;
771     uint32_t evl = nf * max_elems;
772     uint32_t esz = 1 << log2_esz;
773     int mmu_index = riscv_env_mmu_index(env, false);
774 
775     /* Calculate the page range of first page */
776     addr = base + (env->vstart << log2_esz);
777     page_split = -(addr | TARGET_PAGE_MASK);
778     /* Get number of elements */
779     elems = page_split / esz;
780     if (unlikely(env->vstart + elems >= evl)) {
781         elems = evl - env->vstart;
782     }
783 
784     /* Load/store elements in the first page */
785     if (likely(elems)) {
786         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
787                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
788     }
789 
790     /* Load/store elements in the second page */
791     if (unlikely(env->vstart < evl)) {
792         /* Cross page element */
793         if (unlikely(page_split % esz)) {
794             addr = base + (env->vstart << log2_esz);
795             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
796             env->vstart++;
797         }
798 
799         addr = base + (env->vstart << log2_esz);
800         /* Get number of elements of second page */
801         elems = evl - env->vstart;
802 
803         /* Load/store elements in the second page */
804         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
805                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
806     }
807 
808     env->vstart = 0;
809 }
810 
811 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
812 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
813                   uint32_t desc)                                    \
814 {                                                                   \
815     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
816                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
817 }
818 
GEN_VEXT_LD_WHOLE(vl1re8_v,int8_t,lde_b_tlb,lde_b_host)819 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
820 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
821 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
822 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
823 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
824 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
825 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
826 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
827 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
828 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
829 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
830 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
831 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
832 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
833 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
834 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
835 
836 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
837 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
838                   uint32_t desc)                                        \
839 {                                                                       \
840     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
841                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
842 }
843 
844 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
845 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
846 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
847 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
848 
849 /*
850  * Vector Integer Arithmetic Instructions
851  */
852 
853 /* (TD, T1, T2, TX1, TX2) */
854 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
855 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
856 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
857 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
858 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
859 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
860 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
861 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
862 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
863 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
864 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
865 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
866 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
867 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
868 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
869 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
870 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
871 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
872 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
873 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
874 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
875 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
876 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
877 
878 #define DO_SUB(N, M) (N - M)
879 #define DO_RSUB(N, M) (M - N)
880 
881 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
882 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
883 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
884 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
885 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
886 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
887 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
888 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
889 
890 GEN_VEXT_VV(vadd_vv_b, 1)
891 GEN_VEXT_VV(vadd_vv_h, 2)
892 GEN_VEXT_VV(vadd_vv_w, 4)
893 GEN_VEXT_VV(vadd_vv_d, 8)
894 GEN_VEXT_VV(vsub_vv_b, 1)
895 GEN_VEXT_VV(vsub_vv_h, 2)
896 GEN_VEXT_VV(vsub_vv_w, 4)
897 GEN_VEXT_VV(vsub_vv_d, 8)
898 
899 
900 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
901 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
902 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
903 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
904 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
905 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
906 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
907 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
908 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
909 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
910 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
911 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
912 
913 GEN_VEXT_VX(vadd_vx_b, 1)
914 GEN_VEXT_VX(vadd_vx_h, 2)
915 GEN_VEXT_VX(vadd_vx_w, 4)
916 GEN_VEXT_VX(vadd_vx_d, 8)
917 GEN_VEXT_VX(vsub_vx_b, 1)
918 GEN_VEXT_VX(vsub_vx_h, 2)
919 GEN_VEXT_VX(vsub_vx_w, 4)
920 GEN_VEXT_VX(vsub_vx_d, 8)
921 GEN_VEXT_VX(vrsub_vx_b, 1)
922 GEN_VEXT_VX(vrsub_vx_h, 2)
923 GEN_VEXT_VX(vrsub_vx_w, 4)
924 GEN_VEXT_VX(vrsub_vx_d, 8)
925 
926 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
927 {
928     intptr_t oprsz = simd_oprsz(desc);
929     intptr_t i;
930 
931     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
932         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
933     }
934 }
935 
HELPER(vec_rsubs16)936 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
937 {
938     intptr_t oprsz = simd_oprsz(desc);
939     intptr_t i;
940 
941     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
942         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
943     }
944 }
945 
HELPER(vec_rsubs32)946 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
947 {
948     intptr_t oprsz = simd_oprsz(desc);
949     intptr_t i;
950 
951     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
952         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
953     }
954 }
955 
HELPER(vec_rsubs64)956 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
957 {
958     intptr_t oprsz = simd_oprsz(desc);
959     intptr_t i;
960 
961     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
962         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
963     }
964 }
965 
966 /* Vector Widening Integer Add/Subtract */
967 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
968 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
969 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
970 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
971 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
972 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
973 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
974 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
975 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
976 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
977 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
978 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
RVVCALL(OPIVV2,vwaddu_vv_b,WOP_UUU_B,H2,H1,H1,DO_ADD)979 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
980 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
981 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
982 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
983 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
984 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
985 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
986 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
987 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
988 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
989 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
990 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
991 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
992 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
993 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
994 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
995 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
996 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
997 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
998 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
999 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1000 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1001 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1002 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1003 GEN_VEXT_VV(vwaddu_vv_b, 2)
1004 GEN_VEXT_VV(vwaddu_vv_h, 4)
1005 GEN_VEXT_VV(vwaddu_vv_w, 8)
1006 GEN_VEXT_VV(vwsubu_vv_b, 2)
1007 GEN_VEXT_VV(vwsubu_vv_h, 4)
1008 GEN_VEXT_VV(vwsubu_vv_w, 8)
1009 GEN_VEXT_VV(vwadd_vv_b, 2)
1010 GEN_VEXT_VV(vwadd_vv_h, 4)
1011 GEN_VEXT_VV(vwadd_vv_w, 8)
1012 GEN_VEXT_VV(vwsub_vv_b, 2)
1013 GEN_VEXT_VV(vwsub_vv_h, 4)
1014 GEN_VEXT_VV(vwsub_vv_w, 8)
1015 GEN_VEXT_VV(vwaddu_wv_b, 2)
1016 GEN_VEXT_VV(vwaddu_wv_h, 4)
1017 GEN_VEXT_VV(vwaddu_wv_w, 8)
1018 GEN_VEXT_VV(vwsubu_wv_b, 2)
1019 GEN_VEXT_VV(vwsubu_wv_h, 4)
1020 GEN_VEXT_VV(vwsubu_wv_w, 8)
1021 GEN_VEXT_VV(vwadd_wv_b, 2)
1022 GEN_VEXT_VV(vwadd_wv_h, 4)
1023 GEN_VEXT_VV(vwadd_wv_w, 8)
1024 GEN_VEXT_VV(vwsub_wv_b, 2)
1025 GEN_VEXT_VV(vwsub_wv_h, 4)
1026 GEN_VEXT_VV(vwsub_wv_w, 8)
1027 
1028 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1029 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1030 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1031 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1032 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1033 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1034 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1035 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1036 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1037 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1038 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1039 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1040 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1041 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1042 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1043 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1044 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1045 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1046 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1047 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1048 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1049 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1050 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1051 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1052 GEN_VEXT_VX(vwaddu_vx_b, 2)
1053 GEN_VEXT_VX(vwaddu_vx_h, 4)
1054 GEN_VEXT_VX(vwaddu_vx_w, 8)
1055 GEN_VEXT_VX(vwsubu_vx_b, 2)
1056 GEN_VEXT_VX(vwsubu_vx_h, 4)
1057 GEN_VEXT_VX(vwsubu_vx_w, 8)
1058 GEN_VEXT_VX(vwadd_vx_b, 2)
1059 GEN_VEXT_VX(vwadd_vx_h, 4)
1060 GEN_VEXT_VX(vwadd_vx_w, 8)
1061 GEN_VEXT_VX(vwsub_vx_b, 2)
1062 GEN_VEXT_VX(vwsub_vx_h, 4)
1063 GEN_VEXT_VX(vwsub_vx_w, 8)
1064 GEN_VEXT_VX(vwaddu_wx_b, 2)
1065 GEN_VEXT_VX(vwaddu_wx_h, 4)
1066 GEN_VEXT_VX(vwaddu_wx_w, 8)
1067 GEN_VEXT_VX(vwsubu_wx_b, 2)
1068 GEN_VEXT_VX(vwsubu_wx_h, 4)
1069 GEN_VEXT_VX(vwsubu_wx_w, 8)
1070 GEN_VEXT_VX(vwadd_wx_b, 2)
1071 GEN_VEXT_VX(vwadd_wx_h, 4)
1072 GEN_VEXT_VX(vwadd_wx_w, 8)
1073 GEN_VEXT_VX(vwsub_wx_b, 2)
1074 GEN_VEXT_VX(vwsub_wx_h, 4)
1075 GEN_VEXT_VX(vwsub_wx_w, 8)
1076 
1077 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1078 #define DO_VADC(N, M, C) (N + M + C)
1079 #define DO_VSBC(N, M, C) (N - M - C)
1080 
1081 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1082 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1083                   CPURISCVState *env, uint32_t desc)          \
1084 {                                                             \
1085     uint32_t vl = env->vl;                                    \
1086     uint32_t esz = sizeof(ETYPE);                             \
1087     uint32_t total_elems =                                    \
1088         vext_get_total_elems(env, desc, esz);                 \
1089     uint32_t vta = vext_vta(desc);                            \
1090     uint32_t i;                                               \
1091                                                               \
1092     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1093                                                               \
1094     for (i = env->vstart; i < vl; i++) {                      \
1095         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1096         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1097         ETYPE carry = vext_elem_mask(v0, i);                  \
1098                                                               \
1099         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1100     }                                                         \
1101     env->vstart = 0;                                          \
1102     /* set tail elements to 1s */                             \
1103     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1104 }
1105 
1106 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1107 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1108 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1109 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1110 
1111 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1112 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1113 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1114 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1115 
1116 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1117 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1118                   CPURISCVState *env, uint32_t desc)                     \
1119 {                                                                        \
1120     uint32_t vl = env->vl;                                               \
1121     uint32_t esz = sizeof(ETYPE);                                        \
1122     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1123     uint32_t vta = vext_vta(desc);                                       \
1124     uint32_t i;                                                          \
1125                                                                          \
1126     VSTART_CHECK_EARLY_EXIT(env, vl);                                    \
1127                                                                          \
1128     for (i = env->vstart; i < vl; i++) {                                 \
1129         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1130         ETYPE carry = vext_elem_mask(v0, i);                             \
1131                                                                          \
1132         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1133     }                                                                    \
1134     env->vstart = 0;                                                     \
1135     /* set tail elements to 1s */                                        \
1136     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1137 }
1138 
1139 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1140 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1141 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1142 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1143 
1144 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1145 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1146 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1147 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1148 
1149 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1150                           (__typeof(N))(N + M) < N)
1151 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1152 
1153 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1154 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1155                   CPURISCVState *env, uint32_t desc)          \
1156 {                                                             \
1157     uint32_t vl = env->vl;                                    \
1158     uint32_t vm = vext_vm(desc);                              \
1159     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1160     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1161     uint32_t i;                                               \
1162                                                               \
1163     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1164                                                               \
1165     for (i = env->vstart; i < vl; i++) {                      \
1166         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1167         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1168         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1169         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1170     }                                                         \
1171     env->vstart = 0;                                          \
1172     /*
1173      * mask destination register are always tail-agnostic
1174      * set tail elements to 1s
1175      */                                                       \
1176     if (vta_all_1s) {                                         \
1177         for (; i < total_elems; i++) {                        \
1178             vext_set_elem_mask(vd, i, 1);                     \
1179         }                                                     \
1180     }                                                         \
1181 }
1182 
1183 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1184 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1185 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1186 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1187 
1188 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1189 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1190 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1191 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1192 
1193 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1194 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1195                   void *vs2, CPURISCVState *env, uint32_t desc) \
1196 {                                                               \
1197     uint32_t vl = env->vl;                                      \
1198     uint32_t vm = vext_vm(desc);                                \
1199     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1200     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1201     uint32_t i;                                                 \
1202                                                                 \
1203     VSTART_CHECK_EARLY_EXIT(env, vl);                           \
1204                                                                 \
1205     for (i = env->vstart; i < vl; i++) {                        \
1206         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1207         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1208         vext_set_elem_mask(vd, i,                               \
1209                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1210     }                                                           \
1211     env->vstart = 0;                                            \
1212     /*
1213      * mask destination register are always tail-agnostic
1214      * set tail elements to 1s
1215      */                                                         \
1216     if (vta_all_1s) {                                           \
1217         for (; i < total_elems; i++) {                          \
1218             vext_set_elem_mask(vd, i, 1);                       \
1219         }                                                       \
1220     }                                                           \
1221 }
1222 
1223 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1224 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1225 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1226 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1227 
1228 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1229 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1230 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1231 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1232 
1233 /* Vector Bitwise Logical Instructions */
1234 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1235 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1236 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1237 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1238 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1239 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1240 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1241 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1242 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1243 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1244 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1245 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1246 GEN_VEXT_VV(vand_vv_b, 1)
1247 GEN_VEXT_VV(vand_vv_h, 2)
1248 GEN_VEXT_VV(vand_vv_w, 4)
1249 GEN_VEXT_VV(vand_vv_d, 8)
1250 GEN_VEXT_VV(vor_vv_b, 1)
1251 GEN_VEXT_VV(vor_vv_h, 2)
1252 GEN_VEXT_VV(vor_vv_w, 4)
1253 GEN_VEXT_VV(vor_vv_d, 8)
1254 GEN_VEXT_VV(vxor_vv_b, 1)
1255 GEN_VEXT_VV(vxor_vv_h, 2)
1256 GEN_VEXT_VV(vxor_vv_w, 4)
1257 GEN_VEXT_VV(vxor_vv_d, 8)
1258 
1259 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1260 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1261 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1262 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1263 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1264 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1265 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1266 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1267 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1268 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1269 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1270 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1271 GEN_VEXT_VX(vand_vx_b, 1)
1272 GEN_VEXT_VX(vand_vx_h, 2)
1273 GEN_VEXT_VX(vand_vx_w, 4)
1274 GEN_VEXT_VX(vand_vx_d, 8)
1275 GEN_VEXT_VX(vor_vx_b, 1)
1276 GEN_VEXT_VX(vor_vx_h, 2)
1277 GEN_VEXT_VX(vor_vx_w, 4)
1278 GEN_VEXT_VX(vor_vx_d, 8)
1279 GEN_VEXT_VX(vxor_vx_b, 1)
1280 GEN_VEXT_VX(vxor_vx_h, 2)
1281 GEN_VEXT_VX(vxor_vx_w, 4)
1282 GEN_VEXT_VX(vxor_vx_d, 8)
1283 
1284 /* Vector Single-Width Bit Shift Instructions */
1285 #define DO_SLL(N, M)  (N << (M))
1286 #define DO_SRL(N, M)  (N >> (M))
1287 
1288 /* generate the helpers for shift instructions with two vector operators */
1289 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1290 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1291                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1292 {                                                                         \
1293     uint32_t vm = vext_vm(desc);                                          \
1294     uint32_t vl = env->vl;                                                \
1295     uint32_t esz = sizeof(TS1);                                           \
1296     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1297     uint32_t vta = vext_vta(desc);                                        \
1298     uint32_t vma = vext_vma(desc);                                        \
1299     uint32_t i;                                                           \
1300                                                                           \
1301     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
1302                                                                           \
1303     for (i = env->vstart; i < vl; i++) {                                  \
1304         if (!vm && !vext_elem_mask(v0, i)) {                              \
1305             /* set masked-off elements to 1s */                           \
1306             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1307             continue;                                                     \
1308         }                                                                 \
1309         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1310         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1311         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1312     }                                                                     \
1313     env->vstart = 0;                                                      \
1314     /* set tail elements to 1s */                                         \
1315     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1316 }
1317 
1318 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1319 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1320 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1321 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1322 
1323 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1324 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1325 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1326 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1327 
1328 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1329 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1330 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1331 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1332 
1333 /*
1334  * generate the helpers for shift instructions with one vector and one scalar
1335  */
1336 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1337 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1338                   void *vs2, CPURISCVState *env,            \
1339                   uint32_t desc)                            \
1340 {                                                           \
1341     uint32_t vm = vext_vm(desc);                            \
1342     uint32_t vl = env->vl;                                  \
1343     uint32_t esz = sizeof(TD);                              \
1344     uint32_t total_elems =                                  \
1345         vext_get_total_elems(env, desc, esz);               \
1346     uint32_t vta = vext_vta(desc);                          \
1347     uint32_t vma = vext_vma(desc);                          \
1348     uint32_t i;                                             \
1349                                                             \
1350     VSTART_CHECK_EARLY_EXIT(env, vl);                       \
1351                                                             \
1352     for (i = env->vstart; i < vl; i++) {                    \
1353         if (!vm && !vext_elem_mask(v0, i)) {                \
1354             /* set masked-off elements to 1s */             \
1355             vext_set_elems_1s(vd, vma, i * esz,             \
1356                               (i + 1) * esz);               \
1357             continue;                                       \
1358         }                                                   \
1359         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1360         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1361     }                                                       \
1362     env->vstart = 0;                                        \
1363     /* set tail elements to 1s */                           \
1364     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1365 }
1366 
1367 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1368 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1369 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1370 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1371 
1372 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1373 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1374 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1375 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1376 
1377 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1378 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1379 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1380 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1381 
1382 /* Vector Narrowing Integer Right Shift Instructions */
1383 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1384 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1385 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1386 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1387 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1388 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1389 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1390 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1391 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1392 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1393 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1394 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1395 
1396 /* Vector Integer Comparison Instructions */
1397 #define DO_MSEQ(N, M) (N == M)
1398 #define DO_MSNE(N, M) (N != M)
1399 #define DO_MSLT(N, M) (N < M)
1400 #define DO_MSLE(N, M) (N <= M)
1401 #define DO_MSGT(N, M) (N > M)
1402 
1403 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1404 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1405                   CPURISCVState *env, uint32_t desc)          \
1406 {                                                             \
1407     uint32_t vm = vext_vm(desc);                              \
1408     uint32_t vl = env->vl;                                    \
1409     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1410     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1411     uint32_t vma = vext_vma(desc);                            \
1412     uint32_t i;                                               \
1413                                                               \
1414     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1415                                                               \
1416     for (i = env->vstart; i < vl; i++) {                      \
1417         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1418         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1419         if (!vm && !vext_elem_mask(v0, i)) {                  \
1420             /* set masked-off elements to 1s */               \
1421             if (vma) {                                        \
1422                 vext_set_elem_mask(vd, i, 1);                 \
1423             }                                                 \
1424             continue;                                         \
1425         }                                                     \
1426         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1427     }                                                         \
1428     env->vstart = 0;                                          \
1429     /*
1430      * mask destination register are always tail-agnostic
1431      * set tail elements to 1s
1432      */                                                       \
1433     if (vta_all_1s) {                                         \
1434         for (; i < total_elems; i++) {                        \
1435             vext_set_elem_mask(vd, i, 1);                     \
1436         }                                                     \
1437     }                                                         \
1438 }
1439 
1440 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1441 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1442 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1443 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1444 
1445 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1446 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1447 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1448 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1449 
1450 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1451 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1452 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1453 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1454 
1455 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1456 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1457 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1458 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1459 
1460 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1461 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1462 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1463 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1464 
1465 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1466 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1467 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1468 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1469 
1470 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1471 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1472                   CPURISCVState *env, uint32_t desc)                \
1473 {                                                                   \
1474     uint32_t vm = vext_vm(desc);                                    \
1475     uint32_t vl = env->vl;                                          \
1476     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1477     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1478     uint32_t vma = vext_vma(desc);                                  \
1479     uint32_t i;                                                     \
1480                                                                     \
1481     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
1482                                                                     \
1483     for (i = env->vstart; i < vl; i++) {                            \
1484         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1485         if (!vm && !vext_elem_mask(v0, i)) {                        \
1486             /* set masked-off elements to 1s */                     \
1487             if (vma) {                                              \
1488                 vext_set_elem_mask(vd, i, 1);                       \
1489             }                                                       \
1490             continue;                                               \
1491         }                                                           \
1492         vext_set_elem_mask(vd, i,                                   \
1493                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1494     }                                                               \
1495     env->vstart = 0;                                                \
1496     /*
1497      * mask destination register are always tail-agnostic
1498      * set tail elements to 1s
1499      */                                                             \
1500     if (vta_all_1s) {                                               \
1501         for (; i < total_elems; i++) {                              \
1502             vext_set_elem_mask(vd, i, 1);                           \
1503         }                                                           \
1504     }                                                               \
1505 }
1506 
1507 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1508 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1509 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1510 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1511 
1512 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1513 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1514 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1515 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1516 
1517 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1518 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1519 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1520 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1521 
1522 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1523 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1524 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1525 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1526 
1527 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1528 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1529 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1530 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1531 
1532 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1533 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1534 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1535 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1536 
1537 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1538 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1539 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1540 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1541 
1542 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1543 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1544 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1545 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1546 
1547 /* Vector Integer Min/Max Instructions */
1548 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1549 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1550 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1551 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1552 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1553 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1554 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1555 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1556 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1557 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1558 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1559 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1560 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1561 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1562 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1563 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1564 GEN_VEXT_VV(vminu_vv_b, 1)
1565 GEN_VEXT_VV(vminu_vv_h, 2)
1566 GEN_VEXT_VV(vminu_vv_w, 4)
1567 GEN_VEXT_VV(vminu_vv_d, 8)
1568 GEN_VEXT_VV(vmin_vv_b, 1)
1569 GEN_VEXT_VV(vmin_vv_h, 2)
1570 GEN_VEXT_VV(vmin_vv_w, 4)
1571 GEN_VEXT_VV(vmin_vv_d, 8)
1572 GEN_VEXT_VV(vmaxu_vv_b, 1)
1573 GEN_VEXT_VV(vmaxu_vv_h, 2)
1574 GEN_VEXT_VV(vmaxu_vv_w, 4)
1575 GEN_VEXT_VV(vmaxu_vv_d, 8)
1576 GEN_VEXT_VV(vmax_vv_b, 1)
1577 GEN_VEXT_VV(vmax_vv_h, 2)
1578 GEN_VEXT_VV(vmax_vv_w, 4)
1579 GEN_VEXT_VV(vmax_vv_d, 8)
1580 
1581 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1582 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1583 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1584 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1585 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1586 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1587 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1588 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1589 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1590 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1591 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1592 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1593 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1594 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1595 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1596 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1597 GEN_VEXT_VX(vminu_vx_b, 1)
1598 GEN_VEXT_VX(vminu_vx_h, 2)
1599 GEN_VEXT_VX(vminu_vx_w, 4)
1600 GEN_VEXT_VX(vminu_vx_d, 8)
1601 GEN_VEXT_VX(vmin_vx_b, 1)
1602 GEN_VEXT_VX(vmin_vx_h, 2)
1603 GEN_VEXT_VX(vmin_vx_w, 4)
1604 GEN_VEXT_VX(vmin_vx_d, 8)
1605 GEN_VEXT_VX(vmaxu_vx_b, 1)
1606 GEN_VEXT_VX(vmaxu_vx_h, 2)
1607 GEN_VEXT_VX(vmaxu_vx_w, 4)
1608 GEN_VEXT_VX(vmaxu_vx_d, 8)
1609 GEN_VEXT_VX(vmax_vx_b, 1)
1610 GEN_VEXT_VX(vmax_vx_h, 2)
1611 GEN_VEXT_VX(vmax_vx_w, 4)
1612 GEN_VEXT_VX(vmax_vx_d, 8)
1613 
1614 /* Vector Single-Width Integer Multiply Instructions */
1615 #define DO_MUL(N, M) (N * M)
1616 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1617 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1618 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1619 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1620 GEN_VEXT_VV(vmul_vv_b, 1)
1621 GEN_VEXT_VV(vmul_vv_h, 2)
1622 GEN_VEXT_VV(vmul_vv_w, 4)
1623 GEN_VEXT_VV(vmul_vv_d, 8)
1624 
1625 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1626 {
1627     return (int16_t)s2 * (int16_t)s1 >> 8;
1628 }
1629 
do_mulh_h(int16_t s2,int16_t s1)1630 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1631 {
1632     return (int32_t)s2 * (int32_t)s1 >> 16;
1633 }
1634 
do_mulh_w(int32_t s2,int32_t s1)1635 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1636 {
1637     return (int64_t)s2 * (int64_t)s1 >> 32;
1638 }
1639 
do_mulh_d(int64_t s2,int64_t s1)1640 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1641 {
1642     uint64_t hi_64, lo_64;
1643 
1644     muls64(&lo_64, &hi_64, s1, s2);
1645     return hi_64;
1646 }
1647 
do_mulhu_b(uint8_t s2,uint8_t s1)1648 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1649 {
1650     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1651 }
1652 
do_mulhu_h(uint16_t s2,uint16_t s1)1653 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1654 {
1655     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1656 }
1657 
do_mulhu_w(uint32_t s2,uint32_t s1)1658 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1659 {
1660     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1661 }
1662 
do_mulhu_d(uint64_t s2,uint64_t s1)1663 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1664 {
1665     uint64_t hi_64, lo_64;
1666 
1667     mulu64(&lo_64, &hi_64, s2, s1);
1668     return hi_64;
1669 }
1670 
do_mulhsu_b(int8_t s2,uint8_t s1)1671 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1672 {
1673     return (int16_t)s2 * (uint16_t)s1 >> 8;
1674 }
1675 
do_mulhsu_h(int16_t s2,uint16_t s1)1676 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1677 {
1678     return (int32_t)s2 * (uint32_t)s1 >> 16;
1679 }
1680 
do_mulhsu_w(int32_t s2,uint32_t s1)1681 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1682 {
1683     return (int64_t)s2 * (uint64_t)s1 >> 32;
1684 }
1685 
1686 /*
1687  * Let  A = signed operand,
1688  *      B = unsigned operand
1689  *      P = mulu64(A, B), unsigned product
1690  *
1691  * LET  X = 2 ** 64  - A, 2's complement of A
1692  *      SP = signed product
1693  * THEN
1694  *      IF A < 0
1695  *          SP = -X * B
1696  *             = -(2 ** 64 - A) * B
1697  *             = A * B - 2 ** 64 * B
1698  *             = P - 2 ** 64 * B
1699  *      ELSE
1700  *          SP = P
1701  * THEN
1702  *      HI_P -= (A < 0 ? B : 0)
1703  */
1704 
do_mulhsu_d(int64_t s2,uint64_t s1)1705 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1706 {
1707     uint64_t hi_64, lo_64;
1708 
1709     mulu64(&lo_64, &hi_64, s2, s1);
1710 
1711     hi_64 -= s2 < 0 ? s1 : 0;
1712     return hi_64;
1713 }
1714 
1715 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1716 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1717 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1718 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1719 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1720 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1721 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1722 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1723 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1724 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1725 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1726 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1727 GEN_VEXT_VV(vmulh_vv_b, 1)
1728 GEN_VEXT_VV(vmulh_vv_h, 2)
1729 GEN_VEXT_VV(vmulh_vv_w, 4)
1730 GEN_VEXT_VV(vmulh_vv_d, 8)
1731 GEN_VEXT_VV(vmulhu_vv_b, 1)
1732 GEN_VEXT_VV(vmulhu_vv_h, 2)
1733 GEN_VEXT_VV(vmulhu_vv_w, 4)
1734 GEN_VEXT_VV(vmulhu_vv_d, 8)
1735 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1736 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1737 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1738 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1739 
1740 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1741 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1742 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1743 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1744 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1745 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1746 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1747 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1748 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1749 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1750 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1751 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1752 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1753 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1754 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1755 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1756 GEN_VEXT_VX(vmul_vx_b, 1)
1757 GEN_VEXT_VX(vmul_vx_h, 2)
1758 GEN_VEXT_VX(vmul_vx_w, 4)
1759 GEN_VEXT_VX(vmul_vx_d, 8)
1760 GEN_VEXT_VX(vmulh_vx_b, 1)
1761 GEN_VEXT_VX(vmulh_vx_h, 2)
1762 GEN_VEXT_VX(vmulh_vx_w, 4)
1763 GEN_VEXT_VX(vmulh_vx_d, 8)
1764 GEN_VEXT_VX(vmulhu_vx_b, 1)
1765 GEN_VEXT_VX(vmulhu_vx_h, 2)
1766 GEN_VEXT_VX(vmulhu_vx_w, 4)
1767 GEN_VEXT_VX(vmulhu_vx_d, 8)
1768 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1769 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1770 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1771 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1772 
1773 /* Vector Integer Divide Instructions */
1774 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1775 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1776 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1777         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1778 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1779         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1780 
1781 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1782 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1783 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1784 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1785 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1786 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1787 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1788 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1789 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1790 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1791 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1792 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1793 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1794 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1795 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1796 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1797 GEN_VEXT_VV(vdivu_vv_b, 1)
1798 GEN_VEXT_VV(vdivu_vv_h, 2)
1799 GEN_VEXT_VV(vdivu_vv_w, 4)
1800 GEN_VEXT_VV(vdivu_vv_d, 8)
1801 GEN_VEXT_VV(vdiv_vv_b, 1)
1802 GEN_VEXT_VV(vdiv_vv_h, 2)
1803 GEN_VEXT_VV(vdiv_vv_w, 4)
1804 GEN_VEXT_VV(vdiv_vv_d, 8)
1805 GEN_VEXT_VV(vremu_vv_b, 1)
1806 GEN_VEXT_VV(vremu_vv_h, 2)
1807 GEN_VEXT_VV(vremu_vv_w, 4)
1808 GEN_VEXT_VV(vremu_vv_d, 8)
1809 GEN_VEXT_VV(vrem_vv_b, 1)
1810 GEN_VEXT_VV(vrem_vv_h, 2)
1811 GEN_VEXT_VV(vrem_vv_w, 4)
1812 GEN_VEXT_VV(vrem_vv_d, 8)
1813 
1814 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1815 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1816 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1817 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1818 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1819 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1820 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1821 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1822 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1823 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1824 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1825 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1826 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1827 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1828 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1829 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1830 GEN_VEXT_VX(vdivu_vx_b, 1)
1831 GEN_VEXT_VX(vdivu_vx_h, 2)
1832 GEN_VEXT_VX(vdivu_vx_w, 4)
1833 GEN_VEXT_VX(vdivu_vx_d, 8)
1834 GEN_VEXT_VX(vdiv_vx_b, 1)
1835 GEN_VEXT_VX(vdiv_vx_h, 2)
1836 GEN_VEXT_VX(vdiv_vx_w, 4)
1837 GEN_VEXT_VX(vdiv_vx_d, 8)
1838 GEN_VEXT_VX(vremu_vx_b, 1)
1839 GEN_VEXT_VX(vremu_vx_h, 2)
1840 GEN_VEXT_VX(vremu_vx_w, 4)
1841 GEN_VEXT_VX(vremu_vx_d, 8)
1842 GEN_VEXT_VX(vrem_vx_b, 1)
1843 GEN_VEXT_VX(vrem_vx_h, 2)
1844 GEN_VEXT_VX(vrem_vx_w, 4)
1845 GEN_VEXT_VX(vrem_vx_d, 8)
1846 
1847 /* Vector Widening Integer Multiply Instructions */
1848 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1849 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1850 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1851 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1852 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1853 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1854 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1855 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1856 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1857 GEN_VEXT_VV(vwmul_vv_b, 2)
1858 GEN_VEXT_VV(vwmul_vv_h, 4)
1859 GEN_VEXT_VV(vwmul_vv_w, 8)
1860 GEN_VEXT_VV(vwmulu_vv_b, 2)
1861 GEN_VEXT_VV(vwmulu_vv_h, 4)
1862 GEN_VEXT_VV(vwmulu_vv_w, 8)
1863 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1864 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1865 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1866 
1867 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1868 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1869 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1870 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1871 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1872 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1873 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1874 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1875 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1876 GEN_VEXT_VX(vwmul_vx_b, 2)
1877 GEN_VEXT_VX(vwmul_vx_h, 4)
1878 GEN_VEXT_VX(vwmul_vx_w, 8)
1879 GEN_VEXT_VX(vwmulu_vx_b, 2)
1880 GEN_VEXT_VX(vwmulu_vx_h, 4)
1881 GEN_VEXT_VX(vwmulu_vx_w, 8)
1882 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1883 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1884 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1885 
1886 /* Vector Single-Width Integer Multiply-Add Instructions */
1887 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1888 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1889 {                                                                  \
1890     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1891     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1892     TD d = *((TD *)vd + HD(i));                                    \
1893     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1894 }
1895 
1896 #define DO_MACC(N, M, D) (M * N + D)
1897 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1898 #define DO_MADD(N, M, D) (M * D + N)
1899 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1900 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1901 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1902 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1903 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1904 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1905 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1906 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1907 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1908 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1909 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1910 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1911 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1912 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1913 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1914 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1915 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1916 GEN_VEXT_VV(vmacc_vv_b, 1)
1917 GEN_VEXT_VV(vmacc_vv_h, 2)
1918 GEN_VEXT_VV(vmacc_vv_w, 4)
1919 GEN_VEXT_VV(vmacc_vv_d, 8)
1920 GEN_VEXT_VV(vnmsac_vv_b, 1)
1921 GEN_VEXT_VV(vnmsac_vv_h, 2)
1922 GEN_VEXT_VV(vnmsac_vv_w, 4)
1923 GEN_VEXT_VV(vnmsac_vv_d, 8)
1924 GEN_VEXT_VV(vmadd_vv_b, 1)
1925 GEN_VEXT_VV(vmadd_vv_h, 2)
1926 GEN_VEXT_VV(vmadd_vv_w, 4)
1927 GEN_VEXT_VV(vmadd_vv_d, 8)
1928 GEN_VEXT_VV(vnmsub_vv_b, 1)
1929 GEN_VEXT_VV(vnmsub_vv_h, 2)
1930 GEN_VEXT_VV(vnmsub_vv_w, 4)
1931 GEN_VEXT_VV(vnmsub_vv_d, 8)
1932 
1933 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1934 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1935 {                                                                   \
1936     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1937     TD d = *((TD *)vd + HD(i));                                     \
1938     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1939 }
1940 
1941 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1942 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1943 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1944 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1945 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1946 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1947 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1948 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1949 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1950 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1951 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1952 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1953 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1954 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1955 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1956 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1957 GEN_VEXT_VX(vmacc_vx_b, 1)
1958 GEN_VEXT_VX(vmacc_vx_h, 2)
1959 GEN_VEXT_VX(vmacc_vx_w, 4)
1960 GEN_VEXT_VX(vmacc_vx_d, 8)
1961 GEN_VEXT_VX(vnmsac_vx_b, 1)
1962 GEN_VEXT_VX(vnmsac_vx_h, 2)
1963 GEN_VEXT_VX(vnmsac_vx_w, 4)
1964 GEN_VEXT_VX(vnmsac_vx_d, 8)
1965 GEN_VEXT_VX(vmadd_vx_b, 1)
1966 GEN_VEXT_VX(vmadd_vx_h, 2)
1967 GEN_VEXT_VX(vmadd_vx_w, 4)
1968 GEN_VEXT_VX(vmadd_vx_d, 8)
1969 GEN_VEXT_VX(vnmsub_vx_b, 1)
1970 GEN_VEXT_VX(vnmsub_vx_h, 2)
1971 GEN_VEXT_VX(vnmsub_vx_w, 4)
1972 GEN_VEXT_VX(vnmsub_vx_d, 8)
1973 
1974 /* Vector Widening Integer Multiply-Add Instructions */
1975 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1976 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1977 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1978 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1979 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1980 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1981 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1982 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1983 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1984 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1985 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1986 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1987 GEN_VEXT_VV(vwmacc_vv_b, 2)
1988 GEN_VEXT_VV(vwmacc_vv_h, 4)
1989 GEN_VEXT_VV(vwmacc_vv_w, 8)
1990 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1991 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1992 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1993 
1994 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1995 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1996 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1997 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1998 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1999 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2000 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2001 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2002 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2003 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2004 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2005 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2006 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2007 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2008 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2009 GEN_VEXT_VX(vwmacc_vx_b, 2)
2010 GEN_VEXT_VX(vwmacc_vx_h, 4)
2011 GEN_VEXT_VX(vwmacc_vx_w, 8)
2012 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2013 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2014 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2015 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2016 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2017 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2018 
2019 /* Vector Integer Merge and Move Instructions */
2020 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2021 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2022                   uint32_t desc)                                     \
2023 {                                                                    \
2024     uint32_t vl = env->vl;                                           \
2025     uint32_t esz = sizeof(ETYPE);                                    \
2026     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2027     uint32_t vta = vext_vta(desc);                                   \
2028     uint32_t i;                                                      \
2029                                                                      \
2030     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2031                                                                      \
2032     for (i = env->vstart; i < vl; i++) {                             \
2033         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2034         *((ETYPE *)vd + H(i)) = s1;                                  \
2035     }                                                                \
2036     env->vstart = 0;                                                 \
2037     /* set tail elements to 1s */                                    \
2038     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2039 }
2040 
2041 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2042 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2043 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2044 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2045 
2046 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2047 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2048                   uint32_t desc)                                     \
2049 {                                                                    \
2050     uint32_t vl = env->vl;                                           \
2051     uint32_t esz = sizeof(ETYPE);                                    \
2052     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2053     uint32_t vta = vext_vta(desc);                                   \
2054     uint32_t i;                                                      \
2055                                                                      \
2056     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2057                                                                      \
2058     for (i = env->vstart; i < vl; i++) {                             \
2059         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2060     }                                                                \
2061     env->vstart = 0;                                                 \
2062     /* set tail elements to 1s */                                    \
2063     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2064 }
2065 
2066 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2067 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2068 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2069 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2070 
2071 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2072 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2073                   CPURISCVState *env, uint32_t desc)                 \
2074 {                                                                    \
2075     uint32_t vl = env->vl;                                           \
2076     uint32_t esz = sizeof(ETYPE);                                    \
2077     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2078     uint32_t vta = vext_vta(desc);                                   \
2079     uint32_t i;                                                      \
2080                                                                      \
2081     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2082                                                                      \
2083     for (i = env->vstart; i < vl; i++) {                             \
2084         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2085         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2086     }                                                                \
2087     env->vstart = 0;                                                 \
2088     /* set tail elements to 1s */                                    \
2089     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2090 }
2091 
2092 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2093 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2094 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2095 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2096 
2097 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2098 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2099                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2100 {                                                                    \
2101     uint32_t vl = env->vl;                                           \
2102     uint32_t esz = sizeof(ETYPE);                                    \
2103     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2104     uint32_t vta = vext_vta(desc);                                   \
2105     uint32_t i;                                                      \
2106                                                                      \
2107     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2108                                                                      \
2109     for (i = env->vstart; i < vl; i++) {                             \
2110         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2111         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2112                    (ETYPE)(target_long)s1);                          \
2113         *((ETYPE *)vd + H(i)) = d;                                   \
2114     }                                                                \
2115     env->vstart = 0;                                                 \
2116     /* set tail elements to 1s */                                    \
2117     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2118 }
2119 
2120 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2121 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2122 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2123 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2124 
2125 /*
2126  * Vector Fixed-Point Arithmetic Instructions
2127  */
2128 
2129 /* Vector Single-Width Saturating Add and Subtract */
2130 
2131 /*
2132  * As fixed point instructions probably have round mode and saturation,
2133  * define common macros for fixed point here.
2134  */
2135 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2136                           CPURISCVState *env, int vxrm);
2137 
2138 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2139 static inline void                                                  \
2140 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2141           CPURISCVState *env, int vxrm)                             \
2142 {                                                                   \
2143     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2144     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2145     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2146 }
2147 
2148 static inline void
vext_vv_rm_1(void * vd,void * v0,void * vs1,void * vs2,CPURISCVState * env,uint32_t vl,uint32_t vm,int vxrm,opivv2_rm_fn * fn,uint32_t vma,uint32_t esz)2149 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2150              CPURISCVState *env,
2151              uint32_t vl, uint32_t vm, int vxrm,
2152              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2153 {
2154     for (uint32_t i = env->vstart; i < vl; i++) {
2155         if (!vm && !vext_elem_mask(v0, i)) {
2156             /* set masked-off elements to 1s */
2157             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2158             continue;
2159         }
2160         fn(vd, vs1, vs2, i, env, vxrm);
2161     }
2162     env->vstart = 0;
2163 }
2164 
2165 static inline void
vext_vv_rm_2(void * vd,void * v0,void * vs1,void * vs2,CPURISCVState * env,uint32_t desc,opivv2_rm_fn * fn,uint32_t esz)2166 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2167              CPURISCVState *env,
2168              uint32_t desc,
2169              opivv2_rm_fn *fn, uint32_t esz)
2170 {
2171     uint32_t vm = vext_vm(desc);
2172     uint32_t vl = env->vl;
2173     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2174     uint32_t vta = vext_vta(desc);
2175     uint32_t vma = vext_vma(desc);
2176 
2177     VSTART_CHECK_EARLY_EXIT(env, vl);
2178 
2179     switch (env->vxrm) {
2180     case 0: /* rnu */
2181         vext_vv_rm_1(vd, v0, vs1, vs2,
2182                      env, vl, vm, 0, fn, vma, esz);
2183         break;
2184     case 1: /* rne */
2185         vext_vv_rm_1(vd, v0, vs1, vs2,
2186                      env, vl, vm, 1, fn, vma, esz);
2187         break;
2188     case 2: /* rdn */
2189         vext_vv_rm_1(vd, v0, vs1, vs2,
2190                      env, vl, vm, 2, fn, vma, esz);
2191         break;
2192     default: /* rod */
2193         vext_vv_rm_1(vd, v0, vs1, vs2,
2194                      env, vl, vm, 3, fn, vma, esz);
2195         break;
2196     }
2197     /* set tail elements to 1s */
2198     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2199 }
2200 
2201 /* generate helpers for fixed point instructions with OPIVV format */
2202 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2203 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2204                   CPURISCVState *env, uint32_t desc)            \
2205 {                                                               \
2206     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2207                  do_##NAME, ESZ);                               \
2208 }
2209 
saddu8(CPURISCVState * env,int vxrm,uint8_t a,uint8_t b)2210 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2211                              uint8_t b)
2212 {
2213     uint8_t res = a + b;
2214     if (res < a) {
2215         res = UINT8_MAX;
2216         env->vxsat = 0x1;
2217     }
2218     return res;
2219 }
2220 
saddu16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2221 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2222                                uint16_t b)
2223 {
2224     uint16_t res = a + b;
2225     if (res < a) {
2226         res = UINT16_MAX;
2227         env->vxsat = 0x1;
2228     }
2229     return res;
2230 }
2231 
saddu32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2232 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2233                                uint32_t b)
2234 {
2235     uint32_t res = a + b;
2236     if (res < a) {
2237         res = UINT32_MAX;
2238         env->vxsat = 0x1;
2239     }
2240     return res;
2241 }
2242 
saddu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2243 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2244                                uint64_t b)
2245 {
2246     uint64_t res = a + b;
2247     if (res < a) {
2248         res = UINT64_MAX;
2249         env->vxsat = 0x1;
2250     }
2251     return res;
2252 }
2253 
2254 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2255 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2256 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2257 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2258 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2259 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2260 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2261 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2262 
2263 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2264                           CPURISCVState *env, int vxrm);
2265 
2266 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2267 static inline void                                                  \
2268 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2269           CPURISCVState *env, int vxrm)                             \
2270 {                                                                   \
2271     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2272     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2273 }
2274 
2275 static inline void
vext_vx_rm_1(void * vd,void * v0,target_long s1,void * vs2,CPURISCVState * env,uint32_t vl,uint32_t vm,int vxrm,opivx2_rm_fn * fn,uint32_t vma,uint32_t esz)2276 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2277              CPURISCVState *env,
2278              uint32_t vl, uint32_t vm, int vxrm,
2279              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2280 {
2281     for (uint32_t i = env->vstart; i < vl; i++) {
2282         if (!vm && !vext_elem_mask(v0, i)) {
2283             /* set masked-off elements to 1s */
2284             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2285             continue;
2286         }
2287         fn(vd, s1, vs2, i, env, vxrm);
2288     }
2289     env->vstart = 0;
2290 }
2291 
2292 static inline void
vext_vx_rm_2(void * vd,void * v0,target_long s1,void * vs2,CPURISCVState * env,uint32_t desc,opivx2_rm_fn * fn,uint32_t esz)2293 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2294              CPURISCVState *env,
2295              uint32_t desc,
2296              opivx2_rm_fn *fn, uint32_t esz)
2297 {
2298     uint32_t vm = vext_vm(desc);
2299     uint32_t vl = env->vl;
2300     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2301     uint32_t vta = vext_vta(desc);
2302     uint32_t vma = vext_vma(desc);
2303 
2304     VSTART_CHECK_EARLY_EXIT(env, vl);
2305 
2306     switch (env->vxrm) {
2307     case 0: /* rnu */
2308         vext_vx_rm_1(vd, v0, s1, vs2,
2309                      env, vl, vm, 0, fn, vma, esz);
2310         break;
2311     case 1: /* rne */
2312         vext_vx_rm_1(vd, v0, s1, vs2,
2313                      env, vl, vm, 1, fn, vma, esz);
2314         break;
2315     case 2: /* rdn */
2316         vext_vx_rm_1(vd, v0, s1, vs2,
2317                      env, vl, vm, 2, fn, vma, esz);
2318         break;
2319     default: /* rod */
2320         vext_vx_rm_1(vd, v0, s1, vs2,
2321                      env, vl, vm, 3, fn, vma, esz);
2322         break;
2323     }
2324     /* set tail elements to 1s */
2325     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2326 }
2327 
2328 /* generate helpers for fixed point instructions with OPIVX format */
2329 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2330 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2331                   void *vs2, CPURISCVState *env,          \
2332                   uint32_t desc)                          \
2333 {                                                         \
2334     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2335                  do_##NAME, ESZ);                         \
2336 }
2337 
RVVCALL(OPIVX2_RM,vsaddu_vx_b,OP_UUU_B,H1,H1,saddu8)2338 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2339 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2340 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2341 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2342 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2343 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2344 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2345 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2346 
2347 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2348 {
2349     int8_t res = a + b;
2350     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2351         res = a > 0 ? INT8_MAX : INT8_MIN;
2352         env->vxsat = 0x1;
2353     }
2354     return res;
2355 }
2356 
sadd16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2357 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2358                              int16_t b)
2359 {
2360     int16_t res = a + b;
2361     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2362         res = a > 0 ? INT16_MAX : INT16_MIN;
2363         env->vxsat = 0x1;
2364     }
2365     return res;
2366 }
2367 
sadd32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2368 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2369                              int32_t b)
2370 {
2371     int32_t res = a + b;
2372     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2373         res = a > 0 ? INT32_MAX : INT32_MIN;
2374         env->vxsat = 0x1;
2375     }
2376     return res;
2377 }
2378 
sadd64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2379 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2380                              int64_t b)
2381 {
2382     int64_t res = a + b;
2383     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2384         res = a > 0 ? INT64_MAX : INT64_MIN;
2385         env->vxsat = 0x1;
2386     }
2387     return res;
2388 }
2389 
RVVCALL(OPIVV2_RM,vsadd_vv_b,OP_SSS_B,H1,H1,H1,sadd8)2390 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2391 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2392 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2393 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2394 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2395 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2396 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2397 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2398 
2399 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2400 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2401 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2402 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2403 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2404 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2405 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2406 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2407 
2408 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2409                              uint8_t b)
2410 {
2411     uint8_t res = a - b;
2412     if (res > a) {
2413         res = 0;
2414         env->vxsat = 0x1;
2415     }
2416     return res;
2417 }
2418 
ssubu16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2419 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2420                                uint16_t b)
2421 {
2422     uint16_t res = a - b;
2423     if (res > a) {
2424         res = 0;
2425         env->vxsat = 0x1;
2426     }
2427     return res;
2428 }
2429 
ssubu32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2430 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2431                                uint32_t b)
2432 {
2433     uint32_t res = a - b;
2434     if (res > a) {
2435         res = 0;
2436         env->vxsat = 0x1;
2437     }
2438     return res;
2439 }
2440 
ssubu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2441 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2442                                uint64_t b)
2443 {
2444     uint64_t res = a - b;
2445     if (res > a) {
2446         res = 0;
2447         env->vxsat = 0x1;
2448     }
2449     return res;
2450 }
2451 
RVVCALL(OPIVV2_RM,vssubu_vv_b,OP_UUU_B,H1,H1,H1,ssubu8)2452 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2453 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2454 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2455 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2456 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2457 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2458 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2459 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2460 
2461 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2462 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2463 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2464 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2465 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2466 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2467 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2468 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2469 
2470 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2471 {
2472     int8_t res = a - b;
2473     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2474         res = a >= 0 ? INT8_MAX : INT8_MIN;
2475         env->vxsat = 0x1;
2476     }
2477     return res;
2478 }
2479 
ssub16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2480 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2481                              int16_t b)
2482 {
2483     int16_t res = a - b;
2484     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2485         res = a >= 0 ? INT16_MAX : INT16_MIN;
2486         env->vxsat = 0x1;
2487     }
2488     return res;
2489 }
2490 
ssub32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2491 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2492                              int32_t b)
2493 {
2494     int32_t res = a - b;
2495     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2496         res = a >= 0 ? INT32_MAX : INT32_MIN;
2497         env->vxsat = 0x1;
2498     }
2499     return res;
2500 }
2501 
ssub64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2502 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2503                              int64_t b)
2504 {
2505     int64_t res = a - b;
2506     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2507         res = a >= 0 ? INT64_MAX : INT64_MIN;
2508         env->vxsat = 0x1;
2509     }
2510     return res;
2511 }
2512 
RVVCALL(OPIVV2_RM,vssub_vv_b,OP_SSS_B,H1,H1,H1,ssub8)2513 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2514 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2515 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2516 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2517 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2518 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2519 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2520 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2521 
2522 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2523 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2524 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2525 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2526 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2527 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2528 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2529 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2530 
2531 /* Vector Single-Width Averaging Add and Subtract */
2532 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2533 {
2534     uint8_t d = extract64(v, shift, 1);
2535     uint8_t d1;
2536     uint64_t D1, D2;
2537 
2538     if (shift == 0 || shift > 64) {
2539         return 0;
2540     }
2541 
2542     d1 = extract64(v, shift - 1, 1);
2543     D1 = extract64(v, 0, shift);
2544     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2545         return d1;
2546     } else if (vxrm == 1) { /* round-to-nearest-even */
2547         if (shift > 1) {
2548             D2 = extract64(v, 0, shift - 1);
2549             return d1 & ((D2 != 0) | d);
2550         } else {
2551             return d1 & d;
2552         }
2553     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2554         return !d & (D1 != 0);
2555     }
2556     return 0; /* round-down (truncate) */
2557 }
2558 
aadd32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2559 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2560                              int32_t b)
2561 {
2562     int64_t res = (int64_t)a + b;
2563     uint8_t round = get_round(vxrm, res, 1);
2564 
2565     return (res >> 1) + round;
2566 }
2567 
aadd64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2568 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2569                              int64_t b)
2570 {
2571     int64_t res = a + b;
2572     uint8_t round = get_round(vxrm, res, 1);
2573     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2574 
2575     /* With signed overflow, bit 64 is inverse of bit 63. */
2576     return ((res >> 1) ^ over) + round;
2577 }
2578 
RVVCALL(OPIVV2_RM,vaadd_vv_b,OP_SSS_B,H1,H1,H1,aadd32)2579 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2580 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2581 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2582 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2583 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2584 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2585 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2586 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2587 
2588 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2589 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2590 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2591 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2592 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2593 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2594 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2595 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2596 
2597 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2598                                uint32_t a, uint32_t b)
2599 {
2600     uint64_t res = (uint64_t)a + b;
2601     uint8_t round = get_round(vxrm, res, 1);
2602 
2603     return (res >> 1) + round;
2604 }
2605 
aaddu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2606 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2607                                uint64_t a, uint64_t b)
2608 {
2609     uint64_t res = a + b;
2610     uint8_t round = get_round(vxrm, res, 1);
2611     uint64_t over = (uint64_t)(res < a) << 63;
2612 
2613     return ((res >> 1) | over) + round;
2614 }
2615 
RVVCALL(OPIVV2_RM,vaaddu_vv_b,OP_UUU_B,H1,H1,H1,aaddu32)2616 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2617 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2618 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2619 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2620 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2621 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2622 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2623 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2624 
2625 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2626 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2627 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2628 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2629 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2630 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2631 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2632 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2633 
2634 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2635                              int32_t b)
2636 {
2637     int64_t res = (int64_t)a - b;
2638     uint8_t round = get_round(vxrm, res, 1);
2639 
2640     return (res >> 1) + round;
2641 }
2642 
asub64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2643 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2644                              int64_t b)
2645 {
2646     int64_t res = (int64_t)a - b;
2647     uint8_t round = get_round(vxrm, res, 1);
2648     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2649 
2650     /* With signed overflow, bit 64 is inverse of bit 63. */
2651     return ((res >> 1) ^ over) + round;
2652 }
2653 
RVVCALL(OPIVV2_RM,vasub_vv_b,OP_SSS_B,H1,H1,H1,asub32)2654 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2655 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2656 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2657 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2658 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2659 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2660 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2661 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2662 
2663 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2664 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2665 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2666 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2667 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2668 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2669 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2670 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2671 
2672 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2673                                uint32_t a, uint32_t b)
2674 {
2675     int64_t res = (int64_t)a - b;
2676     uint8_t round = get_round(vxrm, res, 1);
2677 
2678     return (res >> 1) + round;
2679 }
2680 
asubu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2681 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2682                                uint64_t a, uint64_t b)
2683 {
2684     uint64_t res = (uint64_t)a - b;
2685     uint8_t round = get_round(vxrm, res, 1);
2686     uint64_t over = (uint64_t)(res > a) << 63;
2687 
2688     return ((res >> 1) | over) + round;
2689 }
2690 
RVVCALL(OPIVV2_RM,vasubu_vv_b,OP_UUU_B,H1,H1,H1,asubu32)2691 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2692 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2693 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2694 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2695 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2696 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2697 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2698 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2699 
2700 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2701 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2702 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2703 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2704 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2705 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2706 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2707 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2708 
2709 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2710 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2711 {
2712     uint8_t round;
2713     int16_t res;
2714 
2715     res = (int16_t)a * (int16_t)b;
2716     round = get_round(vxrm, res, 7);
2717     res = (res >> 7) + round;
2718 
2719     if (res > INT8_MAX) {
2720         env->vxsat = 0x1;
2721         return INT8_MAX;
2722     } else if (res < INT8_MIN) {
2723         env->vxsat = 0x1;
2724         return INT8_MIN;
2725     } else {
2726         return res;
2727     }
2728 }
2729 
vsmul16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2730 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2731 {
2732     uint8_t round;
2733     int32_t res;
2734 
2735     res = (int32_t)a * (int32_t)b;
2736     round = get_round(vxrm, res, 15);
2737     res = (res >> 15) + round;
2738 
2739     if (res > INT16_MAX) {
2740         env->vxsat = 0x1;
2741         return INT16_MAX;
2742     } else if (res < INT16_MIN) {
2743         env->vxsat = 0x1;
2744         return INT16_MIN;
2745     } else {
2746         return res;
2747     }
2748 }
2749 
vsmul32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2750 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2751 {
2752     uint8_t round;
2753     int64_t res;
2754 
2755     res = (int64_t)a * (int64_t)b;
2756     round = get_round(vxrm, res, 31);
2757     res = (res >> 31) + round;
2758 
2759     if (res > INT32_MAX) {
2760         env->vxsat = 0x1;
2761         return INT32_MAX;
2762     } else if (res < INT32_MIN) {
2763         env->vxsat = 0x1;
2764         return INT32_MIN;
2765     } else {
2766         return res;
2767     }
2768 }
2769 
vsmul64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2770 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2771 {
2772     uint8_t round;
2773     uint64_t hi_64, lo_64;
2774     int64_t res;
2775 
2776     if (a == INT64_MIN && b == INT64_MIN) {
2777         env->vxsat = 1;
2778         return INT64_MAX;
2779     }
2780 
2781     muls64(&lo_64, &hi_64, a, b);
2782     round = get_round(vxrm, lo_64, 63);
2783     /*
2784      * Cannot overflow, as there are always
2785      * 2 sign bits after multiply.
2786      */
2787     res = (hi_64 << 1) | (lo_64 >> 63);
2788     if (round) {
2789         if (res == INT64_MAX) {
2790             env->vxsat = 1;
2791         } else {
2792             res += 1;
2793         }
2794     }
2795     return res;
2796 }
2797 
RVVCALL(OPIVV2_RM,vsmul_vv_b,OP_SSS_B,H1,H1,H1,vsmul8)2798 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2799 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2800 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2801 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2802 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2803 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2804 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2805 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2806 
2807 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2808 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2809 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2810 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2811 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2812 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2813 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2814 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2815 
2816 /* Vector Single-Width Scaling Shift Instructions */
2817 static inline uint8_t
2818 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2819 {
2820     uint8_t round, shift = b & 0x7;
2821     uint8_t res;
2822 
2823     round = get_round(vxrm, a, shift);
2824     res = (a >> shift) + round;
2825     return res;
2826 }
2827 static inline uint16_t
vssrl16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2828 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2829 {
2830     uint8_t round, shift = b & 0xf;
2831 
2832     round = get_round(vxrm, a, shift);
2833     return (a >> shift) + round;
2834 }
2835 static inline uint32_t
vssrl32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2836 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2837 {
2838     uint8_t round, shift = b & 0x1f;
2839 
2840     round = get_round(vxrm, a, shift);
2841     return (a >> shift) + round;
2842 }
2843 static inline uint64_t
vssrl64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2844 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2845 {
2846     uint8_t round, shift = b & 0x3f;
2847 
2848     round = get_round(vxrm, a, shift);
2849     return (a >> shift) + round;
2850 }
RVVCALL(OPIVV2_RM,vssrl_vv_b,OP_UUU_B,H1,H1,H1,vssrl8)2851 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2852 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2853 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2854 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2855 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2856 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2857 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2858 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2859 
2860 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2861 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2862 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2863 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2864 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2865 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2866 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2867 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2868 
2869 static inline int8_t
2870 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2871 {
2872     uint8_t round, shift = b & 0x7;
2873 
2874     round = get_round(vxrm, a, shift);
2875     return (a >> shift) + round;
2876 }
2877 static inline int16_t
vssra16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2878 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2879 {
2880     uint8_t round, shift = b & 0xf;
2881 
2882     round = get_round(vxrm, a, shift);
2883     return (a >> shift) + round;
2884 }
2885 static inline int32_t
vssra32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2886 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2887 {
2888     uint8_t round, shift = b & 0x1f;
2889 
2890     round = get_round(vxrm, a, shift);
2891     return (a >> shift) + round;
2892 }
2893 static inline int64_t
vssra64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2894 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2895 {
2896     uint8_t round, shift = b & 0x3f;
2897 
2898     round = get_round(vxrm, a, shift);
2899     return (a >> shift) + round;
2900 }
2901 
RVVCALL(OPIVV2_RM,vssra_vv_b,OP_SSS_B,H1,H1,H1,vssra8)2902 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2903 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2904 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2905 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2906 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2907 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2908 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2909 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2910 
2911 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2912 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2913 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2914 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2915 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2916 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2917 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2918 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2919 
2920 /* Vector Narrowing Fixed-Point Clip Instructions */
2921 static inline int8_t
2922 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2923 {
2924     uint8_t round, shift = b & 0xf;
2925     int16_t res;
2926 
2927     round = get_round(vxrm, a, shift);
2928     res = (a >> shift) + round;
2929     if (res > INT8_MAX) {
2930         env->vxsat = 0x1;
2931         return INT8_MAX;
2932     } else if (res < INT8_MIN) {
2933         env->vxsat = 0x1;
2934         return INT8_MIN;
2935     } else {
2936         return res;
2937     }
2938 }
2939 
2940 static inline int16_t
vnclip16(CPURISCVState * env,int vxrm,int32_t a,int16_t b)2941 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2942 {
2943     uint8_t round, shift = b & 0x1f;
2944     int32_t res;
2945 
2946     round = get_round(vxrm, a, shift);
2947     res = (a >> shift) + round;
2948     if (res > INT16_MAX) {
2949         env->vxsat = 0x1;
2950         return INT16_MAX;
2951     } else if (res < INT16_MIN) {
2952         env->vxsat = 0x1;
2953         return INT16_MIN;
2954     } else {
2955         return res;
2956     }
2957 }
2958 
2959 static inline int32_t
vnclip32(CPURISCVState * env,int vxrm,int64_t a,int32_t b)2960 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2961 {
2962     uint8_t round, shift = b & 0x3f;
2963     int64_t res;
2964 
2965     round = get_round(vxrm, a, shift);
2966     res = (a >> shift) + round;
2967     if (res > INT32_MAX) {
2968         env->vxsat = 0x1;
2969         return INT32_MAX;
2970     } else if (res < INT32_MIN) {
2971         env->vxsat = 0x1;
2972         return INT32_MIN;
2973     } else {
2974         return res;
2975     }
2976 }
2977 
RVVCALL(OPIVV2_RM,vnclip_wv_b,NOP_SSS_B,H1,H2,H1,vnclip8)2978 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2979 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2980 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2981 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2982 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2983 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2984 
2985 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2986 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2987 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2988 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2989 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2990 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2991 
2992 static inline uint8_t
2993 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2994 {
2995     uint8_t round, shift = b & 0xf;
2996     uint16_t res;
2997 
2998     round = get_round(vxrm, a, shift);
2999     res = (a >> shift) + round;
3000     if (res > UINT8_MAX) {
3001         env->vxsat = 0x1;
3002         return UINT8_MAX;
3003     } else {
3004         return res;
3005     }
3006 }
3007 
3008 static inline uint16_t
vnclipu16(CPURISCVState * env,int vxrm,uint32_t a,uint16_t b)3009 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3010 {
3011     uint8_t round, shift = b & 0x1f;
3012     uint32_t res;
3013 
3014     round = get_round(vxrm, a, shift);
3015     res = (a >> shift) + round;
3016     if (res > UINT16_MAX) {
3017         env->vxsat = 0x1;
3018         return UINT16_MAX;
3019     } else {
3020         return res;
3021     }
3022 }
3023 
3024 static inline uint32_t
vnclipu32(CPURISCVState * env,int vxrm,uint64_t a,uint32_t b)3025 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3026 {
3027     uint8_t round, shift = b & 0x3f;
3028     uint64_t res;
3029 
3030     round = get_round(vxrm, a, shift);
3031     res = (a >> shift) + round;
3032     if (res > UINT32_MAX) {
3033         env->vxsat = 0x1;
3034         return UINT32_MAX;
3035     } else {
3036         return res;
3037     }
3038 }
3039 
RVVCALL(OPIVV2_RM,vnclipu_wv_b,NOP_UUU_B,H1,H2,H1,vnclipu8)3040 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3041 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3042 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3043 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3044 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3045 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3046 
3047 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3048 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3049 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3050 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3051 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3052 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3053 
3054 /*
3055  * Vector Float Point Arithmetic Instructions
3056  */
3057 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3058 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3059 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3060                       CPURISCVState *env)                      \
3061 {                                                              \
3062     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3063     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3064     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3065 }
3066 
3067 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3068 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3069                   void *vs2, CPURISCVState *env,          \
3070                   uint32_t desc)                          \
3071 {                                                         \
3072     uint32_t vm = vext_vm(desc);                          \
3073     uint32_t vl = env->vl;                                \
3074     uint32_t total_elems =                                \
3075         vext_get_total_elems(env, desc, ESZ);             \
3076     uint32_t vta = vext_vta(desc);                        \
3077     uint32_t vma = vext_vma(desc);                        \
3078     uint32_t i;                                           \
3079                                                           \
3080     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3081                                                           \
3082     for (i = env->vstart; i < vl; i++) {                  \
3083         if (!vm && !vext_elem_mask(v0, i)) {              \
3084             /* set masked-off elements to 1s */           \
3085             vext_set_elems_1s(vd, vma, i * ESZ,           \
3086                               (i + 1) * ESZ);             \
3087             continue;                                     \
3088         }                                                 \
3089         do_##NAME(vd, vs1, vs2, i, env);                  \
3090     }                                                     \
3091     env->vstart = 0;                                      \
3092     /* set tail elements to 1s */                         \
3093     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3094                       total_elems * ESZ);                 \
3095 }
3096 
3097 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3098 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3099 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3100 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3101 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3102 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3103 
3104 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3105 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3106                       CPURISCVState *env)                      \
3107 {                                                              \
3108     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3109     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3110 }
3111 
3112 #define GEN_VEXT_VF(NAME, ESZ)                            \
3113 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3114                   void *vs2, CPURISCVState *env,          \
3115                   uint32_t desc)                          \
3116 {                                                         \
3117     uint32_t vm = vext_vm(desc);                          \
3118     uint32_t vl = env->vl;                                \
3119     uint32_t total_elems =                                \
3120         vext_get_total_elems(env, desc, ESZ);             \
3121     uint32_t vta = vext_vta(desc);                        \
3122     uint32_t vma = vext_vma(desc);                        \
3123     uint32_t i;                                           \
3124                                                           \
3125     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3126                                                           \
3127     for (i = env->vstart; i < vl; i++) {                  \
3128         if (!vm && !vext_elem_mask(v0, i)) {              \
3129             /* set masked-off elements to 1s */           \
3130             vext_set_elems_1s(vd, vma, i * ESZ,           \
3131                               (i + 1) * ESZ);             \
3132             continue;                                     \
3133         }                                                 \
3134         do_##NAME(vd, s1, vs2, i, env);                   \
3135     }                                                     \
3136     env->vstart = 0;                                      \
3137     /* set tail elements to 1s */                         \
3138     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3139                       total_elems * ESZ);                 \
3140 }
3141 
3142 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3143 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3144 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3145 GEN_VEXT_VF(vfadd_vf_h, 2)
3146 GEN_VEXT_VF(vfadd_vf_w, 4)
3147 GEN_VEXT_VF(vfadd_vf_d, 8)
3148 
3149 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3150 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3151 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3152 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3153 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3154 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3155 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3156 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3157 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3158 GEN_VEXT_VF(vfsub_vf_h, 2)
3159 GEN_VEXT_VF(vfsub_vf_w, 4)
3160 GEN_VEXT_VF(vfsub_vf_d, 8)
3161 
3162 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3163 {
3164     return float16_sub(b, a, s);
3165 }
3166 
float32_rsub(uint32_t a,uint32_t b,float_status * s)3167 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3168 {
3169     return float32_sub(b, a, s);
3170 }
3171 
float64_rsub(uint64_t a,uint64_t b,float_status * s)3172 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3173 {
3174     return float64_sub(b, a, s);
3175 }
3176 
RVVCALL(OPFVF2,vfrsub_vf_h,OP_UUU_H,H2,H2,float16_rsub)3177 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3178 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3179 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3180 GEN_VEXT_VF(vfrsub_vf_h, 2)
3181 GEN_VEXT_VF(vfrsub_vf_w, 4)
3182 GEN_VEXT_VF(vfrsub_vf_d, 8)
3183 
3184 /* Vector Widening Floating-Point Add/Subtract Instructions */
3185 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3186 {
3187     return float32_add(float16_to_float32(a, true, s),
3188                        float16_to_float32(b, true, s), s);
3189 }
3190 
vfwadd32(uint32_t a,uint32_t b,float_status * s)3191 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3192 {
3193     return float64_add(float32_to_float64(a, s),
3194                        float32_to_float64(b, s), s);
3195 
3196 }
3197 
RVVCALL(OPFVV2,vfwadd_vv_h,WOP_UUU_H,H4,H2,H2,vfwadd16)3198 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3199 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3200 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3201 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3202 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3203 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3204 GEN_VEXT_VF(vfwadd_vf_h, 4)
3205 GEN_VEXT_VF(vfwadd_vf_w, 8)
3206 
3207 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3208 {
3209     return float32_sub(float16_to_float32(a, true, s),
3210                        float16_to_float32(b, true, s), s);
3211 }
3212 
vfwsub32(uint32_t a,uint32_t b,float_status * s)3213 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3214 {
3215     return float64_sub(float32_to_float64(a, s),
3216                        float32_to_float64(b, s), s);
3217 
3218 }
3219 
RVVCALL(OPFVV2,vfwsub_vv_h,WOP_UUU_H,H4,H2,H2,vfwsub16)3220 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3221 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3222 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3223 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3224 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3225 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3226 GEN_VEXT_VF(vfwsub_vf_h, 4)
3227 GEN_VEXT_VF(vfwsub_vf_w, 8)
3228 
3229 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3230 {
3231     return float32_add(a, float16_to_float32(b, true, s), s);
3232 }
3233 
vfwaddw32(uint64_t a,uint32_t b,float_status * s)3234 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3235 {
3236     return float64_add(a, float32_to_float64(b, s), s);
3237 }
3238 
RVVCALL(OPFVV2,vfwadd_wv_h,WOP_WUUU_H,H4,H2,H2,vfwaddw16)3239 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3240 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3241 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3242 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3243 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3244 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3245 GEN_VEXT_VF(vfwadd_wf_h, 4)
3246 GEN_VEXT_VF(vfwadd_wf_w, 8)
3247 
3248 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3249 {
3250     return float32_sub(a, float16_to_float32(b, true, s), s);
3251 }
3252 
vfwsubw32(uint64_t a,uint32_t b,float_status * s)3253 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3254 {
3255     return float64_sub(a, float32_to_float64(b, s), s);
3256 }
3257 
RVVCALL(OPFVV2,vfwsub_wv_h,WOP_WUUU_H,H4,H2,H2,vfwsubw16)3258 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3259 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3260 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3261 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3262 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3263 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3264 GEN_VEXT_VF(vfwsub_wf_h, 4)
3265 GEN_VEXT_VF(vfwsub_wf_w, 8)
3266 
3267 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3268 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3269 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3270 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3271 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3272 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3273 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3274 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3275 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3276 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3277 GEN_VEXT_VF(vfmul_vf_h, 2)
3278 GEN_VEXT_VF(vfmul_vf_w, 4)
3279 GEN_VEXT_VF(vfmul_vf_d, 8)
3280 
3281 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3282 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3283 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3284 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3285 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3286 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3287 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3288 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3289 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3290 GEN_VEXT_VF(vfdiv_vf_h, 2)
3291 GEN_VEXT_VF(vfdiv_vf_w, 4)
3292 GEN_VEXT_VF(vfdiv_vf_d, 8)
3293 
3294 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3295 {
3296     return float16_div(b, a, s);
3297 }
3298 
float32_rdiv(uint32_t a,uint32_t b,float_status * s)3299 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3300 {
3301     return float32_div(b, a, s);
3302 }
3303 
float64_rdiv(uint64_t a,uint64_t b,float_status * s)3304 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3305 {
3306     return float64_div(b, a, s);
3307 }
3308 
RVVCALL(OPFVF2,vfrdiv_vf_h,OP_UUU_H,H2,H2,float16_rdiv)3309 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3310 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3311 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3312 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3313 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3314 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3315 
3316 /* Vector Widening Floating-Point Multiply */
3317 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3318 {
3319     return float32_mul(float16_to_float32(a, true, s),
3320                        float16_to_float32(b, true, s), s);
3321 }
3322 
vfwmul32(uint32_t a,uint32_t b,float_status * s)3323 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3324 {
3325     return float64_mul(float32_to_float64(a, s),
3326                        float32_to_float64(b, s), s);
3327 
3328 }
RVVCALL(OPFVV2,vfwmul_vv_h,WOP_UUU_H,H4,H2,H2,vfwmul16)3329 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3330 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3331 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3332 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3333 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3334 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3335 GEN_VEXT_VF(vfwmul_vf_h, 4)
3336 GEN_VEXT_VF(vfwmul_vf_w, 8)
3337 
3338 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3339 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3340 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3341                       CPURISCVState *env)                          \
3342 {                                                                  \
3343     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3344     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3345     TD d = *((TD *)vd + HD(i));                                    \
3346     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3347 }
3348 
3349 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3350 {
3351     return float16_muladd(a, b, d, 0, s);
3352 }
3353 
fmacc32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3354 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3355 {
3356     return float32_muladd(a, b, d, 0, s);
3357 }
3358 
fmacc64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3359 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3360 {
3361     return float64_muladd(a, b, d, 0, s);
3362 }
3363 
RVVCALL(OPFVV3,vfmacc_vv_h,OP_UUU_H,H2,H2,H2,fmacc16)3364 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3365 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3366 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3367 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3368 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3369 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3370 
3371 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3372 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3373                       CPURISCVState *env)                         \
3374 {                                                                 \
3375     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3376     TD d = *((TD *)vd + HD(i));                                   \
3377     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3378 }
3379 
3380 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3381 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3382 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3383 GEN_VEXT_VF(vfmacc_vf_h, 2)
3384 GEN_VEXT_VF(vfmacc_vf_w, 4)
3385 GEN_VEXT_VF(vfmacc_vf_d, 8)
3386 
3387 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3388 {
3389     return float16_muladd(a, b, d, float_muladd_negate_c |
3390                                    float_muladd_negate_product, s);
3391 }
3392 
fnmacc32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3393 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3394 {
3395     return float32_muladd(a, b, d, float_muladd_negate_c |
3396                                    float_muladd_negate_product, s);
3397 }
3398 
fnmacc64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3399 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3400 {
3401     return float64_muladd(a, b, d, float_muladd_negate_c |
3402                                    float_muladd_negate_product, s);
3403 }
3404 
RVVCALL(OPFVV3,vfnmacc_vv_h,OP_UUU_H,H2,H2,H2,fnmacc16)3405 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3406 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3407 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3408 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3409 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3410 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3411 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3412 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3413 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3414 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3415 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3416 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3417 
3418 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3419 {
3420     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3421 }
3422 
fmsac32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3423 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3424 {
3425     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3426 }
3427 
fmsac64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3428 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3429 {
3430     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3431 }
3432 
RVVCALL(OPFVV3,vfmsac_vv_h,OP_UUU_H,H2,H2,H2,fmsac16)3433 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3434 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3435 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3436 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3437 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3438 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3439 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3440 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3441 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3442 GEN_VEXT_VF(vfmsac_vf_h, 2)
3443 GEN_VEXT_VF(vfmsac_vf_w, 4)
3444 GEN_VEXT_VF(vfmsac_vf_d, 8)
3445 
3446 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3447 {
3448     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3449 }
3450 
fnmsac32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3451 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3452 {
3453     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3454 }
3455 
fnmsac64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3456 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3457 {
3458     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3459 }
3460 
RVVCALL(OPFVV3,vfnmsac_vv_h,OP_UUU_H,H2,H2,H2,fnmsac16)3461 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3462 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3463 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3464 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3465 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3466 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3467 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3468 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3469 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3470 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3471 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3472 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3473 
3474 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3475 {
3476     return float16_muladd(d, b, a, 0, s);
3477 }
3478 
fmadd32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3479 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3480 {
3481     return float32_muladd(d, b, a, 0, s);
3482 }
3483 
fmadd64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3484 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3485 {
3486     return float64_muladd(d, b, a, 0, s);
3487 }
3488 
RVVCALL(OPFVV3,vfmadd_vv_h,OP_UUU_H,H2,H2,H2,fmadd16)3489 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3490 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3491 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3492 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3493 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3494 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3495 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3496 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3497 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3498 GEN_VEXT_VF(vfmadd_vf_h, 2)
3499 GEN_VEXT_VF(vfmadd_vf_w, 4)
3500 GEN_VEXT_VF(vfmadd_vf_d, 8)
3501 
3502 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3503 {
3504     return float16_muladd(d, b, a, float_muladd_negate_c |
3505                                    float_muladd_negate_product, s);
3506 }
3507 
fnmadd32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3508 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3509 {
3510     return float32_muladd(d, b, a, float_muladd_negate_c |
3511                                    float_muladd_negate_product, s);
3512 }
3513 
fnmadd64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3514 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3515 {
3516     return float64_muladd(d, b, a, float_muladd_negate_c |
3517                                    float_muladd_negate_product, s);
3518 }
3519 
RVVCALL(OPFVV3,vfnmadd_vv_h,OP_UUU_H,H2,H2,H2,fnmadd16)3520 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3521 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3522 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3523 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3524 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3525 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3526 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3527 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3528 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3529 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3530 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3531 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3532 
3533 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3534 {
3535     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3536 }
3537 
fmsub32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3538 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3539 {
3540     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3541 }
3542 
fmsub64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3543 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3544 {
3545     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3546 }
3547 
RVVCALL(OPFVV3,vfmsub_vv_h,OP_UUU_H,H2,H2,H2,fmsub16)3548 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3549 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3550 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3551 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3552 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3553 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3554 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3555 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3556 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3557 GEN_VEXT_VF(vfmsub_vf_h, 2)
3558 GEN_VEXT_VF(vfmsub_vf_w, 4)
3559 GEN_VEXT_VF(vfmsub_vf_d, 8)
3560 
3561 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3562 {
3563     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3564 }
3565 
fnmsub32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3566 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3567 {
3568     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3569 }
3570 
fnmsub64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3571 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3572 {
3573     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3574 }
3575 
RVVCALL(OPFVV3,vfnmsub_vv_h,OP_UUU_H,H2,H2,H2,fnmsub16)3576 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3577 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3578 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3579 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3580 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3581 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3582 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3583 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3584 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3585 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3586 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3587 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3588 
3589 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3590 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3591 {
3592     return float32_muladd(float16_to_float32(a, true, s),
3593                           float16_to_float32(b, true, s), d, 0, s);
3594 }
3595 
fwmacc32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3596 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3597 {
3598     return float64_muladd(float32_to_float64(a, s),
3599                           float32_to_float64(b, s), d, 0, s);
3600 }
3601 
RVVCALL(OPFVV3,vfwmacc_vv_h,WOP_UUU_H,H4,H2,H2,fwmacc16)3602 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3603 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3604 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3605 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3606 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3607 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3608 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3609 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3610 
3611 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3612 {
3613     return float32_muladd(bfloat16_to_float32(a, s),
3614                           bfloat16_to_float32(b, s), d, 0, s);
3615 }
3616 
RVVCALL(OPFVV3,vfwmaccbf16_vv,WOP_UUU_H,H4,H2,H2,fwmaccbf16)3617 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3618 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3619 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3620 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3621 
3622 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3623 {
3624     return float32_muladd(float16_to_float32(a, true, s),
3625                           float16_to_float32(b, true, s), d,
3626                           float_muladd_negate_c | float_muladd_negate_product,
3627                           s);
3628 }
3629 
fwnmacc32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3630 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3631 {
3632     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3633                           d, float_muladd_negate_c |
3634                              float_muladd_negate_product, s);
3635 }
3636 
RVVCALL(OPFVV3,vfwnmacc_vv_h,WOP_UUU_H,H4,H2,H2,fwnmacc16)3637 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3638 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3639 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3640 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3641 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3642 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3643 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3644 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3645 
3646 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3647 {
3648     return float32_muladd(float16_to_float32(a, true, s),
3649                           float16_to_float32(b, true, s), d,
3650                           float_muladd_negate_c, s);
3651 }
3652 
fwmsac32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3653 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3654 {
3655     return float64_muladd(float32_to_float64(a, s),
3656                           float32_to_float64(b, s), d,
3657                           float_muladd_negate_c, s);
3658 }
3659 
RVVCALL(OPFVV3,vfwmsac_vv_h,WOP_UUU_H,H4,H2,H2,fwmsac16)3660 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3661 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3662 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3663 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3664 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3665 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3666 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3667 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3668 
3669 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3670 {
3671     return float32_muladd(float16_to_float32(a, true, s),
3672                           float16_to_float32(b, true, s), d,
3673                           float_muladd_negate_product, s);
3674 }
3675 
fwnmsac32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3676 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3677 {
3678     return float64_muladd(float32_to_float64(a, s),
3679                           float32_to_float64(b, s), d,
3680                           float_muladd_negate_product, s);
3681 }
3682 
RVVCALL(OPFVV3,vfwnmsac_vv_h,WOP_UUU_H,H4,H2,H2,fwnmsac16)3683 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3684 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3685 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3686 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3687 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3688 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3689 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3690 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3691 
3692 /* Vector Floating-Point Square-Root Instruction */
3693 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3694 static void do_##NAME(void *vd, void *vs2, int i,      \
3695                       CPURISCVState *env)              \
3696 {                                                      \
3697     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3698     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3699 }
3700 
3701 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3702 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3703                   CPURISCVState *env, uint32_t desc)   \
3704 {                                                      \
3705     uint32_t vm = vext_vm(desc);                       \
3706     uint32_t vl = env->vl;                             \
3707     uint32_t total_elems =                             \
3708         vext_get_total_elems(env, desc, ESZ);          \
3709     uint32_t vta = vext_vta(desc);                     \
3710     uint32_t vma = vext_vma(desc);                     \
3711     uint32_t i;                                        \
3712                                                        \
3713     VSTART_CHECK_EARLY_EXIT(env, vl);                  \
3714                                                        \
3715     if (vl == 0) {                                     \
3716         return;                                        \
3717     }                                                  \
3718     for (i = env->vstart; i < vl; i++) {               \
3719         if (!vm && !vext_elem_mask(v0, i)) {           \
3720             /* set masked-off elements to 1s */        \
3721             vext_set_elems_1s(vd, vma, i * ESZ,        \
3722                               (i + 1) * ESZ);          \
3723             continue;                                  \
3724         }                                              \
3725         do_##NAME(vd, vs2, i, env);                    \
3726     }                                                  \
3727     env->vstart = 0;                                   \
3728     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3729                       total_elems * ESZ);              \
3730 }
3731 
3732 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3733 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3734 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3735 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3736 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3737 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3738 
3739 /*
3740  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3741  *
3742  * Adapted from riscv-v-spec recip.c:
3743  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3744  */
3745 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3746 {
3747     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3748     uint64_t exp = extract64(f, frac_size, exp_size);
3749     uint64_t frac = extract64(f, 0, frac_size);
3750 
3751     const uint8_t lookup_table[] = {
3752         52, 51, 50, 48, 47, 46, 44, 43,
3753         42, 41, 40, 39, 38, 36, 35, 34,
3754         33, 32, 31, 30, 30, 29, 28, 27,
3755         26, 25, 24, 23, 23, 22, 21, 20,
3756         19, 19, 18, 17, 16, 16, 15, 14,
3757         14, 13, 12, 12, 11, 10, 10, 9,
3758         9, 8, 7, 7, 6, 6, 5, 4,
3759         4, 3, 3, 2, 2, 1, 1, 0,
3760         127, 125, 123, 121, 119, 118, 116, 114,
3761         113, 111, 109, 108, 106, 105, 103, 102,
3762         100, 99, 97, 96, 95, 93, 92, 91,
3763         90, 88, 87, 86, 85, 84, 83, 82,
3764         80, 79, 78, 77, 76, 75, 74, 73,
3765         72, 71, 70, 70, 69, 68, 67, 66,
3766         65, 64, 63, 63, 62, 61, 60, 59,
3767         59, 58, 57, 56, 56, 55, 54, 53
3768     };
3769     const int precision = 7;
3770 
3771     if (exp == 0 && frac != 0) { /* subnormal */
3772         /* Normalize the subnormal. */
3773         while (extract64(frac, frac_size - 1, 1) == 0) {
3774             exp--;
3775             frac <<= 1;
3776         }
3777 
3778         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3779     }
3780 
3781     int idx = ((exp & 1) << (precision - 1)) |
3782               (frac >> (frac_size - precision + 1));
3783     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3784                         (frac_size - precision);
3785     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3786 
3787     uint64_t val = 0;
3788     val = deposit64(val, 0, frac_size, out_frac);
3789     val = deposit64(val, frac_size, exp_size, out_exp);
3790     val = deposit64(val, frac_size + exp_size, 1, sign);
3791     return val;
3792 }
3793 
frsqrt7_h(float16 f,float_status * s)3794 static float16 frsqrt7_h(float16 f, float_status *s)
3795 {
3796     int exp_size = 5, frac_size = 10;
3797     bool sign = float16_is_neg(f);
3798 
3799     /*
3800      * frsqrt7(sNaN) = canonical NaN
3801      * frsqrt7(-inf) = canonical NaN
3802      * frsqrt7(-normal) = canonical NaN
3803      * frsqrt7(-subnormal) = canonical NaN
3804      */
3805     if (float16_is_signaling_nan(f, s) ||
3806         (float16_is_infinity(f) && sign) ||
3807         (float16_is_normal(f) && sign) ||
3808         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3809         s->float_exception_flags |= float_flag_invalid;
3810         return float16_default_nan(s);
3811     }
3812 
3813     /* frsqrt7(qNaN) = canonical NaN */
3814     if (float16_is_quiet_nan(f, s)) {
3815         return float16_default_nan(s);
3816     }
3817 
3818     /* frsqrt7(+-0) = +-inf */
3819     if (float16_is_zero(f)) {
3820         s->float_exception_flags |= float_flag_divbyzero;
3821         return float16_set_sign(float16_infinity, sign);
3822     }
3823 
3824     /* frsqrt7(+inf) = +0 */
3825     if (float16_is_infinity(f) && !sign) {
3826         return float16_set_sign(float16_zero, sign);
3827     }
3828 
3829     /* +normal, +subnormal */
3830     uint64_t val = frsqrt7(f, exp_size, frac_size);
3831     return make_float16(val);
3832 }
3833 
frsqrt7_s(float32 f,float_status * s)3834 static float32 frsqrt7_s(float32 f, float_status *s)
3835 {
3836     int exp_size = 8, frac_size = 23;
3837     bool sign = float32_is_neg(f);
3838 
3839     /*
3840      * frsqrt7(sNaN) = canonical NaN
3841      * frsqrt7(-inf) = canonical NaN
3842      * frsqrt7(-normal) = canonical NaN
3843      * frsqrt7(-subnormal) = canonical NaN
3844      */
3845     if (float32_is_signaling_nan(f, s) ||
3846         (float32_is_infinity(f) && sign) ||
3847         (float32_is_normal(f) && sign) ||
3848         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3849         s->float_exception_flags |= float_flag_invalid;
3850         return float32_default_nan(s);
3851     }
3852 
3853     /* frsqrt7(qNaN) = canonical NaN */
3854     if (float32_is_quiet_nan(f, s)) {
3855         return float32_default_nan(s);
3856     }
3857 
3858     /* frsqrt7(+-0) = +-inf */
3859     if (float32_is_zero(f)) {
3860         s->float_exception_flags |= float_flag_divbyzero;
3861         return float32_set_sign(float32_infinity, sign);
3862     }
3863 
3864     /* frsqrt7(+inf) = +0 */
3865     if (float32_is_infinity(f) && !sign) {
3866         return float32_set_sign(float32_zero, sign);
3867     }
3868 
3869     /* +normal, +subnormal */
3870     uint64_t val = frsqrt7(f, exp_size, frac_size);
3871     return make_float32(val);
3872 }
3873 
frsqrt7_d(float64 f,float_status * s)3874 static float64 frsqrt7_d(float64 f, float_status *s)
3875 {
3876     int exp_size = 11, frac_size = 52;
3877     bool sign = float64_is_neg(f);
3878 
3879     /*
3880      * frsqrt7(sNaN) = canonical NaN
3881      * frsqrt7(-inf) = canonical NaN
3882      * frsqrt7(-normal) = canonical NaN
3883      * frsqrt7(-subnormal) = canonical NaN
3884      */
3885     if (float64_is_signaling_nan(f, s) ||
3886         (float64_is_infinity(f) && sign) ||
3887         (float64_is_normal(f) && sign) ||
3888         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3889         s->float_exception_flags |= float_flag_invalid;
3890         return float64_default_nan(s);
3891     }
3892 
3893     /* frsqrt7(qNaN) = canonical NaN */
3894     if (float64_is_quiet_nan(f, s)) {
3895         return float64_default_nan(s);
3896     }
3897 
3898     /* frsqrt7(+-0) = +-inf */
3899     if (float64_is_zero(f)) {
3900         s->float_exception_flags |= float_flag_divbyzero;
3901         return float64_set_sign(float64_infinity, sign);
3902     }
3903 
3904     /* frsqrt7(+inf) = +0 */
3905     if (float64_is_infinity(f) && !sign) {
3906         return float64_set_sign(float64_zero, sign);
3907     }
3908 
3909     /* +normal, +subnormal */
3910     uint64_t val = frsqrt7(f, exp_size, frac_size);
3911     return make_float64(val);
3912 }
3913 
RVVCALL(OPFVV1,vfrsqrt7_v_h,OP_UU_H,H2,H2,frsqrt7_h)3914 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3915 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3916 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3917 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3918 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3919 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3920 
3921 /*
3922  * Vector Floating-Point Reciprocal Estimate Instruction
3923  *
3924  * Adapted from riscv-v-spec recip.c:
3925  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3926  */
3927 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3928                       float_status *s)
3929 {
3930     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3931     uint64_t exp = extract64(f, frac_size, exp_size);
3932     uint64_t frac = extract64(f, 0, frac_size);
3933 
3934     const uint8_t lookup_table[] = {
3935         127, 125, 123, 121, 119, 117, 116, 114,
3936         112, 110, 109, 107, 105, 104, 102, 100,
3937         99, 97, 96, 94, 93, 91, 90, 88,
3938         87, 85, 84, 83, 81, 80, 79, 77,
3939         76, 75, 74, 72, 71, 70, 69, 68,
3940         66, 65, 64, 63, 62, 61, 60, 59,
3941         58, 57, 56, 55, 54, 53, 52, 51,
3942         50, 49, 48, 47, 46, 45, 44, 43,
3943         42, 41, 40, 40, 39, 38, 37, 36,
3944         35, 35, 34, 33, 32, 31, 31, 30,
3945         29, 28, 28, 27, 26, 25, 25, 24,
3946         23, 23, 22, 21, 21, 20, 19, 19,
3947         18, 17, 17, 16, 15, 15, 14, 14,
3948         13, 12, 12, 11, 11, 10, 9, 9,
3949         8, 8, 7, 7, 6, 5, 5, 4,
3950         4, 3, 3, 2, 2, 1, 1, 0
3951     };
3952     const int precision = 7;
3953 
3954     if (exp == 0 && frac != 0) { /* subnormal */
3955         /* Normalize the subnormal. */
3956         while (extract64(frac, frac_size - 1, 1) == 0) {
3957             exp--;
3958             frac <<= 1;
3959         }
3960 
3961         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3962 
3963         if (exp != 0 && exp != UINT64_MAX) {
3964             /*
3965              * Overflow to inf or max value of same sign,
3966              * depending on sign and rounding mode.
3967              */
3968             s->float_exception_flags |= (float_flag_inexact |
3969                                          float_flag_overflow);
3970 
3971             if ((s->float_rounding_mode == float_round_to_zero) ||
3972                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3973                 ((s->float_rounding_mode == float_round_up) && sign)) {
3974                 /* Return greatest/negative finite value. */
3975                 return (sign << (exp_size + frac_size)) |
3976                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3977             } else {
3978                 /* Return +-inf. */
3979                 return (sign << (exp_size + frac_size)) |
3980                        MAKE_64BIT_MASK(frac_size, exp_size);
3981             }
3982         }
3983     }
3984 
3985     int idx = frac >> (frac_size - precision);
3986     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3987                         (frac_size - precision);
3988     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3989 
3990     if (out_exp == 0 || out_exp == UINT64_MAX) {
3991         /*
3992          * The result is subnormal, but don't raise the underflow exception,
3993          * because there's no additional loss of precision.
3994          */
3995         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3996         if (out_exp == UINT64_MAX) {
3997             out_frac >>= 1;
3998             out_exp = 0;
3999         }
4000     }
4001 
4002     uint64_t val = 0;
4003     val = deposit64(val, 0, frac_size, out_frac);
4004     val = deposit64(val, frac_size, exp_size, out_exp);
4005     val = deposit64(val, frac_size + exp_size, 1, sign);
4006     return val;
4007 }
4008 
frec7_h(float16 f,float_status * s)4009 static float16 frec7_h(float16 f, float_status *s)
4010 {
4011     int exp_size = 5, frac_size = 10;
4012     bool sign = float16_is_neg(f);
4013 
4014     /* frec7(+-inf) = +-0 */
4015     if (float16_is_infinity(f)) {
4016         return float16_set_sign(float16_zero, sign);
4017     }
4018 
4019     /* frec7(+-0) = +-inf */
4020     if (float16_is_zero(f)) {
4021         s->float_exception_flags |= float_flag_divbyzero;
4022         return float16_set_sign(float16_infinity, sign);
4023     }
4024 
4025     /* frec7(sNaN) = canonical NaN */
4026     if (float16_is_signaling_nan(f, s)) {
4027         s->float_exception_flags |= float_flag_invalid;
4028         return float16_default_nan(s);
4029     }
4030 
4031     /* frec7(qNaN) = canonical NaN */
4032     if (float16_is_quiet_nan(f, s)) {
4033         return float16_default_nan(s);
4034     }
4035 
4036     /* +-normal, +-subnormal */
4037     uint64_t val = frec7(f, exp_size, frac_size, s);
4038     return make_float16(val);
4039 }
4040 
frec7_s(float32 f,float_status * s)4041 static float32 frec7_s(float32 f, float_status *s)
4042 {
4043     int exp_size = 8, frac_size = 23;
4044     bool sign = float32_is_neg(f);
4045 
4046     /* frec7(+-inf) = +-0 */
4047     if (float32_is_infinity(f)) {
4048         return float32_set_sign(float32_zero, sign);
4049     }
4050 
4051     /* frec7(+-0) = +-inf */
4052     if (float32_is_zero(f)) {
4053         s->float_exception_flags |= float_flag_divbyzero;
4054         return float32_set_sign(float32_infinity, sign);
4055     }
4056 
4057     /* frec7(sNaN) = canonical NaN */
4058     if (float32_is_signaling_nan(f, s)) {
4059         s->float_exception_flags |= float_flag_invalid;
4060         return float32_default_nan(s);
4061     }
4062 
4063     /* frec7(qNaN) = canonical NaN */
4064     if (float32_is_quiet_nan(f, s)) {
4065         return float32_default_nan(s);
4066     }
4067 
4068     /* +-normal, +-subnormal */
4069     uint64_t val = frec7(f, exp_size, frac_size, s);
4070     return make_float32(val);
4071 }
4072 
frec7_d(float64 f,float_status * s)4073 static float64 frec7_d(float64 f, float_status *s)
4074 {
4075     int exp_size = 11, frac_size = 52;
4076     bool sign = float64_is_neg(f);
4077 
4078     /* frec7(+-inf) = +-0 */
4079     if (float64_is_infinity(f)) {
4080         return float64_set_sign(float64_zero, sign);
4081     }
4082 
4083     /* frec7(+-0) = +-inf */
4084     if (float64_is_zero(f)) {
4085         s->float_exception_flags |= float_flag_divbyzero;
4086         return float64_set_sign(float64_infinity, sign);
4087     }
4088 
4089     /* frec7(sNaN) = canonical NaN */
4090     if (float64_is_signaling_nan(f, s)) {
4091         s->float_exception_flags |= float_flag_invalid;
4092         return float64_default_nan(s);
4093     }
4094 
4095     /* frec7(qNaN) = canonical NaN */
4096     if (float64_is_quiet_nan(f, s)) {
4097         return float64_default_nan(s);
4098     }
4099 
4100     /* +-normal, +-subnormal */
4101     uint64_t val = frec7(f, exp_size, frac_size, s);
4102     return make_float64(val);
4103 }
4104 
RVVCALL(OPFVV1,vfrec7_v_h,OP_UU_H,H2,H2,frec7_h)4105 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4106 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4107 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4108 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4109 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4110 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4111 
4112 /* Vector Floating-Point MIN/MAX Instructions */
4113 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4114 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4115 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4116 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4117 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4118 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4119 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4120 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4121 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4122 GEN_VEXT_VF(vfmin_vf_h, 2)
4123 GEN_VEXT_VF(vfmin_vf_w, 4)
4124 GEN_VEXT_VF(vfmin_vf_d, 8)
4125 
4126 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4127 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4128 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4129 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4130 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4131 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4132 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4133 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4134 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4135 GEN_VEXT_VF(vfmax_vf_h, 2)
4136 GEN_VEXT_VF(vfmax_vf_w, 4)
4137 GEN_VEXT_VF(vfmax_vf_d, 8)
4138 
4139 /* Vector Floating-Point Sign-Injection Instructions */
4140 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4141 {
4142     return deposit64(b, 0, 15, a);
4143 }
4144 
fsgnj32(uint32_t a,uint32_t b,float_status * s)4145 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4146 {
4147     return deposit64(b, 0, 31, a);
4148 }
4149 
fsgnj64(uint64_t a,uint64_t b,float_status * s)4150 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4151 {
4152     return deposit64(b, 0, 63, a);
4153 }
4154 
RVVCALL(OPFVV2,vfsgnj_vv_h,OP_UUU_H,H2,H2,H2,fsgnj16)4155 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4156 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4157 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4158 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4159 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4160 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4161 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4162 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4163 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4164 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4165 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4166 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4167 
4168 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4169 {
4170     return deposit64(~b, 0, 15, a);
4171 }
4172 
fsgnjn32(uint32_t a,uint32_t b,float_status * s)4173 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4174 {
4175     return deposit64(~b, 0, 31, a);
4176 }
4177 
fsgnjn64(uint64_t a,uint64_t b,float_status * s)4178 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4179 {
4180     return deposit64(~b, 0, 63, a);
4181 }
4182 
RVVCALL(OPFVV2,vfsgnjn_vv_h,OP_UUU_H,H2,H2,H2,fsgnjn16)4183 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4184 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4185 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4186 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4187 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4188 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4189 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4190 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4191 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4192 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4193 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4194 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4195 
4196 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4197 {
4198     return deposit64(b ^ a, 0, 15, a);
4199 }
4200 
fsgnjx32(uint32_t a,uint32_t b,float_status * s)4201 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4202 {
4203     return deposit64(b ^ a, 0, 31, a);
4204 }
4205 
fsgnjx64(uint64_t a,uint64_t b,float_status * s)4206 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4207 {
4208     return deposit64(b ^ a, 0, 63, a);
4209 }
4210 
RVVCALL(OPFVV2,vfsgnjx_vv_h,OP_UUU_H,H2,H2,H2,fsgnjx16)4211 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4212 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4213 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4214 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4215 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4216 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4217 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4218 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4219 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4220 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4221 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4222 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4223 
4224 /* Vector Floating-Point Compare Instructions */
4225 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4226 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4227                   CPURISCVState *env, uint32_t desc)          \
4228 {                                                             \
4229     uint32_t vm = vext_vm(desc);                              \
4230     uint32_t vl = env->vl;                                    \
4231     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4232     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4233     uint32_t vma = vext_vma(desc);                            \
4234     uint32_t i;                                               \
4235                                                               \
4236     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4237                                                               \
4238     for (i = env->vstart; i < vl; i++) {                      \
4239         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4240         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4241         if (!vm && !vext_elem_mask(v0, i)) {                  \
4242             /* set masked-off elements to 1s */               \
4243             if (vma) {                                        \
4244                 vext_set_elem_mask(vd, i, 1);                 \
4245             }                                                 \
4246             continue;                                         \
4247         }                                                     \
4248         vext_set_elem_mask(vd, i,                             \
4249                            DO_OP(s2, s1, &env->fp_status));   \
4250     }                                                         \
4251     env->vstart = 0;                                          \
4252     /*
4253      * mask destination register are always tail-agnostic
4254      * set tail elements to 1s
4255      */                                                       \
4256     if (vta_all_1s) {                                         \
4257         for (; i < total_elems; i++) {                        \
4258             vext_set_elem_mask(vd, i, 1);                     \
4259         }                                                     \
4260     }                                                         \
4261 }
4262 
4263 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4264 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4265 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4266 
4267 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4268 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4269                   CPURISCVState *env, uint32_t desc)                \
4270 {                                                                   \
4271     uint32_t vm = vext_vm(desc);                                    \
4272     uint32_t vl = env->vl;                                          \
4273     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4274     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4275     uint32_t vma = vext_vma(desc);                                  \
4276     uint32_t i;                                                     \
4277                                                                     \
4278     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
4279                                                                     \
4280     for (i = env->vstart; i < vl; i++) {                            \
4281         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4282         if (!vm && !vext_elem_mask(v0, i)) {                        \
4283             /* set masked-off elements to 1s */                     \
4284             if (vma) {                                              \
4285                 vext_set_elem_mask(vd, i, 1);                       \
4286             }                                                       \
4287             continue;                                               \
4288         }                                                           \
4289         vext_set_elem_mask(vd, i,                                   \
4290                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4291     }                                                               \
4292     env->vstart = 0;                                                \
4293     /*
4294      * mask destination register are always tail-agnostic
4295      * set tail elements to 1s
4296      */                                                             \
4297     if (vta_all_1s) {                                               \
4298         for (; i < total_elems; i++) {                              \
4299             vext_set_elem_mask(vd, i, 1);                           \
4300         }                                                           \
4301     }                                                               \
4302 }
4303 
4304 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4305 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4306 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4307 
4308 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4309 {
4310     FloatRelation compare = float16_compare_quiet(a, b, s);
4311     return compare != float_relation_equal;
4312 }
4313 
vmfne32(uint32_t a,uint32_t b,float_status * s)4314 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4315 {
4316     FloatRelation compare = float32_compare_quiet(a, b, s);
4317     return compare != float_relation_equal;
4318 }
4319 
vmfne64(uint64_t a,uint64_t b,float_status * s)4320 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4321 {
4322     FloatRelation compare = float64_compare_quiet(a, b, s);
4323     return compare != float_relation_equal;
4324 }
4325 
GEN_VEXT_CMP_VV_ENV(vmfne_vv_h,uint16_t,H2,vmfne16)4326 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4327 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4328 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4329 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4330 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4331 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4332 
4333 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4334 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4335 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4336 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4337 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4338 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4339 
4340 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4341 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4342 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4343 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4344 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4345 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4346 
4347 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4348 {
4349     FloatRelation compare = float16_compare(a, b, s);
4350     return compare == float_relation_greater;
4351 }
4352 
vmfgt32(uint32_t a,uint32_t b,float_status * s)4353 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4354 {
4355     FloatRelation compare = float32_compare(a, b, s);
4356     return compare == float_relation_greater;
4357 }
4358 
vmfgt64(uint64_t a,uint64_t b,float_status * s)4359 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4360 {
4361     FloatRelation compare = float64_compare(a, b, s);
4362     return compare == float_relation_greater;
4363 }
4364 
GEN_VEXT_CMP_VF(vmfgt_vf_h,uint16_t,H2,vmfgt16)4365 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4366 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4367 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4368 
4369 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4370 {
4371     FloatRelation compare = float16_compare(a, b, s);
4372     return compare == float_relation_greater ||
4373            compare == float_relation_equal;
4374 }
4375 
vmfge32(uint32_t a,uint32_t b,float_status * s)4376 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4377 {
4378     FloatRelation compare = float32_compare(a, b, s);
4379     return compare == float_relation_greater ||
4380            compare == float_relation_equal;
4381 }
4382 
vmfge64(uint64_t a,uint64_t b,float_status * s)4383 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4384 {
4385     FloatRelation compare = float64_compare(a, b, s);
4386     return compare == float_relation_greater ||
4387            compare == float_relation_equal;
4388 }
4389 
GEN_VEXT_CMP_VF(vmfge_vf_h,uint16_t,H2,vmfge16)4390 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4391 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4392 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4393 
4394 /* Vector Floating-Point Classify Instruction */
4395 target_ulong fclass_h(uint64_t frs1)
4396 {
4397     float16 f = frs1;
4398     bool sign = float16_is_neg(f);
4399 
4400     if (float16_is_infinity(f)) {
4401         return sign ? 1 << 0 : 1 << 7;
4402     } else if (float16_is_zero(f)) {
4403         return sign ? 1 << 3 : 1 << 4;
4404     } else if (float16_is_zero_or_denormal(f)) {
4405         return sign ? 1 << 2 : 1 << 5;
4406     } else if (float16_is_any_nan(f)) {
4407         float_status s = { }; /* for snan_bit_is_one */
4408         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4409     } else {
4410         return sign ? 1 << 1 : 1 << 6;
4411     }
4412 }
4413 
fclass_s(uint64_t frs1)4414 target_ulong fclass_s(uint64_t frs1)
4415 {
4416     float32 f = frs1;
4417     bool sign = float32_is_neg(f);
4418 
4419     if (float32_is_infinity(f)) {
4420         return sign ? 1 << 0 : 1 << 7;
4421     } else if (float32_is_zero(f)) {
4422         return sign ? 1 << 3 : 1 << 4;
4423     } else if (float32_is_zero_or_denormal(f)) {
4424         return sign ? 1 << 2 : 1 << 5;
4425     } else if (float32_is_any_nan(f)) {
4426         float_status s = { }; /* for snan_bit_is_one */
4427         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4428     } else {
4429         return sign ? 1 << 1 : 1 << 6;
4430     }
4431 }
4432 
fclass_d(uint64_t frs1)4433 target_ulong fclass_d(uint64_t frs1)
4434 {
4435     float64 f = frs1;
4436     bool sign = float64_is_neg(f);
4437 
4438     if (float64_is_infinity(f)) {
4439         return sign ? 1 << 0 : 1 << 7;
4440     } else if (float64_is_zero(f)) {
4441         return sign ? 1 << 3 : 1 << 4;
4442     } else if (float64_is_zero_or_denormal(f)) {
4443         return sign ? 1 << 2 : 1 << 5;
4444     } else if (float64_is_any_nan(f)) {
4445         float_status s = { }; /* for snan_bit_is_one */
4446         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4447     } else {
4448         return sign ? 1 << 1 : 1 << 6;
4449     }
4450 }
4451 
RVVCALL(OPIVV1,vfclass_v_h,OP_UU_H,H2,H2,fclass_h)4452 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4453 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4454 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4455 GEN_VEXT_V(vfclass_v_h, 2)
4456 GEN_VEXT_V(vfclass_v_w, 4)
4457 GEN_VEXT_V(vfclass_v_d, 8)
4458 
4459 /* Vector Floating-Point Merge Instruction */
4460 
4461 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4462 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4463                   CPURISCVState *env, uint32_t desc)          \
4464 {                                                             \
4465     uint32_t vm = vext_vm(desc);                              \
4466     uint32_t vl = env->vl;                                    \
4467     uint32_t esz = sizeof(ETYPE);                             \
4468     uint32_t total_elems =                                    \
4469         vext_get_total_elems(env, desc, esz);                 \
4470     uint32_t vta = vext_vta(desc);                            \
4471     uint32_t i;                                               \
4472                                                               \
4473     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4474                                                               \
4475     for (i = env->vstart; i < vl; i++) {                      \
4476         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4477         *((ETYPE *)vd + H(i)) =                               \
4478             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4479     }                                                         \
4480     env->vstart = 0;                                          \
4481     /* set tail elements to 1s */                             \
4482     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4483 }
4484 
4485 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4486 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4487 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4488 
4489 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4490 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4491 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4492 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4493 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4494 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4495 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4496 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4497 
4498 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4499 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4500 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4501 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4502 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4503 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4504 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4505 
4506 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4507 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4508 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4509 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4510 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4511 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4512 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4513 
4514 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4515 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4516 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4517 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4518 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4519 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4520 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4521 
4522 /* Widening Floating-Point/Integer Type-Convert Instructions */
4523 /* (TD, T2, TX2) */
4524 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4525 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4526 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4527 /*
4528  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4529  */
4530 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4531 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4532 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4533 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4534 
4535 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4536 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4537 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4538 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4539 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4540 
4541 /*
4542  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4543  */
4544 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4545 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4546 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4547 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4548 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4549 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4550 
4551 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4552 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4553 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4554 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4555 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4556 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4557 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4558 
4559 /*
4560  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4561  */
4562 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4563 {
4564     return float16_to_float32(a, true, s);
4565 }
4566 
RVVCALL(OPFVV1,vfwcvt_f_f_v_h,WOP_UU_H,H4,H2,vfwcvtffv16)4567 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4568 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4569 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4570 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4571 
4572 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4573 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4574 
4575 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4576 /* (TD, T2, TX2) */
4577 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4578 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4579 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4580 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4581 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4582 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4583 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4584 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4585 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4586 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4587 
4588 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4589 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4590 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4591 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4592 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4593 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4594 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4595 
4596 /*
4597  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4598  */
4599 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4600 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4601 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4602 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4603 
4604 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4605 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4606 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4607 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4608 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4609 
4610 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4611 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4612 {
4613     return float32_to_float16(a, true, s);
4614 }
4615 
RVVCALL(OPFVV1,vfncvt_f_f_w_h,NOP_UU_H,H2,H4,vfncvtffv16)4616 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4617 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4618 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4619 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4620 
4621 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4622 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4623 
4624 /*
4625  * Vector Reduction Operations
4626  */
4627 /* Vector Single-Width Integer Reduction Instructions */
4628 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4629 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4630                   void *vs2, CPURISCVState *env,          \
4631                   uint32_t desc)                          \
4632 {                                                         \
4633     uint32_t vm = vext_vm(desc);                          \
4634     uint32_t vl = env->vl;                                \
4635     uint32_t esz = sizeof(TD);                            \
4636     uint32_t vlenb = simd_maxsz(desc);                    \
4637     uint32_t vta = vext_vta(desc);                        \
4638     uint32_t i;                                           \
4639     TD s1 =  *((TD *)vs1 + HD(0));                        \
4640                                                           \
4641     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4642                                                           \
4643     for (i = env->vstart; i < vl; i++) {                  \
4644         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4645         if (!vm && !vext_elem_mask(v0, i)) {              \
4646             continue;                                     \
4647         }                                                 \
4648         s1 = OP(s1, (TD)s2);                              \
4649     }                                                     \
4650     if (vl > 0) {                                         \
4651         *((TD *)vd + HD(0)) = s1;                         \
4652     }                                                     \
4653     env->vstart = 0;                                      \
4654     /* set tail elements to 1s */                         \
4655     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4656 }
4657 
4658 /* vd[0] = sum(vs1[0], vs2[*]) */
4659 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4660 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4661 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4662 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4663 
4664 /* vd[0] = maxu(vs1[0], vs2[*]) */
4665 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4666 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4667 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4668 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4669 
4670 /* vd[0] = max(vs1[0], vs2[*]) */
4671 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4672 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4673 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4674 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4675 
4676 /* vd[0] = minu(vs1[0], vs2[*]) */
4677 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4678 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4679 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4680 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4681 
4682 /* vd[0] = min(vs1[0], vs2[*]) */
4683 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4684 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4685 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4686 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4687 
4688 /* vd[0] = and(vs1[0], vs2[*]) */
4689 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4690 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4691 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4692 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4693 
4694 /* vd[0] = or(vs1[0], vs2[*]) */
4695 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4696 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4697 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4698 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4699 
4700 /* vd[0] = xor(vs1[0], vs2[*]) */
4701 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4702 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4703 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4704 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4705 
4706 /* Vector Widening Integer Reduction Instructions */
4707 /* signed sum reduction into double-width accumulator */
4708 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4709 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4710 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4711 
4712 /* Unsigned sum reduction into double-width accumulator */
4713 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4714 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4715 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4716 
4717 /* Vector Single-Width Floating-Point Reduction Instructions */
4718 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4719 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4720                   void *vs2, CPURISCVState *env,           \
4721                   uint32_t desc)                           \
4722 {                                                          \
4723     uint32_t vm = vext_vm(desc);                           \
4724     uint32_t vl = env->vl;                                 \
4725     uint32_t esz = sizeof(TD);                             \
4726     uint32_t vlenb = simd_maxsz(desc);                     \
4727     uint32_t vta = vext_vta(desc);                         \
4728     uint32_t i;                                            \
4729     TD s1 =  *((TD *)vs1 + HD(0));                         \
4730                                                            \
4731     VSTART_CHECK_EARLY_EXIT(env, vl);                      \
4732                                                            \
4733     for (i = env->vstart; i < vl; i++) {                   \
4734         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4735         if (!vm && !vext_elem_mask(v0, i)) {               \
4736             continue;                                      \
4737         }                                                  \
4738         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4739     }                                                      \
4740     if (vl > 0) {                                          \
4741         *((TD *)vd + HD(0)) = s1;                          \
4742     }                                                      \
4743     env->vstart = 0;                                       \
4744     /* set tail elements to 1s */                          \
4745     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4746 }
4747 
4748 /* Unordered sum */
4749 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4750 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4751 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4752 
4753 /* Ordered sum */
4754 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4755 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4756 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4757 
4758 /* Maximum value */
4759 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4760               float16_maximum_number)
4761 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4762               float32_maximum_number)
4763 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4764               float64_maximum_number)
4765 
4766 /* Minimum value */
4767 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4768               float16_minimum_number)
4769 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4770               float32_minimum_number)
4771 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4772               float64_minimum_number)
4773 
4774 /* Vector Widening Floating-Point Add Instructions */
4775 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4776 {
4777     return float32_add(a, float16_to_float32(b, true, s), s);
4778 }
4779 
fwadd32(uint64_t a,uint32_t b,float_status * s)4780 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4781 {
4782     return float64_add(a, float32_to_float64(b, s), s);
4783 }
4784 
4785 /* Vector Widening Floating-Point Reduction Instructions */
4786 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
GEN_VEXT_FRED(vfwredusum_vs_h,uint32_t,uint16_t,H4,H2,fwadd16)4787 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4788 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4789 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4790 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4791 
4792 /*
4793  * Vector Mask Operations
4794  */
4795 /* Vector Mask-Register Logical Instructions */
4796 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4797 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4798                   void *vs2, CPURISCVState *env,          \
4799                   uint32_t desc)                          \
4800 {                                                         \
4801     uint32_t vl = env->vl;                                \
4802     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4803     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4804     uint32_t i;                                           \
4805     int a, b;                                             \
4806                                                           \
4807     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4808                                                           \
4809     for (i = env->vstart; i < vl; i++) {                  \
4810         a = vext_elem_mask(vs1, i);                       \
4811         b = vext_elem_mask(vs2, i);                       \
4812         vext_set_elem_mask(vd, i, OP(b, a));              \
4813     }                                                     \
4814     env->vstart = 0;                                      \
4815     /*
4816      * mask destination register are always tail-agnostic
4817      * set tail elements to 1s
4818      */                                                   \
4819     if (vta_all_1s) {                                     \
4820         for (; i < total_elems; i++) {                    \
4821             vext_set_elem_mask(vd, i, 1);                 \
4822         }                                                 \
4823     }                                                     \
4824 }
4825 
4826 #define DO_NAND(N, M)  (!(N & M))
4827 #define DO_ANDNOT(N, M)  (N & !M)
4828 #define DO_NOR(N, M)  (!(N | M))
4829 #define DO_ORNOT(N, M)  (N | !M)
4830 #define DO_XNOR(N, M)  (!(N ^ M))
4831 
4832 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4833 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4834 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4835 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4836 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4837 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4838 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4839 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4840 
4841 /* Vector count population in mask vcpop */
4842 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4843                              uint32_t desc)
4844 {
4845     target_ulong cnt = 0;
4846     uint32_t vm = vext_vm(desc);
4847     uint32_t vl = env->vl;
4848     int i;
4849 
4850     for (i = env->vstart; i < vl; i++) {
4851         if (vm || vext_elem_mask(v0, i)) {
4852             if (vext_elem_mask(vs2, i)) {
4853                 cnt++;
4854             }
4855         }
4856     }
4857     env->vstart = 0;
4858     return cnt;
4859 }
4860 
4861 /* vfirst find-first-set mask bit */
HELPER(vfirst_m)4862 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4863                               uint32_t desc)
4864 {
4865     uint32_t vm = vext_vm(desc);
4866     uint32_t vl = env->vl;
4867     int i;
4868 
4869     for (i = env->vstart; i < vl; i++) {
4870         if (vm || vext_elem_mask(v0, i)) {
4871             if (vext_elem_mask(vs2, i)) {
4872                 return i;
4873             }
4874         }
4875     }
4876     env->vstart = 0;
4877     return -1LL;
4878 }
4879 
4880 enum set_mask_type {
4881     ONLY_FIRST = 1,
4882     INCLUDE_FIRST,
4883     BEFORE_FIRST,
4884 };
4885 
vmsetm(void * vd,void * v0,void * vs2,CPURISCVState * env,uint32_t desc,enum set_mask_type type)4886 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4887                    uint32_t desc, enum set_mask_type type)
4888 {
4889     uint32_t vm = vext_vm(desc);
4890     uint32_t vl = env->vl;
4891     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4892     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4893     uint32_t vma = vext_vma(desc);
4894     int i;
4895     bool first_mask_bit = false;
4896 
4897     VSTART_CHECK_EARLY_EXIT(env, vl);
4898 
4899     for (i = env->vstart; i < vl; i++) {
4900         if (!vm && !vext_elem_mask(v0, i)) {
4901             /* set masked-off elements to 1s */
4902             if (vma) {
4903                 vext_set_elem_mask(vd, i, 1);
4904             }
4905             continue;
4906         }
4907         /* write a zero to all following active elements */
4908         if (first_mask_bit) {
4909             vext_set_elem_mask(vd, i, 0);
4910             continue;
4911         }
4912         if (vext_elem_mask(vs2, i)) {
4913             first_mask_bit = true;
4914             if (type == BEFORE_FIRST) {
4915                 vext_set_elem_mask(vd, i, 0);
4916             } else {
4917                 vext_set_elem_mask(vd, i, 1);
4918             }
4919         } else {
4920             if (type == ONLY_FIRST) {
4921                 vext_set_elem_mask(vd, i, 0);
4922             } else {
4923                 vext_set_elem_mask(vd, i, 1);
4924             }
4925         }
4926     }
4927     env->vstart = 0;
4928     /*
4929      * mask destination register are always tail-agnostic
4930      * set tail elements to 1s
4931      */
4932     if (vta_all_1s) {
4933         for (; i < total_elems; i++) {
4934             vext_set_elem_mask(vd, i, 1);
4935         }
4936     }
4937 }
4938 
HELPER(vmsbf_m)4939 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4940                      uint32_t desc)
4941 {
4942     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4943 }
4944 
HELPER(vmsif_m)4945 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4946                      uint32_t desc)
4947 {
4948     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4949 }
4950 
HELPER(vmsof_m)4951 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4952                      uint32_t desc)
4953 {
4954     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4955 }
4956 
4957 /* Vector Iota Instruction */
4958 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4959 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4960                   uint32_t desc)                                          \
4961 {                                                                         \
4962     uint32_t vm = vext_vm(desc);                                          \
4963     uint32_t vl = env->vl;                                                \
4964     uint32_t esz = sizeof(ETYPE);                                         \
4965     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4966     uint32_t vta = vext_vta(desc);                                        \
4967     uint32_t vma = vext_vma(desc);                                        \
4968     uint32_t sum = 0;                                                     \
4969     int i;                                                                \
4970                                                                           \
4971     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
4972                                                                           \
4973     for (i = env->vstart; i < vl; i++) {                                  \
4974         if (!vm && !vext_elem_mask(v0, i)) {                              \
4975             /* set masked-off elements to 1s */                           \
4976             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4977             continue;                                                     \
4978         }                                                                 \
4979         *((ETYPE *)vd + H(i)) = sum;                                      \
4980         if (vext_elem_mask(vs2, i)) {                                     \
4981             sum++;                                                        \
4982         }                                                                 \
4983     }                                                                     \
4984     env->vstart = 0;                                                      \
4985     /* set tail elements to 1s */                                         \
4986     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4987 }
4988 
GEN_VEXT_VIOTA_M(viota_m_b,uint8_t,H1)4989 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4990 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4991 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4992 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4993 
4994 /* Vector Element Index Instruction */
4995 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4996 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4997 {                                                                         \
4998     uint32_t vm = vext_vm(desc);                                          \
4999     uint32_t vl = env->vl;                                                \
5000     uint32_t esz = sizeof(ETYPE);                                         \
5001     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5002     uint32_t vta = vext_vta(desc);                                        \
5003     uint32_t vma = vext_vma(desc);                                        \
5004     int i;                                                                \
5005                                                                           \
5006     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5007                                                                           \
5008     for (i = env->vstart; i < vl; i++) {                                  \
5009         if (!vm && !vext_elem_mask(v0, i)) {                              \
5010             /* set masked-off elements to 1s */                           \
5011             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5012             continue;                                                     \
5013         }                                                                 \
5014         *((ETYPE *)vd + H(i)) = i;                                        \
5015     }                                                                     \
5016     env->vstart = 0;                                                      \
5017     /* set tail elements to 1s */                                         \
5018     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5019 }
5020 
5021 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5022 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5023 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5024 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5025 
5026 /*
5027  * Vector Permutation Instructions
5028  */
5029 
5030 /* Vector Slide Instructions */
5031 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5032 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5033                   CPURISCVState *env, uint32_t desc)                      \
5034 {                                                                         \
5035     uint32_t vm = vext_vm(desc);                                          \
5036     uint32_t vl = env->vl;                                                \
5037     uint32_t esz = sizeof(ETYPE);                                         \
5038     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5039     uint32_t vta = vext_vta(desc);                                        \
5040     uint32_t vma = vext_vma(desc);                                        \
5041     target_ulong offset = s1, i_min, i;                                   \
5042                                                                           \
5043     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5044                                                                           \
5045     i_min = MAX(env->vstart, offset);                                     \
5046     for (i = i_min; i < vl; i++) {                                        \
5047         if (!vm && !vext_elem_mask(v0, i)) {                              \
5048             /* set masked-off elements to 1s */                           \
5049             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5050             continue;                                                     \
5051         }                                                                 \
5052         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5053     }                                                                     \
5054     env->vstart = 0;                                                      \
5055     /* set tail elements to 1s */                                         \
5056     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5057 }
5058 
5059 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5060 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5061 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5062 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5063 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5064 
5065 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5066 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5067                   CPURISCVState *env, uint32_t desc)                      \
5068 {                                                                         \
5069     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5070     uint32_t vm = vext_vm(desc);                                          \
5071     uint32_t vl = env->vl;                                                \
5072     uint32_t esz = sizeof(ETYPE);                                         \
5073     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5074     uint32_t vta = vext_vta(desc);                                        \
5075     uint32_t vma = vext_vma(desc);                                        \
5076     target_ulong i_max, i_min, i;                                         \
5077                                                                           \
5078     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5079                                                                           \
5080     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5081     i_max = MAX(i_min, env->vstart);                                      \
5082     for (i = env->vstart; i < i_max; ++i) {                               \
5083         if (!vm && !vext_elem_mask(v0, i)) {                              \
5084             /* set masked-off elements to 1s */                           \
5085             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5086             continue;                                                     \
5087         }                                                                 \
5088         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5089     }                                                                     \
5090                                                                           \
5091     for (i = i_max; i < vl; ++i) {                                        \
5092         if (vm || vext_elem_mask(v0, i)) {                                \
5093             *((ETYPE *)vd + H(i)) = 0;                                    \
5094         }                                                                 \
5095     }                                                                     \
5096                                                                           \
5097     env->vstart = 0;                                                      \
5098     /* set tail elements to 1s */                                         \
5099     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5100 }
5101 
5102 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5103 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5104 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5105 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5106 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5107 
5108 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5109 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5110                                  void *vs2, CPURISCVState *env,             \
5111                                  uint32_t desc)                             \
5112 {                                                                           \
5113     typedef uint##BITWIDTH##_t ETYPE;                                       \
5114     uint32_t vm = vext_vm(desc);                                            \
5115     uint32_t vl = env->vl;                                                  \
5116     uint32_t esz = sizeof(ETYPE);                                           \
5117     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5118     uint32_t vta = vext_vta(desc);                                          \
5119     uint32_t vma = vext_vma(desc);                                          \
5120     uint32_t i;                                                             \
5121                                                                             \
5122     VSTART_CHECK_EARLY_EXIT(env, vl);                                       \
5123                                                                             \
5124     for (i = env->vstart; i < vl; i++) {                                    \
5125         if (!vm && !vext_elem_mask(v0, i)) {                                \
5126             /* set masked-off elements to 1s */                             \
5127             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5128             continue;                                                       \
5129         }                                                                   \
5130         if (i == 0) {                                                       \
5131             *((ETYPE *)vd + H(i)) = s1;                                     \
5132         } else {                                                            \
5133             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5134         }                                                                   \
5135     }                                                                       \
5136     env->vstart = 0;                                                        \
5137     /* set tail elements to 1s */                                           \
5138     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5139 }
5140 
5141 GEN_VEXT_VSLIE1UP(8,  H1)
5142 GEN_VEXT_VSLIE1UP(16, H2)
5143 GEN_VEXT_VSLIE1UP(32, H4)
5144 GEN_VEXT_VSLIE1UP(64, H8)
5145 
5146 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5147 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5148                   CPURISCVState *env, uint32_t desc)              \
5149 {                                                                 \
5150     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5151 }
5152 
5153 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5154 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5155 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5156 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5157 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5158 
5159 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5160 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5161                                    void *vs2, CPURISCVState *env,             \
5162                                    uint32_t desc)                             \
5163 {                                                                             \
5164     typedef uint##BITWIDTH##_t ETYPE;                                         \
5165     uint32_t vm = vext_vm(desc);                                              \
5166     uint32_t vl = env->vl;                                                    \
5167     uint32_t esz = sizeof(ETYPE);                                             \
5168     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5169     uint32_t vta = vext_vta(desc);                                            \
5170     uint32_t vma = vext_vma(desc);                                            \
5171     uint32_t i;                                                               \
5172                                                                               \
5173     VSTART_CHECK_EARLY_EXIT(env, vl);                                         \
5174                                                                               \
5175     for (i = env->vstart; i < vl; i++) {                                      \
5176         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5177             /* set masked-off elements to 1s */                               \
5178             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5179             continue;                                                         \
5180         }                                                                     \
5181         if (i == vl - 1) {                                                    \
5182             *((ETYPE *)vd + H(i)) = s1;                                       \
5183         } else {                                                              \
5184             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5185         }                                                                     \
5186     }                                                                         \
5187     env->vstart = 0;                                                          \
5188     /* set tail elements to 1s */                                             \
5189     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5190 }
5191 
5192 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5193 GEN_VEXT_VSLIDE1DOWN(16, H2)
5194 GEN_VEXT_VSLIDE1DOWN(32, H4)
5195 GEN_VEXT_VSLIDE1DOWN(64, H8)
5196 
5197 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5198 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5199                   CPURISCVState *env, uint32_t desc)              \
5200 {                                                                 \
5201     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5202 }
5203 
5204 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5205 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5206 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5207 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5208 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5209 
5210 /* Vector Floating-Point Slide Instructions */
5211 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5212 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5213                   CPURISCVState *env, uint32_t desc)          \
5214 {                                                             \
5215     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5216 }
5217 
5218 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5219 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5220 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5221 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5222 
5223 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5224 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5225                   CPURISCVState *env, uint32_t desc)          \
5226 {                                                             \
5227     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5228 }
5229 
5230 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5231 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5232 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5233 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5234 
5235 /* Vector Register Gather Instruction */
5236 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5237 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5238                   CPURISCVState *env, uint32_t desc)                      \
5239 {                                                                         \
5240     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5241     uint32_t vm = vext_vm(desc);                                          \
5242     uint32_t vl = env->vl;                                                \
5243     uint32_t esz = sizeof(TS2);                                           \
5244     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5245     uint32_t vta = vext_vta(desc);                                        \
5246     uint32_t vma = vext_vma(desc);                                        \
5247     uint64_t index;                                                       \
5248     uint32_t i;                                                           \
5249                                                                           \
5250     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5251                                                                           \
5252     for (i = env->vstart; i < vl; i++) {                                  \
5253         if (!vm && !vext_elem_mask(v0, i)) {                              \
5254             /* set masked-off elements to 1s */                           \
5255             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5256             continue;                                                     \
5257         }                                                                 \
5258         index = *((TS1 *)vs1 + HS1(i));                                   \
5259         if (index >= vlmax) {                                             \
5260             *((TS2 *)vd + HS2(i)) = 0;                                    \
5261         } else {                                                          \
5262             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5263         }                                                                 \
5264     }                                                                     \
5265     env->vstart = 0;                                                      \
5266     /* set tail elements to 1s */                                         \
5267     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5268 }
5269 
5270 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5271 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5272 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5273 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5274 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5275 
5276 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5277 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5278 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5279 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5280 
5281 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5282 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5283                   CPURISCVState *env, uint32_t desc)                      \
5284 {                                                                         \
5285     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5286     uint32_t vm = vext_vm(desc);                                          \
5287     uint32_t vl = env->vl;                                                \
5288     uint32_t esz = sizeof(ETYPE);                                         \
5289     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5290     uint32_t vta = vext_vta(desc);                                        \
5291     uint32_t vma = vext_vma(desc);                                        \
5292     uint64_t index = s1;                                                  \
5293     uint32_t i;                                                           \
5294                                                                           \
5295     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5296                                                                           \
5297     for (i = env->vstart; i < vl; i++) {                                  \
5298         if (!vm && !vext_elem_mask(v0, i)) {                              \
5299             /* set masked-off elements to 1s */                           \
5300             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5301             continue;                                                     \
5302         }                                                                 \
5303         if (index >= vlmax) {                                             \
5304             *((ETYPE *)vd + H(i)) = 0;                                    \
5305         } else {                                                          \
5306             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5307         }                                                                 \
5308     }                                                                     \
5309     env->vstart = 0;                                                      \
5310     /* set tail elements to 1s */                                         \
5311     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5312 }
5313 
5314 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5315 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5316 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5317 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5318 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5319 
5320 /* Vector Compress Instruction */
5321 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5322 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5323                   CPURISCVState *env, uint32_t desc)                      \
5324 {                                                                         \
5325     uint32_t vl = env->vl;                                                \
5326     uint32_t esz = sizeof(ETYPE);                                         \
5327     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5328     uint32_t vta = vext_vta(desc);                                        \
5329     uint32_t num = 0, i;                                                  \
5330                                                                           \
5331     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5332                                                                           \
5333     for (i = env->vstart; i < vl; i++) {                                  \
5334         if (!vext_elem_mask(vs1, i)) {                                    \
5335             continue;                                                     \
5336         }                                                                 \
5337         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5338         num++;                                                            \
5339     }                                                                     \
5340     env->vstart = 0;                                                      \
5341     /* set tail elements to 1s */                                         \
5342     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5343 }
5344 
5345 /* Compress into vd elements of vs2 where vs1 is enabled */
5346 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5347 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5348 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5349 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5350 
5351 /* Vector Whole Register Move */
5352 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5353 {
5354     /* EEW = SEW */
5355     uint32_t maxsz = simd_maxsz(desc);
5356     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5357     uint32_t startb = env->vstart * sewb;
5358     uint32_t i = startb;
5359 
5360     if (startb >= maxsz) {
5361         env->vstart = 0;
5362         return;
5363     }
5364 
5365     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5366         uint32_t j = ROUND_UP(i, 8);
5367         memcpy((uint8_t *)vd + H1(j - 1),
5368                (uint8_t *)vs2 + H1(j - 1),
5369                j - i);
5370         i = j;
5371     }
5372 
5373     memcpy((uint8_t *)vd + H1(i),
5374            (uint8_t *)vs2 + H1(i),
5375            maxsz - i);
5376 
5377     env->vstart = 0;
5378 }
5379 
5380 /* Vector Integer Extension */
5381 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5382 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5383                   CPURISCVState *env, uint32_t desc)             \
5384 {                                                                \
5385     uint32_t vl = env->vl;                                       \
5386     uint32_t vm = vext_vm(desc);                                 \
5387     uint32_t esz = sizeof(ETYPE);                                \
5388     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5389     uint32_t vta = vext_vta(desc);                               \
5390     uint32_t vma = vext_vma(desc);                               \
5391     uint32_t i;                                                  \
5392                                                                  \
5393     VSTART_CHECK_EARLY_EXIT(env, vl);                            \
5394                                                                  \
5395     for (i = env->vstart; i < vl; i++) {                         \
5396         if (!vm && !vext_elem_mask(v0, i)) {                     \
5397             /* set masked-off elements to 1s */                  \
5398             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5399             continue;                                            \
5400         }                                                        \
5401         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5402     }                                                            \
5403     env->vstart = 0;                                             \
5404     /* set tail elements to 1s */                                \
5405     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5406 }
5407 
5408 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5409 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5410 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5411 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5412 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5413 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5414 
5415 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5416 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5417 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5418 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5419 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5420 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5421