xref: /openbmc/qemu/target/riscv/vector_helper.c (revision e452053097371880910c744a5d42ae2df058a4a7)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "accel/tcg/cpu-ldst.h"
25 #include "accel/tcg/probe.h"
26 #include "exec/page-protection.h"
27 #include "exec/helper-proto.h"
28 #include "exec/tlb-flags.h"
29 #include "exec/target_page.h"
30 #include "fpu/softfloat.h"
31 #include "tcg/tcg-gvec-desc.h"
32 #include "internals.h"
33 #include "vector_internals.h"
34 #include <math.h>
35 
HELPER(vsetvl)36 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
37                             target_ulong s2, target_ulong x0)
38 {
39     int vlmax, vl;
40     RISCVCPU *cpu = env_archcpu(env);
41     uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
42     uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
43     uint16_t sew = 8 << vsew;
44     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
45     int xlen = riscv_cpu_xlen(env);
46     bool vill = (s2 >> (xlen - 1)) & 0x1;
47     target_ulong reserved = s2 &
48                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
49                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
50     uint16_t vlen = cpu->cfg.vlenb << 3;
51     int8_t lmul;
52 
53     if (vlmul & 4) {
54         /*
55          * Fractional LMUL, check:
56          *
57          * VLEN * LMUL >= SEW
58          * VLEN >> (8 - lmul) >= sew
59          * (vlenb << 3) >> (8 - lmul) >= sew
60          */
61         if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
62             vill = true;
63         }
64     }
65 
66     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
67         /* only set vill bit. */
68         env->vill = 1;
69         env->vtype = 0;
70         env->vl = 0;
71         env->vstart = 0;
72         return 0;
73     }
74 
75     /* lmul encoded as in DisasContext::lmul */
76     lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
77     vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
78     if (s1 <= vlmax) {
79         vl = s1;
80     } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
81         vl = (s1 + 1) >> 1;
82     } else {
83         vl = vlmax;
84     }
85 
86     if (cpu->cfg.rvv_vsetvl_x0_vill && x0 && (env->vl != vl)) {
87         /* only set vill bit. */
88         env->vill = 1;
89         env->vtype = 0;
90         env->vl = 0;
91         env->vstart = 0;
92         return 0;
93     }
94 
95     env->vl = vl;
96     env->vtype = s2;
97     env->vstart = 0;
98     env->vill = 0;
99     return vl;
100 }
101 
102 /*
103  * Get the maximum number of elements can be operated.
104  *
105  * log2_esz: log2 of element size in bytes.
106  */
vext_max_elems(uint32_t desc,uint32_t log2_esz)107 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
108 {
109     /*
110      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
111      * so vlen in bytes (vlenb) is encoded as maxsz.
112      */
113     uint32_t vlenb = simd_maxsz(desc);
114 
115     /* Return VLMAX */
116     int scale = vext_lmul(desc) - log2_esz;
117     return scale < 0 ? vlenb >> -scale : vlenb << scale;
118 }
119 
120 /*
121  * This function checks watchpoint before real load operation.
122  *
123  * In system mode, the TLB API probe_access is enough for watchpoint check.
124  * In user mode, there is no watchpoint support now.
125  *
126  * It will trigger an exception if there is no mapping in TLB
127  * and page table walk can't fill the TLB entry. Then the guest
128  * software can return here after process the exception or never return.
129  *
130  * This function can also be used when direct access to probe_access_flags is
131  * needed in order to access the flags. If a pointer to a flags operand is
132  * provided the function will call probe_access_flags instead, use nonfault
133  * and update host and flags.
134  */
probe_pages(CPURISCVState * env,target_ulong addr,target_ulong len,uintptr_t ra,MMUAccessType access_type,int mmu_index,void ** host,int * flags,bool nonfault)135 static void probe_pages(CPURISCVState *env, target_ulong addr, target_ulong len,
136                         uintptr_t ra, MMUAccessType access_type, int mmu_index,
137                         void **host, int *flags, bool nonfault)
138 {
139     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
140     target_ulong curlen = MIN(pagelen, len);
141 
142     if (flags != NULL) {
143         *flags = probe_access_flags(env, adjust_addr(env, addr), curlen,
144                                     access_type, mmu_index, nonfault, host, ra);
145     } else {
146         probe_access(env, adjust_addr(env, addr), curlen, access_type,
147                      mmu_index, ra);
148     }
149 
150     if (len > curlen) {
151         addr += curlen;
152         curlen = len - curlen;
153         if (flags != NULL) {
154             *flags = probe_access_flags(env, adjust_addr(env, addr), curlen,
155                                         access_type, mmu_index, nonfault,
156                                         host, ra);
157         } else {
158             probe_access(env, adjust_addr(env, addr), curlen, access_type,
159                          mmu_index, ra);
160         }
161     }
162 }
163 
164 
vext_set_elem_mask(void * v0,int index,uint8_t value)165 static inline void vext_set_elem_mask(void *v0, int index,
166                                       uint8_t value)
167 {
168     int idx = index / 64;
169     int pos = index % 64;
170     uint64_t old = ((uint64_t *)v0)[idx];
171     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
172 }
173 
174 /* elements operations for load and store */
175 typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
176                                    uint32_t idx, void *vd, uintptr_t retaddr);
177 typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
178 
179 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
180 static inline QEMU_ALWAYS_INLINE                            \
181 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
182                 uint32_t idx, void *vd, uintptr_t retaddr)  \
183 {                                                           \
184     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
185     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
186 }                                                           \
187                                                             \
188 static inline QEMU_ALWAYS_INLINE                            \
189 void NAME##_host(void *vd, uint32_t idx, void *host)        \
190 {                                                           \
191     ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
192     *cur = (ETYPE)LDSUF##_p(host);                          \
193 }
194 
GEN_VEXT_LD_ELEM(lde_b,uint8_t,H1,ldub)195 GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
196 GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
197 GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
198 GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
199 
200 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
201 static inline QEMU_ALWAYS_INLINE                            \
202 void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
203                 uint32_t idx, void *vd, uintptr_t retaddr)  \
204 {                                                           \
205     ETYPE data = *((ETYPE *)vd + H(idx));                   \
206     cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
207 }                                                           \
208                                                             \
209 static inline QEMU_ALWAYS_INLINE                            \
210 void NAME##_host(void *vd, uint32_t idx, void *host)        \
211 {                                                           \
212     ETYPE data = *((ETYPE *)vd + H(idx));                   \
213     STSUF##_p(host, data);                                  \
214 }
215 
216 GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
217 GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
218 GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
219 GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
220 
221 static inline QEMU_ALWAYS_INLINE void
222 vext_continuous_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
223                        void *vd, uint32_t evl, target_ulong addr,
224                        uint32_t reg_start, uintptr_t ra, uint32_t esz,
225                        bool is_load)
226 {
227     uint32_t i;
228     for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
229         ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
230     }
231 }
232 
233 static inline QEMU_ALWAYS_INLINE void
vext_continuous_ldst_host(CPURISCVState * env,vext_ldst_elem_fn_host * ldst_host,void * vd,uint32_t evl,uint32_t reg_start,void * host,uint32_t esz,bool is_load)234 vext_continuous_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
235                         void *vd, uint32_t evl, uint32_t reg_start, void *host,
236                         uint32_t esz, bool is_load)
237 {
238 #if HOST_BIG_ENDIAN
239     for (; reg_start < evl; reg_start++, host += esz) {
240         ldst_host(vd, reg_start, host);
241     }
242 #else
243     if (esz == 1) {
244         uint32_t byte_offset = reg_start * esz;
245         uint32_t size = (evl - reg_start) * esz;
246 
247         if (is_load) {
248             memcpy(vd + byte_offset, host, size);
249         } else {
250             memcpy(host, vd + byte_offset, size);
251         }
252     } else {
253         for (; reg_start < evl; reg_start++, host += esz) {
254             ldst_host(vd, reg_start, host);
255         }
256     }
257 #endif
258 }
259 
vext_set_tail_elems_1s(target_ulong vl,void * vd,uint32_t desc,uint32_t nf,uint32_t esz,uint32_t max_elems)260 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
261                                    uint32_t desc, uint32_t nf,
262                                    uint32_t esz, uint32_t max_elems)
263 {
264     uint32_t vta = vext_vta(desc);
265     int k;
266 
267     if (vta == 0) {
268         return;
269     }
270 
271     for (k = 0; k < nf; ++k) {
272         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
273                           (k * max_elems + max_elems) * esz);
274     }
275 }
276 
277 /*
278  * stride: access vector element from strided memory
279  */
280 static void
vext_ldst_stride(void * vd,void * v0,target_ulong base,target_ulong stride,CPURISCVState * env,uint32_t desc,uint32_t vm,vext_ldst_elem_fn_tlb * ldst_elem,uint32_t log2_esz,uintptr_t ra)281 vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
282                  CPURISCVState *env, uint32_t desc, uint32_t vm,
283                  vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
284                  uintptr_t ra)
285 {
286     uint32_t i, k;
287     uint32_t nf = vext_nf(desc);
288     uint32_t max_elems = vext_max_elems(desc, log2_esz);
289     uint32_t esz = 1 << log2_esz;
290     uint32_t vma = vext_vma(desc);
291 
292     VSTART_CHECK_EARLY_EXIT(env, env->vl);
293 
294     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
295         k = 0;
296         while (k < nf) {
297             if (!vm && !vext_elem_mask(v0, i)) {
298                 /* set masked-off elements to 1s */
299                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
300                                   (i + k * max_elems + 1) * esz);
301                 k++;
302                 continue;
303             }
304             target_ulong addr = base + stride * i + (k << log2_esz);
305             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
306             k++;
307         }
308     }
309     env->vstart = 0;
310 
311     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
312 }
313 
314 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
315 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
316                   target_ulong stride, CPURISCVState *env,              \
317                   uint32_t desc)                                        \
318 {                                                                       \
319     uint32_t vm = vext_vm(desc);                                        \
320     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
321                      ctzl(sizeof(ETYPE)), GETPC());                     \
322 }
323 
GEN_VEXT_LD_STRIDE(vlse8_v,int8_t,lde_b_tlb)324 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
325 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
326 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
327 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
328 
329 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
330 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
331                   target_ulong stride, CPURISCVState *env,              \
332                   uint32_t desc)                                        \
333 {                                                                       \
334     uint32_t vm = vext_vm(desc);                                        \
335     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
336                      ctzl(sizeof(ETYPE)), GETPC());                     \
337 }
338 
339 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
340 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
341 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
342 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
343 
344 /*
345  * unit-stride: access elements stored contiguously in memory
346  */
347 
348 /* unmasked unit-stride load and store operation */
349 static inline QEMU_ALWAYS_INLINE void
350 vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
351                   uint32_t elems, uint32_t nf, uint32_t max_elems,
352                   uint32_t log2_esz, bool is_load, int mmu_index,
353                   vext_ldst_elem_fn_tlb *ldst_tlb,
354                   vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
355 {
356     void *host;
357     int i, k, flags;
358     uint32_t esz = 1 << log2_esz;
359     uint32_t size = (elems * nf) << log2_esz;
360     uint32_t evl = env->vstart + elems;
361     MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
362 
363     /* Check page permission/pmp/watchpoint/etc. */
364     probe_pages(env, addr, size, ra, access_type, mmu_index, &host, &flags,
365                 true);
366 
367     if (flags == 0) {
368         if (nf == 1) {
369             vext_continuous_ldst_host(env, ldst_host, vd, evl, env->vstart,
370                                       host, esz, is_load);
371         } else {
372             for (i = env->vstart; i < evl; ++i) {
373                 k = 0;
374                 while (k < nf) {
375                     ldst_host(vd, i + k * max_elems, host);
376                     host += esz;
377                     k++;
378                 }
379             }
380         }
381         env->vstart += elems;
382     } else {
383         if (nf == 1) {
384             vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
385                                    ra, esz, is_load);
386         } else {
387             /* load bytes from guest memory */
388             for (i = env->vstart; i < evl; env->vstart = ++i) {
389                 k = 0;
390                 while (k < nf) {
391                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
392                              vd, ra);
393                     addr += esz;
394                     k++;
395                 }
396             }
397         }
398     }
399 }
400 
401 static inline QEMU_ALWAYS_INLINE void
vext_ldst_us(void * vd,target_ulong base,CPURISCVState * env,uint32_t desc,vext_ldst_elem_fn_tlb * ldst_tlb,vext_ldst_elem_fn_host * ldst_host,uint32_t log2_esz,uint32_t evl,uintptr_t ra,bool is_load)402 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
403              vext_ldst_elem_fn_tlb *ldst_tlb,
404              vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
405              uint32_t evl, uintptr_t ra, bool is_load)
406 {
407     uint32_t k;
408     target_ulong page_split, elems, addr;
409     uint32_t nf = vext_nf(desc);
410     uint32_t max_elems = vext_max_elems(desc, log2_esz);
411     uint32_t esz = 1 << log2_esz;
412     uint32_t msize = nf * esz;
413     int mmu_index = riscv_env_mmu_index(env, false);
414 
415     VSTART_CHECK_EARLY_EXIT(env, evl);
416 
417 #if defined(CONFIG_USER_ONLY)
418     /*
419      * For data sizes <= 6 bytes we get better performance by simply calling
420      * vext_continuous_ldst_tlb
421      */
422     if (nf == 1 && (evl << log2_esz) <= 6) {
423         addr = base + (env->vstart << log2_esz);
424         vext_continuous_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart, ra,
425                                  esz, is_load);
426 
427         env->vstart = 0;
428         vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
429         return;
430     }
431 #endif
432 
433     /* Calculate the page range of first page */
434     addr = base + ((env->vstart * nf) << log2_esz);
435     page_split = -(addr | TARGET_PAGE_MASK);
436     /* Get number of elements */
437     elems = page_split / msize;
438     if (unlikely(env->vstart + elems >= evl)) {
439         elems = evl - env->vstart;
440     }
441 
442     /* Load/store elements in the first page */
443     if (likely(elems)) {
444         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
445                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
446     }
447 
448     /* Load/store elements in the second page */
449     if (unlikely(env->vstart < evl)) {
450         /* Cross page element */
451         if (unlikely(page_split % msize)) {
452             for (k = 0; k < nf; k++) {
453                 addr = base + ((env->vstart * nf + k) << log2_esz);
454                 ldst_tlb(env, adjust_addr(env, addr),
455                         env->vstart + k * max_elems, vd, ra);
456             }
457             env->vstart++;
458         }
459 
460         addr = base + ((env->vstart * nf) << log2_esz);
461         /* Get number of elements of second page */
462         elems = evl - env->vstart;
463 
464         /* Load/store elements in the second page */
465         vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
466                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
467     }
468 
469     env->vstart = 0;
470     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
471 }
472 
473 /*
474  * masked unit-stride load and store operation will be a special case of
475  * stride, stride = NF * sizeof (ETYPE)
476  */
477 
478 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
479 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
480                          CPURISCVState *env, uint32_t desc)         \
481 {                                                                   \
482     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
483     vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
484                      LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
485 }                                                                   \
486                                                                     \
487 void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
488                   CPURISCVState *env, uint32_t desc)                \
489 {                                                                   \
490     vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
491                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
492 }
493 
GEN_VEXT_LD_US(vle8_v,int8_t,lde_b_tlb,lde_b_host)494 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
495 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
496 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
497 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
498 
499 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
500 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
501                          CPURISCVState *env, uint32_t desc)              \
502 {                                                                        \
503     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
504     vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
505                      STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
506 }                                                                        \
507                                                                          \
508 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
509                   CPURISCVState *env, uint32_t desc)                     \
510 {                                                                        \
511     vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
512                  ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
513 }
514 
515 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
516 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
517 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
518 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
519 
520 /*
521  * unit stride mask load and store, EEW = 1
522  */
523 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
524                     CPURISCVState *env, uint32_t desc)
525 {
526     /* evl = ceil(vl/8) */
527     uint8_t evl = (env->vl + 7) >> 3;
528     vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
529                  0, evl, GETPC(), true);
530 }
531 
HELPER(vsm_v)532 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
533                     CPURISCVState *env, uint32_t desc)
534 {
535     /* evl = ceil(vl/8) */
536     uint8_t evl = (env->vl + 7) >> 3;
537     vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
538                  0, evl, GETPC(), false);
539 }
540 
541 /*
542  * index: access vector element from indexed memory
543  */
544 typedef target_ulong vext_get_index_addr(target_ulong base,
545         uint32_t idx, void *vs2);
546 
547 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
548 static target_ulong NAME(target_ulong base,            \
549                          uint32_t idx, void *vs2)      \
550 {                                                      \
551     return (base + *((ETYPE *)vs2 + H(idx)));          \
552 }
553 
GEN_VEXT_GET_INDEX_ADDR(idx_b,uint8_t,H1)554 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
555 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
556 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
557 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
558 
559 static inline void
560 vext_ldst_index(void *vd, void *v0, target_ulong base,
561                 void *vs2, CPURISCVState *env, uint32_t desc,
562                 vext_get_index_addr get_index_addr,
563                 vext_ldst_elem_fn_tlb *ldst_elem,
564                 uint32_t log2_esz, uintptr_t ra)
565 {
566     uint32_t i, k;
567     uint32_t nf = vext_nf(desc);
568     uint32_t vm = vext_vm(desc);
569     uint32_t max_elems = vext_max_elems(desc, log2_esz);
570     uint32_t esz = 1 << log2_esz;
571     uint32_t vma = vext_vma(desc);
572 
573     VSTART_CHECK_EARLY_EXIT(env, env->vl);
574 
575     /* load bytes from guest memory */
576     for (i = env->vstart; i < env->vl; env->vstart = ++i) {
577         k = 0;
578         while (k < nf) {
579             if (!vm && !vext_elem_mask(v0, i)) {
580                 /* set masked-off elements to 1s */
581                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
582                                   (i + k * max_elems + 1) * esz);
583                 k++;
584                 continue;
585             }
586             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
587             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
588             k++;
589         }
590     }
591     env->vstart = 0;
592 
593     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
594 }
595 
596 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
597 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
598                   void *vs2, CPURISCVState *env, uint32_t desc)            \
599 {                                                                          \
600     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
601                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
602 }
603 
GEN_VEXT_LD_INDEX(vlxei8_8_v,int8_t,idx_b,lde_b_tlb)604 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
605 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
606 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
607 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
608 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
609 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
610 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
611 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
612 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
613 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
614 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
615 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
616 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
617 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
618 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
619 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
620 
621 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
622 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
623                   void *vs2, CPURISCVState *env, uint32_t desc)  \
624 {                                                                \
625     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
626                     STORE_FN, ctzl(sizeof(ETYPE)),               \
627                     GETPC());                                    \
628 }
629 
630 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
631 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
632 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
633 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
634 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
635 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
636 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
637 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
638 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
639 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
640 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
641 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
642 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
643 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
644 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
645 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
646 
647 /*
648  * unit-stride fault-only-fisrt load instructions
649  */
650 static inline void
651 vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
652           uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
653           vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
654 {
655     uint32_t i, k, vl = 0;
656     uint32_t nf = vext_nf(desc);
657     uint32_t vm = vext_vm(desc);
658     uint32_t max_elems = vext_max_elems(desc, log2_esz);
659     uint32_t esz = 1 << log2_esz;
660     uint32_t msize = nf * esz;
661     uint32_t vma = vext_vma(desc);
662     target_ulong addr, addr_probe, addr_i, offset, remain, page_split, elems;
663     int mmu_index = riscv_env_mmu_index(env, false);
664     int flags, probe_flags;
665     void *host;
666 
667     VSTART_CHECK_EARLY_EXIT(env, env->vl);
668 
669     addr = base + ((env->vstart * nf) << log2_esz);
670     page_split = -(addr | TARGET_PAGE_MASK);
671     /* Get number of elements */
672     elems = page_split / msize;
673     if (unlikely(env->vstart + elems >= env->vl)) {
674         elems = env->vl - env->vstart;
675     }
676 
677     /* Check page permission/pmp/watchpoint/etc. */
678     probe_pages(env, addr, elems * msize, ra, MMU_DATA_LOAD, mmu_index, &host,
679                 &flags, true);
680 
681     /* If we are crossing a page check also the second page. */
682     if (env->vl > elems) {
683         addr_probe = addr + (elems << log2_esz);
684         probe_pages(env, addr_probe, elems * msize, ra, MMU_DATA_LOAD,
685                     mmu_index, &host, &probe_flags, true);
686         flags |= probe_flags;
687     }
688 
689     if (flags & ~TLB_WATCHPOINT) {
690         /* probe every access */
691         for (i = env->vstart; i < env->vl; i++) {
692             if (!vm && !vext_elem_mask(v0, i)) {
693                 continue;
694             }
695             addr_i = adjust_addr(env, base + i * (nf << log2_esz));
696             if (i == 0) {
697                 /* Allow fault on first element. */
698                 probe_pages(env, addr_i, nf << log2_esz, ra, MMU_DATA_LOAD,
699                             mmu_index, &host, NULL, false);
700             } else {
701                 remain = nf << log2_esz;
702                 while (remain > 0) {
703                     offset = -(addr_i | TARGET_PAGE_MASK);
704 
705                     /* Probe nonfault on subsequent elements. */
706                     probe_pages(env, addr_i, offset, 0, MMU_DATA_LOAD,
707                                 mmu_index, &host, &flags, true);
708 
709                     /*
710                      * Stop if invalid (unmapped) or mmio (transaction may
711                      * fail). Do not stop if watchpoint, as the spec says that
712                      * first-fault should continue to access the same
713                      * elements regardless of any watchpoint.
714                      */
715                     if (flags & ~TLB_WATCHPOINT) {
716                         vl = i;
717                         goto ProbeSuccess;
718                     }
719                     if (remain <= offset) {
720                         break;
721                     }
722                     remain -= offset;
723                     addr_i = adjust_addr(env, addr_i + offset);
724                 }
725             }
726         }
727     }
728 ProbeSuccess:
729     /* load bytes from guest memory */
730     if (vl != 0) {
731         env->vl = vl;
732     }
733 
734     if (env->vstart < env->vl) {
735         if (vm) {
736             /* Load/store elements in the first page */
737             if (likely(elems)) {
738                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
739                                   log2_esz, true, mmu_index, ldst_tlb,
740                                   ldst_host, ra);
741             }
742 
743             /* Load/store elements in the second page */
744             if (unlikely(env->vstart < env->vl)) {
745                 /* Cross page element */
746                 if (unlikely(page_split % msize)) {
747                     for (k = 0; k < nf; k++) {
748                         addr = base + ((env->vstart * nf + k) << log2_esz);
749                         ldst_tlb(env, adjust_addr(env, addr),
750                                  env->vstart + k * max_elems, vd, ra);
751                     }
752                     env->vstart++;
753                 }
754 
755                 addr = base + ((env->vstart * nf) << log2_esz);
756                 /* Get number of elements of second page */
757                 elems = env->vl - env->vstart;
758 
759                 /* Load/store elements in the second page */
760                 vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
761                                   log2_esz, true, mmu_index, ldst_tlb,
762                                   ldst_host, ra);
763             }
764         } else {
765             for (i = env->vstart; i < env->vl; i++) {
766                 k = 0;
767                 while (k < nf) {
768                     if (!vext_elem_mask(v0, i)) {
769                         /* set masked-off elements to 1s */
770                         vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
771                                           (i + k * max_elems + 1) * esz);
772                         k++;
773                         continue;
774                     }
775                     addr = base + ((i * nf + k) << log2_esz);
776                     ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
777                              vd, ra);
778                     k++;
779                 }
780             }
781         }
782     }
783     env->vstart = 0;
784 
785     vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
786 }
787 
788 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
789 void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
790                   CPURISCVState *env, uint32_t desc)            \
791 {                                                               \
792     vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
793               LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
794 }
795 
GEN_VEXT_LDFF(vle8ff_v,int8_t,lde_b_tlb,lde_b_host)796 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
797 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
798 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
799 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
800 
801 #define DO_SWAP(N, M) (M)
802 #define DO_AND(N, M)  (N & M)
803 #define DO_XOR(N, M)  (N ^ M)
804 #define DO_OR(N, M)   (N | M)
805 #define DO_ADD(N, M)  (N + M)
806 
807 /* Signed min/max */
808 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
809 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
810 
811 /*
812  * load and store whole register instructions
813  */
814 static inline QEMU_ALWAYS_INLINE void
815 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
816                 vext_ldst_elem_fn_tlb *ldst_tlb,
817                 vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
818                 uintptr_t ra, bool is_load)
819 {
820     target_ulong page_split, elems, addr;
821     uint32_t nf = vext_nf(desc);
822     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
823     uint32_t max_elems = vlenb >> log2_esz;
824     uint32_t evl = nf * max_elems;
825     uint32_t esz = 1 << log2_esz;
826     int mmu_index = riscv_env_mmu_index(env, false);
827 
828     /* Calculate the page range of first page */
829     addr = base + (env->vstart << log2_esz);
830     page_split = -(addr | TARGET_PAGE_MASK);
831     /* Get number of elements */
832     elems = page_split / esz;
833     if (unlikely(env->vstart + elems >= evl)) {
834         elems = evl - env->vstart;
835     }
836 
837     /* Load/store elements in the first page */
838     if (likely(elems)) {
839         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
840                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
841     }
842 
843     /* Load/store elements in the second page */
844     if (unlikely(env->vstart < evl)) {
845         /* Cross page element */
846         if (unlikely(page_split % esz)) {
847             addr = base + (env->vstart << log2_esz);
848             ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
849             env->vstart++;
850         }
851 
852         addr = base + (env->vstart << log2_esz);
853         /* Get number of elements of second page */
854         elems = evl - env->vstart;
855 
856         /* Load/store elements in the second page */
857         vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
858                           is_load, mmu_index, ldst_tlb, ldst_host, ra);
859     }
860 
861     env->vstart = 0;
862 }
863 
864 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
865 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
866                   uint32_t desc)                                    \
867 {                                                                   \
868     vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
869                     ctzl(sizeof(ETYPE)), GETPC(), true);            \
870 }
871 
GEN_VEXT_LD_WHOLE(vl1re8_v,int8_t,lde_b_tlb,lde_b_host)872 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
873 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
874 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
875 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
876 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
877 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
878 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
879 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
880 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
881 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
882 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
883 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
884 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
885 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
886 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
887 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
888 
889 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
890 void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
891                   uint32_t desc)                                        \
892 {                                                                       \
893     vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
894                     ctzl(sizeof(ETYPE)), GETPC(), false);               \
895 }
896 
897 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
898 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
899 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
900 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
901 
902 /*
903  * Vector Integer Arithmetic Instructions
904  */
905 
906 /* (TD, T1, T2, TX1, TX2) */
907 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
908 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
909 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
910 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
911 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
912 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
913 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
914 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
915 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
916 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
917 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
918 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
919 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
920 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
921 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
922 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
923 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
924 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
925 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
926 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
927 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
928 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
929 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
930 
931 #define DO_SUB(N, M) (N - M)
932 #define DO_RSUB(N, M) (M - N)
933 
934 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
935 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
936 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
937 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
938 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
939 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
940 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
941 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
942 
943 GEN_VEXT_VV(vadd_vv_b, 1)
944 GEN_VEXT_VV(vadd_vv_h, 2)
945 GEN_VEXT_VV(vadd_vv_w, 4)
946 GEN_VEXT_VV(vadd_vv_d, 8)
947 GEN_VEXT_VV(vsub_vv_b, 1)
948 GEN_VEXT_VV(vsub_vv_h, 2)
949 GEN_VEXT_VV(vsub_vv_w, 4)
950 GEN_VEXT_VV(vsub_vv_d, 8)
951 
952 
953 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
954 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
955 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
956 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
957 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
958 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
959 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
960 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
961 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
962 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
963 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
964 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
965 
966 GEN_VEXT_VX(vadd_vx_b, 1)
967 GEN_VEXT_VX(vadd_vx_h, 2)
968 GEN_VEXT_VX(vadd_vx_w, 4)
969 GEN_VEXT_VX(vadd_vx_d, 8)
970 GEN_VEXT_VX(vsub_vx_b, 1)
971 GEN_VEXT_VX(vsub_vx_h, 2)
972 GEN_VEXT_VX(vsub_vx_w, 4)
973 GEN_VEXT_VX(vsub_vx_d, 8)
974 GEN_VEXT_VX(vrsub_vx_b, 1)
975 GEN_VEXT_VX(vrsub_vx_h, 2)
976 GEN_VEXT_VX(vrsub_vx_w, 4)
977 GEN_VEXT_VX(vrsub_vx_d, 8)
978 
979 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
980 {
981     intptr_t oprsz = simd_oprsz(desc);
982     intptr_t i;
983 
984     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
985         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
986     }
987 }
988 
HELPER(vec_rsubs16)989 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
990 {
991     intptr_t oprsz = simd_oprsz(desc);
992     intptr_t i;
993 
994     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
995         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
996     }
997 }
998 
HELPER(vec_rsubs32)999 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
1000 {
1001     intptr_t oprsz = simd_oprsz(desc);
1002     intptr_t i;
1003 
1004     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1005         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
1006     }
1007 }
1008 
HELPER(vec_rsubs64)1009 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
1010 {
1011     intptr_t oprsz = simd_oprsz(desc);
1012     intptr_t i;
1013 
1014     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1015         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
1016     }
1017 }
1018 
1019 /* Vector Widening Integer Add/Subtract */
1020 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
1021 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
1022 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
1023 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
1024 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
1025 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
1026 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
1027 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
1028 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
1029 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
1030 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
1031 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
RVVCALL(OPIVV2,vwaddu_vv_b,WOP_UUU_B,H2,H1,H1,DO_ADD)1032 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
1033 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
1034 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
1035 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
1036 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
1037 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
1038 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
1039 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
1040 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1041 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1042 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1043 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1044 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1045 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1046 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1047 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1048 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1049 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1050 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1051 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1052 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1053 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1054 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1055 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1056 GEN_VEXT_VV(vwaddu_vv_b, 2)
1057 GEN_VEXT_VV(vwaddu_vv_h, 4)
1058 GEN_VEXT_VV(vwaddu_vv_w, 8)
1059 GEN_VEXT_VV(vwsubu_vv_b, 2)
1060 GEN_VEXT_VV(vwsubu_vv_h, 4)
1061 GEN_VEXT_VV(vwsubu_vv_w, 8)
1062 GEN_VEXT_VV(vwadd_vv_b, 2)
1063 GEN_VEXT_VV(vwadd_vv_h, 4)
1064 GEN_VEXT_VV(vwadd_vv_w, 8)
1065 GEN_VEXT_VV(vwsub_vv_b, 2)
1066 GEN_VEXT_VV(vwsub_vv_h, 4)
1067 GEN_VEXT_VV(vwsub_vv_w, 8)
1068 GEN_VEXT_VV(vwaddu_wv_b, 2)
1069 GEN_VEXT_VV(vwaddu_wv_h, 4)
1070 GEN_VEXT_VV(vwaddu_wv_w, 8)
1071 GEN_VEXT_VV(vwsubu_wv_b, 2)
1072 GEN_VEXT_VV(vwsubu_wv_h, 4)
1073 GEN_VEXT_VV(vwsubu_wv_w, 8)
1074 GEN_VEXT_VV(vwadd_wv_b, 2)
1075 GEN_VEXT_VV(vwadd_wv_h, 4)
1076 GEN_VEXT_VV(vwadd_wv_w, 8)
1077 GEN_VEXT_VV(vwsub_wv_b, 2)
1078 GEN_VEXT_VV(vwsub_wv_h, 4)
1079 GEN_VEXT_VV(vwsub_wv_w, 8)
1080 
1081 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1082 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1083 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1084 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1085 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1086 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1087 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1088 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1089 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1090 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1091 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1092 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1093 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1094 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1095 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1096 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1097 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1098 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1099 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1100 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1101 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1102 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1103 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1104 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1105 GEN_VEXT_VX(vwaddu_vx_b, 2)
1106 GEN_VEXT_VX(vwaddu_vx_h, 4)
1107 GEN_VEXT_VX(vwaddu_vx_w, 8)
1108 GEN_VEXT_VX(vwsubu_vx_b, 2)
1109 GEN_VEXT_VX(vwsubu_vx_h, 4)
1110 GEN_VEXT_VX(vwsubu_vx_w, 8)
1111 GEN_VEXT_VX(vwadd_vx_b, 2)
1112 GEN_VEXT_VX(vwadd_vx_h, 4)
1113 GEN_VEXT_VX(vwadd_vx_w, 8)
1114 GEN_VEXT_VX(vwsub_vx_b, 2)
1115 GEN_VEXT_VX(vwsub_vx_h, 4)
1116 GEN_VEXT_VX(vwsub_vx_w, 8)
1117 GEN_VEXT_VX(vwaddu_wx_b, 2)
1118 GEN_VEXT_VX(vwaddu_wx_h, 4)
1119 GEN_VEXT_VX(vwaddu_wx_w, 8)
1120 GEN_VEXT_VX(vwsubu_wx_b, 2)
1121 GEN_VEXT_VX(vwsubu_wx_h, 4)
1122 GEN_VEXT_VX(vwsubu_wx_w, 8)
1123 GEN_VEXT_VX(vwadd_wx_b, 2)
1124 GEN_VEXT_VX(vwadd_wx_h, 4)
1125 GEN_VEXT_VX(vwadd_wx_w, 8)
1126 GEN_VEXT_VX(vwsub_wx_b, 2)
1127 GEN_VEXT_VX(vwsub_wx_h, 4)
1128 GEN_VEXT_VX(vwsub_wx_w, 8)
1129 
1130 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1131 #define DO_VADC(N, M, C) (N + M + C)
1132 #define DO_VSBC(N, M, C) (N - M - C)
1133 
1134 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1135 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1136                   CPURISCVState *env, uint32_t desc)          \
1137 {                                                             \
1138     uint32_t vl = env->vl;                                    \
1139     uint32_t esz = sizeof(ETYPE);                             \
1140     uint32_t total_elems =                                    \
1141         vext_get_total_elems(env, desc, esz);                 \
1142     uint32_t vta = vext_vta(desc);                            \
1143     uint32_t i;                                               \
1144                                                               \
1145     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1146                                                               \
1147     for (i = env->vstart; i < vl; i++) {                      \
1148         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1149         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1150         ETYPE carry = vext_elem_mask(v0, i);                  \
1151                                                               \
1152         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1153     }                                                         \
1154     env->vstart = 0;                                          \
1155     /* set tail elements to 1s */                             \
1156     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1157 }
1158 
1159 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1160 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1161 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1162 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1163 
1164 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1165 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1166 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1167 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1168 
1169 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1170 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1171                   CPURISCVState *env, uint32_t desc)                     \
1172 {                                                                        \
1173     uint32_t vl = env->vl;                                               \
1174     uint32_t esz = sizeof(ETYPE);                                        \
1175     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1176     uint32_t vta = vext_vta(desc);                                       \
1177     uint32_t i;                                                          \
1178                                                                          \
1179     VSTART_CHECK_EARLY_EXIT(env, vl);                                    \
1180                                                                          \
1181     for (i = env->vstart; i < vl; i++) {                                 \
1182         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1183         ETYPE carry = vext_elem_mask(v0, i);                             \
1184                                                                          \
1185         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1186     }                                                                    \
1187     env->vstart = 0;                                                     \
1188     /* set tail elements to 1s */                                        \
1189     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1190 }
1191 
1192 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1193 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1194 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1195 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1196 
1197 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1198 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1199 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1200 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1201 
1202 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1203                           (__typeof(N))(N + M) < N)
1204 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1205 
1206 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1207 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1208                   CPURISCVState *env, uint32_t desc)          \
1209 {                                                             \
1210     uint32_t vl = env->vl;                                    \
1211     uint32_t vm = vext_vm(desc);                              \
1212     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1213     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1214     uint32_t i;                                               \
1215                                                               \
1216     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1217                                                               \
1218     for (i = env->vstart; i < vl; i++) {                      \
1219         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1220         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1221         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1222         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1223     }                                                         \
1224     env->vstart = 0;                                          \
1225     /*
1226      * mask destination register are always tail-agnostic
1227      * set tail elements to 1s
1228      */                                                       \
1229     if (vta_all_1s) {                                         \
1230         for (; i < total_elems; i++) {                        \
1231             vext_set_elem_mask(vd, i, 1);                     \
1232         }                                                     \
1233     }                                                         \
1234 }
1235 
1236 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1237 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1238 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1239 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1240 
1241 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1242 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1243 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1244 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1245 
1246 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1247 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1248                   void *vs2, CPURISCVState *env, uint32_t desc) \
1249 {                                                               \
1250     uint32_t vl = env->vl;                                      \
1251     uint32_t vm = vext_vm(desc);                                \
1252     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1253     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1254     uint32_t i;                                                 \
1255                                                                 \
1256     VSTART_CHECK_EARLY_EXIT(env, vl);                           \
1257                                                                 \
1258     for (i = env->vstart; i < vl; i++) {                        \
1259         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1260         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1261         vext_set_elem_mask(vd, i,                               \
1262                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1263     }                                                           \
1264     env->vstart = 0;                                            \
1265     /*
1266      * mask destination register are always tail-agnostic
1267      * set tail elements to 1s
1268      */                                                         \
1269     if (vta_all_1s) {                                           \
1270         for (; i < total_elems; i++) {                          \
1271             vext_set_elem_mask(vd, i, 1);                       \
1272         }                                                       \
1273     }                                                           \
1274 }
1275 
1276 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1277 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1278 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1279 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1280 
1281 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1282 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1283 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1284 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1285 
1286 /* Vector Bitwise Logical Instructions */
1287 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1288 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1289 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1290 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1291 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1292 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1293 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1294 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1295 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1296 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1297 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1298 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1299 GEN_VEXT_VV(vand_vv_b, 1)
1300 GEN_VEXT_VV(vand_vv_h, 2)
1301 GEN_VEXT_VV(vand_vv_w, 4)
1302 GEN_VEXT_VV(vand_vv_d, 8)
1303 GEN_VEXT_VV(vor_vv_b, 1)
1304 GEN_VEXT_VV(vor_vv_h, 2)
1305 GEN_VEXT_VV(vor_vv_w, 4)
1306 GEN_VEXT_VV(vor_vv_d, 8)
1307 GEN_VEXT_VV(vxor_vv_b, 1)
1308 GEN_VEXT_VV(vxor_vv_h, 2)
1309 GEN_VEXT_VV(vxor_vv_w, 4)
1310 GEN_VEXT_VV(vxor_vv_d, 8)
1311 
1312 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1313 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1314 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1315 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1316 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1317 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1318 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1319 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1320 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1321 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1322 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1323 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1324 GEN_VEXT_VX(vand_vx_b, 1)
1325 GEN_VEXT_VX(vand_vx_h, 2)
1326 GEN_VEXT_VX(vand_vx_w, 4)
1327 GEN_VEXT_VX(vand_vx_d, 8)
1328 GEN_VEXT_VX(vor_vx_b, 1)
1329 GEN_VEXT_VX(vor_vx_h, 2)
1330 GEN_VEXT_VX(vor_vx_w, 4)
1331 GEN_VEXT_VX(vor_vx_d, 8)
1332 GEN_VEXT_VX(vxor_vx_b, 1)
1333 GEN_VEXT_VX(vxor_vx_h, 2)
1334 GEN_VEXT_VX(vxor_vx_w, 4)
1335 GEN_VEXT_VX(vxor_vx_d, 8)
1336 
1337 /* Vector Single-Width Bit Shift Instructions */
1338 #define DO_SLL(N, M)  (N << (M))
1339 #define DO_SRL(N, M)  (N >> (M))
1340 
1341 /* generate the helpers for shift instructions with two vector operators */
1342 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1343 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1344                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1345 {                                                                         \
1346     uint32_t vm = vext_vm(desc);                                          \
1347     uint32_t vl = env->vl;                                                \
1348     uint32_t esz = sizeof(TS1);                                           \
1349     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1350     uint32_t vta = vext_vta(desc);                                        \
1351     uint32_t vma = vext_vma(desc);                                        \
1352     uint32_t i;                                                           \
1353                                                                           \
1354     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
1355                                                                           \
1356     for (i = env->vstart; i < vl; i++) {                                  \
1357         if (!vm && !vext_elem_mask(v0, i)) {                              \
1358             /* set masked-off elements to 1s */                           \
1359             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1360             continue;                                                     \
1361         }                                                                 \
1362         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1363         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1364         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1365     }                                                                     \
1366     env->vstart = 0;                                                      \
1367     /* set tail elements to 1s */                                         \
1368     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1369 }
1370 
1371 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1372 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1373 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1374 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1375 
1376 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1377 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1378 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1379 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1380 
1381 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1382 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1383 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1384 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1385 
1386 /*
1387  * generate the helpers for shift instructions with one vector and one scalar
1388  */
1389 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1390 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1391                   void *vs2, CPURISCVState *env,            \
1392                   uint32_t desc)                            \
1393 {                                                           \
1394     uint32_t vm = vext_vm(desc);                            \
1395     uint32_t vl = env->vl;                                  \
1396     uint32_t esz = sizeof(TD);                              \
1397     uint32_t total_elems =                                  \
1398         vext_get_total_elems(env, desc, esz);               \
1399     uint32_t vta = vext_vta(desc);                          \
1400     uint32_t vma = vext_vma(desc);                          \
1401     uint32_t i;                                             \
1402                                                             \
1403     VSTART_CHECK_EARLY_EXIT(env, vl);                       \
1404                                                             \
1405     for (i = env->vstart; i < vl; i++) {                    \
1406         if (!vm && !vext_elem_mask(v0, i)) {                \
1407             /* set masked-off elements to 1s */             \
1408             vext_set_elems_1s(vd, vma, i * esz,             \
1409                               (i + 1) * esz);               \
1410             continue;                                       \
1411         }                                                   \
1412         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1413         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1414     }                                                       \
1415     env->vstart = 0;                                        \
1416     /* set tail elements to 1s */                           \
1417     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1418 }
1419 
1420 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1421 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1422 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1423 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1424 
1425 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1426 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1427 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1428 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1429 
1430 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1431 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1432 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1433 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1434 
1435 /* Vector Narrowing Integer Right Shift Instructions */
1436 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1437 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1438 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1439 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1440 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1441 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1442 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1443 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1444 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1445 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1446 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1447 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1448 
1449 /* Vector Integer Comparison Instructions */
1450 #define DO_MSEQ(N, M) (N == M)
1451 #define DO_MSNE(N, M) (N != M)
1452 #define DO_MSLT(N, M) (N < M)
1453 #define DO_MSLE(N, M) (N <= M)
1454 #define DO_MSGT(N, M) (N > M)
1455 
1456 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1457 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1458                   CPURISCVState *env, uint32_t desc)          \
1459 {                                                             \
1460     uint32_t vm = vext_vm(desc);                              \
1461     uint32_t vl = env->vl;                                    \
1462     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1463     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1464     uint32_t vma = vext_vma(desc);                            \
1465     uint32_t i;                                               \
1466                                                               \
1467     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
1468                                                               \
1469     for (i = env->vstart; i < vl; i++) {                      \
1470         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1471         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1472         if (!vm && !vext_elem_mask(v0, i)) {                  \
1473             /* set masked-off elements to 1s */               \
1474             if (vma) {                                        \
1475                 vext_set_elem_mask(vd, i, 1);                 \
1476             }                                                 \
1477             continue;                                         \
1478         }                                                     \
1479         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1480     }                                                         \
1481     env->vstart = 0;                                          \
1482     /*
1483      * mask destination register are always tail-agnostic
1484      * set tail elements to 1s
1485      */                                                       \
1486     if (vta_all_1s) {                                         \
1487         for (; i < total_elems; i++) {                        \
1488             vext_set_elem_mask(vd, i, 1);                     \
1489         }                                                     \
1490     }                                                         \
1491 }
1492 
1493 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1494 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1495 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1496 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1497 
1498 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1499 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1500 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1501 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1502 
1503 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1504 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1505 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1506 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1507 
1508 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1509 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1510 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1511 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1512 
1513 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1514 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1515 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1516 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1517 
1518 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1519 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1520 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1521 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1522 
1523 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1524 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1525                   CPURISCVState *env, uint32_t desc)                \
1526 {                                                                   \
1527     uint32_t vm = vext_vm(desc);                                    \
1528     uint32_t vl = env->vl;                                          \
1529     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1530     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1531     uint32_t vma = vext_vma(desc);                                  \
1532     uint32_t i;                                                     \
1533                                                                     \
1534     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
1535                                                                     \
1536     for (i = env->vstart; i < vl; i++) {                            \
1537         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1538         if (!vm && !vext_elem_mask(v0, i)) {                        \
1539             /* set masked-off elements to 1s */                     \
1540             if (vma) {                                              \
1541                 vext_set_elem_mask(vd, i, 1);                       \
1542             }                                                       \
1543             continue;                                               \
1544         }                                                           \
1545         vext_set_elem_mask(vd, i,                                   \
1546                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1547     }                                                               \
1548     env->vstart = 0;                                                \
1549     /*
1550      * mask destination register are always tail-agnostic
1551      * set tail elements to 1s
1552      */                                                             \
1553     if (vta_all_1s) {                                               \
1554         for (; i < total_elems; i++) {                              \
1555             vext_set_elem_mask(vd, i, 1);                           \
1556         }                                                           \
1557     }                                                               \
1558 }
1559 
1560 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1561 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1562 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1563 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1564 
1565 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1566 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1567 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1568 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1569 
1570 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1571 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1572 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1573 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1574 
1575 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1576 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1577 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1578 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1579 
1580 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1581 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1582 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1583 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1584 
1585 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1586 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1587 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1588 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1589 
1590 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1591 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1592 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1593 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1594 
1595 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1596 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1597 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1598 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1599 
1600 /* Vector Integer Min/Max Instructions */
1601 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1602 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1603 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1604 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1605 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1606 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1607 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1608 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1609 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1610 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1611 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1612 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1613 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1614 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1615 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1616 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1617 GEN_VEXT_VV(vminu_vv_b, 1)
1618 GEN_VEXT_VV(vminu_vv_h, 2)
1619 GEN_VEXT_VV(vminu_vv_w, 4)
1620 GEN_VEXT_VV(vminu_vv_d, 8)
1621 GEN_VEXT_VV(vmin_vv_b, 1)
1622 GEN_VEXT_VV(vmin_vv_h, 2)
1623 GEN_VEXT_VV(vmin_vv_w, 4)
1624 GEN_VEXT_VV(vmin_vv_d, 8)
1625 GEN_VEXT_VV(vmaxu_vv_b, 1)
1626 GEN_VEXT_VV(vmaxu_vv_h, 2)
1627 GEN_VEXT_VV(vmaxu_vv_w, 4)
1628 GEN_VEXT_VV(vmaxu_vv_d, 8)
1629 GEN_VEXT_VV(vmax_vv_b, 1)
1630 GEN_VEXT_VV(vmax_vv_h, 2)
1631 GEN_VEXT_VV(vmax_vv_w, 4)
1632 GEN_VEXT_VV(vmax_vv_d, 8)
1633 
1634 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1635 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1636 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1637 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1638 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1639 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1640 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1641 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1642 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1643 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1644 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1645 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1646 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1647 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1648 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1649 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1650 GEN_VEXT_VX(vminu_vx_b, 1)
1651 GEN_VEXT_VX(vminu_vx_h, 2)
1652 GEN_VEXT_VX(vminu_vx_w, 4)
1653 GEN_VEXT_VX(vminu_vx_d, 8)
1654 GEN_VEXT_VX(vmin_vx_b, 1)
1655 GEN_VEXT_VX(vmin_vx_h, 2)
1656 GEN_VEXT_VX(vmin_vx_w, 4)
1657 GEN_VEXT_VX(vmin_vx_d, 8)
1658 GEN_VEXT_VX(vmaxu_vx_b, 1)
1659 GEN_VEXT_VX(vmaxu_vx_h, 2)
1660 GEN_VEXT_VX(vmaxu_vx_w, 4)
1661 GEN_VEXT_VX(vmaxu_vx_d, 8)
1662 GEN_VEXT_VX(vmax_vx_b, 1)
1663 GEN_VEXT_VX(vmax_vx_h, 2)
1664 GEN_VEXT_VX(vmax_vx_w, 4)
1665 GEN_VEXT_VX(vmax_vx_d, 8)
1666 
1667 /* Vector Single-Width Integer Multiply Instructions */
1668 #define DO_MUL(N, M) (N * M)
1669 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1670 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1671 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1672 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1673 GEN_VEXT_VV(vmul_vv_b, 1)
1674 GEN_VEXT_VV(vmul_vv_h, 2)
1675 GEN_VEXT_VV(vmul_vv_w, 4)
1676 GEN_VEXT_VV(vmul_vv_d, 8)
1677 
1678 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1679 {
1680     return (int16_t)s2 * (int16_t)s1 >> 8;
1681 }
1682 
do_mulh_h(int16_t s2,int16_t s1)1683 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1684 {
1685     return (int32_t)s2 * (int32_t)s1 >> 16;
1686 }
1687 
do_mulh_w(int32_t s2,int32_t s1)1688 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1689 {
1690     return (int64_t)s2 * (int64_t)s1 >> 32;
1691 }
1692 
do_mulh_d(int64_t s2,int64_t s1)1693 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1694 {
1695     uint64_t hi_64, lo_64;
1696 
1697     muls64(&lo_64, &hi_64, s1, s2);
1698     return hi_64;
1699 }
1700 
do_mulhu_b(uint8_t s2,uint8_t s1)1701 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1702 {
1703     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1704 }
1705 
do_mulhu_h(uint16_t s2,uint16_t s1)1706 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1707 {
1708     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1709 }
1710 
do_mulhu_w(uint32_t s2,uint32_t s1)1711 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1712 {
1713     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1714 }
1715 
do_mulhu_d(uint64_t s2,uint64_t s1)1716 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1717 {
1718     uint64_t hi_64, lo_64;
1719 
1720     mulu64(&lo_64, &hi_64, s2, s1);
1721     return hi_64;
1722 }
1723 
do_mulhsu_b(int8_t s2,uint8_t s1)1724 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1725 {
1726     return (int16_t)s2 * (uint16_t)s1 >> 8;
1727 }
1728 
do_mulhsu_h(int16_t s2,uint16_t s1)1729 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1730 {
1731     return (int32_t)s2 * (uint32_t)s1 >> 16;
1732 }
1733 
do_mulhsu_w(int32_t s2,uint32_t s1)1734 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1735 {
1736     return (int64_t)s2 * (uint64_t)s1 >> 32;
1737 }
1738 
1739 /*
1740  * Let  A = signed operand,
1741  *      B = unsigned operand
1742  *      P = mulu64(A, B), unsigned product
1743  *
1744  * LET  X = 2 ** 64  - A, 2's complement of A
1745  *      SP = signed product
1746  * THEN
1747  *      IF A < 0
1748  *          SP = -X * B
1749  *             = -(2 ** 64 - A) * B
1750  *             = A * B - 2 ** 64 * B
1751  *             = P - 2 ** 64 * B
1752  *      ELSE
1753  *          SP = P
1754  * THEN
1755  *      HI_P -= (A < 0 ? B : 0)
1756  */
1757 
do_mulhsu_d(int64_t s2,uint64_t s1)1758 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1759 {
1760     uint64_t hi_64, lo_64;
1761 
1762     mulu64(&lo_64, &hi_64, s2, s1);
1763 
1764     hi_64 -= s2 < 0 ? s1 : 0;
1765     return hi_64;
1766 }
1767 
1768 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1769 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1770 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1771 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1772 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1773 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1774 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1775 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1776 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1777 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1778 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1779 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1780 GEN_VEXT_VV(vmulh_vv_b, 1)
1781 GEN_VEXT_VV(vmulh_vv_h, 2)
1782 GEN_VEXT_VV(vmulh_vv_w, 4)
1783 GEN_VEXT_VV(vmulh_vv_d, 8)
1784 GEN_VEXT_VV(vmulhu_vv_b, 1)
1785 GEN_VEXT_VV(vmulhu_vv_h, 2)
1786 GEN_VEXT_VV(vmulhu_vv_w, 4)
1787 GEN_VEXT_VV(vmulhu_vv_d, 8)
1788 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1789 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1790 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1791 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1792 
1793 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1794 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1795 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1796 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1797 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1798 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1799 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1800 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1801 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1802 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1803 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1804 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1805 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1806 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1807 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1808 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1809 GEN_VEXT_VX(vmul_vx_b, 1)
1810 GEN_VEXT_VX(vmul_vx_h, 2)
1811 GEN_VEXT_VX(vmul_vx_w, 4)
1812 GEN_VEXT_VX(vmul_vx_d, 8)
1813 GEN_VEXT_VX(vmulh_vx_b, 1)
1814 GEN_VEXT_VX(vmulh_vx_h, 2)
1815 GEN_VEXT_VX(vmulh_vx_w, 4)
1816 GEN_VEXT_VX(vmulh_vx_d, 8)
1817 GEN_VEXT_VX(vmulhu_vx_b, 1)
1818 GEN_VEXT_VX(vmulhu_vx_h, 2)
1819 GEN_VEXT_VX(vmulhu_vx_w, 4)
1820 GEN_VEXT_VX(vmulhu_vx_d, 8)
1821 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1822 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1823 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1824 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1825 
1826 /* Vector Integer Divide Instructions */
1827 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1828 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1829 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1830         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1831 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1832         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1833 
1834 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1835 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1836 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1837 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1838 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1839 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1840 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1841 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1842 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1843 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1844 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1845 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1846 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1847 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1848 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1849 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1850 GEN_VEXT_VV(vdivu_vv_b, 1)
1851 GEN_VEXT_VV(vdivu_vv_h, 2)
1852 GEN_VEXT_VV(vdivu_vv_w, 4)
1853 GEN_VEXT_VV(vdivu_vv_d, 8)
1854 GEN_VEXT_VV(vdiv_vv_b, 1)
1855 GEN_VEXT_VV(vdiv_vv_h, 2)
1856 GEN_VEXT_VV(vdiv_vv_w, 4)
1857 GEN_VEXT_VV(vdiv_vv_d, 8)
1858 GEN_VEXT_VV(vremu_vv_b, 1)
1859 GEN_VEXT_VV(vremu_vv_h, 2)
1860 GEN_VEXT_VV(vremu_vv_w, 4)
1861 GEN_VEXT_VV(vremu_vv_d, 8)
1862 GEN_VEXT_VV(vrem_vv_b, 1)
1863 GEN_VEXT_VV(vrem_vv_h, 2)
1864 GEN_VEXT_VV(vrem_vv_w, 4)
1865 GEN_VEXT_VV(vrem_vv_d, 8)
1866 
1867 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1868 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1869 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1870 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1871 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1872 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1873 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1874 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1875 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1876 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1877 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1878 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1879 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1880 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1881 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1882 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1883 GEN_VEXT_VX(vdivu_vx_b, 1)
1884 GEN_VEXT_VX(vdivu_vx_h, 2)
1885 GEN_VEXT_VX(vdivu_vx_w, 4)
1886 GEN_VEXT_VX(vdivu_vx_d, 8)
1887 GEN_VEXT_VX(vdiv_vx_b, 1)
1888 GEN_VEXT_VX(vdiv_vx_h, 2)
1889 GEN_VEXT_VX(vdiv_vx_w, 4)
1890 GEN_VEXT_VX(vdiv_vx_d, 8)
1891 GEN_VEXT_VX(vremu_vx_b, 1)
1892 GEN_VEXT_VX(vremu_vx_h, 2)
1893 GEN_VEXT_VX(vremu_vx_w, 4)
1894 GEN_VEXT_VX(vremu_vx_d, 8)
1895 GEN_VEXT_VX(vrem_vx_b, 1)
1896 GEN_VEXT_VX(vrem_vx_h, 2)
1897 GEN_VEXT_VX(vrem_vx_w, 4)
1898 GEN_VEXT_VX(vrem_vx_d, 8)
1899 
1900 /* Vector Widening Integer Multiply Instructions */
1901 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1902 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1903 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1904 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1905 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1906 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1907 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1908 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1909 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1910 GEN_VEXT_VV(vwmul_vv_b, 2)
1911 GEN_VEXT_VV(vwmul_vv_h, 4)
1912 GEN_VEXT_VV(vwmul_vv_w, 8)
1913 GEN_VEXT_VV(vwmulu_vv_b, 2)
1914 GEN_VEXT_VV(vwmulu_vv_h, 4)
1915 GEN_VEXT_VV(vwmulu_vv_w, 8)
1916 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1917 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1918 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1919 
1920 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1921 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1922 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1923 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1924 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1925 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1926 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1927 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1928 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1929 GEN_VEXT_VX(vwmul_vx_b, 2)
1930 GEN_VEXT_VX(vwmul_vx_h, 4)
1931 GEN_VEXT_VX(vwmul_vx_w, 8)
1932 GEN_VEXT_VX(vwmulu_vx_b, 2)
1933 GEN_VEXT_VX(vwmulu_vx_h, 4)
1934 GEN_VEXT_VX(vwmulu_vx_w, 8)
1935 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1936 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1937 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1938 
1939 /* Vector Single-Width Integer Multiply-Add Instructions */
1940 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1941 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1942 {                                                                  \
1943     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1944     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1945     TD d = *((TD *)vd + HD(i));                                    \
1946     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1947 }
1948 
1949 #define DO_MACC(N, M, D) (M * N + D)
1950 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1951 #define DO_MADD(N, M, D) (M * D + N)
1952 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1953 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1954 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1955 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1956 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1957 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1958 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1959 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1960 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1961 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1962 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1963 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1964 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1965 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1966 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1967 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1968 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1969 GEN_VEXT_VV(vmacc_vv_b, 1)
1970 GEN_VEXT_VV(vmacc_vv_h, 2)
1971 GEN_VEXT_VV(vmacc_vv_w, 4)
1972 GEN_VEXT_VV(vmacc_vv_d, 8)
1973 GEN_VEXT_VV(vnmsac_vv_b, 1)
1974 GEN_VEXT_VV(vnmsac_vv_h, 2)
1975 GEN_VEXT_VV(vnmsac_vv_w, 4)
1976 GEN_VEXT_VV(vnmsac_vv_d, 8)
1977 GEN_VEXT_VV(vmadd_vv_b, 1)
1978 GEN_VEXT_VV(vmadd_vv_h, 2)
1979 GEN_VEXT_VV(vmadd_vv_w, 4)
1980 GEN_VEXT_VV(vmadd_vv_d, 8)
1981 GEN_VEXT_VV(vnmsub_vv_b, 1)
1982 GEN_VEXT_VV(vnmsub_vv_h, 2)
1983 GEN_VEXT_VV(vnmsub_vv_w, 4)
1984 GEN_VEXT_VV(vnmsub_vv_d, 8)
1985 
1986 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1987 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1988 {                                                                   \
1989     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1990     TD d = *((TD *)vd + HD(i));                                     \
1991     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1992 }
1993 
1994 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1995 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1996 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1997 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1998 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1999 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
2000 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
2001 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
2002 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
2003 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
2004 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
2005 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
2006 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
2007 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
2008 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
2009 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
2010 GEN_VEXT_VX(vmacc_vx_b, 1)
2011 GEN_VEXT_VX(vmacc_vx_h, 2)
2012 GEN_VEXT_VX(vmacc_vx_w, 4)
2013 GEN_VEXT_VX(vmacc_vx_d, 8)
2014 GEN_VEXT_VX(vnmsac_vx_b, 1)
2015 GEN_VEXT_VX(vnmsac_vx_h, 2)
2016 GEN_VEXT_VX(vnmsac_vx_w, 4)
2017 GEN_VEXT_VX(vnmsac_vx_d, 8)
2018 GEN_VEXT_VX(vmadd_vx_b, 1)
2019 GEN_VEXT_VX(vmadd_vx_h, 2)
2020 GEN_VEXT_VX(vmadd_vx_w, 4)
2021 GEN_VEXT_VX(vmadd_vx_d, 8)
2022 GEN_VEXT_VX(vnmsub_vx_b, 1)
2023 GEN_VEXT_VX(vnmsub_vx_h, 2)
2024 GEN_VEXT_VX(vnmsub_vx_w, 4)
2025 GEN_VEXT_VX(vnmsub_vx_d, 8)
2026 
2027 /* Vector Widening Integer Multiply-Add Instructions */
2028 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
2029 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
2030 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
2031 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
2032 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
2033 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
2034 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
2035 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
2036 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
2037 GEN_VEXT_VV(vwmaccu_vv_b, 2)
2038 GEN_VEXT_VV(vwmaccu_vv_h, 4)
2039 GEN_VEXT_VV(vwmaccu_vv_w, 8)
2040 GEN_VEXT_VV(vwmacc_vv_b, 2)
2041 GEN_VEXT_VV(vwmacc_vv_h, 4)
2042 GEN_VEXT_VV(vwmacc_vv_w, 8)
2043 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
2044 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
2045 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
2046 
2047 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
2048 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
2049 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2050 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2051 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2052 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2053 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2054 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2055 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2056 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2057 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2058 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2059 GEN_VEXT_VX(vwmaccu_vx_b, 2)
2060 GEN_VEXT_VX(vwmaccu_vx_h, 4)
2061 GEN_VEXT_VX(vwmaccu_vx_w, 8)
2062 GEN_VEXT_VX(vwmacc_vx_b, 2)
2063 GEN_VEXT_VX(vwmacc_vx_h, 4)
2064 GEN_VEXT_VX(vwmacc_vx_w, 8)
2065 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2066 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2067 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2068 GEN_VEXT_VX(vwmaccus_vx_b, 2)
2069 GEN_VEXT_VX(vwmaccus_vx_h, 4)
2070 GEN_VEXT_VX(vwmaccus_vx_w, 8)
2071 
2072 /* Vector Integer Merge and Move Instructions */
2073 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2074 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2075                   uint32_t desc)                                     \
2076 {                                                                    \
2077     uint32_t vl = env->vl;                                           \
2078     uint32_t esz = sizeof(ETYPE);                                    \
2079     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2080     uint32_t vta = vext_vta(desc);                                   \
2081     uint32_t i;                                                      \
2082                                                                      \
2083     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2084                                                                      \
2085     for (i = env->vstart; i < vl; i++) {                             \
2086         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2087         *((ETYPE *)vd + H(i)) = s1;                                  \
2088     }                                                                \
2089     env->vstart = 0;                                                 \
2090     /* set tail elements to 1s */                                    \
2091     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2092 }
2093 
2094 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2095 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2096 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2097 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2098 
2099 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2100 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2101                   uint32_t desc)                                     \
2102 {                                                                    \
2103     uint32_t vl = env->vl;                                           \
2104     uint32_t esz = sizeof(ETYPE);                                    \
2105     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2106     uint32_t vta = vext_vta(desc);                                   \
2107     uint32_t i;                                                      \
2108                                                                      \
2109     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2110                                                                      \
2111     for (i = env->vstart; i < vl; i++) {                             \
2112         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2113     }                                                                \
2114     env->vstart = 0;                                                 \
2115     /* set tail elements to 1s */                                    \
2116     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2117 }
2118 
2119 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2120 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2121 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2122 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2123 
2124 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2125 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2126                   CPURISCVState *env, uint32_t desc)                 \
2127 {                                                                    \
2128     uint32_t vl = env->vl;                                           \
2129     uint32_t esz = sizeof(ETYPE);                                    \
2130     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2131     uint32_t vta = vext_vta(desc);                                   \
2132     uint32_t i;                                                      \
2133                                                                      \
2134     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2135                                                                      \
2136     for (i = env->vstart; i < vl; i++) {                             \
2137         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2138         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2139     }                                                                \
2140     env->vstart = 0;                                                 \
2141     /* set tail elements to 1s */                                    \
2142     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2143 }
2144 
2145 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2146 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2147 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2148 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2149 
2150 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2151 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2152                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2153 {                                                                    \
2154     uint32_t vl = env->vl;                                           \
2155     uint32_t esz = sizeof(ETYPE);                                    \
2156     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2157     uint32_t vta = vext_vta(desc);                                   \
2158     uint32_t i;                                                      \
2159                                                                      \
2160     VSTART_CHECK_EARLY_EXIT(env, vl);                                \
2161                                                                      \
2162     for (i = env->vstart; i < vl; i++) {                             \
2163         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2164         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2165                    (ETYPE)(target_long)s1);                          \
2166         *((ETYPE *)vd + H(i)) = d;                                   \
2167     }                                                                \
2168     env->vstart = 0;                                                 \
2169     /* set tail elements to 1s */                                    \
2170     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2171 }
2172 
2173 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2174 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2175 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2176 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2177 
2178 /*
2179  * Vector Fixed-Point Arithmetic Instructions
2180  */
2181 
2182 /* Vector Single-Width Saturating Add and Subtract */
2183 
2184 /*
2185  * As fixed point instructions probably have round mode and saturation,
2186  * define common macros for fixed point here.
2187  */
2188 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2189                           CPURISCVState *env, int vxrm);
2190 
2191 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2192 static inline void                                                  \
2193 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2194           CPURISCVState *env, int vxrm)                             \
2195 {                                                                   \
2196     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2197     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2198     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2199 }
2200 
2201 static inline void
vext_vv_rm_1(void * vd,void * v0,void * vs1,void * vs2,CPURISCVState * env,uint32_t vl,uint32_t vm,int vxrm,opivv2_rm_fn * fn,uint32_t vma,uint32_t esz)2202 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2203              CPURISCVState *env,
2204              uint32_t vl, uint32_t vm, int vxrm,
2205              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2206 {
2207     for (uint32_t i = env->vstart; i < vl; i++) {
2208         if (!vm && !vext_elem_mask(v0, i)) {
2209             /* set masked-off elements to 1s */
2210             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2211             continue;
2212         }
2213         fn(vd, vs1, vs2, i, env, vxrm);
2214     }
2215     env->vstart = 0;
2216 }
2217 
2218 static inline void
vext_vv_rm_2(void * vd,void * v0,void * vs1,void * vs2,CPURISCVState * env,uint32_t desc,opivv2_rm_fn * fn,uint32_t esz)2219 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2220              CPURISCVState *env,
2221              uint32_t desc,
2222              opivv2_rm_fn *fn, uint32_t esz)
2223 {
2224     uint32_t vm = vext_vm(desc);
2225     uint32_t vl = env->vl;
2226     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2227     uint32_t vta = vext_vta(desc);
2228     uint32_t vma = vext_vma(desc);
2229 
2230     VSTART_CHECK_EARLY_EXIT(env, vl);
2231 
2232     switch (env->vxrm) {
2233     case 0: /* rnu */
2234         vext_vv_rm_1(vd, v0, vs1, vs2,
2235                      env, vl, vm, 0, fn, vma, esz);
2236         break;
2237     case 1: /* rne */
2238         vext_vv_rm_1(vd, v0, vs1, vs2,
2239                      env, vl, vm, 1, fn, vma, esz);
2240         break;
2241     case 2: /* rdn */
2242         vext_vv_rm_1(vd, v0, vs1, vs2,
2243                      env, vl, vm, 2, fn, vma, esz);
2244         break;
2245     default: /* rod */
2246         vext_vv_rm_1(vd, v0, vs1, vs2,
2247                      env, vl, vm, 3, fn, vma, esz);
2248         break;
2249     }
2250     /* set tail elements to 1s */
2251     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2252 }
2253 
2254 /* generate helpers for fixed point instructions with OPIVV format */
2255 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2256 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2257                   CPURISCVState *env, uint32_t desc)            \
2258 {                                                               \
2259     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2260                  do_##NAME, ESZ);                               \
2261 }
2262 
saddu8(CPURISCVState * env,int vxrm,uint8_t a,uint8_t b)2263 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2264                              uint8_t b)
2265 {
2266     uint8_t res = a + b;
2267     if (res < a) {
2268         res = UINT8_MAX;
2269         env->vxsat = 0x1;
2270     }
2271     return res;
2272 }
2273 
saddu16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2274 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2275                                uint16_t b)
2276 {
2277     uint16_t res = a + b;
2278     if (res < a) {
2279         res = UINT16_MAX;
2280         env->vxsat = 0x1;
2281     }
2282     return res;
2283 }
2284 
saddu32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2285 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2286                                uint32_t b)
2287 {
2288     uint32_t res = a + b;
2289     if (res < a) {
2290         res = UINT32_MAX;
2291         env->vxsat = 0x1;
2292     }
2293     return res;
2294 }
2295 
saddu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2296 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2297                                uint64_t b)
2298 {
2299     uint64_t res = a + b;
2300     if (res < a) {
2301         res = UINT64_MAX;
2302         env->vxsat = 0x1;
2303     }
2304     return res;
2305 }
2306 
2307 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2308 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2309 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2310 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2311 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2312 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2313 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2314 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2315 
2316 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2317                           CPURISCVState *env, int vxrm);
2318 
2319 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2320 static inline void                                                  \
2321 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2322           CPURISCVState *env, int vxrm)                             \
2323 {                                                                   \
2324     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2325     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2326 }
2327 
2328 static inline void
vext_vx_rm_1(void * vd,void * v0,target_long s1,void * vs2,CPURISCVState * env,uint32_t vl,uint32_t vm,int vxrm,opivx2_rm_fn * fn,uint32_t vma,uint32_t esz)2329 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2330              CPURISCVState *env,
2331              uint32_t vl, uint32_t vm, int vxrm,
2332              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2333 {
2334     for (uint32_t i = env->vstart; i < vl; i++) {
2335         if (!vm && !vext_elem_mask(v0, i)) {
2336             /* set masked-off elements to 1s */
2337             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2338             continue;
2339         }
2340         fn(vd, s1, vs2, i, env, vxrm);
2341     }
2342     env->vstart = 0;
2343 }
2344 
2345 static inline void
vext_vx_rm_2(void * vd,void * v0,target_long s1,void * vs2,CPURISCVState * env,uint32_t desc,opivx2_rm_fn * fn,uint32_t esz)2346 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2347              CPURISCVState *env,
2348              uint32_t desc,
2349              opivx2_rm_fn *fn, uint32_t esz)
2350 {
2351     uint32_t vm = vext_vm(desc);
2352     uint32_t vl = env->vl;
2353     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2354     uint32_t vta = vext_vta(desc);
2355     uint32_t vma = vext_vma(desc);
2356 
2357     VSTART_CHECK_EARLY_EXIT(env, vl);
2358 
2359     switch (env->vxrm) {
2360     case 0: /* rnu */
2361         vext_vx_rm_1(vd, v0, s1, vs2,
2362                      env, vl, vm, 0, fn, vma, esz);
2363         break;
2364     case 1: /* rne */
2365         vext_vx_rm_1(vd, v0, s1, vs2,
2366                      env, vl, vm, 1, fn, vma, esz);
2367         break;
2368     case 2: /* rdn */
2369         vext_vx_rm_1(vd, v0, s1, vs2,
2370                      env, vl, vm, 2, fn, vma, esz);
2371         break;
2372     default: /* rod */
2373         vext_vx_rm_1(vd, v0, s1, vs2,
2374                      env, vl, vm, 3, fn, vma, esz);
2375         break;
2376     }
2377     /* set tail elements to 1s */
2378     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2379 }
2380 
2381 /* generate helpers for fixed point instructions with OPIVX format */
2382 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2383 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2384                   void *vs2, CPURISCVState *env,          \
2385                   uint32_t desc)                          \
2386 {                                                         \
2387     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2388                  do_##NAME, ESZ);                         \
2389 }
2390 
RVVCALL(OPIVX2_RM,vsaddu_vx_b,OP_UUU_B,H1,H1,saddu8)2391 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2392 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2393 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2394 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2395 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2396 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2397 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2398 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2399 
2400 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2401 {
2402     int8_t res = a + b;
2403     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2404         res = a > 0 ? INT8_MAX : INT8_MIN;
2405         env->vxsat = 0x1;
2406     }
2407     return res;
2408 }
2409 
sadd16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2410 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2411                              int16_t b)
2412 {
2413     int16_t res = a + b;
2414     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2415         res = a > 0 ? INT16_MAX : INT16_MIN;
2416         env->vxsat = 0x1;
2417     }
2418     return res;
2419 }
2420 
sadd32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2421 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2422                              int32_t b)
2423 {
2424     int32_t res = a + b;
2425     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2426         res = a > 0 ? INT32_MAX : INT32_MIN;
2427         env->vxsat = 0x1;
2428     }
2429     return res;
2430 }
2431 
sadd64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2432 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2433                              int64_t b)
2434 {
2435     int64_t res = a + b;
2436     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2437         res = a > 0 ? INT64_MAX : INT64_MIN;
2438         env->vxsat = 0x1;
2439     }
2440     return res;
2441 }
2442 
RVVCALL(OPIVV2_RM,vsadd_vv_b,OP_SSS_B,H1,H1,H1,sadd8)2443 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2444 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2445 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2446 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2447 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2448 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2449 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2450 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2451 
2452 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2453 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2454 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2455 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2456 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2457 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2458 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2459 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2460 
2461 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2462                              uint8_t b)
2463 {
2464     uint8_t res = a - b;
2465     if (res > a) {
2466         res = 0;
2467         env->vxsat = 0x1;
2468     }
2469     return res;
2470 }
2471 
ssubu16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2472 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2473                                uint16_t b)
2474 {
2475     uint16_t res = a - b;
2476     if (res > a) {
2477         res = 0;
2478         env->vxsat = 0x1;
2479     }
2480     return res;
2481 }
2482 
ssubu32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2483 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2484                                uint32_t b)
2485 {
2486     uint32_t res = a - b;
2487     if (res > a) {
2488         res = 0;
2489         env->vxsat = 0x1;
2490     }
2491     return res;
2492 }
2493 
ssubu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2494 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2495                                uint64_t b)
2496 {
2497     uint64_t res = a - b;
2498     if (res > a) {
2499         res = 0;
2500         env->vxsat = 0x1;
2501     }
2502     return res;
2503 }
2504 
RVVCALL(OPIVV2_RM,vssubu_vv_b,OP_UUU_B,H1,H1,H1,ssubu8)2505 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2506 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2507 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2508 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2509 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2510 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2511 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2512 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2513 
2514 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2515 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2516 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2517 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2518 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2519 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2520 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2521 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2522 
2523 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2524 {
2525     int8_t res = a - b;
2526     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2527         res = a >= 0 ? INT8_MAX : INT8_MIN;
2528         env->vxsat = 0x1;
2529     }
2530     return res;
2531 }
2532 
ssub16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2533 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2534                              int16_t b)
2535 {
2536     int16_t res = a - b;
2537     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2538         res = a >= 0 ? INT16_MAX : INT16_MIN;
2539         env->vxsat = 0x1;
2540     }
2541     return res;
2542 }
2543 
ssub32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2544 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2545                              int32_t b)
2546 {
2547     int32_t res = a - b;
2548     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2549         res = a >= 0 ? INT32_MAX : INT32_MIN;
2550         env->vxsat = 0x1;
2551     }
2552     return res;
2553 }
2554 
ssub64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2555 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2556                              int64_t b)
2557 {
2558     int64_t res = a - b;
2559     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2560         res = a >= 0 ? INT64_MAX : INT64_MIN;
2561         env->vxsat = 0x1;
2562     }
2563     return res;
2564 }
2565 
RVVCALL(OPIVV2_RM,vssub_vv_b,OP_SSS_B,H1,H1,H1,ssub8)2566 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2567 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2568 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2569 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2570 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2571 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2572 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2573 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2574 
2575 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2576 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2577 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2578 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2579 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2580 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2581 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2582 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2583 
2584 /* Vector Single-Width Averaging Add and Subtract */
2585 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2586 {
2587     uint8_t d = extract64(v, shift, 1);
2588     uint8_t d1;
2589     uint64_t D1, D2;
2590 
2591     if (shift == 0 || shift > 64) {
2592         return 0;
2593     }
2594 
2595     d1 = extract64(v, shift - 1, 1);
2596     D1 = extract64(v, 0, shift);
2597     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2598         return d1;
2599     } else if (vxrm == 1) { /* round-to-nearest-even */
2600         if (shift > 1) {
2601             D2 = extract64(v, 0, shift - 1);
2602             return d1 & ((D2 != 0) | d);
2603         } else {
2604             return d1 & d;
2605         }
2606     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2607         return !d & (D1 != 0);
2608     }
2609     return 0; /* round-down (truncate) */
2610 }
2611 
aadd32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2612 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2613                              int32_t b)
2614 {
2615     int64_t res = (int64_t)a + b;
2616     uint8_t round = get_round(vxrm, res, 1);
2617 
2618     return (res >> 1) + round;
2619 }
2620 
aadd64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2621 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2622                              int64_t b)
2623 {
2624     int64_t res = a + b;
2625     uint8_t round = get_round(vxrm, res, 1);
2626     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2627 
2628     /* With signed overflow, bit 64 is inverse of bit 63. */
2629     return ((res >> 1) ^ over) + round;
2630 }
2631 
RVVCALL(OPIVV2_RM,vaadd_vv_b,OP_SSS_B,H1,H1,H1,aadd32)2632 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2633 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2634 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2635 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2636 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2637 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2638 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2639 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2640 
2641 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2642 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2643 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2644 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2645 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2646 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2647 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2648 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2649 
2650 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2651                                uint32_t a, uint32_t b)
2652 {
2653     uint64_t res = (uint64_t)a + b;
2654     uint8_t round = get_round(vxrm, res, 1);
2655 
2656     return (res >> 1) + round;
2657 }
2658 
aaddu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2659 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2660                                uint64_t a, uint64_t b)
2661 {
2662     uint64_t res = a + b;
2663     uint8_t round = get_round(vxrm, res, 1);
2664     uint64_t over = (uint64_t)(res < a) << 63;
2665 
2666     return ((res >> 1) | over) + round;
2667 }
2668 
RVVCALL(OPIVV2_RM,vaaddu_vv_b,OP_UUU_B,H1,H1,H1,aaddu32)2669 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2670 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2671 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2672 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2673 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2674 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2675 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2676 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2677 
2678 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2679 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2680 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2681 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2682 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2683 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2684 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2685 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2686 
2687 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2688                              int32_t b)
2689 {
2690     int64_t res = (int64_t)a - b;
2691     uint8_t round = get_round(vxrm, res, 1);
2692 
2693     return (res >> 1) + round;
2694 }
2695 
asub64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2696 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2697                              int64_t b)
2698 {
2699     int64_t res = (int64_t)a - b;
2700     uint8_t round = get_round(vxrm, res, 1);
2701     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2702 
2703     /* With signed overflow, bit 64 is inverse of bit 63. */
2704     return ((res >> 1) ^ over) + round;
2705 }
2706 
RVVCALL(OPIVV2_RM,vasub_vv_b,OP_SSS_B,H1,H1,H1,asub32)2707 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2708 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2709 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2710 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2711 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2712 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2713 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2714 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2715 
2716 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2717 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2718 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2719 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2720 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2721 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2722 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2723 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2724 
2725 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2726                                uint32_t a, uint32_t b)
2727 {
2728     int64_t res = (int64_t)a - b;
2729     uint8_t round = get_round(vxrm, res, 1);
2730 
2731     return (res >> 1) + round;
2732 }
2733 
asubu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2734 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2735                                uint64_t a, uint64_t b)
2736 {
2737     uint64_t res = (uint64_t)a - b;
2738     uint8_t round = get_round(vxrm, res, 1);
2739     uint64_t over = (uint64_t)(res > a) << 63;
2740 
2741     return ((res >> 1) | over) + round;
2742 }
2743 
RVVCALL(OPIVV2_RM,vasubu_vv_b,OP_UUU_B,H1,H1,H1,asubu32)2744 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2745 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2746 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2747 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2748 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2749 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2750 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2751 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2752 
2753 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2754 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2755 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2756 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2757 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2758 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2759 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2760 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2761 
2762 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2763 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2764 {
2765     uint8_t round;
2766     int16_t res;
2767 
2768     res = (int16_t)a * (int16_t)b;
2769     round = get_round(vxrm, res, 7);
2770     res = (res >> 7) + round;
2771 
2772     if (res > INT8_MAX) {
2773         env->vxsat = 0x1;
2774         return INT8_MAX;
2775     } else if (res < INT8_MIN) {
2776         env->vxsat = 0x1;
2777         return INT8_MIN;
2778     } else {
2779         return res;
2780     }
2781 }
2782 
vsmul16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2783 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2784 {
2785     uint8_t round;
2786     int32_t res;
2787 
2788     res = (int32_t)a * (int32_t)b;
2789     round = get_round(vxrm, res, 15);
2790     res = (res >> 15) + round;
2791 
2792     if (res > INT16_MAX) {
2793         env->vxsat = 0x1;
2794         return INT16_MAX;
2795     } else if (res < INT16_MIN) {
2796         env->vxsat = 0x1;
2797         return INT16_MIN;
2798     } else {
2799         return res;
2800     }
2801 }
2802 
vsmul32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2803 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2804 {
2805     uint8_t round;
2806     int64_t res;
2807 
2808     res = (int64_t)a * (int64_t)b;
2809     round = get_round(vxrm, res, 31);
2810     res = (res >> 31) + round;
2811 
2812     if (res > INT32_MAX) {
2813         env->vxsat = 0x1;
2814         return INT32_MAX;
2815     } else if (res < INT32_MIN) {
2816         env->vxsat = 0x1;
2817         return INT32_MIN;
2818     } else {
2819         return res;
2820     }
2821 }
2822 
vsmul64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2823 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2824 {
2825     uint8_t round;
2826     uint64_t hi_64, lo_64;
2827     int64_t res;
2828 
2829     if (a == INT64_MIN && b == INT64_MIN) {
2830         env->vxsat = 1;
2831         return INT64_MAX;
2832     }
2833 
2834     muls64(&lo_64, &hi_64, a, b);
2835     round = get_round(vxrm, lo_64, 63);
2836     /*
2837      * Cannot overflow, as there are always
2838      * 2 sign bits after multiply.
2839      */
2840     res = (hi_64 << 1) | (lo_64 >> 63);
2841     if (round) {
2842         if (res == INT64_MAX) {
2843             env->vxsat = 1;
2844         } else {
2845             res += 1;
2846         }
2847     }
2848     return res;
2849 }
2850 
RVVCALL(OPIVV2_RM,vsmul_vv_b,OP_SSS_B,H1,H1,H1,vsmul8)2851 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2852 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2853 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2854 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2855 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2856 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2857 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2858 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2859 
2860 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2861 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2862 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2863 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2864 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2865 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2866 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2867 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2868 
2869 /* Vector Single-Width Scaling Shift Instructions */
2870 static inline uint8_t
2871 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2872 {
2873     uint8_t round, shift = b & 0x7;
2874     uint8_t res;
2875 
2876     round = get_round(vxrm, a, shift);
2877     res = (a >> shift) + round;
2878     return res;
2879 }
2880 static inline uint16_t
vssrl16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2881 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2882 {
2883     uint8_t round, shift = b & 0xf;
2884 
2885     round = get_round(vxrm, a, shift);
2886     return (a >> shift) + round;
2887 }
2888 static inline uint32_t
vssrl32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2889 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2890 {
2891     uint8_t round, shift = b & 0x1f;
2892 
2893     round = get_round(vxrm, a, shift);
2894     return (a >> shift) + round;
2895 }
2896 static inline uint64_t
vssrl64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2897 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2898 {
2899     uint8_t round, shift = b & 0x3f;
2900 
2901     round = get_round(vxrm, a, shift);
2902     return (a >> shift) + round;
2903 }
RVVCALL(OPIVV2_RM,vssrl_vv_b,OP_UUU_B,H1,H1,H1,vssrl8)2904 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2905 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2906 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2907 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2908 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2909 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2910 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2911 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2912 
2913 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2914 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2915 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2916 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2917 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2918 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2919 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2920 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2921 
2922 static inline int8_t
2923 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2924 {
2925     uint8_t round, shift = b & 0x7;
2926 
2927     round = get_round(vxrm, a, shift);
2928     return (a >> shift) + round;
2929 }
2930 static inline int16_t
vssra16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2931 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2932 {
2933     uint8_t round, shift = b & 0xf;
2934 
2935     round = get_round(vxrm, a, shift);
2936     return (a >> shift) + round;
2937 }
2938 static inline int32_t
vssra32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2939 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2940 {
2941     uint8_t round, shift = b & 0x1f;
2942 
2943     round = get_round(vxrm, a, shift);
2944     return (a >> shift) + round;
2945 }
2946 static inline int64_t
vssra64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2947 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2948 {
2949     uint8_t round, shift = b & 0x3f;
2950 
2951     round = get_round(vxrm, a, shift);
2952     return (a >> shift) + round;
2953 }
2954 
RVVCALL(OPIVV2_RM,vssra_vv_b,OP_SSS_B,H1,H1,H1,vssra8)2955 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2956 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2957 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2958 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2959 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2960 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2961 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2962 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2963 
2964 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2965 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2966 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2967 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2968 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2969 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2970 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2971 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2972 
2973 /* Vector Narrowing Fixed-Point Clip Instructions */
2974 static inline int8_t
2975 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2976 {
2977     uint8_t round, shift = b & 0xf;
2978     int16_t res;
2979 
2980     round = get_round(vxrm, a, shift);
2981     res = (a >> shift) + round;
2982     if (res > INT8_MAX) {
2983         env->vxsat = 0x1;
2984         return INT8_MAX;
2985     } else if (res < INT8_MIN) {
2986         env->vxsat = 0x1;
2987         return INT8_MIN;
2988     } else {
2989         return res;
2990     }
2991 }
2992 
2993 static inline int16_t
vnclip16(CPURISCVState * env,int vxrm,int32_t a,int16_t b)2994 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2995 {
2996     uint8_t round, shift = b & 0x1f;
2997     int32_t res;
2998 
2999     round = get_round(vxrm, a, shift);
3000     res = (a >> shift) + round;
3001     if (res > INT16_MAX) {
3002         env->vxsat = 0x1;
3003         return INT16_MAX;
3004     } else if (res < INT16_MIN) {
3005         env->vxsat = 0x1;
3006         return INT16_MIN;
3007     } else {
3008         return res;
3009     }
3010 }
3011 
3012 static inline int32_t
vnclip32(CPURISCVState * env,int vxrm,int64_t a,int32_t b)3013 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
3014 {
3015     uint8_t round, shift = b & 0x3f;
3016     int64_t res;
3017 
3018     round = get_round(vxrm, a, shift);
3019     res = (a >> shift) + round;
3020     if (res > INT32_MAX) {
3021         env->vxsat = 0x1;
3022         return INT32_MAX;
3023     } else if (res < INT32_MIN) {
3024         env->vxsat = 0x1;
3025         return INT32_MIN;
3026     } else {
3027         return res;
3028     }
3029 }
3030 
RVVCALL(OPIVV2_RM,vnclip_wv_b,NOP_SSS_B,H1,H2,H1,vnclip8)3031 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
3032 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
3033 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
3034 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
3035 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
3036 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
3037 
3038 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
3039 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
3040 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
3041 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
3042 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
3043 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
3044 
3045 static inline uint8_t
3046 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
3047 {
3048     uint8_t round, shift = b & 0xf;
3049     uint16_t res;
3050 
3051     round = get_round(vxrm, a, shift);
3052     res = (a >> shift) + round;
3053     if (res > UINT8_MAX) {
3054         env->vxsat = 0x1;
3055         return UINT8_MAX;
3056     } else {
3057         return res;
3058     }
3059 }
3060 
3061 static inline uint16_t
vnclipu16(CPURISCVState * env,int vxrm,uint32_t a,uint16_t b)3062 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3063 {
3064     uint8_t round, shift = b & 0x1f;
3065     uint32_t res;
3066 
3067     round = get_round(vxrm, a, shift);
3068     res = (a >> shift) + round;
3069     if (res > UINT16_MAX) {
3070         env->vxsat = 0x1;
3071         return UINT16_MAX;
3072     } else {
3073         return res;
3074     }
3075 }
3076 
3077 static inline uint32_t
vnclipu32(CPURISCVState * env,int vxrm,uint64_t a,uint32_t b)3078 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3079 {
3080     uint8_t round, shift = b & 0x3f;
3081     uint64_t res;
3082 
3083     round = get_round(vxrm, a, shift);
3084     res = (a >> shift) + round;
3085     if (res > UINT32_MAX) {
3086         env->vxsat = 0x1;
3087         return UINT32_MAX;
3088     } else {
3089         return res;
3090     }
3091 }
3092 
RVVCALL(OPIVV2_RM,vnclipu_wv_b,NOP_UUU_B,H1,H2,H1,vnclipu8)3093 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3094 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3095 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3096 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3097 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3098 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3099 
3100 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3101 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3102 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3103 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3104 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3105 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3106 
3107 /*
3108  * Vector Float Point Arithmetic Instructions
3109  */
3110 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3111 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3112 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3113                       CPURISCVState *env)                      \
3114 {                                                              \
3115     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3116     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3117     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3118 }
3119 
3120 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3121 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3122                   void *vs2, CPURISCVState *env,          \
3123                   uint32_t desc)                          \
3124 {                                                         \
3125     uint32_t vm = vext_vm(desc);                          \
3126     uint32_t vl = env->vl;                                \
3127     uint32_t total_elems =                                \
3128         vext_get_total_elems(env, desc, ESZ);             \
3129     uint32_t vta = vext_vta(desc);                        \
3130     uint32_t vma = vext_vma(desc);                        \
3131     uint32_t i;                                           \
3132                                                           \
3133     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3134                                                           \
3135     for (i = env->vstart; i < vl; i++) {                  \
3136         if (!vm && !vext_elem_mask(v0, i)) {              \
3137             /* set masked-off elements to 1s */           \
3138             vext_set_elems_1s(vd, vma, i * ESZ,           \
3139                               (i + 1) * ESZ);             \
3140             continue;                                     \
3141         }                                                 \
3142         do_##NAME(vd, vs1, vs2, i, env);                  \
3143     }                                                     \
3144     env->vstart = 0;                                      \
3145     /* set tail elements to 1s */                         \
3146     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3147                       total_elems * ESZ);                 \
3148 }
3149 
3150 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3151 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3152 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3153 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3154 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3155 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3156 
3157 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3158 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3159                       CPURISCVState *env)                      \
3160 {                                                              \
3161     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3162     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3163 }
3164 
3165 #define GEN_VEXT_VF(NAME, ESZ)                            \
3166 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3167                   void *vs2, CPURISCVState *env,          \
3168                   uint32_t desc)                          \
3169 {                                                         \
3170     uint32_t vm = vext_vm(desc);                          \
3171     uint32_t vl = env->vl;                                \
3172     uint32_t total_elems =                                \
3173         vext_get_total_elems(env, desc, ESZ);             \
3174     uint32_t vta = vext_vta(desc);                        \
3175     uint32_t vma = vext_vma(desc);                        \
3176     uint32_t i;                                           \
3177                                                           \
3178     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
3179                                                           \
3180     for (i = env->vstart; i < vl; i++) {                  \
3181         if (!vm && !vext_elem_mask(v0, i)) {              \
3182             /* set masked-off elements to 1s */           \
3183             vext_set_elems_1s(vd, vma, i * ESZ,           \
3184                               (i + 1) * ESZ);             \
3185             continue;                                     \
3186         }                                                 \
3187         do_##NAME(vd, s1, vs2, i, env);                   \
3188     }                                                     \
3189     env->vstart = 0;                                      \
3190     /* set tail elements to 1s */                         \
3191     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3192                       total_elems * ESZ);                 \
3193 }
3194 
3195 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3196 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3197 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3198 GEN_VEXT_VF(vfadd_vf_h, 2)
3199 GEN_VEXT_VF(vfadd_vf_w, 4)
3200 GEN_VEXT_VF(vfadd_vf_d, 8)
3201 
3202 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3203 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3204 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3205 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3206 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3207 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3208 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3209 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3210 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3211 GEN_VEXT_VF(vfsub_vf_h, 2)
3212 GEN_VEXT_VF(vfsub_vf_w, 4)
3213 GEN_VEXT_VF(vfsub_vf_d, 8)
3214 
3215 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3216 {
3217     return float16_sub(b, a, s);
3218 }
3219 
float32_rsub(uint32_t a,uint32_t b,float_status * s)3220 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3221 {
3222     return float32_sub(b, a, s);
3223 }
3224 
float64_rsub(uint64_t a,uint64_t b,float_status * s)3225 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3226 {
3227     return float64_sub(b, a, s);
3228 }
3229 
RVVCALL(OPFVF2,vfrsub_vf_h,OP_UUU_H,H2,H2,float16_rsub)3230 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3231 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3232 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3233 GEN_VEXT_VF(vfrsub_vf_h, 2)
3234 GEN_VEXT_VF(vfrsub_vf_w, 4)
3235 GEN_VEXT_VF(vfrsub_vf_d, 8)
3236 
3237 /* Vector Widening Floating-Point Add/Subtract Instructions */
3238 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3239 {
3240     return float32_add(float16_to_float32(a, true, s),
3241                        float16_to_float32(b, true, s), s);
3242 }
3243 
vfwadd32(uint32_t a,uint32_t b,float_status * s)3244 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3245 {
3246     return float64_add(float32_to_float64(a, s),
3247                        float32_to_float64(b, s), s);
3248 
3249 }
3250 
RVVCALL(OPFVV2,vfwadd_vv_h,WOP_UUU_H,H4,H2,H2,vfwadd16)3251 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3252 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3253 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3254 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3255 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3256 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3257 GEN_VEXT_VF(vfwadd_vf_h, 4)
3258 GEN_VEXT_VF(vfwadd_vf_w, 8)
3259 
3260 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3261 {
3262     return float32_sub(float16_to_float32(a, true, s),
3263                        float16_to_float32(b, true, s), s);
3264 }
3265 
vfwsub32(uint32_t a,uint32_t b,float_status * s)3266 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3267 {
3268     return float64_sub(float32_to_float64(a, s),
3269                        float32_to_float64(b, s), s);
3270 
3271 }
3272 
RVVCALL(OPFVV2,vfwsub_vv_h,WOP_UUU_H,H4,H2,H2,vfwsub16)3273 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3274 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3275 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3276 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3277 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3278 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3279 GEN_VEXT_VF(vfwsub_vf_h, 4)
3280 GEN_VEXT_VF(vfwsub_vf_w, 8)
3281 
3282 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3283 {
3284     return float32_add(a, float16_to_float32(b, true, s), s);
3285 }
3286 
vfwaddw32(uint64_t a,uint32_t b,float_status * s)3287 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3288 {
3289     return float64_add(a, float32_to_float64(b, s), s);
3290 }
3291 
RVVCALL(OPFVV2,vfwadd_wv_h,WOP_WUUU_H,H4,H2,H2,vfwaddw16)3292 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3293 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3294 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3295 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3296 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3297 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3298 GEN_VEXT_VF(vfwadd_wf_h, 4)
3299 GEN_VEXT_VF(vfwadd_wf_w, 8)
3300 
3301 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3302 {
3303     return float32_sub(a, float16_to_float32(b, true, s), s);
3304 }
3305 
vfwsubw32(uint64_t a,uint32_t b,float_status * s)3306 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3307 {
3308     return float64_sub(a, float32_to_float64(b, s), s);
3309 }
3310 
RVVCALL(OPFVV2,vfwsub_wv_h,WOP_WUUU_H,H4,H2,H2,vfwsubw16)3311 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3312 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3313 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3314 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3315 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3316 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3317 GEN_VEXT_VF(vfwsub_wf_h, 4)
3318 GEN_VEXT_VF(vfwsub_wf_w, 8)
3319 
3320 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3321 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3322 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3323 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3324 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3325 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3326 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3327 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3328 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3329 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3330 GEN_VEXT_VF(vfmul_vf_h, 2)
3331 GEN_VEXT_VF(vfmul_vf_w, 4)
3332 GEN_VEXT_VF(vfmul_vf_d, 8)
3333 
3334 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3335 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3336 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3337 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3338 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3339 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3340 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3341 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3342 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3343 GEN_VEXT_VF(vfdiv_vf_h, 2)
3344 GEN_VEXT_VF(vfdiv_vf_w, 4)
3345 GEN_VEXT_VF(vfdiv_vf_d, 8)
3346 
3347 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3348 {
3349     return float16_div(b, a, s);
3350 }
3351 
float32_rdiv(uint32_t a,uint32_t b,float_status * s)3352 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3353 {
3354     return float32_div(b, a, s);
3355 }
3356 
float64_rdiv(uint64_t a,uint64_t b,float_status * s)3357 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3358 {
3359     return float64_div(b, a, s);
3360 }
3361 
RVVCALL(OPFVF2,vfrdiv_vf_h,OP_UUU_H,H2,H2,float16_rdiv)3362 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3363 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3364 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3365 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3366 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3367 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3368 
3369 /* Vector Widening Floating-Point Multiply */
3370 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3371 {
3372     return float32_mul(float16_to_float32(a, true, s),
3373                        float16_to_float32(b, true, s), s);
3374 }
3375 
vfwmul32(uint32_t a,uint32_t b,float_status * s)3376 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3377 {
3378     return float64_mul(float32_to_float64(a, s),
3379                        float32_to_float64(b, s), s);
3380 
3381 }
RVVCALL(OPFVV2,vfwmul_vv_h,WOP_UUU_H,H4,H2,H2,vfwmul16)3382 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3383 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3384 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3385 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3386 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3387 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3388 GEN_VEXT_VF(vfwmul_vf_h, 4)
3389 GEN_VEXT_VF(vfwmul_vf_w, 8)
3390 
3391 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3392 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3393 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3394                       CPURISCVState *env)                          \
3395 {                                                                  \
3396     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3397     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3398     TD d = *((TD *)vd + HD(i));                                    \
3399     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3400 }
3401 
3402 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3403 {
3404     return float16_muladd(a, b, d, 0, s);
3405 }
3406 
fmacc32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3407 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3408 {
3409     return float32_muladd(a, b, d, 0, s);
3410 }
3411 
fmacc64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3412 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3413 {
3414     return float64_muladd(a, b, d, 0, s);
3415 }
3416 
RVVCALL(OPFVV3,vfmacc_vv_h,OP_UUU_H,H2,H2,H2,fmacc16)3417 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3418 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3419 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3420 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3421 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3422 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3423 
3424 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3425 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3426                       CPURISCVState *env)                         \
3427 {                                                                 \
3428     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3429     TD d = *((TD *)vd + HD(i));                                   \
3430     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3431 }
3432 
3433 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3434 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3435 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3436 GEN_VEXT_VF(vfmacc_vf_h, 2)
3437 GEN_VEXT_VF(vfmacc_vf_w, 4)
3438 GEN_VEXT_VF(vfmacc_vf_d, 8)
3439 
3440 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3441 {
3442     return float16_muladd(a, b, d, float_muladd_negate_c |
3443                                    float_muladd_negate_product, s);
3444 }
3445 
fnmacc32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3446 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3447 {
3448     return float32_muladd(a, b, d, float_muladd_negate_c |
3449                                    float_muladd_negate_product, s);
3450 }
3451 
fnmacc64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3452 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3453 {
3454     return float64_muladd(a, b, d, float_muladd_negate_c |
3455                                    float_muladd_negate_product, s);
3456 }
3457 
RVVCALL(OPFVV3,vfnmacc_vv_h,OP_UUU_H,H2,H2,H2,fnmacc16)3458 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3459 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3460 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3461 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3462 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3463 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3464 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3465 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3466 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3467 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3468 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3469 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3470 
3471 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3472 {
3473     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3474 }
3475 
fmsac32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3476 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3477 {
3478     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3479 }
3480 
fmsac64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3481 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3482 {
3483     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3484 }
3485 
RVVCALL(OPFVV3,vfmsac_vv_h,OP_UUU_H,H2,H2,H2,fmsac16)3486 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3487 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3488 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3489 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3490 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3491 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3492 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3493 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3494 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3495 GEN_VEXT_VF(vfmsac_vf_h, 2)
3496 GEN_VEXT_VF(vfmsac_vf_w, 4)
3497 GEN_VEXT_VF(vfmsac_vf_d, 8)
3498 
3499 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3500 {
3501     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3502 }
3503 
fnmsac32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3504 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3505 {
3506     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3507 }
3508 
fnmsac64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3509 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3510 {
3511     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3512 }
3513 
RVVCALL(OPFVV3,vfnmsac_vv_h,OP_UUU_H,H2,H2,H2,fnmsac16)3514 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3515 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3516 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3517 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3518 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3519 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3520 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3521 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3522 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3523 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3524 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3525 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3526 
3527 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3528 {
3529     return float16_muladd(d, b, a, 0, s);
3530 }
3531 
fmadd32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3532 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3533 {
3534     return float32_muladd(d, b, a, 0, s);
3535 }
3536 
fmadd64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3537 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3538 {
3539     return float64_muladd(d, b, a, 0, s);
3540 }
3541 
RVVCALL(OPFVV3,vfmadd_vv_h,OP_UUU_H,H2,H2,H2,fmadd16)3542 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3543 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3544 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3545 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3546 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3547 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3548 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3549 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3550 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3551 GEN_VEXT_VF(vfmadd_vf_h, 2)
3552 GEN_VEXT_VF(vfmadd_vf_w, 4)
3553 GEN_VEXT_VF(vfmadd_vf_d, 8)
3554 
3555 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3556 {
3557     return float16_muladd(d, b, a, float_muladd_negate_c |
3558                                    float_muladd_negate_product, s);
3559 }
3560 
fnmadd32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3561 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3562 {
3563     return float32_muladd(d, b, a, float_muladd_negate_c |
3564                                    float_muladd_negate_product, s);
3565 }
3566 
fnmadd64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3567 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3568 {
3569     return float64_muladd(d, b, a, float_muladd_negate_c |
3570                                    float_muladd_negate_product, s);
3571 }
3572 
RVVCALL(OPFVV3,vfnmadd_vv_h,OP_UUU_H,H2,H2,H2,fnmadd16)3573 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3574 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3575 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3576 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3577 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3578 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3579 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3580 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3581 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3582 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3583 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3584 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3585 
3586 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3587 {
3588     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3589 }
3590 
fmsub32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3591 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3592 {
3593     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3594 }
3595 
fmsub64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3596 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3597 {
3598     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3599 }
3600 
RVVCALL(OPFVV3,vfmsub_vv_h,OP_UUU_H,H2,H2,H2,fmsub16)3601 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3602 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3603 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3604 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3605 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3606 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3607 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3608 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3609 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3610 GEN_VEXT_VF(vfmsub_vf_h, 2)
3611 GEN_VEXT_VF(vfmsub_vf_w, 4)
3612 GEN_VEXT_VF(vfmsub_vf_d, 8)
3613 
3614 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3615 {
3616     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3617 }
3618 
fnmsub32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3619 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3620 {
3621     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3622 }
3623 
fnmsub64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3624 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3625 {
3626     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3627 }
3628 
RVVCALL(OPFVV3,vfnmsub_vv_h,OP_UUU_H,H2,H2,H2,fnmsub16)3629 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3630 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3631 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3632 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3633 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3634 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3635 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3636 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3637 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3638 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3639 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3640 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3641 
3642 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3643 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3644 {
3645     return float32_muladd(float16_to_float32(a, true, s),
3646                           float16_to_float32(b, true, s), d, 0, s);
3647 }
3648 
fwmacc32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3649 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3650 {
3651     return float64_muladd(float32_to_float64(a, s),
3652                           float32_to_float64(b, s), d, 0, s);
3653 }
3654 
RVVCALL(OPFVV3,vfwmacc_vv_h,WOP_UUU_H,H4,H2,H2,fwmacc16)3655 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3656 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3657 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3658 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3659 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3660 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3661 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3662 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3663 
3664 static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3665 {
3666     return float32_muladd(bfloat16_to_float32(a, s),
3667                           bfloat16_to_float32(b, s), d, 0, s);
3668 }
3669 
RVVCALL(OPFVV3,vfwmaccbf16_vv,WOP_UUU_H,H4,H2,H2,fwmaccbf16)3670 RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3671 GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3672 RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3673 GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3674 
3675 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3676 {
3677     return float32_muladd(float16_to_float32(a, true, s),
3678                           float16_to_float32(b, true, s), d,
3679                           float_muladd_negate_c | float_muladd_negate_product,
3680                           s);
3681 }
3682 
fwnmacc32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3683 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3684 {
3685     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3686                           d, float_muladd_negate_c |
3687                              float_muladd_negate_product, s);
3688 }
3689 
RVVCALL(OPFVV3,vfwnmacc_vv_h,WOP_UUU_H,H4,H2,H2,fwnmacc16)3690 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3691 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3692 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3693 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3694 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3695 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3696 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3697 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3698 
3699 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3700 {
3701     return float32_muladd(float16_to_float32(a, true, s),
3702                           float16_to_float32(b, true, s), d,
3703                           float_muladd_negate_c, s);
3704 }
3705 
fwmsac32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3706 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3707 {
3708     return float64_muladd(float32_to_float64(a, s),
3709                           float32_to_float64(b, s), d,
3710                           float_muladd_negate_c, s);
3711 }
3712 
RVVCALL(OPFVV3,vfwmsac_vv_h,WOP_UUU_H,H4,H2,H2,fwmsac16)3713 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3714 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3715 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3716 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3717 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3718 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3719 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3720 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3721 
3722 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3723 {
3724     return float32_muladd(float16_to_float32(a, true, s),
3725                           float16_to_float32(b, true, s), d,
3726                           float_muladd_negate_product, s);
3727 }
3728 
fwnmsac32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3729 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3730 {
3731     return float64_muladd(float32_to_float64(a, s),
3732                           float32_to_float64(b, s), d,
3733                           float_muladd_negate_product, s);
3734 }
3735 
RVVCALL(OPFVV3,vfwnmsac_vv_h,WOP_UUU_H,H4,H2,H2,fwnmsac16)3736 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3737 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3738 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3739 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3740 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3741 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3742 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3743 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3744 
3745 /* Vector Floating-Point Square-Root Instruction */
3746 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3747 static void do_##NAME(void *vd, void *vs2, int i,      \
3748                       CPURISCVState *env)              \
3749 {                                                      \
3750     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3751     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3752 }
3753 
3754 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3755 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3756                   CPURISCVState *env, uint32_t desc)   \
3757 {                                                      \
3758     uint32_t vm = vext_vm(desc);                       \
3759     uint32_t vl = env->vl;                             \
3760     uint32_t total_elems =                             \
3761         vext_get_total_elems(env, desc, ESZ);          \
3762     uint32_t vta = vext_vta(desc);                     \
3763     uint32_t vma = vext_vma(desc);                     \
3764     uint32_t i;                                        \
3765                                                        \
3766     VSTART_CHECK_EARLY_EXIT(env, vl);                  \
3767                                                        \
3768     if (vl == 0) {                                     \
3769         return;                                        \
3770     }                                                  \
3771     for (i = env->vstart; i < vl; i++) {               \
3772         if (!vm && !vext_elem_mask(v0, i)) {           \
3773             /* set masked-off elements to 1s */        \
3774             vext_set_elems_1s(vd, vma, i * ESZ,        \
3775                               (i + 1) * ESZ);          \
3776             continue;                                  \
3777         }                                              \
3778         do_##NAME(vd, vs2, i, env);                    \
3779     }                                                  \
3780     env->vstart = 0;                                   \
3781     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3782                       total_elems * ESZ);              \
3783 }
3784 
3785 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3786 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3787 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3788 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3789 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3790 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3791 
3792 /*
3793  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3794  *
3795  * Adapted from riscv-v-spec recip.c:
3796  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3797  */
3798 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3799 {
3800     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3801     uint64_t exp = extract64(f, frac_size, exp_size);
3802     uint64_t frac = extract64(f, 0, frac_size);
3803 
3804     const uint8_t lookup_table[] = {
3805         52, 51, 50, 48, 47, 46, 44, 43,
3806         42, 41, 40, 39, 38, 36, 35, 34,
3807         33, 32, 31, 30, 30, 29, 28, 27,
3808         26, 25, 24, 23, 23, 22, 21, 20,
3809         19, 19, 18, 17, 16, 16, 15, 14,
3810         14, 13, 12, 12, 11, 10, 10, 9,
3811         9, 8, 7, 7, 6, 6, 5, 4,
3812         4, 3, 3, 2, 2, 1, 1, 0,
3813         127, 125, 123, 121, 119, 118, 116, 114,
3814         113, 111, 109, 108, 106, 105, 103, 102,
3815         100, 99, 97, 96, 95, 93, 92, 91,
3816         90, 88, 87, 86, 85, 84, 83, 82,
3817         80, 79, 78, 77, 76, 75, 74, 73,
3818         72, 71, 70, 70, 69, 68, 67, 66,
3819         65, 64, 63, 63, 62, 61, 60, 59,
3820         59, 58, 57, 56, 56, 55, 54, 53
3821     };
3822     const int precision = 7;
3823 
3824     if (exp == 0 && frac != 0) { /* subnormal */
3825         /* Normalize the subnormal. */
3826         while (extract64(frac, frac_size - 1, 1) == 0) {
3827             exp--;
3828             frac <<= 1;
3829         }
3830 
3831         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3832     }
3833 
3834     int idx = ((exp & 1) << (precision - 1)) |
3835               (frac >> (frac_size - precision + 1));
3836     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3837                         (frac_size - precision);
3838     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3839 
3840     uint64_t val = 0;
3841     val = deposit64(val, 0, frac_size, out_frac);
3842     val = deposit64(val, frac_size, exp_size, out_exp);
3843     val = deposit64(val, frac_size + exp_size, 1, sign);
3844     return val;
3845 }
3846 
frsqrt7_h(float16 f,float_status * s)3847 static float16 frsqrt7_h(float16 f, float_status *s)
3848 {
3849     int exp_size = 5, frac_size = 10;
3850     bool sign = float16_is_neg(f);
3851 
3852     /*
3853      * frsqrt7(sNaN) = canonical NaN
3854      * frsqrt7(-inf) = canonical NaN
3855      * frsqrt7(-normal) = canonical NaN
3856      * frsqrt7(-subnormal) = canonical NaN
3857      */
3858     if (float16_is_signaling_nan(f, s) ||
3859         (float16_is_infinity(f) && sign) ||
3860         (float16_is_normal(f) && sign) ||
3861         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3862         s->float_exception_flags |= float_flag_invalid;
3863         return float16_default_nan(s);
3864     }
3865 
3866     /* frsqrt7(qNaN) = canonical NaN */
3867     if (float16_is_quiet_nan(f, s)) {
3868         return float16_default_nan(s);
3869     }
3870 
3871     /* frsqrt7(+-0) = +-inf */
3872     if (float16_is_zero(f)) {
3873         s->float_exception_flags |= float_flag_divbyzero;
3874         return float16_set_sign(float16_infinity, sign);
3875     }
3876 
3877     /* frsqrt7(+inf) = +0 */
3878     if (float16_is_infinity(f) && !sign) {
3879         return float16_set_sign(float16_zero, sign);
3880     }
3881 
3882     /* +normal, +subnormal */
3883     uint64_t val = frsqrt7(f, exp_size, frac_size);
3884     return make_float16(val);
3885 }
3886 
frsqrt7_s(float32 f,float_status * s)3887 static float32 frsqrt7_s(float32 f, float_status *s)
3888 {
3889     int exp_size = 8, frac_size = 23;
3890     bool sign = float32_is_neg(f);
3891 
3892     /*
3893      * frsqrt7(sNaN) = canonical NaN
3894      * frsqrt7(-inf) = canonical NaN
3895      * frsqrt7(-normal) = canonical NaN
3896      * frsqrt7(-subnormal) = canonical NaN
3897      */
3898     if (float32_is_signaling_nan(f, s) ||
3899         (float32_is_infinity(f) && sign) ||
3900         (float32_is_normal(f) && sign) ||
3901         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3902         s->float_exception_flags |= float_flag_invalid;
3903         return float32_default_nan(s);
3904     }
3905 
3906     /* frsqrt7(qNaN) = canonical NaN */
3907     if (float32_is_quiet_nan(f, s)) {
3908         return float32_default_nan(s);
3909     }
3910 
3911     /* frsqrt7(+-0) = +-inf */
3912     if (float32_is_zero(f)) {
3913         s->float_exception_flags |= float_flag_divbyzero;
3914         return float32_set_sign(float32_infinity, sign);
3915     }
3916 
3917     /* frsqrt7(+inf) = +0 */
3918     if (float32_is_infinity(f) && !sign) {
3919         return float32_set_sign(float32_zero, sign);
3920     }
3921 
3922     /* +normal, +subnormal */
3923     uint64_t val = frsqrt7(f, exp_size, frac_size);
3924     return make_float32(val);
3925 }
3926 
frsqrt7_d(float64 f,float_status * s)3927 static float64 frsqrt7_d(float64 f, float_status *s)
3928 {
3929     int exp_size = 11, frac_size = 52;
3930     bool sign = float64_is_neg(f);
3931 
3932     /*
3933      * frsqrt7(sNaN) = canonical NaN
3934      * frsqrt7(-inf) = canonical NaN
3935      * frsqrt7(-normal) = canonical NaN
3936      * frsqrt7(-subnormal) = canonical NaN
3937      */
3938     if (float64_is_signaling_nan(f, s) ||
3939         (float64_is_infinity(f) && sign) ||
3940         (float64_is_normal(f) && sign) ||
3941         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3942         s->float_exception_flags |= float_flag_invalid;
3943         return float64_default_nan(s);
3944     }
3945 
3946     /* frsqrt7(qNaN) = canonical NaN */
3947     if (float64_is_quiet_nan(f, s)) {
3948         return float64_default_nan(s);
3949     }
3950 
3951     /* frsqrt7(+-0) = +-inf */
3952     if (float64_is_zero(f)) {
3953         s->float_exception_flags |= float_flag_divbyzero;
3954         return float64_set_sign(float64_infinity, sign);
3955     }
3956 
3957     /* frsqrt7(+inf) = +0 */
3958     if (float64_is_infinity(f) && !sign) {
3959         return float64_set_sign(float64_zero, sign);
3960     }
3961 
3962     /* +normal, +subnormal */
3963     uint64_t val = frsqrt7(f, exp_size, frac_size);
3964     return make_float64(val);
3965 }
3966 
RVVCALL(OPFVV1,vfrsqrt7_v_h,OP_UU_H,H2,H2,frsqrt7_h)3967 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3968 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3969 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3970 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3971 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3972 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3973 
3974 /*
3975  * Vector Floating-Point Reciprocal Estimate Instruction
3976  *
3977  * Adapted from riscv-v-spec recip.c:
3978  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3979  */
3980 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3981                       float_status *s)
3982 {
3983     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3984     uint64_t exp = extract64(f, frac_size, exp_size);
3985     uint64_t frac = extract64(f, 0, frac_size);
3986 
3987     const uint8_t lookup_table[] = {
3988         127, 125, 123, 121, 119, 117, 116, 114,
3989         112, 110, 109, 107, 105, 104, 102, 100,
3990         99, 97, 96, 94, 93, 91, 90, 88,
3991         87, 85, 84, 83, 81, 80, 79, 77,
3992         76, 75, 74, 72, 71, 70, 69, 68,
3993         66, 65, 64, 63, 62, 61, 60, 59,
3994         58, 57, 56, 55, 54, 53, 52, 51,
3995         50, 49, 48, 47, 46, 45, 44, 43,
3996         42, 41, 40, 40, 39, 38, 37, 36,
3997         35, 35, 34, 33, 32, 31, 31, 30,
3998         29, 28, 28, 27, 26, 25, 25, 24,
3999         23, 23, 22, 21, 21, 20, 19, 19,
4000         18, 17, 17, 16, 15, 15, 14, 14,
4001         13, 12, 12, 11, 11, 10, 9, 9,
4002         8, 8, 7, 7, 6, 5, 5, 4,
4003         4, 3, 3, 2, 2, 1, 1, 0
4004     };
4005     const int precision = 7;
4006 
4007     if (exp == 0 && frac != 0) { /* subnormal */
4008         /* Normalize the subnormal. */
4009         while (extract64(frac, frac_size - 1, 1) == 0) {
4010             exp--;
4011             frac <<= 1;
4012         }
4013 
4014         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
4015 
4016         if (exp != 0 && exp != UINT64_MAX) {
4017             /*
4018              * Overflow to inf or max value of same sign,
4019              * depending on sign and rounding mode.
4020              */
4021             s->float_exception_flags |= (float_flag_inexact |
4022                                          float_flag_overflow);
4023 
4024             if ((s->float_rounding_mode == float_round_to_zero) ||
4025                 ((s->float_rounding_mode == float_round_down) && !sign) ||
4026                 ((s->float_rounding_mode == float_round_up) && sign)) {
4027                 /* Return greatest/negative finite value. */
4028                 return (sign << (exp_size + frac_size)) |
4029                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
4030             } else {
4031                 /* Return +-inf. */
4032                 return (sign << (exp_size + frac_size)) |
4033                        MAKE_64BIT_MASK(frac_size, exp_size);
4034             }
4035         }
4036     }
4037 
4038     int idx = frac >> (frac_size - precision);
4039     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
4040                         (frac_size - precision);
4041     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
4042 
4043     if (out_exp == 0 || out_exp == UINT64_MAX) {
4044         /*
4045          * The result is subnormal, but don't raise the underflow exception,
4046          * because there's no additional loss of precision.
4047          */
4048         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
4049         if (out_exp == UINT64_MAX) {
4050             out_frac >>= 1;
4051             out_exp = 0;
4052         }
4053     }
4054 
4055     uint64_t val = 0;
4056     val = deposit64(val, 0, frac_size, out_frac);
4057     val = deposit64(val, frac_size, exp_size, out_exp);
4058     val = deposit64(val, frac_size + exp_size, 1, sign);
4059     return val;
4060 }
4061 
frec7_h(float16 f,float_status * s)4062 static float16 frec7_h(float16 f, float_status *s)
4063 {
4064     int exp_size = 5, frac_size = 10;
4065     bool sign = float16_is_neg(f);
4066 
4067     /* frec7(+-inf) = +-0 */
4068     if (float16_is_infinity(f)) {
4069         return float16_set_sign(float16_zero, sign);
4070     }
4071 
4072     /* frec7(+-0) = +-inf */
4073     if (float16_is_zero(f)) {
4074         s->float_exception_flags |= float_flag_divbyzero;
4075         return float16_set_sign(float16_infinity, sign);
4076     }
4077 
4078     /* frec7(sNaN) = canonical NaN */
4079     if (float16_is_signaling_nan(f, s)) {
4080         s->float_exception_flags |= float_flag_invalid;
4081         return float16_default_nan(s);
4082     }
4083 
4084     /* frec7(qNaN) = canonical NaN */
4085     if (float16_is_quiet_nan(f, s)) {
4086         return float16_default_nan(s);
4087     }
4088 
4089     /* +-normal, +-subnormal */
4090     uint64_t val = frec7(f, exp_size, frac_size, s);
4091     return make_float16(val);
4092 }
4093 
frec7_s(float32 f,float_status * s)4094 static float32 frec7_s(float32 f, float_status *s)
4095 {
4096     int exp_size = 8, frac_size = 23;
4097     bool sign = float32_is_neg(f);
4098 
4099     /* frec7(+-inf) = +-0 */
4100     if (float32_is_infinity(f)) {
4101         return float32_set_sign(float32_zero, sign);
4102     }
4103 
4104     /* frec7(+-0) = +-inf */
4105     if (float32_is_zero(f)) {
4106         s->float_exception_flags |= float_flag_divbyzero;
4107         return float32_set_sign(float32_infinity, sign);
4108     }
4109 
4110     /* frec7(sNaN) = canonical NaN */
4111     if (float32_is_signaling_nan(f, s)) {
4112         s->float_exception_flags |= float_flag_invalid;
4113         return float32_default_nan(s);
4114     }
4115 
4116     /* frec7(qNaN) = canonical NaN */
4117     if (float32_is_quiet_nan(f, s)) {
4118         return float32_default_nan(s);
4119     }
4120 
4121     /* +-normal, +-subnormal */
4122     uint64_t val = frec7(f, exp_size, frac_size, s);
4123     return make_float32(val);
4124 }
4125 
frec7_d(float64 f,float_status * s)4126 static float64 frec7_d(float64 f, float_status *s)
4127 {
4128     int exp_size = 11, frac_size = 52;
4129     bool sign = float64_is_neg(f);
4130 
4131     /* frec7(+-inf) = +-0 */
4132     if (float64_is_infinity(f)) {
4133         return float64_set_sign(float64_zero, sign);
4134     }
4135 
4136     /* frec7(+-0) = +-inf */
4137     if (float64_is_zero(f)) {
4138         s->float_exception_flags |= float_flag_divbyzero;
4139         return float64_set_sign(float64_infinity, sign);
4140     }
4141 
4142     /* frec7(sNaN) = canonical NaN */
4143     if (float64_is_signaling_nan(f, s)) {
4144         s->float_exception_flags |= float_flag_invalid;
4145         return float64_default_nan(s);
4146     }
4147 
4148     /* frec7(qNaN) = canonical NaN */
4149     if (float64_is_quiet_nan(f, s)) {
4150         return float64_default_nan(s);
4151     }
4152 
4153     /* +-normal, +-subnormal */
4154     uint64_t val = frec7(f, exp_size, frac_size, s);
4155     return make_float64(val);
4156 }
4157 
RVVCALL(OPFVV1,vfrec7_v_h,OP_UU_H,H2,H2,frec7_h)4158 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4159 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4160 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4161 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4162 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4163 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4164 
4165 /* Vector Floating-Point MIN/MAX Instructions */
4166 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4167 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4168 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4169 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4170 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4171 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4172 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4173 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4174 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4175 GEN_VEXT_VF(vfmin_vf_h, 2)
4176 GEN_VEXT_VF(vfmin_vf_w, 4)
4177 GEN_VEXT_VF(vfmin_vf_d, 8)
4178 
4179 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4180 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4181 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4182 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4183 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4184 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4185 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4186 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4187 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4188 GEN_VEXT_VF(vfmax_vf_h, 2)
4189 GEN_VEXT_VF(vfmax_vf_w, 4)
4190 GEN_VEXT_VF(vfmax_vf_d, 8)
4191 
4192 /* Vector Floating-Point Sign-Injection Instructions */
4193 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4194 {
4195     return deposit64(b, 0, 15, a);
4196 }
4197 
fsgnj32(uint32_t a,uint32_t b,float_status * s)4198 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4199 {
4200     return deposit64(b, 0, 31, a);
4201 }
4202 
fsgnj64(uint64_t a,uint64_t b,float_status * s)4203 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4204 {
4205     return deposit64(b, 0, 63, a);
4206 }
4207 
RVVCALL(OPFVV2,vfsgnj_vv_h,OP_UUU_H,H2,H2,H2,fsgnj16)4208 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4209 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4210 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4211 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4212 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4213 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4214 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4215 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4216 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4217 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4218 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4219 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4220 
4221 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4222 {
4223     return deposit64(~b, 0, 15, a);
4224 }
4225 
fsgnjn32(uint32_t a,uint32_t b,float_status * s)4226 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4227 {
4228     return deposit64(~b, 0, 31, a);
4229 }
4230 
fsgnjn64(uint64_t a,uint64_t b,float_status * s)4231 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4232 {
4233     return deposit64(~b, 0, 63, a);
4234 }
4235 
RVVCALL(OPFVV2,vfsgnjn_vv_h,OP_UUU_H,H2,H2,H2,fsgnjn16)4236 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4237 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4238 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4239 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4240 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4241 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4242 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4243 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4244 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4245 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4246 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4247 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4248 
4249 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4250 {
4251     return deposit64(b ^ a, 0, 15, a);
4252 }
4253 
fsgnjx32(uint32_t a,uint32_t b,float_status * s)4254 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4255 {
4256     return deposit64(b ^ a, 0, 31, a);
4257 }
4258 
fsgnjx64(uint64_t a,uint64_t b,float_status * s)4259 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4260 {
4261     return deposit64(b ^ a, 0, 63, a);
4262 }
4263 
RVVCALL(OPFVV2,vfsgnjx_vv_h,OP_UUU_H,H2,H2,H2,fsgnjx16)4264 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4265 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4266 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4267 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4268 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4269 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4270 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4271 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4272 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4273 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4274 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4275 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4276 
4277 /* Vector Floating-Point Compare Instructions */
4278 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4279 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4280                   CPURISCVState *env, uint32_t desc)          \
4281 {                                                             \
4282     uint32_t vm = vext_vm(desc);                              \
4283     uint32_t vl = env->vl;                                    \
4284     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4285     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4286     uint32_t vma = vext_vma(desc);                            \
4287     uint32_t i;                                               \
4288                                                               \
4289     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4290                                                               \
4291     for (i = env->vstart; i < vl; i++) {                      \
4292         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4293         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4294         if (!vm && !vext_elem_mask(v0, i)) {                  \
4295             /* set masked-off elements to 1s */               \
4296             if (vma) {                                        \
4297                 vext_set_elem_mask(vd, i, 1);                 \
4298             }                                                 \
4299             continue;                                         \
4300         }                                                     \
4301         vext_set_elem_mask(vd, i,                             \
4302                            DO_OP(s2, s1, &env->fp_status));   \
4303     }                                                         \
4304     env->vstart = 0;                                          \
4305     /*
4306      * mask destination register are always tail-agnostic
4307      * set tail elements to 1s
4308      */                                                       \
4309     if (vta_all_1s) {                                         \
4310         for (; i < total_elems; i++) {                        \
4311             vext_set_elem_mask(vd, i, 1);                     \
4312         }                                                     \
4313     }                                                         \
4314 }
4315 
4316 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4317 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4318 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4319 
4320 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4321 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4322                   CPURISCVState *env, uint32_t desc)                \
4323 {                                                                   \
4324     uint32_t vm = vext_vm(desc);                                    \
4325     uint32_t vl = env->vl;                                          \
4326     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4327     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4328     uint32_t vma = vext_vma(desc);                                  \
4329     uint32_t i;                                                     \
4330                                                                     \
4331     VSTART_CHECK_EARLY_EXIT(env, vl);                               \
4332                                                                     \
4333     for (i = env->vstart; i < vl; i++) {                            \
4334         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4335         if (!vm && !vext_elem_mask(v0, i)) {                        \
4336             /* set masked-off elements to 1s */                     \
4337             if (vma) {                                              \
4338                 vext_set_elem_mask(vd, i, 1);                       \
4339             }                                                       \
4340             continue;                                               \
4341         }                                                           \
4342         vext_set_elem_mask(vd, i,                                   \
4343                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4344     }                                                               \
4345     env->vstart = 0;                                                \
4346     /*
4347      * mask destination register are always tail-agnostic
4348      * set tail elements to 1s
4349      */                                                             \
4350     if (vta_all_1s) {                                               \
4351         for (; i < total_elems; i++) {                              \
4352             vext_set_elem_mask(vd, i, 1);                           \
4353         }                                                           \
4354     }                                                               \
4355 }
4356 
4357 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4358 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4359 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4360 
4361 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4362 {
4363     FloatRelation compare = float16_compare_quiet(a, b, s);
4364     return compare != float_relation_equal;
4365 }
4366 
vmfne32(uint32_t a,uint32_t b,float_status * s)4367 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4368 {
4369     FloatRelation compare = float32_compare_quiet(a, b, s);
4370     return compare != float_relation_equal;
4371 }
4372 
vmfne64(uint64_t a,uint64_t b,float_status * s)4373 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4374 {
4375     FloatRelation compare = float64_compare_quiet(a, b, s);
4376     return compare != float_relation_equal;
4377 }
4378 
GEN_VEXT_CMP_VV_ENV(vmfne_vv_h,uint16_t,H2,vmfne16)4379 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4380 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4381 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4382 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4383 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4384 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4385 
4386 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4387 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4388 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4389 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4390 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4391 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4392 
4393 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4394 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4395 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4396 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4397 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4398 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4399 
4400 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4401 {
4402     FloatRelation compare = float16_compare(a, b, s);
4403     return compare == float_relation_greater;
4404 }
4405 
vmfgt32(uint32_t a,uint32_t b,float_status * s)4406 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4407 {
4408     FloatRelation compare = float32_compare(a, b, s);
4409     return compare == float_relation_greater;
4410 }
4411 
vmfgt64(uint64_t a,uint64_t b,float_status * s)4412 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4413 {
4414     FloatRelation compare = float64_compare(a, b, s);
4415     return compare == float_relation_greater;
4416 }
4417 
GEN_VEXT_CMP_VF(vmfgt_vf_h,uint16_t,H2,vmfgt16)4418 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4419 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4420 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4421 
4422 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4423 {
4424     FloatRelation compare = float16_compare(a, b, s);
4425     return compare == float_relation_greater ||
4426            compare == float_relation_equal;
4427 }
4428 
vmfge32(uint32_t a,uint32_t b,float_status * s)4429 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4430 {
4431     FloatRelation compare = float32_compare(a, b, s);
4432     return compare == float_relation_greater ||
4433            compare == float_relation_equal;
4434 }
4435 
vmfge64(uint64_t a,uint64_t b,float_status * s)4436 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4437 {
4438     FloatRelation compare = float64_compare(a, b, s);
4439     return compare == float_relation_greater ||
4440            compare == float_relation_equal;
4441 }
4442 
GEN_VEXT_CMP_VF(vmfge_vf_h,uint16_t,H2,vmfge16)4443 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4444 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4445 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4446 
4447 /* Vector Floating-Point Classify Instruction */
4448 target_ulong fclass_h(uint64_t frs1)
4449 {
4450     float16 f = frs1;
4451     bool sign = float16_is_neg(f);
4452 
4453     if (float16_is_infinity(f)) {
4454         return sign ? 1 << 0 : 1 << 7;
4455     } else if (float16_is_zero(f)) {
4456         return sign ? 1 << 3 : 1 << 4;
4457     } else if (float16_is_zero_or_denormal(f)) {
4458         return sign ? 1 << 2 : 1 << 5;
4459     } else if (float16_is_any_nan(f)) {
4460         float_status s = { }; /* for snan_bit_is_one */
4461         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4462     } else {
4463         return sign ? 1 << 1 : 1 << 6;
4464     }
4465 }
4466 
fclass_s(uint64_t frs1)4467 target_ulong fclass_s(uint64_t frs1)
4468 {
4469     float32 f = frs1;
4470     bool sign = float32_is_neg(f);
4471 
4472     if (float32_is_infinity(f)) {
4473         return sign ? 1 << 0 : 1 << 7;
4474     } else if (float32_is_zero(f)) {
4475         return sign ? 1 << 3 : 1 << 4;
4476     } else if (float32_is_zero_or_denormal(f)) {
4477         return sign ? 1 << 2 : 1 << 5;
4478     } else if (float32_is_any_nan(f)) {
4479         float_status s = { }; /* for snan_bit_is_one */
4480         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4481     } else {
4482         return sign ? 1 << 1 : 1 << 6;
4483     }
4484 }
4485 
fclass_d(uint64_t frs1)4486 target_ulong fclass_d(uint64_t frs1)
4487 {
4488     float64 f = frs1;
4489     bool sign = float64_is_neg(f);
4490 
4491     if (float64_is_infinity(f)) {
4492         return sign ? 1 << 0 : 1 << 7;
4493     } else if (float64_is_zero(f)) {
4494         return sign ? 1 << 3 : 1 << 4;
4495     } else if (float64_is_zero_or_denormal(f)) {
4496         return sign ? 1 << 2 : 1 << 5;
4497     } else if (float64_is_any_nan(f)) {
4498         float_status s = { }; /* for snan_bit_is_one */
4499         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4500     } else {
4501         return sign ? 1 << 1 : 1 << 6;
4502     }
4503 }
4504 
RVVCALL(OPIVV1,vfclass_v_h,OP_UU_H,H2,H2,fclass_h)4505 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4506 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4507 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4508 GEN_VEXT_V(vfclass_v_h, 2)
4509 GEN_VEXT_V(vfclass_v_w, 4)
4510 GEN_VEXT_V(vfclass_v_d, 8)
4511 
4512 /* Vector Floating-Point Merge Instruction */
4513 
4514 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4515 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4516                   CPURISCVState *env, uint32_t desc)          \
4517 {                                                             \
4518     uint32_t vm = vext_vm(desc);                              \
4519     uint32_t vl = env->vl;                                    \
4520     uint32_t esz = sizeof(ETYPE);                             \
4521     uint32_t total_elems =                                    \
4522         vext_get_total_elems(env, desc, esz);                 \
4523     uint32_t vta = vext_vta(desc);                            \
4524     uint32_t i;                                               \
4525                                                               \
4526     VSTART_CHECK_EARLY_EXIT(env, vl);                         \
4527                                                               \
4528     for (i = env->vstart; i < vl; i++) {                      \
4529         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4530         *((ETYPE *)vd + H(i)) =                               \
4531             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4532     }                                                         \
4533     env->vstart = 0;                                          \
4534     /* set tail elements to 1s */                             \
4535     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4536 }
4537 
4538 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4539 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4540 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4541 
4542 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4543 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4544 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4545 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4546 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4547 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4548 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4549 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4550 
4551 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4552 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4553 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4554 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4555 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4556 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4557 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4558 
4559 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4560 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4561 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4562 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4563 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4564 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4565 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4566 
4567 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4568 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4569 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4570 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4571 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4572 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4573 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4574 
4575 /* Widening Floating-Point/Integer Type-Convert Instructions */
4576 /* (TD, T2, TX2) */
4577 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4578 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4579 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4580 /*
4581  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4582  */
4583 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4584 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4585 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4586 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4587 
4588 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4589 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4590 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4591 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4592 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4593 
4594 /*
4595  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4596  */
4597 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4598 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4599 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4600 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4601 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4602 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4603 
4604 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4605 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4606 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4607 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4608 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4609 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4610 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4611 
4612 /*
4613  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4614  */
4615 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4616 {
4617     return float16_to_float32(a, true, s);
4618 }
4619 
RVVCALL(OPFVV1,vfwcvt_f_f_v_h,WOP_UU_H,H4,H2,vfwcvtffv16)4620 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4621 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4622 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4623 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4624 
4625 RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4626 GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4627 
4628 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4629 /* (TD, T2, TX2) */
4630 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4631 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4632 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4633 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4634 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4635 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4636 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4637 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4638 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4639 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4640 
4641 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4642 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4643 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4644 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4645 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4646 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4647 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4648 
4649 /*
4650  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4651  */
4652 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4653 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4654 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4655 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4656 
4657 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4658 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4659 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4660 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4661 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4662 
4663 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4664 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4665 {
4666     return float32_to_float16(a, true, s);
4667 }
4668 
RVVCALL(OPFVV1,vfncvt_f_f_w_h,NOP_UU_H,H2,H4,vfncvtffv16)4669 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4670 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4671 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4672 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4673 
4674 RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4675 GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4676 
4677 /*
4678  * Vector Reduction Operations
4679  */
4680 /* Vector Single-Width Integer Reduction Instructions */
4681 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4682 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4683                   void *vs2, CPURISCVState *env,          \
4684                   uint32_t desc)                          \
4685 {                                                         \
4686     uint32_t vm = vext_vm(desc);                          \
4687     uint32_t vl = env->vl;                                \
4688     uint32_t esz = sizeof(TD);                            \
4689     uint32_t vlenb = simd_maxsz(desc);                    \
4690     uint32_t vta = vext_vta(desc);                        \
4691     uint32_t i;                                           \
4692     TD s1 =  *((TD *)vs1 + HD(0));                        \
4693                                                           \
4694     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4695                                                           \
4696     for (i = env->vstart; i < vl; i++) {                  \
4697         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4698         if (!vm && !vext_elem_mask(v0, i)) {              \
4699             continue;                                     \
4700         }                                                 \
4701         s1 = OP(s1, (TD)s2);                              \
4702     }                                                     \
4703     if (vl > 0) {                                         \
4704         *((TD *)vd + HD(0)) = s1;                         \
4705     }                                                     \
4706     env->vstart = 0;                                      \
4707     /* set tail elements to 1s */                         \
4708     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4709 }
4710 
4711 /* vd[0] = sum(vs1[0], vs2[*]) */
4712 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4713 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4714 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4715 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4716 
4717 /* vd[0] = maxu(vs1[0], vs2[*]) */
4718 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4719 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4720 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4721 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4722 
4723 /* vd[0] = max(vs1[0], vs2[*]) */
4724 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4725 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4726 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4727 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4728 
4729 /* vd[0] = minu(vs1[0], vs2[*]) */
4730 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4731 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4732 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4733 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4734 
4735 /* vd[0] = min(vs1[0], vs2[*]) */
4736 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4737 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4738 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4739 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4740 
4741 /* vd[0] = and(vs1[0], vs2[*]) */
4742 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4743 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4744 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4745 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4746 
4747 /* vd[0] = or(vs1[0], vs2[*]) */
4748 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4749 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4750 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4751 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4752 
4753 /* vd[0] = xor(vs1[0], vs2[*]) */
4754 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4755 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4756 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4757 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4758 
4759 /* Vector Widening Integer Reduction Instructions */
4760 /* signed sum reduction into double-width accumulator */
4761 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4762 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4763 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4764 
4765 /* Unsigned sum reduction into double-width accumulator */
4766 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4767 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4768 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4769 
4770 /* Vector Single-Width Floating-Point Reduction Instructions */
4771 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4772 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4773                   void *vs2, CPURISCVState *env,           \
4774                   uint32_t desc)                           \
4775 {                                                          \
4776     uint32_t vm = vext_vm(desc);                           \
4777     uint32_t vl = env->vl;                                 \
4778     uint32_t esz = sizeof(TD);                             \
4779     uint32_t vlenb = simd_maxsz(desc);                     \
4780     uint32_t vta = vext_vta(desc);                         \
4781     uint32_t i;                                            \
4782     TD s1 =  *((TD *)vs1 + HD(0));                         \
4783                                                            \
4784     VSTART_CHECK_EARLY_EXIT(env, vl);                      \
4785                                                            \
4786     for (i = env->vstart; i < vl; i++) {                   \
4787         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4788         if (!vm && !vext_elem_mask(v0, i)) {               \
4789             continue;                                      \
4790         }                                                  \
4791         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4792     }                                                      \
4793     if (vl > 0) {                                          \
4794         *((TD *)vd + HD(0)) = s1;                          \
4795     }                                                      \
4796     env->vstart = 0;                                       \
4797     /* set tail elements to 1s */                          \
4798     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4799 }
4800 
4801 /* Unordered sum */
4802 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4803 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4804 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4805 
4806 /* Ordered sum */
4807 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4808 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4809 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4810 
4811 /* Maximum value */
4812 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4813               float16_maximum_number)
4814 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4815               float32_maximum_number)
4816 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4817               float64_maximum_number)
4818 
4819 /* Minimum value */
4820 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4821               float16_minimum_number)
4822 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4823               float32_minimum_number)
4824 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4825               float64_minimum_number)
4826 
4827 /* Vector Widening Floating-Point Add Instructions */
4828 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4829 {
4830     return float32_add(a, float16_to_float32(b, true, s), s);
4831 }
4832 
fwadd32(uint64_t a,uint32_t b,float_status * s)4833 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4834 {
4835     return float64_add(a, float32_to_float64(b, s), s);
4836 }
4837 
4838 /* Vector Widening Floating-Point Reduction Instructions */
4839 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
GEN_VEXT_FRED(vfwredusum_vs_h,uint32_t,uint16_t,H4,H2,fwadd16)4840 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4841 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4842 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4843 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4844 
4845 /*
4846  * Vector Mask Operations
4847  */
4848 /* Vector Mask-Register Logical Instructions */
4849 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4850 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4851                   void *vs2, CPURISCVState *env,          \
4852                   uint32_t desc)                          \
4853 {                                                         \
4854     uint32_t vl = env->vl;                                \
4855     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4856     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4857     uint32_t i;                                           \
4858     int a, b;                                             \
4859                                                           \
4860     VSTART_CHECK_EARLY_EXIT(env, vl);                     \
4861                                                           \
4862     for (i = env->vstart; i < vl; i++) {                  \
4863         a = vext_elem_mask(vs1, i);                       \
4864         b = vext_elem_mask(vs2, i);                       \
4865         vext_set_elem_mask(vd, i, OP(b, a));              \
4866     }                                                     \
4867     env->vstart = 0;                                      \
4868     /*
4869      * mask destination register are always tail-agnostic
4870      * set tail elements to 1s
4871      */                                                   \
4872     if (vta_all_1s) {                                     \
4873         for (; i < total_elems; i++) {                    \
4874             vext_set_elem_mask(vd, i, 1);                 \
4875         }                                                 \
4876     }                                                     \
4877 }
4878 
4879 #define DO_NAND(N, M)  (!(N & M))
4880 #define DO_ANDNOT(N, M)  (N & !M)
4881 #define DO_NOR(N, M)  (!(N | M))
4882 #define DO_ORNOT(N, M)  (N | !M)
4883 #define DO_XNOR(N, M)  (!(N ^ M))
4884 
4885 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4886 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4887 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4888 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4889 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4890 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4891 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4892 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4893 
4894 /* Vector count population in mask vcpop */
4895 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4896                              uint32_t desc)
4897 {
4898     target_ulong cnt = 0;
4899     uint32_t vm = vext_vm(desc);
4900     uint32_t vl = env->vl;
4901     int i;
4902 
4903     for (i = env->vstart; i < vl; i++) {
4904         if (vm || vext_elem_mask(v0, i)) {
4905             if (vext_elem_mask(vs2, i)) {
4906                 cnt++;
4907             }
4908         }
4909     }
4910     env->vstart = 0;
4911     return cnt;
4912 }
4913 
4914 /* vfirst find-first-set mask bit */
HELPER(vfirst_m)4915 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4916                               uint32_t desc)
4917 {
4918     uint32_t vm = vext_vm(desc);
4919     uint32_t vl = env->vl;
4920     int i;
4921 
4922     for (i = env->vstart; i < vl; i++) {
4923         if (vm || vext_elem_mask(v0, i)) {
4924             if (vext_elem_mask(vs2, i)) {
4925                 return i;
4926             }
4927         }
4928     }
4929     env->vstart = 0;
4930     return -1LL;
4931 }
4932 
4933 enum set_mask_type {
4934     ONLY_FIRST = 1,
4935     INCLUDE_FIRST,
4936     BEFORE_FIRST,
4937 };
4938 
vmsetm(void * vd,void * v0,void * vs2,CPURISCVState * env,uint32_t desc,enum set_mask_type type)4939 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4940                    uint32_t desc, enum set_mask_type type)
4941 {
4942     uint32_t vm = vext_vm(desc);
4943     uint32_t vl = env->vl;
4944     uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4945     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4946     uint32_t vma = vext_vma(desc);
4947     int i;
4948     bool first_mask_bit = false;
4949 
4950     VSTART_CHECK_EARLY_EXIT(env, vl);
4951 
4952     for (i = env->vstart; i < vl; i++) {
4953         if (!vm && !vext_elem_mask(v0, i)) {
4954             /* set masked-off elements to 1s */
4955             if (vma) {
4956                 vext_set_elem_mask(vd, i, 1);
4957             }
4958             continue;
4959         }
4960         /* write a zero to all following active elements */
4961         if (first_mask_bit) {
4962             vext_set_elem_mask(vd, i, 0);
4963             continue;
4964         }
4965         if (vext_elem_mask(vs2, i)) {
4966             first_mask_bit = true;
4967             if (type == BEFORE_FIRST) {
4968                 vext_set_elem_mask(vd, i, 0);
4969             } else {
4970                 vext_set_elem_mask(vd, i, 1);
4971             }
4972         } else {
4973             if (type == ONLY_FIRST) {
4974                 vext_set_elem_mask(vd, i, 0);
4975             } else {
4976                 vext_set_elem_mask(vd, i, 1);
4977             }
4978         }
4979     }
4980     env->vstart = 0;
4981     /*
4982      * mask destination register are always tail-agnostic
4983      * set tail elements to 1s
4984      */
4985     if (vta_all_1s) {
4986         for (; i < total_elems; i++) {
4987             vext_set_elem_mask(vd, i, 1);
4988         }
4989     }
4990 }
4991 
HELPER(vmsbf_m)4992 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4993                      uint32_t desc)
4994 {
4995     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4996 }
4997 
HELPER(vmsif_m)4998 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4999                      uint32_t desc)
5000 {
5001     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
5002 }
5003 
HELPER(vmsof_m)5004 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
5005                      uint32_t desc)
5006 {
5007     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
5008 }
5009 
5010 /* Vector Iota Instruction */
5011 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
5012 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
5013                   uint32_t desc)                                          \
5014 {                                                                         \
5015     uint32_t vm = vext_vm(desc);                                          \
5016     uint32_t vl = env->vl;                                                \
5017     uint32_t esz = sizeof(ETYPE);                                         \
5018     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5019     uint32_t vta = vext_vta(desc);                                        \
5020     uint32_t vma = vext_vma(desc);                                        \
5021     uint32_t sum = 0;                                                     \
5022     int i;                                                                \
5023                                                                           \
5024     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5025                                                                           \
5026     for (i = env->vstart; i < vl; i++) {                                  \
5027         if (!vm && !vext_elem_mask(v0, i)) {                              \
5028             /* set masked-off elements to 1s */                           \
5029             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5030             continue;                                                     \
5031         }                                                                 \
5032         *((ETYPE *)vd + H(i)) = sum;                                      \
5033         if (vext_elem_mask(vs2, i)) {                                     \
5034             sum++;                                                        \
5035         }                                                                 \
5036     }                                                                     \
5037     env->vstart = 0;                                                      \
5038     /* set tail elements to 1s */                                         \
5039     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5040 }
5041 
GEN_VEXT_VIOTA_M(viota_m_b,uint8_t,H1)5042 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
5043 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
5044 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
5045 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
5046 
5047 /* Vector Element Index Instruction */
5048 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
5049 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
5050 {                                                                         \
5051     uint32_t vm = vext_vm(desc);                                          \
5052     uint32_t vl = env->vl;                                                \
5053     uint32_t esz = sizeof(ETYPE);                                         \
5054     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5055     uint32_t vta = vext_vta(desc);                                        \
5056     uint32_t vma = vext_vma(desc);                                        \
5057     int i;                                                                \
5058                                                                           \
5059     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5060                                                                           \
5061     for (i = env->vstart; i < vl; i++) {                                  \
5062         if (!vm && !vext_elem_mask(v0, i)) {                              \
5063             /* set masked-off elements to 1s */                           \
5064             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5065             continue;                                                     \
5066         }                                                                 \
5067         *((ETYPE *)vd + H(i)) = i;                                        \
5068     }                                                                     \
5069     env->vstart = 0;                                                      \
5070     /* set tail elements to 1s */                                         \
5071     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5072 }
5073 
5074 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5075 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5076 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5077 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5078 
5079 /*
5080  * Vector Permutation Instructions
5081  */
5082 
5083 /* Vector Slide Instructions */
5084 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5085 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5086                   CPURISCVState *env, uint32_t desc)                      \
5087 {                                                                         \
5088     uint32_t vm = vext_vm(desc);                                          \
5089     uint32_t vl = env->vl;                                                \
5090     uint32_t esz = sizeof(ETYPE);                                         \
5091     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5092     uint32_t vta = vext_vta(desc);                                        \
5093     uint32_t vma = vext_vma(desc);                                        \
5094     target_ulong offset = s1, i_min, i;                                   \
5095                                                                           \
5096     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5097                                                                           \
5098     i_min = MAX(env->vstart, offset);                                     \
5099     for (i = i_min; i < vl; i++) {                                        \
5100         if (!vm && !vext_elem_mask(v0, i)) {                              \
5101             /* set masked-off elements to 1s */                           \
5102             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5103             continue;                                                     \
5104         }                                                                 \
5105         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5106     }                                                                     \
5107     env->vstart = 0;                                                      \
5108     /* set tail elements to 1s */                                         \
5109     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5110 }
5111 
5112 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5113 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5114 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5115 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5116 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5117 
5118 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5119 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5120                   CPURISCVState *env, uint32_t desc)                      \
5121 {                                                                         \
5122     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5123     uint32_t vm = vext_vm(desc);                                          \
5124     uint32_t vl = env->vl;                                                \
5125     uint32_t esz = sizeof(ETYPE);                                         \
5126     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5127     uint32_t vta = vext_vta(desc);                                        \
5128     uint32_t vma = vext_vma(desc);                                        \
5129     target_ulong i_max, i_min, i;                                         \
5130                                                                           \
5131     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5132                                                                           \
5133     i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5134     i_max = MAX(i_min, env->vstart);                                      \
5135     for (i = env->vstart; i < i_max; ++i) {                               \
5136         if (!vm && !vext_elem_mask(v0, i)) {                              \
5137             /* set masked-off elements to 1s */                           \
5138             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5139             continue;                                                     \
5140         }                                                                 \
5141         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5142     }                                                                     \
5143                                                                           \
5144     for (i = i_max; i < vl; ++i) {                                        \
5145         if (!vm && !vext_elem_mask(v0, i)) {                              \
5146             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5147             continue;                                                     \
5148         }                                                                 \
5149         *((ETYPE *)vd + H(i)) = 0;                                        \
5150     }                                                                     \
5151                                                                           \
5152     env->vstart = 0;                                                      \
5153     /* set tail elements to 1s */                                         \
5154     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5155 }
5156 
5157 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5158 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5159 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5160 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5161 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5162 
5163 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5164 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5165                                  void *vs2, CPURISCVState *env,             \
5166                                  uint32_t desc)                             \
5167 {                                                                           \
5168     typedef uint##BITWIDTH##_t ETYPE;                                       \
5169     uint32_t vm = vext_vm(desc);                                            \
5170     uint32_t vl = env->vl;                                                  \
5171     uint32_t esz = sizeof(ETYPE);                                           \
5172     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5173     uint32_t vta = vext_vta(desc);                                          \
5174     uint32_t vma = vext_vma(desc);                                          \
5175     uint32_t i;                                                             \
5176                                                                             \
5177     VSTART_CHECK_EARLY_EXIT(env, vl);                                       \
5178                                                                             \
5179     for (i = env->vstart; i < vl; i++) {                                    \
5180         if (!vm && !vext_elem_mask(v0, i)) {                                \
5181             /* set masked-off elements to 1s */                             \
5182             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5183             continue;                                                       \
5184         }                                                                   \
5185         if (i == 0) {                                                       \
5186             *((ETYPE *)vd + H(i)) = s1;                                     \
5187         } else {                                                            \
5188             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5189         }                                                                   \
5190     }                                                                       \
5191     env->vstart = 0;                                                        \
5192     /* set tail elements to 1s */                                           \
5193     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5194 }
5195 
5196 GEN_VEXT_VSLIE1UP(8,  H1)
5197 GEN_VEXT_VSLIE1UP(16, H2)
5198 GEN_VEXT_VSLIE1UP(32, H4)
5199 GEN_VEXT_VSLIE1UP(64, H8)
5200 
5201 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5202 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5203                   CPURISCVState *env, uint32_t desc)              \
5204 {                                                                 \
5205     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5206 }
5207 
5208 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5209 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5210 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5211 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5212 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5213 
5214 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5215 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5216                                    void *vs2, CPURISCVState *env,             \
5217                                    uint32_t desc)                             \
5218 {                                                                             \
5219     typedef uint##BITWIDTH##_t ETYPE;                                         \
5220     uint32_t vm = vext_vm(desc);                                              \
5221     uint32_t vl = env->vl;                                                    \
5222     uint32_t esz = sizeof(ETYPE);                                             \
5223     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5224     uint32_t vta = vext_vta(desc);                                            \
5225     uint32_t vma = vext_vma(desc);                                            \
5226     uint32_t i;                                                               \
5227                                                                               \
5228     VSTART_CHECK_EARLY_EXIT(env, vl);                                         \
5229                                                                               \
5230     for (i = env->vstart; i < vl; i++) {                                      \
5231         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5232             /* set masked-off elements to 1s */                               \
5233             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5234             continue;                                                         \
5235         }                                                                     \
5236         if (i == vl - 1) {                                                    \
5237             *((ETYPE *)vd + H(i)) = s1;                                       \
5238         } else {                                                              \
5239             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5240         }                                                                     \
5241     }                                                                         \
5242     env->vstart = 0;                                                          \
5243     /* set tail elements to 1s */                                             \
5244     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5245 }
5246 
5247 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5248 GEN_VEXT_VSLIDE1DOWN(16, H2)
5249 GEN_VEXT_VSLIDE1DOWN(32, H4)
5250 GEN_VEXT_VSLIDE1DOWN(64, H8)
5251 
5252 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5253 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5254                   CPURISCVState *env, uint32_t desc)              \
5255 {                                                                 \
5256     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5257 }
5258 
5259 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5260 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5261 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5262 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5263 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5264 
5265 /* Vector Floating-Point Slide Instructions */
5266 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5267 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5268                   CPURISCVState *env, uint32_t desc)          \
5269 {                                                             \
5270     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5271 }
5272 
5273 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5274 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5275 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5276 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5277 
5278 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5279 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5280                   CPURISCVState *env, uint32_t desc)          \
5281 {                                                             \
5282     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5283 }
5284 
5285 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5286 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5287 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5288 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5289 
5290 /* Vector Register Gather Instruction */
5291 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5292 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5293                   CPURISCVState *env, uint32_t desc)                      \
5294 {                                                                         \
5295     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5296     uint32_t vm = vext_vm(desc);                                          \
5297     uint32_t vl = env->vl;                                                \
5298     uint32_t esz = sizeof(TS2);                                           \
5299     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5300     uint32_t vta = vext_vta(desc);                                        \
5301     uint32_t vma = vext_vma(desc);                                        \
5302     uint64_t index;                                                       \
5303     uint32_t i;                                                           \
5304                                                                           \
5305     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5306                                                                           \
5307     for (i = env->vstart; i < vl; i++) {                                  \
5308         if (!vm && !vext_elem_mask(v0, i)) {                              \
5309             /* set masked-off elements to 1s */                           \
5310             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5311             continue;                                                     \
5312         }                                                                 \
5313         index = *((TS1 *)vs1 + HS1(i));                                   \
5314         if (index >= vlmax) {                                             \
5315             *((TS2 *)vd + HS2(i)) = 0;                                    \
5316         } else {                                                          \
5317             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5318         }                                                                 \
5319     }                                                                     \
5320     env->vstart = 0;                                                      \
5321     /* set tail elements to 1s */                                         \
5322     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5323 }
5324 
5325 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5326 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5327 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5328 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5329 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5330 
5331 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5332 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5333 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5334 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5335 
5336 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5337 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5338                   CPURISCVState *env, uint32_t desc)                      \
5339 {                                                                         \
5340     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5341     uint32_t vm = vext_vm(desc);                                          \
5342     uint32_t vl = env->vl;                                                \
5343     uint32_t esz = sizeof(ETYPE);                                         \
5344     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5345     uint32_t vta = vext_vta(desc);                                        \
5346     uint32_t vma = vext_vma(desc);                                        \
5347     uint64_t index = s1;                                                  \
5348     uint32_t i;                                                           \
5349                                                                           \
5350     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5351                                                                           \
5352     for (i = env->vstart; i < vl; i++) {                                  \
5353         if (!vm && !vext_elem_mask(v0, i)) {                              \
5354             /* set masked-off elements to 1s */                           \
5355             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5356             continue;                                                     \
5357         }                                                                 \
5358         if (index >= vlmax) {                                             \
5359             *((ETYPE *)vd + H(i)) = 0;                                    \
5360         } else {                                                          \
5361             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5362         }                                                                 \
5363     }                                                                     \
5364     env->vstart = 0;                                                      \
5365     /* set tail elements to 1s */                                         \
5366     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5367 }
5368 
5369 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5370 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5371 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5372 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5373 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5374 
5375 /* Vector Compress Instruction */
5376 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5377 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5378                   CPURISCVState *env, uint32_t desc)                      \
5379 {                                                                         \
5380     uint32_t vl = env->vl;                                                \
5381     uint32_t esz = sizeof(ETYPE);                                         \
5382     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5383     uint32_t vta = vext_vta(desc);                                        \
5384     uint32_t num = 0, i;                                                  \
5385                                                                           \
5386     VSTART_CHECK_EARLY_EXIT(env, vl);                                     \
5387                                                                           \
5388     for (i = env->vstart; i < vl; i++) {                                  \
5389         if (!vext_elem_mask(vs1, i)) {                                    \
5390             continue;                                                     \
5391         }                                                                 \
5392         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5393         num++;                                                            \
5394     }                                                                     \
5395     env->vstart = 0;                                                      \
5396     /* set tail elements to 1s */                                         \
5397     vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5398 }
5399 
5400 /* Compress into vd elements of vs2 where vs1 is enabled */
5401 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5402 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5403 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5404 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5405 
5406 /* Vector Whole Register Move */
5407 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5408 {
5409     /* EEW = SEW */
5410     uint32_t maxsz = simd_maxsz(desc);
5411     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5412     uint32_t startb = env->vstart * sewb;
5413     uint32_t i = startb;
5414 
5415     if (startb >= maxsz) {
5416         env->vstart = 0;
5417         return;
5418     }
5419 
5420     if (HOST_BIG_ENDIAN && i % 8 != 0) {
5421         uint32_t j = ROUND_UP(i, 8);
5422         memcpy((uint8_t *)vd + H1(j - 1),
5423                (uint8_t *)vs2 + H1(j - 1),
5424                j - i);
5425         i = j;
5426     }
5427 
5428     memcpy((uint8_t *)vd + H1(i),
5429            (uint8_t *)vs2 + H1(i),
5430            maxsz - i);
5431 
5432     env->vstart = 0;
5433 }
5434 
5435 /* Vector Integer Extension */
5436 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5437 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5438                   CPURISCVState *env, uint32_t desc)             \
5439 {                                                                \
5440     uint32_t vl = env->vl;                                       \
5441     uint32_t vm = vext_vm(desc);                                 \
5442     uint32_t esz = sizeof(ETYPE);                                \
5443     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5444     uint32_t vta = vext_vta(desc);                               \
5445     uint32_t vma = vext_vma(desc);                               \
5446     uint32_t i;                                                  \
5447                                                                  \
5448     VSTART_CHECK_EARLY_EXIT(env, vl);                            \
5449                                                                  \
5450     for (i = env->vstart; i < vl; i++) {                         \
5451         if (!vm && !vext_elem_mask(v0, i)) {                     \
5452             /* set masked-off elements to 1s */                  \
5453             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5454             continue;                                            \
5455         }                                                        \
5456         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5457     }                                                            \
5458     env->vstart = 0;                                             \
5459     /* set tail elements to 1s */                                \
5460     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5461 }
5462 
5463 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5464 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5465 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5466 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5467 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5468 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5469 
5470 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5471 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5472 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5473 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5474 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5475 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5476