xref: /openbmc/qemu/target/riscv/vector_helper.c (revision feef1866d1366d651e6a3cb8c9cf1a9aabb81395)
1  /*
2   * RISC-V Vector Extension Helpers for QEMU.
3   *
4   * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5   *
6   * This program is free software; you can redistribute it and/or modify it
7   * under the terms and conditions of the GNU General Public License,
8   * version 2 or later, as published by the Free Software Foundation.
9   *
10   * This program is distributed in the hope it will be useful, but WITHOUT
11   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13   * more details.
14   *
15   * You should have received a copy of the GNU General Public License along with
16   * this program.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  
19  #include "qemu/osdep.h"
20  #include "qemu/host-utils.h"
21  #include "qemu/bitops.h"
22  #include "cpu.h"
23  #include "exec/memop.h"
24  #include "exec/exec-all.h"
25  #include "exec/cpu_ldst.h"
26  #include "exec/page-protection.h"
27  #include "exec/helper-proto.h"
28  #include "fpu/softfloat.h"
29  #include "tcg/tcg-gvec-desc.h"
30  #include "internals.h"
31  #include "vector_internals.h"
32  #include <math.h>
33  
HELPER(vsetvl)34  target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
35                              target_ulong s2)
36  {
37      int vlmax, vl;
38      RISCVCPU *cpu = env_archcpu(env);
39      uint64_t vlmul = FIELD_EX64(s2, VTYPE, VLMUL);
40      uint8_t vsew = FIELD_EX64(s2, VTYPE, VSEW);
41      uint16_t sew = 8 << vsew;
42      uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
43      int xlen = riscv_cpu_xlen(env);
44      bool vill = (s2 >> (xlen - 1)) & 0x1;
45      target_ulong reserved = s2 &
46                              MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
47                                              xlen - 1 - R_VTYPE_RESERVED_SHIFT);
48      uint16_t vlen = cpu->cfg.vlenb << 3;
49      int8_t lmul;
50  
51      if (vlmul & 4) {
52          /*
53           * Fractional LMUL, check:
54           *
55           * VLEN * LMUL >= SEW
56           * VLEN >> (8 - lmul) >= sew
57           * (vlenb << 3) >> (8 - lmul) >= sew
58           */
59          if (vlmul == 4 || (vlen >> (8 - vlmul)) < sew) {
60              vill = true;
61          }
62      }
63  
64      if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
65          /* only set vill bit. */
66          env->vill = 1;
67          env->vtype = 0;
68          env->vl = 0;
69          env->vstart = 0;
70          return 0;
71      }
72  
73      /* lmul encoded as in DisasContext::lmul */
74      lmul = sextract32(FIELD_EX64(s2, VTYPE, VLMUL), 0, 3);
75      vlmax = vext_get_vlmax(cpu->cfg.vlenb, vsew, lmul);
76      if (s1 <= vlmax) {
77          vl = s1;
78      } else if (s1 < 2 * vlmax && cpu->cfg.rvv_vl_half_avl) {
79          vl = (s1 + 1) >> 1;
80      } else {
81          vl = vlmax;
82      }
83      env->vl = vl;
84      env->vtype = s2;
85      env->vstart = 0;
86      env->vill = 0;
87      return vl;
88  }
89  
90  /*
91   * Get the maximum number of elements can be operated.
92   *
93   * log2_esz: log2 of element size in bytes.
94   */
vext_max_elems(uint32_t desc,uint32_t log2_esz)95  static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
96  {
97      /*
98       * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
99       * so vlen in bytes (vlenb) is encoded as maxsz.
100       */
101      uint32_t vlenb = simd_maxsz(desc);
102  
103      /* Return VLMAX */
104      int scale = vext_lmul(desc) - log2_esz;
105      return scale < 0 ? vlenb >> -scale : vlenb << scale;
106  }
107  
adjust_addr(CPURISCVState * env,target_ulong addr)108  static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
109  {
110      return (addr & ~env->cur_pmmask) | env->cur_pmbase;
111  }
112  
113  /*
114   * This function checks watchpoint before real load operation.
115   *
116   * In system mode, the TLB API probe_access is enough for watchpoint check.
117   * In user mode, there is no watchpoint support now.
118   *
119   * It will trigger an exception if there is no mapping in TLB
120   * and page table walk can't fill the TLB entry. Then the guest
121   * software can return here after process the exception or never return.
122   */
probe_pages(CPURISCVState * env,target_ulong addr,target_ulong len,uintptr_t ra,MMUAccessType access_type)123  static void probe_pages(CPURISCVState *env, target_ulong addr,
124                          target_ulong len, uintptr_t ra,
125                          MMUAccessType access_type)
126  {
127      target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
128      target_ulong curlen = MIN(pagelen, len);
129      int mmu_index = riscv_env_mmu_index(env, false);
130  
131      probe_access(env, adjust_addr(env, addr), curlen, access_type,
132                   mmu_index, ra);
133      if (len > curlen) {
134          addr += curlen;
135          curlen = len - curlen;
136          probe_access(env, adjust_addr(env, addr), curlen, access_type,
137                       mmu_index, ra);
138      }
139  }
140  
vext_set_elem_mask(void * v0,int index,uint8_t value)141  static inline void vext_set_elem_mask(void *v0, int index,
142                                        uint8_t value)
143  {
144      int idx = index / 64;
145      int pos = index % 64;
146      uint64_t old = ((uint64_t *)v0)[idx];
147      ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
148  }
149  
150  /* elements operations for load and store */
151  typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
152                                     uint32_t idx, void *vd, uintptr_t retaddr);
153  typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
154  
155  #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)             \
156  static inline QEMU_ALWAYS_INLINE                            \
157  void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
158                  uint32_t idx, void *vd, uintptr_t retaddr)  \
159  {                                                           \
160      ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
161      *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);       \
162  }                                                           \
163                                                              \
164  static inline QEMU_ALWAYS_INLINE                            \
165  void NAME##_host(void *vd, uint32_t idx, void *host)        \
166  {                                                           \
167      ETYPE *cur = ((ETYPE *)vd + H(idx));                    \
168      *cur = (ETYPE)LDSUF##_p(host);                          \
169  }
170  
GEN_VEXT_LD_ELEM(lde_b,uint8_t,H1,ldub)171  GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
172  GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
173  GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
174  GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
175  
176  #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)             \
177  static inline QEMU_ALWAYS_INLINE                            \
178  void NAME##_tlb(CPURISCVState *env, abi_ptr addr,           \
179                  uint32_t idx, void *vd, uintptr_t retaddr)  \
180  {                                                           \
181      ETYPE data = *((ETYPE *)vd + H(idx));                   \
182      cpu_##STSUF##_data_ra(env, addr, data, retaddr);        \
183  }                                                           \
184                                                              \
185  static inline QEMU_ALWAYS_INLINE                            \
186  void NAME##_host(void *vd, uint32_t idx, void *host)        \
187  {                                                           \
188      ETYPE data = *((ETYPE *)vd + H(idx));                   \
189      STSUF##_p(host, data);                                  \
190  }
191  
192  GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
193  GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
194  GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
195  GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
196  
197  static inline QEMU_ALWAYS_INLINE void
198  vext_continus_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb *ldst_tlb,
199                         void *vd, uint32_t evl, target_ulong addr,
200                         uint32_t reg_start, uintptr_t ra, uint32_t esz,
201                         bool is_load)
202  {
203      uint32_t i;
204      for (i = env->vstart; i < evl; env->vstart = ++i, addr += esz) {
205          ldst_tlb(env, adjust_addr(env, addr), i, vd, ra);
206      }
207  }
208  
209  static inline QEMU_ALWAYS_INLINE void
vext_continus_ldst_host(CPURISCVState * env,vext_ldst_elem_fn_host * ldst_host,void * vd,uint32_t evl,uint32_t reg_start,void * host,uint32_t esz,bool is_load)210  vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host *ldst_host,
211                          void *vd, uint32_t evl, uint32_t reg_start, void *host,
212                          uint32_t esz, bool is_load)
213  {
214  #if HOST_BIG_ENDIAN
215      for (; reg_start < evl; reg_start++, host += esz) {
216          ldst_host(vd, reg_start, host);
217      }
218  #else
219      if (esz == 1) {
220          uint32_t byte_offset = reg_start * esz;
221          uint32_t size = (evl - reg_start) * esz;
222  
223          if (is_load) {
224              memcpy(vd + byte_offset, host, size);
225          } else {
226              memcpy(host, vd + byte_offset, size);
227          }
228      } else {
229          for (; reg_start < evl; reg_start++, host += esz) {
230              ldst_host(vd, reg_start, host);
231          }
232      }
233  #endif
234  }
235  
vext_set_tail_elems_1s(target_ulong vl,void * vd,uint32_t desc,uint32_t nf,uint32_t esz,uint32_t max_elems)236  static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
237                                     uint32_t desc, uint32_t nf,
238                                     uint32_t esz, uint32_t max_elems)
239  {
240      uint32_t vta = vext_vta(desc);
241      int k;
242  
243      if (vta == 0) {
244          return;
245      }
246  
247      for (k = 0; k < nf; ++k) {
248          vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
249                            (k * max_elems + max_elems) * esz);
250      }
251  }
252  
253  /*
254   * stride: access vector element from strided memory
255   */
256  static void
vext_ldst_stride(void * vd,void * v0,target_ulong base,target_ulong stride,CPURISCVState * env,uint32_t desc,uint32_t vm,vext_ldst_elem_fn_tlb * ldst_elem,uint32_t log2_esz,uintptr_t ra)257  vext_ldst_stride(void *vd, void *v0, target_ulong base, target_ulong stride,
258                   CPURISCVState *env, uint32_t desc, uint32_t vm,
259                   vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
260                   uintptr_t ra)
261  {
262      uint32_t i, k;
263      uint32_t nf = vext_nf(desc);
264      uint32_t max_elems = vext_max_elems(desc, log2_esz);
265      uint32_t esz = 1 << log2_esz;
266      uint32_t vma = vext_vma(desc);
267  
268      VSTART_CHECK_EARLY_EXIT(env);
269  
270      for (i = env->vstart; i < env->vl; env->vstart = ++i) {
271          k = 0;
272          while (k < nf) {
273              if (!vm && !vext_elem_mask(v0, i)) {
274                  /* set masked-off elements to 1s */
275                  vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
276                                    (i + k * max_elems + 1) * esz);
277                  k++;
278                  continue;
279              }
280              target_ulong addr = base + stride * i + (k << log2_esz);
281              ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
282              k++;
283          }
284      }
285      env->vstart = 0;
286  
287      vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
288  }
289  
290  #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
291  void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
292                    target_ulong stride, CPURISCVState *env,              \
293                    uint32_t desc)                                        \
294  {                                                                       \
295      uint32_t vm = vext_vm(desc);                                        \
296      vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
297                       ctzl(sizeof(ETYPE)), GETPC());                     \
298  }
299  
GEN_VEXT_LD_STRIDE(vlse8_v,int8_t,lde_b_tlb)300  GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
301  GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
302  GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
303  GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
304  
305  #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
306  void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
307                    target_ulong stride, CPURISCVState *env,              \
308                    uint32_t desc)                                        \
309  {                                                                       \
310      uint32_t vm = vext_vm(desc);                                        \
311      vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
312                       ctzl(sizeof(ETYPE)), GETPC());                     \
313  }
314  
315  GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
316  GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
317  GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
318  GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
319  
320  /*
321   * unit-stride: access elements stored contiguously in memory
322   */
323  
324  /* unmasked unit-stride load and store operation */
325  static inline QEMU_ALWAYS_INLINE void
326  vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
327                    uint32_t elems, uint32_t nf, uint32_t max_elems,
328                    uint32_t log2_esz, bool is_load, int mmu_index,
329                    vext_ldst_elem_fn_tlb *ldst_tlb,
330                    vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
331  {
332      void *host;
333      int i, k, flags;
334      uint32_t esz = 1 << log2_esz;
335      uint32_t size = (elems * nf) << log2_esz;
336      uint32_t evl = env->vstart + elems;
337      MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
338  
339      /* Check page permission/pmp/watchpoint/etc. */
340      flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
341                                 mmu_index, true, &host, ra);
342  
343      if (flags == 0) {
344          if (nf == 1) {
345              vext_continus_ldst_host(env, ldst_host, vd, evl, env->vstart, host,
346                                      esz, is_load);
347          } else {
348              for (i = env->vstart; i < evl; ++i) {
349                  k = 0;
350                  while (k < nf) {
351                      ldst_host(vd, i + k * max_elems, host);
352                      host += esz;
353                      k++;
354                  }
355              }
356          }
357          env->vstart += elems;
358      } else {
359          if (nf == 1) {
360              vext_continus_ldst_tlb(env, ldst_tlb, vd, evl, addr, env->vstart,
361                                     ra, esz, is_load);
362          } else {
363              /* load bytes from guest memory */
364              for (i = env->vstart; i < evl; env->vstart = ++i) {
365                  k = 0;
366                  while (k < nf) {
367                      ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
368                               vd, ra);
369                      addr += esz;
370                      k++;
371                  }
372              }
373          }
374      }
375  }
376  
377  static inline QEMU_ALWAYS_INLINE void
vext_ldst_us(void * vd,target_ulong base,CPURISCVState * env,uint32_t desc,vext_ldst_elem_fn_tlb * ldst_tlb,vext_ldst_elem_fn_host * ldst_host,uint32_t log2_esz,uint32_t evl,uintptr_t ra,bool is_load)378  vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
379               vext_ldst_elem_fn_tlb *ldst_tlb,
380               vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
381               uint32_t evl, uintptr_t ra, bool is_load)
382  {
383      uint32_t k;
384      target_ulong page_split, elems, addr;
385      uint32_t nf = vext_nf(desc);
386      uint32_t max_elems = vext_max_elems(desc, log2_esz);
387      uint32_t esz = 1 << log2_esz;
388      uint32_t msize = nf * esz;
389      int mmu_index = riscv_env_mmu_index(env, false);
390  
391      if (env->vstart >= evl) {
392          env->vstart = 0;
393          return;
394      }
395  
396      /* Calculate the page range of first page */
397      addr = base + ((env->vstart * nf) << log2_esz);
398      page_split = -(addr | TARGET_PAGE_MASK);
399      /* Get number of elements */
400      elems = page_split / msize;
401      if (unlikely(env->vstart + elems >= evl)) {
402          elems = evl - env->vstart;
403      }
404  
405      /* Load/store elements in the first page */
406      if (likely(elems)) {
407          vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
408                            is_load, mmu_index, ldst_tlb, ldst_host, ra);
409      }
410  
411      /* Load/store elements in the second page */
412      if (unlikely(env->vstart < evl)) {
413          /* Cross page element */
414          if (unlikely(page_split % msize)) {
415              for (k = 0; k < nf; k++) {
416                  addr = base + ((env->vstart * nf + k) << log2_esz);
417                  ldst_tlb(env, adjust_addr(env, addr),
418                          env->vstart + k * max_elems, vd, ra);
419              }
420              env->vstart++;
421          }
422  
423          addr = base + ((env->vstart * nf) << log2_esz);
424          /* Get number of elements of second page */
425          elems = evl - env->vstart;
426  
427          /* Load/store elements in the second page */
428          vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
429                            is_load, mmu_index, ldst_tlb, ldst_host, ra);
430      }
431  
432      env->vstart = 0;
433      vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
434  }
435  
436  /*
437   * masked unit-stride load and store operation will be a special case of
438   * stride, stride = NF * sizeof (ETYPE)
439   */
440  
441  #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
442  void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
443                           CPURISCVState *env, uint32_t desc)         \
444  {                                                                   \
445      uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
446      vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
447                       LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
448  }                                                                   \
449                                                                      \
450  void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
451                    CPURISCVState *env, uint32_t desc)                \
452  {                                                                   \
453      vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
454                   ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
455  }
456  
GEN_VEXT_LD_US(vle8_v,int8_t,lde_b_tlb,lde_b_host)457  GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
458  GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
459  GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
460  GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
461  
462  #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
463  void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
464                           CPURISCVState *env, uint32_t desc)              \
465  {                                                                        \
466      uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
467      vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
468                       STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
469  }                                                                        \
470                                                                           \
471  void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
472                    CPURISCVState *env, uint32_t desc)                     \
473  {                                                                        \
474      vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
475                   ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
476  }
477  
478  GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
479  GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
480  GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
481  GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
482  
483  /*
484   * unit stride mask load and store, EEW = 1
485   */
486  void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
487                      CPURISCVState *env, uint32_t desc)
488  {
489      /* evl = ceil(vl/8) */
490      uint8_t evl = (env->vl + 7) >> 3;
491      vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
492                   0, evl, GETPC(), true);
493  }
494  
HELPER(vsm_v)495  void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
496                      CPURISCVState *env, uint32_t desc)
497  {
498      /* evl = ceil(vl/8) */
499      uint8_t evl = (env->vl + 7) >> 3;
500      vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
501                   0, evl, GETPC(), false);
502  }
503  
504  /*
505   * index: access vector element from indexed memory
506   */
507  typedef target_ulong vext_get_index_addr(target_ulong base,
508          uint32_t idx, void *vs2);
509  
510  #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
511  static target_ulong NAME(target_ulong base,            \
512                           uint32_t idx, void *vs2)      \
513  {                                                      \
514      return (base + *((ETYPE *)vs2 + H(idx)));          \
515  }
516  
GEN_VEXT_GET_INDEX_ADDR(idx_b,uint8_t,H1)517  GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
518  GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
519  GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
520  GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
521  
522  static inline void
523  vext_ldst_index(void *vd, void *v0, target_ulong base,
524                  void *vs2, CPURISCVState *env, uint32_t desc,
525                  vext_get_index_addr get_index_addr,
526                  vext_ldst_elem_fn_tlb *ldst_elem,
527                  uint32_t log2_esz, uintptr_t ra)
528  {
529      uint32_t i, k;
530      uint32_t nf = vext_nf(desc);
531      uint32_t vm = vext_vm(desc);
532      uint32_t max_elems = vext_max_elems(desc, log2_esz);
533      uint32_t esz = 1 << log2_esz;
534      uint32_t vma = vext_vma(desc);
535  
536      VSTART_CHECK_EARLY_EXIT(env);
537  
538      /* load bytes from guest memory */
539      for (i = env->vstart; i < env->vl; env->vstart = ++i) {
540          k = 0;
541          while (k < nf) {
542              if (!vm && !vext_elem_mask(v0, i)) {
543                  /* set masked-off elements to 1s */
544                  vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
545                                    (i + k * max_elems + 1) * esz);
546                  k++;
547                  continue;
548              }
549              abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
550              ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
551              k++;
552          }
553      }
554      env->vstart = 0;
555  
556      vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
557  }
558  
559  #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
560  void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
561                    void *vs2, CPURISCVState *env, uint32_t desc)            \
562  {                                                                          \
563      vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
564                      LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
565  }
566  
GEN_VEXT_LD_INDEX(vlxei8_8_v,int8_t,idx_b,lde_b_tlb)567  GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
568  GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
569  GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
570  GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
571  GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
572  GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
573  GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
574  GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
575  GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
576  GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
577  GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
578  GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
579  GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
580  GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
581  GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
582  GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
583  
584  #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
585  void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
586                    void *vs2, CPURISCVState *env, uint32_t desc)  \
587  {                                                                \
588      vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
589                      STORE_FN, ctzl(sizeof(ETYPE)),               \
590                      GETPC());                                    \
591  }
592  
593  GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
594  GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
595  GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
596  GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
597  GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
598  GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
599  GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
600  GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
601  GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
602  GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
603  GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
604  GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
605  GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
606  GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
607  GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
608  GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
609  
610  /*
611   * unit-stride fault-only-fisrt load instructions
612   */
613  static inline void
614  vext_ldff(void *vd, void *v0, target_ulong base, CPURISCVState *env,
615            uint32_t desc, vext_ldst_elem_fn_tlb *ldst_tlb,
616            vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, uintptr_t ra)
617  {
618      uint32_t i, k, vl = 0;
619      uint32_t nf = vext_nf(desc);
620      uint32_t vm = vext_vm(desc);
621      uint32_t max_elems = vext_max_elems(desc, log2_esz);
622      uint32_t esz = 1 << log2_esz;
623      uint32_t msize = nf * esz;
624      uint32_t vma = vext_vma(desc);
625      target_ulong addr, offset, remain, page_split, elems;
626      int mmu_index = riscv_env_mmu_index(env, false);
627  
628      VSTART_CHECK_EARLY_EXIT(env);
629  
630      /* probe every access */
631      for (i = env->vstart; i < env->vl; i++) {
632          if (!vm && !vext_elem_mask(v0, i)) {
633              continue;
634          }
635          addr = adjust_addr(env, base + i * (nf << log2_esz));
636          if (i == 0) {
637              /* Allow fault on first element. */
638              probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
639          } else {
640              remain = nf << log2_esz;
641              while (remain > 0) {
642                  void *host;
643                  int flags;
644  
645                  offset = -(addr | TARGET_PAGE_MASK);
646  
647                  /* Probe nonfault on subsequent elements. */
648                  flags = probe_access_flags(env, addr, offset, MMU_DATA_LOAD,
649                                             mmu_index, true, &host, 0);
650  
651                  /*
652                   * Stop if invalid (unmapped) or mmio (transaction may fail).
653                   * Do not stop if watchpoint, as the spec says that
654                   * first-fault should continue to access the same
655                   * elements regardless of any watchpoint.
656                   */
657                  if (flags & ~TLB_WATCHPOINT) {
658                      vl = i;
659                      goto ProbeSuccess;
660                  }
661                  if (remain <= offset) {
662                      break;
663                  }
664                  remain -= offset;
665                  addr = adjust_addr(env, addr + offset);
666              }
667          }
668      }
669  ProbeSuccess:
670      /* load bytes from guest memory */
671      if (vl != 0) {
672          env->vl = vl;
673      }
674  
675      if (env->vstart < env->vl) {
676          if (vm) {
677              /* Calculate the page range of first page */
678              addr = base + ((env->vstart * nf) << log2_esz);
679              page_split = -(addr | TARGET_PAGE_MASK);
680              /* Get number of elements */
681              elems = page_split / msize;
682              if (unlikely(env->vstart + elems >= env->vl)) {
683                  elems = env->vl - env->vstart;
684              }
685  
686              /* Load/store elements in the first page */
687              if (likely(elems)) {
688                  vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
689                                    log2_esz, true, mmu_index, ldst_tlb,
690                                    ldst_host, ra);
691              }
692  
693              /* Load/store elements in the second page */
694              if (unlikely(env->vstart < env->vl)) {
695                  /* Cross page element */
696                  if (unlikely(page_split % msize)) {
697                      for (k = 0; k < nf; k++) {
698                          addr = base + ((env->vstart * nf + k) << log2_esz);
699                          ldst_tlb(env, adjust_addr(env, addr),
700                                   env->vstart + k * max_elems, vd, ra);
701                      }
702                      env->vstart++;
703                  }
704  
705                  addr = base + ((env->vstart * nf) << log2_esz);
706                  /* Get number of elements of second page */
707                  elems = env->vl - env->vstart;
708  
709                  /* Load/store elements in the second page */
710                  vext_page_ldst_us(env, vd, addr, elems, nf, max_elems,
711                                    log2_esz, true, mmu_index, ldst_tlb,
712                                    ldst_host, ra);
713              }
714          } else {
715              for (i = env->vstart; i < env->vl; i++) {
716                  k = 0;
717                  while (k < nf) {
718                      if (!vext_elem_mask(v0, i)) {
719                          /* set masked-off elements to 1s */
720                          vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
721                                            (i + k * max_elems + 1) * esz);
722                          k++;
723                          continue;
724                      }
725                      addr = base + ((i * nf + k) << log2_esz);
726                      ldst_tlb(env, adjust_addr(env, addr), i + k * max_elems,
727                               vd, ra);
728                      k++;
729                  }
730              }
731          }
732      }
733      env->vstart = 0;
734  
735      vext_set_tail_elems_1s(env->vl, vd, desc, nf, esz, max_elems);
736  }
737  
738  #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
739  void HELPER(NAME)(void *vd, void *v0, target_ulong base,        \
740                    CPURISCVState *env, uint32_t desc)            \
741  {                                                               \
742      vext_ldff(vd, v0, base, env, desc, LOAD_FN_TLB,             \
743                LOAD_FN_HOST, ctzl(sizeof(ETYPE)), GETPC());      \
744  }
745  
GEN_VEXT_LDFF(vle8ff_v,int8_t,lde_b_tlb,lde_b_host)746  GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb, lde_b_host)
747  GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb, lde_h_host)
748  GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb, lde_w_host)
749  GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb, lde_d_host)
750  
751  #define DO_SWAP(N, M) (M)
752  #define DO_AND(N, M)  (N & M)
753  #define DO_XOR(N, M)  (N ^ M)
754  #define DO_OR(N, M)   (N | M)
755  #define DO_ADD(N, M)  (N + M)
756  
757  /* Signed min/max */
758  #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
759  #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
760  
761  /*
762   * load and store whole register instructions
763   */
764  static inline QEMU_ALWAYS_INLINE void
765  vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
766                  vext_ldst_elem_fn_tlb *ldst_tlb,
767                  vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
768                  uintptr_t ra, bool is_load)
769  {
770      target_ulong page_split, elems, addr;
771      uint32_t nf = vext_nf(desc);
772      uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
773      uint32_t max_elems = vlenb >> log2_esz;
774      uint32_t evl = nf * max_elems;
775      uint32_t esz = 1 << log2_esz;
776      int mmu_index = riscv_env_mmu_index(env, false);
777  
778      /* Calculate the page range of first page */
779      addr = base + (env->vstart << log2_esz);
780      page_split = -(addr | TARGET_PAGE_MASK);
781      /* Get number of elements */
782      elems = page_split / esz;
783      if (unlikely(env->vstart + elems >= evl)) {
784          elems = evl - env->vstart;
785      }
786  
787      /* Load/store elements in the first page */
788      if (likely(elems)) {
789          vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
790                            is_load, mmu_index, ldst_tlb, ldst_host, ra);
791      }
792  
793      /* Load/store elements in the second page */
794      if (unlikely(env->vstart < evl)) {
795          /* Cross page element */
796          if (unlikely(page_split % esz)) {
797              addr = base + (env->vstart << log2_esz);
798              ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
799              env->vstart++;
800          }
801  
802          addr = base + (env->vstart << log2_esz);
803          /* Get number of elements of second page */
804          elems = evl - env->vstart;
805  
806          /* Load/store elements in the second page */
807          vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
808                            is_load, mmu_index, ldst_tlb, ldst_host, ra);
809      }
810  
811      env->vstart = 0;
812  }
813  
814  #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
815  void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
816                    uint32_t desc)                                    \
817  {                                                                   \
818      vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
819                      ctzl(sizeof(ETYPE)), GETPC(), true);            \
820  }
821  
GEN_VEXT_LD_WHOLE(vl1re8_v,int8_t,lde_b_tlb,lde_b_host)822  GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
823  GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
824  GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
825  GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
826  GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
827  GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
828  GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
829  GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
830  GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
831  GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
832  GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
833  GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
834  GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
835  GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
836  GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
837  GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
838  
839  #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
840  void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
841                    uint32_t desc)                                        \
842  {                                                                       \
843      vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
844                      ctzl(sizeof(ETYPE)), GETPC(), false);               \
845  }
846  
847  GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
848  GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
849  GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
850  GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
851  
852  /*
853   * Vector Integer Arithmetic Instructions
854   */
855  
856  /* (TD, T1, T2, TX1, TX2) */
857  #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
858  #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
859  #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
860  #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
861  #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
862  #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
863  #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
864  #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
865  #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
866  #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
867  #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
868  #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
869  #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
870  #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
871  #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
872  #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
873  #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
874  #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
875  #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
876  #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
877  #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
878  #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
879  #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
880  
881  #define DO_SUB(N, M) (N - M)
882  #define DO_RSUB(N, M) (M - N)
883  
884  RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
885  RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
886  RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
887  RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
888  RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
889  RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
890  RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
891  RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
892  
893  GEN_VEXT_VV(vadd_vv_b, 1)
894  GEN_VEXT_VV(vadd_vv_h, 2)
895  GEN_VEXT_VV(vadd_vv_w, 4)
896  GEN_VEXT_VV(vadd_vv_d, 8)
897  GEN_VEXT_VV(vsub_vv_b, 1)
898  GEN_VEXT_VV(vsub_vv_h, 2)
899  GEN_VEXT_VV(vsub_vv_w, 4)
900  GEN_VEXT_VV(vsub_vv_d, 8)
901  
902  
903  RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
904  RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
905  RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
906  RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
907  RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
908  RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
909  RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
910  RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
911  RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
912  RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
913  RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
914  RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
915  
916  GEN_VEXT_VX(vadd_vx_b, 1)
917  GEN_VEXT_VX(vadd_vx_h, 2)
918  GEN_VEXT_VX(vadd_vx_w, 4)
919  GEN_VEXT_VX(vadd_vx_d, 8)
920  GEN_VEXT_VX(vsub_vx_b, 1)
921  GEN_VEXT_VX(vsub_vx_h, 2)
922  GEN_VEXT_VX(vsub_vx_w, 4)
923  GEN_VEXT_VX(vsub_vx_d, 8)
924  GEN_VEXT_VX(vrsub_vx_b, 1)
925  GEN_VEXT_VX(vrsub_vx_h, 2)
926  GEN_VEXT_VX(vrsub_vx_w, 4)
927  GEN_VEXT_VX(vrsub_vx_d, 8)
928  
929  void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
930  {
931      intptr_t oprsz = simd_oprsz(desc);
932      intptr_t i;
933  
934      for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
935          *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
936      }
937  }
938  
HELPER(vec_rsubs16)939  void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
940  {
941      intptr_t oprsz = simd_oprsz(desc);
942      intptr_t i;
943  
944      for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
945          *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
946      }
947  }
948  
HELPER(vec_rsubs32)949  void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
950  {
951      intptr_t oprsz = simd_oprsz(desc);
952      intptr_t i;
953  
954      for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
955          *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
956      }
957  }
958  
HELPER(vec_rsubs64)959  void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
960  {
961      intptr_t oprsz = simd_oprsz(desc);
962      intptr_t i;
963  
964      for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
965          *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
966      }
967  }
968  
969  /* Vector Widening Integer Add/Subtract */
970  #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
971  #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
972  #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
973  #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
974  #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
975  #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
976  #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
977  #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
978  #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
979  #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
980  #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
981  #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
RVVCALL(OPIVV2,vwaddu_vv_b,WOP_UUU_B,H2,H1,H1,DO_ADD)982  RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
983  RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
984  RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
985  RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
986  RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
987  RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
988  RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
989  RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
990  RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
991  RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
992  RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
993  RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
994  RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
995  RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
996  RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
997  RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
998  RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
999  RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1000  RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1001  RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1002  RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1003  RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1004  RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1005  RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1006  GEN_VEXT_VV(vwaddu_vv_b, 2)
1007  GEN_VEXT_VV(vwaddu_vv_h, 4)
1008  GEN_VEXT_VV(vwaddu_vv_w, 8)
1009  GEN_VEXT_VV(vwsubu_vv_b, 2)
1010  GEN_VEXT_VV(vwsubu_vv_h, 4)
1011  GEN_VEXT_VV(vwsubu_vv_w, 8)
1012  GEN_VEXT_VV(vwadd_vv_b, 2)
1013  GEN_VEXT_VV(vwadd_vv_h, 4)
1014  GEN_VEXT_VV(vwadd_vv_w, 8)
1015  GEN_VEXT_VV(vwsub_vv_b, 2)
1016  GEN_VEXT_VV(vwsub_vv_h, 4)
1017  GEN_VEXT_VV(vwsub_vv_w, 8)
1018  GEN_VEXT_VV(vwaddu_wv_b, 2)
1019  GEN_VEXT_VV(vwaddu_wv_h, 4)
1020  GEN_VEXT_VV(vwaddu_wv_w, 8)
1021  GEN_VEXT_VV(vwsubu_wv_b, 2)
1022  GEN_VEXT_VV(vwsubu_wv_h, 4)
1023  GEN_VEXT_VV(vwsubu_wv_w, 8)
1024  GEN_VEXT_VV(vwadd_wv_b, 2)
1025  GEN_VEXT_VV(vwadd_wv_h, 4)
1026  GEN_VEXT_VV(vwadd_wv_w, 8)
1027  GEN_VEXT_VV(vwsub_wv_b, 2)
1028  GEN_VEXT_VV(vwsub_wv_h, 4)
1029  GEN_VEXT_VV(vwsub_wv_w, 8)
1030  
1031  RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1032  RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1033  RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1034  RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1035  RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1036  RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1037  RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1038  RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1039  RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1040  RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1041  RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1042  RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1043  RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1044  RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1045  RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1046  RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1047  RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1048  RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1049  RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1050  RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1051  RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1052  RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1053  RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1054  RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1055  GEN_VEXT_VX(vwaddu_vx_b, 2)
1056  GEN_VEXT_VX(vwaddu_vx_h, 4)
1057  GEN_VEXT_VX(vwaddu_vx_w, 8)
1058  GEN_VEXT_VX(vwsubu_vx_b, 2)
1059  GEN_VEXT_VX(vwsubu_vx_h, 4)
1060  GEN_VEXT_VX(vwsubu_vx_w, 8)
1061  GEN_VEXT_VX(vwadd_vx_b, 2)
1062  GEN_VEXT_VX(vwadd_vx_h, 4)
1063  GEN_VEXT_VX(vwadd_vx_w, 8)
1064  GEN_VEXT_VX(vwsub_vx_b, 2)
1065  GEN_VEXT_VX(vwsub_vx_h, 4)
1066  GEN_VEXT_VX(vwsub_vx_w, 8)
1067  GEN_VEXT_VX(vwaddu_wx_b, 2)
1068  GEN_VEXT_VX(vwaddu_wx_h, 4)
1069  GEN_VEXT_VX(vwaddu_wx_w, 8)
1070  GEN_VEXT_VX(vwsubu_wx_b, 2)
1071  GEN_VEXT_VX(vwsubu_wx_h, 4)
1072  GEN_VEXT_VX(vwsubu_wx_w, 8)
1073  GEN_VEXT_VX(vwadd_wx_b, 2)
1074  GEN_VEXT_VX(vwadd_wx_h, 4)
1075  GEN_VEXT_VX(vwadd_wx_w, 8)
1076  GEN_VEXT_VX(vwsub_wx_b, 2)
1077  GEN_VEXT_VX(vwsub_wx_h, 4)
1078  GEN_VEXT_VX(vwsub_wx_w, 8)
1079  
1080  /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1081  #define DO_VADC(N, M, C) (N + M + C)
1082  #define DO_VSBC(N, M, C) (N - M - C)
1083  
1084  #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1085  void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1086                    CPURISCVState *env, uint32_t desc)          \
1087  {                                                             \
1088      uint32_t vl = env->vl;                                    \
1089      uint32_t esz = sizeof(ETYPE);                             \
1090      uint32_t total_elems =                                    \
1091          vext_get_total_elems(env, desc, esz);                 \
1092      uint32_t vta = vext_vta(desc);                            \
1093      uint32_t i;                                               \
1094                                                                \
1095      VSTART_CHECK_EARLY_EXIT(env);                             \
1096                                                                \
1097      for (i = env->vstart; i < vl; i++) {                      \
1098          ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1099          ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1100          ETYPE carry = vext_elem_mask(v0, i);                  \
1101                                                                \
1102          *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1103      }                                                         \
1104      env->vstart = 0;                                          \
1105      /* set tail elements to 1s */                             \
1106      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1107  }
1108  
1109  GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1110  GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1111  GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1112  GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1113  
1114  GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1115  GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1116  GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1117  GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1118  
1119  #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1120  void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1121                    CPURISCVState *env, uint32_t desc)                     \
1122  {                                                                        \
1123      uint32_t vl = env->vl;                                               \
1124      uint32_t esz = sizeof(ETYPE);                                        \
1125      uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1126      uint32_t vta = vext_vta(desc);                                       \
1127      uint32_t i;                                                          \
1128                                                                           \
1129      VSTART_CHECK_EARLY_EXIT(env);                                        \
1130                                                                           \
1131      for (i = env->vstart; i < vl; i++) {                                 \
1132          ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1133          ETYPE carry = vext_elem_mask(v0, i);                             \
1134                                                                           \
1135          *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1136      }                                                                    \
1137      env->vstart = 0;                                                     \
1138      /* set tail elements to 1s */                                        \
1139      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1140  }
1141  
1142  GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1143  GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1144  GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1145  GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1146  
1147  GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1148  GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1149  GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1150  GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1151  
1152  #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1153                            (__typeof(N))(N + M) < N)
1154  #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1155  
1156  #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1157  void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1158                    CPURISCVState *env, uint32_t desc)          \
1159  {                                                             \
1160      uint32_t vl = env->vl;                                    \
1161      uint32_t vm = vext_vm(desc);                              \
1162      uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1163      uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1164      uint32_t i;                                               \
1165                                                                \
1166      VSTART_CHECK_EARLY_EXIT(env);                             \
1167                                                                \
1168      for (i = env->vstart; i < vl; i++) {                      \
1169          ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1170          ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1171          ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1172          vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1173      }                                                         \
1174      env->vstart = 0;                                          \
1175      /*
1176       * mask destination register are always tail-agnostic
1177       * set tail elements to 1s
1178       */                                                       \
1179      if (vta_all_1s) {                                         \
1180          for (; i < total_elems; i++) {                        \
1181              vext_set_elem_mask(vd, i, 1);                     \
1182          }                                                     \
1183      }                                                         \
1184  }
1185  
1186  GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1187  GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1188  GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1189  GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1190  
1191  GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1192  GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1193  GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1194  GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1195  
1196  #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1197  void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1198                    void *vs2, CPURISCVState *env, uint32_t desc) \
1199  {                                                               \
1200      uint32_t vl = env->vl;                                      \
1201      uint32_t vm = vext_vm(desc);                                \
1202      uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;      \
1203      uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1204      uint32_t i;                                                 \
1205                                                                  \
1206      VSTART_CHECK_EARLY_EXIT(env);                               \
1207                                                                  \
1208      for (i = env->vstart; i < vl; i++) {                        \
1209          ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1210          ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1211          vext_set_elem_mask(vd, i,                               \
1212                  DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1213      }                                                           \
1214      env->vstart = 0;                                            \
1215      /*
1216       * mask destination register are always tail-agnostic
1217       * set tail elements to 1s
1218       */                                                         \
1219      if (vta_all_1s) {                                           \
1220          for (; i < total_elems; i++) {                          \
1221              vext_set_elem_mask(vd, i, 1);                       \
1222          }                                                       \
1223      }                                                           \
1224  }
1225  
1226  GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1227  GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1228  GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1229  GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1230  
1231  GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1232  GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1233  GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1234  GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1235  
1236  /* Vector Bitwise Logical Instructions */
1237  RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1238  RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1239  RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1240  RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1241  RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1242  RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1243  RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1244  RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1245  RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1246  RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1247  RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1248  RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1249  GEN_VEXT_VV(vand_vv_b, 1)
1250  GEN_VEXT_VV(vand_vv_h, 2)
1251  GEN_VEXT_VV(vand_vv_w, 4)
1252  GEN_VEXT_VV(vand_vv_d, 8)
1253  GEN_VEXT_VV(vor_vv_b, 1)
1254  GEN_VEXT_VV(vor_vv_h, 2)
1255  GEN_VEXT_VV(vor_vv_w, 4)
1256  GEN_VEXT_VV(vor_vv_d, 8)
1257  GEN_VEXT_VV(vxor_vv_b, 1)
1258  GEN_VEXT_VV(vxor_vv_h, 2)
1259  GEN_VEXT_VV(vxor_vv_w, 4)
1260  GEN_VEXT_VV(vxor_vv_d, 8)
1261  
1262  RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1263  RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1264  RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1265  RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1266  RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1267  RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1268  RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1269  RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1270  RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1271  RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1272  RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1273  RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1274  GEN_VEXT_VX(vand_vx_b, 1)
1275  GEN_VEXT_VX(vand_vx_h, 2)
1276  GEN_VEXT_VX(vand_vx_w, 4)
1277  GEN_VEXT_VX(vand_vx_d, 8)
1278  GEN_VEXT_VX(vor_vx_b, 1)
1279  GEN_VEXT_VX(vor_vx_h, 2)
1280  GEN_VEXT_VX(vor_vx_w, 4)
1281  GEN_VEXT_VX(vor_vx_d, 8)
1282  GEN_VEXT_VX(vxor_vx_b, 1)
1283  GEN_VEXT_VX(vxor_vx_h, 2)
1284  GEN_VEXT_VX(vxor_vx_w, 4)
1285  GEN_VEXT_VX(vxor_vx_d, 8)
1286  
1287  /* Vector Single-Width Bit Shift Instructions */
1288  #define DO_SLL(N, M)  (N << (M))
1289  #define DO_SRL(N, M)  (N >> (M))
1290  
1291  /* generate the helpers for shift instructions with two vector operators */
1292  #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1293  void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1294                    void *vs2, CPURISCVState *env, uint32_t desc)           \
1295  {                                                                         \
1296      uint32_t vm = vext_vm(desc);                                          \
1297      uint32_t vl = env->vl;                                                \
1298      uint32_t esz = sizeof(TS1);                                           \
1299      uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1300      uint32_t vta = vext_vta(desc);                                        \
1301      uint32_t vma = vext_vma(desc);                                        \
1302      uint32_t i;                                                           \
1303                                                                            \
1304      VSTART_CHECK_EARLY_EXIT(env);                                         \
1305                                                                            \
1306      for (i = env->vstart; i < vl; i++) {                                  \
1307          if (!vm && !vext_elem_mask(v0, i)) {                              \
1308              /* set masked-off elements to 1s */                           \
1309              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1310              continue;                                                     \
1311          }                                                                 \
1312          TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1313          TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1314          *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1315      }                                                                     \
1316      env->vstart = 0;                                                      \
1317      /* set tail elements to 1s */                                         \
1318      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1319  }
1320  
1321  GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1322  GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1323  GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1324  GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1325  
1326  GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1327  GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1328  GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1329  GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1330  
1331  GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1332  GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1333  GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1334  GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1335  
1336  /*
1337   * generate the helpers for shift instructions with one vector and one scalar
1338   */
1339  #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1340  void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1341                    void *vs2, CPURISCVState *env,            \
1342                    uint32_t desc)                            \
1343  {                                                           \
1344      uint32_t vm = vext_vm(desc);                            \
1345      uint32_t vl = env->vl;                                  \
1346      uint32_t esz = sizeof(TD);                              \
1347      uint32_t total_elems =                                  \
1348          vext_get_total_elems(env, desc, esz);               \
1349      uint32_t vta = vext_vta(desc);                          \
1350      uint32_t vma = vext_vma(desc);                          \
1351      uint32_t i;                                             \
1352                                                              \
1353      VSTART_CHECK_EARLY_EXIT(env);                           \
1354                                                              \
1355      for (i = env->vstart; i < vl; i++) {                    \
1356          if (!vm && !vext_elem_mask(v0, i)) {                \
1357              /* set masked-off elements to 1s */             \
1358              vext_set_elems_1s(vd, vma, i * esz,             \
1359                                (i + 1) * esz);               \
1360              continue;                                       \
1361          }                                                   \
1362          TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1363          *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1364      }                                                       \
1365      env->vstart = 0;                                        \
1366      /* set tail elements to 1s */                           \
1367      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1368  }
1369  
1370  GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1371  GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1372  GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1373  GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1374  
1375  GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1376  GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1377  GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1378  GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1379  
1380  GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1381  GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1382  GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1383  GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1384  
1385  /* Vector Narrowing Integer Right Shift Instructions */
1386  GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1387  GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1388  GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1389  GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1390  GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1391  GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1392  GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1393  GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1394  GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1395  GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1396  GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1397  GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1398  
1399  /* Vector Integer Comparison Instructions */
1400  #define DO_MSEQ(N, M) (N == M)
1401  #define DO_MSNE(N, M) (N != M)
1402  #define DO_MSLT(N, M) (N < M)
1403  #define DO_MSLE(N, M) (N <= M)
1404  #define DO_MSGT(N, M) (N > M)
1405  
1406  #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1407  void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1408                    CPURISCVState *env, uint32_t desc)          \
1409  {                                                             \
1410      uint32_t vm = vext_vm(desc);                              \
1411      uint32_t vl = env->vl;                                    \
1412      uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
1413      uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1414      uint32_t vma = vext_vma(desc);                            \
1415      uint32_t i;                                               \
1416                                                                \
1417      VSTART_CHECK_EARLY_EXIT(env);                             \
1418                                                                \
1419      for (i = env->vstart; i < vl; i++) {                      \
1420          ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1421          ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1422          if (!vm && !vext_elem_mask(v0, i)) {                  \
1423              /* set masked-off elements to 1s */               \
1424              if (vma) {                                        \
1425                  vext_set_elem_mask(vd, i, 1);                 \
1426              }                                                 \
1427              continue;                                         \
1428          }                                                     \
1429          vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1430      }                                                         \
1431      env->vstart = 0;                                          \
1432      /*
1433       * mask destination register are always tail-agnostic
1434       * set tail elements to 1s
1435       */                                                       \
1436      if (vta_all_1s) {                                         \
1437          for (; i < total_elems; i++) {                        \
1438              vext_set_elem_mask(vd, i, 1);                     \
1439          }                                                     \
1440      }                                                         \
1441  }
1442  
1443  GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1444  GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1445  GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1446  GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1447  
1448  GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1449  GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1450  GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1451  GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1452  
1453  GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1454  GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1455  GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1456  GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1457  
1458  GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1459  GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1460  GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1461  GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1462  
1463  GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1464  GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1465  GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1466  GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1467  
1468  GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1469  GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1470  GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1471  GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1472  
1473  #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1474  void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1475                    CPURISCVState *env, uint32_t desc)                \
1476  {                                                                   \
1477      uint32_t vm = vext_vm(desc);                                    \
1478      uint32_t vl = env->vl;                                          \
1479      uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
1480      uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1481      uint32_t vma = vext_vma(desc);                                  \
1482      uint32_t i;                                                     \
1483                                                                      \
1484      VSTART_CHECK_EARLY_EXIT(env);                                   \
1485                                                                      \
1486      for (i = env->vstart; i < vl; i++) {                            \
1487          ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1488          if (!vm && !vext_elem_mask(v0, i)) {                        \
1489              /* set masked-off elements to 1s */                     \
1490              if (vma) {                                              \
1491                  vext_set_elem_mask(vd, i, 1);                       \
1492              }                                                       \
1493              continue;                                               \
1494          }                                                           \
1495          vext_set_elem_mask(vd, i,                                   \
1496                  DO_OP(s2, (ETYPE)(target_long)s1));                 \
1497      }                                                               \
1498      env->vstart = 0;                                                \
1499      /*
1500       * mask destination register are always tail-agnostic
1501       * set tail elements to 1s
1502       */                                                             \
1503      if (vta_all_1s) {                                               \
1504          for (; i < total_elems; i++) {                              \
1505              vext_set_elem_mask(vd, i, 1);                           \
1506          }                                                           \
1507      }                                                               \
1508  }
1509  
1510  GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1511  GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1512  GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1513  GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1514  
1515  GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1516  GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1517  GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1518  GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1519  
1520  GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1521  GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1522  GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1523  GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1524  
1525  GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1526  GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1527  GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1528  GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1529  
1530  GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1531  GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1532  GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1533  GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1534  
1535  GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1536  GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1537  GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1538  GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1539  
1540  GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1541  GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1542  GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1543  GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1544  
1545  GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1546  GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1547  GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1548  GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1549  
1550  /* Vector Integer Min/Max Instructions */
1551  RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1552  RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1553  RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1554  RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1555  RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1556  RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1557  RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1558  RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1559  RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1560  RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1561  RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1562  RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1563  RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1564  RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1565  RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1566  RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1567  GEN_VEXT_VV(vminu_vv_b, 1)
1568  GEN_VEXT_VV(vminu_vv_h, 2)
1569  GEN_VEXT_VV(vminu_vv_w, 4)
1570  GEN_VEXT_VV(vminu_vv_d, 8)
1571  GEN_VEXT_VV(vmin_vv_b, 1)
1572  GEN_VEXT_VV(vmin_vv_h, 2)
1573  GEN_VEXT_VV(vmin_vv_w, 4)
1574  GEN_VEXT_VV(vmin_vv_d, 8)
1575  GEN_VEXT_VV(vmaxu_vv_b, 1)
1576  GEN_VEXT_VV(vmaxu_vv_h, 2)
1577  GEN_VEXT_VV(vmaxu_vv_w, 4)
1578  GEN_VEXT_VV(vmaxu_vv_d, 8)
1579  GEN_VEXT_VV(vmax_vv_b, 1)
1580  GEN_VEXT_VV(vmax_vv_h, 2)
1581  GEN_VEXT_VV(vmax_vv_w, 4)
1582  GEN_VEXT_VV(vmax_vv_d, 8)
1583  
1584  RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1585  RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1586  RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1587  RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1588  RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1589  RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1590  RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1591  RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1592  RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1593  RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1594  RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1595  RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1596  RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1597  RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1598  RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1599  RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1600  GEN_VEXT_VX(vminu_vx_b, 1)
1601  GEN_VEXT_VX(vminu_vx_h, 2)
1602  GEN_VEXT_VX(vminu_vx_w, 4)
1603  GEN_VEXT_VX(vminu_vx_d, 8)
1604  GEN_VEXT_VX(vmin_vx_b, 1)
1605  GEN_VEXT_VX(vmin_vx_h, 2)
1606  GEN_VEXT_VX(vmin_vx_w, 4)
1607  GEN_VEXT_VX(vmin_vx_d, 8)
1608  GEN_VEXT_VX(vmaxu_vx_b, 1)
1609  GEN_VEXT_VX(vmaxu_vx_h, 2)
1610  GEN_VEXT_VX(vmaxu_vx_w, 4)
1611  GEN_VEXT_VX(vmaxu_vx_d, 8)
1612  GEN_VEXT_VX(vmax_vx_b, 1)
1613  GEN_VEXT_VX(vmax_vx_h, 2)
1614  GEN_VEXT_VX(vmax_vx_w, 4)
1615  GEN_VEXT_VX(vmax_vx_d, 8)
1616  
1617  /* Vector Single-Width Integer Multiply Instructions */
1618  #define DO_MUL(N, M) (N * M)
1619  RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1620  RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1621  RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1622  RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1623  GEN_VEXT_VV(vmul_vv_b, 1)
1624  GEN_VEXT_VV(vmul_vv_h, 2)
1625  GEN_VEXT_VV(vmul_vv_w, 4)
1626  GEN_VEXT_VV(vmul_vv_d, 8)
1627  
1628  static int8_t do_mulh_b(int8_t s2, int8_t s1)
1629  {
1630      return (int16_t)s2 * (int16_t)s1 >> 8;
1631  }
1632  
do_mulh_h(int16_t s2,int16_t s1)1633  static int16_t do_mulh_h(int16_t s2, int16_t s1)
1634  {
1635      return (int32_t)s2 * (int32_t)s1 >> 16;
1636  }
1637  
do_mulh_w(int32_t s2,int32_t s1)1638  static int32_t do_mulh_w(int32_t s2, int32_t s1)
1639  {
1640      return (int64_t)s2 * (int64_t)s1 >> 32;
1641  }
1642  
do_mulh_d(int64_t s2,int64_t s1)1643  static int64_t do_mulh_d(int64_t s2, int64_t s1)
1644  {
1645      uint64_t hi_64, lo_64;
1646  
1647      muls64(&lo_64, &hi_64, s1, s2);
1648      return hi_64;
1649  }
1650  
do_mulhu_b(uint8_t s2,uint8_t s1)1651  static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1652  {
1653      return (uint16_t)s2 * (uint16_t)s1 >> 8;
1654  }
1655  
do_mulhu_h(uint16_t s2,uint16_t s1)1656  static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1657  {
1658      return (uint32_t)s2 * (uint32_t)s1 >> 16;
1659  }
1660  
do_mulhu_w(uint32_t s2,uint32_t s1)1661  static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1662  {
1663      return (uint64_t)s2 * (uint64_t)s1 >> 32;
1664  }
1665  
do_mulhu_d(uint64_t s2,uint64_t s1)1666  static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1667  {
1668      uint64_t hi_64, lo_64;
1669  
1670      mulu64(&lo_64, &hi_64, s2, s1);
1671      return hi_64;
1672  }
1673  
do_mulhsu_b(int8_t s2,uint8_t s1)1674  static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1675  {
1676      return (int16_t)s2 * (uint16_t)s1 >> 8;
1677  }
1678  
do_mulhsu_h(int16_t s2,uint16_t s1)1679  static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1680  {
1681      return (int32_t)s2 * (uint32_t)s1 >> 16;
1682  }
1683  
do_mulhsu_w(int32_t s2,uint32_t s1)1684  static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1685  {
1686      return (int64_t)s2 * (uint64_t)s1 >> 32;
1687  }
1688  
1689  /*
1690   * Let  A = signed operand,
1691   *      B = unsigned operand
1692   *      P = mulu64(A, B), unsigned product
1693   *
1694   * LET  X = 2 ** 64  - A, 2's complement of A
1695   *      SP = signed product
1696   * THEN
1697   *      IF A < 0
1698   *          SP = -X * B
1699   *             = -(2 ** 64 - A) * B
1700   *             = A * B - 2 ** 64 * B
1701   *             = P - 2 ** 64 * B
1702   *      ELSE
1703   *          SP = P
1704   * THEN
1705   *      HI_P -= (A < 0 ? B : 0)
1706   */
1707  
do_mulhsu_d(int64_t s2,uint64_t s1)1708  static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1709  {
1710      uint64_t hi_64, lo_64;
1711  
1712      mulu64(&lo_64, &hi_64, s2, s1);
1713  
1714      hi_64 -= s2 < 0 ? s1 : 0;
1715      return hi_64;
1716  }
1717  
1718  RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1719  RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1720  RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1721  RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1722  RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1723  RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1724  RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1725  RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1726  RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1727  RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1728  RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1729  RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1730  GEN_VEXT_VV(vmulh_vv_b, 1)
1731  GEN_VEXT_VV(vmulh_vv_h, 2)
1732  GEN_VEXT_VV(vmulh_vv_w, 4)
1733  GEN_VEXT_VV(vmulh_vv_d, 8)
1734  GEN_VEXT_VV(vmulhu_vv_b, 1)
1735  GEN_VEXT_VV(vmulhu_vv_h, 2)
1736  GEN_VEXT_VV(vmulhu_vv_w, 4)
1737  GEN_VEXT_VV(vmulhu_vv_d, 8)
1738  GEN_VEXT_VV(vmulhsu_vv_b, 1)
1739  GEN_VEXT_VV(vmulhsu_vv_h, 2)
1740  GEN_VEXT_VV(vmulhsu_vv_w, 4)
1741  GEN_VEXT_VV(vmulhsu_vv_d, 8)
1742  
1743  RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1744  RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1745  RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1746  RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1747  RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1748  RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1749  RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1750  RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1751  RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1752  RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1753  RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1754  RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1755  RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1756  RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1757  RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1758  RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1759  GEN_VEXT_VX(vmul_vx_b, 1)
1760  GEN_VEXT_VX(vmul_vx_h, 2)
1761  GEN_VEXT_VX(vmul_vx_w, 4)
1762  GEN_VEXT_VX(vmul_vx_d, 8)
1763  GEN_VEXT_VX(vmulh_vx_b, 1)
1764  GEN_VEXT_VX(vmulh_vx_h, 2)
1765  GEN_VEXT_VX(vmulh_vx_w, 4)
1766  GEN_VEXT_VX(vmulh_vx_d, 8)
1767  GEN_VEXT_VX(vmulhu_vx_b, 1)
1768  GEN_VEXT_VX(vmulhu_vx_h, 2)
1769  GEN_VEXT_VX(vmulhu_vx_w, 4)
1770  GEN_VEXT_VX(vmulhu_vx_d, 8)
1771  GEN_VEXT_VX(vmulhsu_vx_b, 1)
1772  GEN_VEXT_VX(vmulhsu_vx_h, 2)
1773  GEN_VEXT_VX(vmulhsu_vx_w, 4)
1774  GEN_VEXT_VX(vmulhsu_vx_d, 8)
1775  
1776  /* Vector Integer Divide Instructions */
1777  #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1778  #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1779  #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1780          unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1781  #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1782          unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1783  
1784  RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1785  RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1786  RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1787  RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1788  RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1789  RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1790  RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1791  RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1792  RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1793  RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1794  RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1795  RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1796  RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1797  RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1798  RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1799  RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1800  GEN_VEXT_VV(vdivu_vv_b, 1)
1801  GEN_VEXT_VV(vdivu_vv_h, 2)
1802  GEN_VEXT_VV(vdivu_vv_w, 4)
1803  GEN_VEXT_VV(vdivu_vv_d, 8)
1804  GEN_VEXT_VV(vdiv_vv_b, 1)
1805  GEN_VEXT_VV(vdiv_vv_h, 2)
1806  GEN_VEXT_VV(vdiv_vv_w, 4)
1807  GEN_VEXT_VV(vdiv_vv_d, 8)
1808  GEN_VEXT_VV(vremu_vv_b, 1)
1809  GEN_VEXT_VV(vremu_vv_h, 2)
1810  GEN_VEXT_VV(vremu_vv_w, 4)
1811  GEN_VEXT_VV(vremu_vv_d, 8)
1812  GEN_VEXT_VV(vrem_vv_b, 1)
1813  GEN_VEXT_VV(vrem_vv_h, 2)
1814  GEN_VEXT_VV(vrem_vv_w, 4)
1815  GEN_VEXT_VV(vrem_vv_d, 8)
1816  
1817  RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1818  RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1819  RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1820  RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1821  RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1822  RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1823  RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1824  RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1825  RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1826  RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1827  RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1828  RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1829  RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1830  RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1831  RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1832  RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1833  GEN_VEXT_VX(vdivu_vx_b, 1)
1834  GEN_VEXT_VX(vdivu_vx_h, 2)
1835  GEN_VEXT_VX(vdivu_vx_w, 4)
1836  GEN_VEXT_VX(vdivu_vx_d, 8)
1837  GEN_VEXT_VX(vdiv_vx_b, 1)
1838  GEN_VEXT_VX(vdiv_vx_h, 2)
1839  GEN_VEXT_VX(vdiv_vx_w, 4)
1840  GEN_VEXT_VX(vdiv_vx_d, 8)
1841  GEN_VEXT_VX(vremu_vx_b, 1)
1842  GEN_VEXT_VX(vremu_vx_h, 2)
1843  GEN_VEXT_VX(vremu_vx_w, 4)
1844  GEN_VEXT_VX(vremu_vx_d, 8)
1845  GEN_VEXT_VX(vrem_vx_b, 1)
1846  GEN_VEXT_VX(vrem_vx_h, 2)
1847  GEN_VEXT_VX(vrem_vx_w, 4)
1848  GEN_VEXT_VX(vrem_vx_d, 8)
1849  
1850  /* Vector Widening Integer Multiply Instructions */
1851  RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1852  RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1853  RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1854  RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1855  RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1856  RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1857  RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1858  RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1859  RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1860  GEN_VEXT_VV(vwmul_vv_b, 2)
1861  GEN_VEXT_VV(vwmul_vv_h, 4)
1862  GEN_VEXT_VV(vwmul_vv_w, 8)
1863  GEN_VEXT_VV(vwmulu_vv_b, 2)
1864  GEN_VEXT_VV(vwmulu_vv_h, 4)
1865  GEN_VEXT_VV(vwmulu_vv_w, 8)
1866  GEN_VEXT_VV(vwmulsu_vv_b, 2)
1867  GEN_VEXT_VV(vwmulsu_vv_h, 4)
1868  GEN_VEXT_VV(vwmulsu_vv_w, 8)
1869  
1870  RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1871  RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1872  RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1873  RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1874  RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1875  RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1876  RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1877  RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1878  RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1879  GEN_VEXT_VX(vwmul_vx_b, 2)
1880  GEN_VEXT_VX(vwmul_vx_h, 4)
1881  GEN_VEXT_VX(vwmul_vx_w, 8)
1882  GEN_VEXT_VX(vwmulu_vx_b, 2)
1883  GEN_VEXT_VX(vwmulu_vx_h, 4)
1884  GEN_VEXT_VX(vwmulu_vx_w, 8)
1885  GEN_VEXT_VX(vwmulsu_vx_b, 2)
1886  GEN_VEXT_VX(vwmulsu_vx_h, 4)
1887  GEN_VEXT_VX(vwmulsu_vx_w, 8)
1888  
1889  /* Vector Single-Width Integer Multiply-Add Instructions */
1890  #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1891  static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1892  {                                                                  \
1893      TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1894      TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1895      TD d = *((TD *)vd + HD(i));                                    \
1896      *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1897  }
1898  
1899  #define DO_MACC(N, M, D) (M * N + D)
1900  #define DO_NMSAC(N, M, D) (-(M * N) + D)
1901  #define DO_MADD(N, M, D) (M * D + N)
1902  #define DO_NMSUB(N, M, D) (-(M * D) + N)
1903  RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1904  RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1905  RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1906  RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1907  RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1908  RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1909  RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1910  RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1911  RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1912  RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1913  RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1914  RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1915  RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1916  RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1917  RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1918  RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1919  GEN_VEXT_VV(vmacc_vv_b, 1)
1920  GEN_VEXT_VV(vmacc_vv_h, 2)
1921  GEN_VEXT_VV(vmacc_vv_w, 4)
1922  GEN_VEXT_VV(vmacc_vv_d, 8)
1923  GEN_VEXT_VV(vnmsac_vv_b, 1)
1924  GEN_VEXT_VV(vnmsac_vv_h, 2)
1925  GEN_VEXT_VV(vnmsac_vv_w, 4)
1926  GEN_VEXT_VV(vnmsac_vv_d, 8)
1927  GEN_VEXT_VV(vmadd_vv_b, 1)
1928  GEN_VEXT_VV(vmadd_vv_h, 2)
1929  GEN_VEXT_VV(vmadd_vv_w, 4)
1930  GEN_VEXT_VV(vmadd_vv_d, 8)
1931  GEN_VEXT_VV(vnmsub_vv_b, 1)
1932  GEN_VEXT_VV(vnmsub_vv_h, 2)
1933  GEN_VEXT_VV(vnmsub_vv_w, 4)
1934  GEN_VEXT_VV(vnmsub_vv_d, 8)
1935  
1936  #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1937  static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1938  {                                                                   \
1939      TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1940      TD d = *((TD *)vd + HD(i));                                     \
1941      *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1942  }
1943  
1944  RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1945  RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1946  RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1947  RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1948  RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1949  RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1950  RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1951  RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1952  RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1953  RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1954  RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1955  RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1956  RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1957  RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1958  RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1959  RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1960  GEN_VEXT_VX(vmacc_vx_b, 1)
1961  GEN_VEXT_VX(vmacc_vx_h, 2)
1962  GEN_VEXT_VX(vmacc_vx_w, 4)
1963  GEN_VEXT_VX(vmacc_vx_d, 8)
1964  GEN_VEXT_VX(vnmsac_vx_b, 1)
1965  GEN_VEXT_VX(vnmsac_vx_h, 2)
1966  GEN_VEXT_VX(vnmsac_vx_w, 4)
1967  GEN_VEXT_VX(vnmsac_vx_d, 8)
1968  GEN_VEXT_VX(vmadd_vx_b, 1)
1969  GEN_VEXT_VX(vmadd_vx_h, 2)
1970  GEN_VEXT_VX(vmadd_vx_w, 4)
1971  GEN_VEXT_VX(vmadd_vx_d, 8)
1972  GEN_VEXT_VX(vnmsub_vx_b, 1)
1973  GEN_VEXT_VX(vnmsub_vx_h, 2)
1974  GEN_VEXT_VX(vnmsub_vx_w, 4)
1975  GEN_VEXT_VX(vnmsub_vx_d, 8)
1976  
1977  /* Vector Widening Integer Multiply-Add Instructions */
1978  RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1979  RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1980  RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1981  RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1982  RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1983  RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1984  RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1985  RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1986  RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1987  GEN_VEXT_VV(vwmaccu_vv_b, 2)
1988  GEN_VEXT_VV(vwmaccu_vv_h, 4)
1989  GEN_VEXT_VV(vwmaccu_vv_w, 8)
1990  GEN_VEXT_VV(vwmacc_vv_b, 2)
1991  GEN_VEXT_VV(vwmacc_vv_h, 4)
1992  GEN_VEXT_VV(vwmacc_vv_w, 8)
1993  GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1994  GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1995  GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1996  
1997  RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1998  RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1999  RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
2000  RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
2001  RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
2002  RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
2003  RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
2004  RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
2005  RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
2006  RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
2007  RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
2008  RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
2009  GEN_VEXT_VX(vwmaccu_vx_b, 2)
2010  GEN_VEXT_VX(vwmaccu_vx_h, 4)
2011  GEN_VEXT_VX(vwmaccu_vx_w, 8)
2012  GEN_VEXT_VX(vwmacc_vx_b, 2)
2013  GEN_VEXT_VX(vwmacc_vx_h, 4)
2014  GEN_VEXT_VX(vwmacc_vx_w, 8)
2015  GEN_VEXT_VX(vwmaccsu_vx_b, 2)
2016  GEN_VEXT_VX(vwmaccsu_vx_h, 4)
2017  GEN_VEXT_VX(vwmaccsu_vx_w, 8)
2018  GEN_VEXT_VX(vwmaccus_vx_b, 2)
2019  GEN_VEXT_VX(vwmaccus_vx_h, 4)
2020  GEN_VEXT_VX(vwmaccus_vx_w, 8)
2021  
2022  /* Vector Integer Merge and Move Instructions */
2023  #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
2024  void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
2025                    uint32_t desc)                                     \
2026  {                                                                    \
2027      uint32_t vl = env->vl;                                           \
2028      uint32_t esz = sizeof(ETYPE);                                    \
2029      uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2030      uint32_t vta = vext_vta(desc);                                   \
2031      uint32_t i;                                                      \
2032                                                                       \
2033      VSTART_CHECK_EARLY_EXIT(env);                                    \
2034                                                                       \
2035      for (i = env->vstart; i < vl; i++) {                             \
2036          ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2037          *((ETYPE *)vd + H(i)) = s1;                                  \
2038      }                                                                \
2039      env->vstart = 0;                                                 \
2040      /* set tail elements to 1s */                                    \
2041      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2042  }
2043  
2044  GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2045  GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2046  GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2047  GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2048  
2049  #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2050  void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2051                    uint32_t desc)                                     \
2052  {                                                                    \
2053      uint32_t vl = env->vl;                                           \
2054      uint32_t esz = sizeof(ETYPE);                                    \
2055      uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2056      uint32_t vta = vext_vta(desc);                                   \
2057      uint32_t i;                                                      \
2058                                                                       \
2059      VSTART_CHECK_EARLY_EXIT(env);                                    \
2060                                                                       \
2061      for (i = env->vstart; i < vl; i++) {                             \
2062          *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2063      }                                                                \
2064      env->vstart = 0;                                                 \
2065      /* set tail elements to 1s */                                    \
2066      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2067  }
2068  
2069  GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2070  GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2071  GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2072  GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2073  
2074  #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2075  void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2076                    CPURISCVState *env, uint32_t desc)                 \
2077  {                                                                    \
2078      uint32_t vl = env->vl;                                           \
2079      uint32_t esz = sizeof(ETYPE);                                    \
2080      uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2081      uint32_t vta = vext_vta(desc);                                   \
2082      uint32_t i;                                                      \
2083                                                                       \
2084      VSTART_CHECK_EARLY_EXIT(env);                                    \
2085                                                                       \
2086      for (i = env->vstart; i < vl; i++) {                             \
2087          ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2088          *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2089      }                                                                \
2090      env->vstart = 0;                                                 \
2091      /* set tail elements to 1s */                                    \
2092      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2093  }
2094  
2095  GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2096  GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2097  GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2098  GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2099  
2100  #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2101  void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2102                    void *vs2, CPURISCVState *env, uint32_t desc)      \
2103  {                                                                    \
2104      uint32_t vl = env->vl;                                           \
2105      uint32_t esz = sizeof(ETYPE);                                    \
2106      uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2107      uint32_t vta = vext_vta(desc);                                   \
2108      uint32_t i;                                                      \
2109                                                                       \
2110      VSTART_CHECK_EARLY_EXIT(env);                                    \
2111                                                                       \
2112      for (i = env->vstart; i < vl; i++) {                             \
2113          ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2114          ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2115                     (ETYPE)(target_long)s1);                          \
2116          *((ETYPE *)vd + H(i)) = d;                                   \
2117      }                                                                \
2118      env->vstart = 0;                                                 \
2119      /* set tail elements to 1s */                                    \
2120      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2121  }
2122  
2123  GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2124  GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2125  GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2126  GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2127  
2128  /*
2129   * Vector Fixed-Point Arithmetic Instructions
2130   */
2131  
2132  /* Vector Single-Width Saturating Add and Subtract */
2133  
2134  /*
2135   * As fixed point instructions probably have round mode and saturation,
2136   * define common macros for fixed point here.
2137   */
2138  typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2139                            CPURISCVState *env, int vxrm);
2140  
2141  #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2142  static inline void                                                  \
2143  do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2144            CPURISCVState *env, int vxrm)                             \
2145  {                                                                   \
2146      TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2147      TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2148      *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2149  }
2150  
2151  static inline void
vext_vv_rm_1(void * vd,void * v0,void * vs1,void * vs2,CPURISCVState * env,uint32_t vl,uint32_t vm,int vxrm,opivv2_rm_fn * fn,uint32_t vma,uint32_t esz)2152  vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2153               CPURISCVState *env,
2154               uint32_t vl, uint32_t vm, int vxrm,
2155               opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2156  {
2157      VSTART_CHECK_EARLY_EXIT(env);
2158  
2159      for (uint32_t i = env->vstart; i < vl; i++) {
2160          if (!vm && !vext_elem_mask(v0, i)) {
2161              /* set masked-off elements to 1s */
2162              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2163              continue;
2164          }
2165          fn(vd, vs1, vs2, i, env, vxrm);
2166      }
2167      env->vstart = 0;
2168  }
2169  
2170  static inline void
vext_vv_rm_2(void * vd,void * v0,void * vs1,void * vs2,CPURISCVState * env,uint32_t desc,opivv2_rm_fn * fn,uint32_t esz)2171  vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2172               CPURISCVState *env,
2173               uint32_t desc,
2174               opivv2_rm_fn *fn, uint32_t esz)
2175  {
2176      uint32_t vm = vext_vm(desc);
2177      uint32_t vl = env->vl;
2178      uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2179      uint32_t vta = vext_vta(desc);
2180      uint32_t vma = vext_vma(desc);
2181  
2182      switch (env->vxrm) {
2183      case 0: /* rnu */
2184          vext_vv_rm_1(vd, v0, vs1, vs2,
2185                       env, vl, vm, 0, fn, vma, esz);
2186          break;
2187      case 1: /* rne */
2188          vext_vv_rm_1(vd, v0, vs1, vs2,
2189                       env, vl, vm, 1, fn, vma, esz);
2190          break;
2191      case 2: /* rdn */
2192          vext_vv_rm_1(vd, v0, vs1, vs2,
2193                       env, vl, vm, 2, fn, vma, esz);
2194          break;
2195      default: /* rod */
2196          vext_vv_rm_1(vd, v0, vs1, vs2,
2197                       env, vl, vm, 3, fn, vma, esz);
2198          break;
2199      }
2200      /* set tail elements to 1s */
2201      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2202  }
2203  
2204  /* generate helpers for fixed point instructions with OPIVV format */
2205  #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2206  void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2207                    CPURISCVState *env, uint32_t desc)            \
2208  {                                                               \
2209      vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2210                   do_##NAME, ESZ);                               \
2211  }
2212  
saddu8(CPURISCVState * env,int vxrm,uint8_t a,uint8_t b)2213  static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2214                               uint8_t b)
2215  {
2216      uint8_t res = a + b;
2217      if (res < a) {
2218          res = UINT8_MAX;
2219          env->vxsat = 0x1;
2220      }
2221      return res;
2222  }
2223  
saddu16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2224  static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2225                                 uint16_t b)
2226  {
2227      uint16_t res = a + b;
2228      if (res < a) {
2229          res = UINT16_MAX;
2230          env->vxsat = 0x1;
2231      }
2232      return res;
2233  }
2234  
saddu32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2235  static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2236                                 uint32_t b)
2237  {
2238      uint32_t res = a + b;
2239      if (res < a) {
2240          res = UINT32_MAX;
2241          env->vxsat = 0x1;
2242      }
2243      return res;
2244  }
2245  
saddu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2246  static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2247                                 uint64_t b)
2248  {
2249      uint64_t res = a + b;
2250      if (res < a) {
2251          res = UINT64_MAX;
2252          env->vxsat = 0x1;
2253      }
2254      return res;
2255  }
2256  
2257  RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2258  RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2259  RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2260  RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2261  GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2262  GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2263  GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2264  GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2265  
2266  typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2267                            CPURISCVState *env, int vxrm);
2268  
2269  #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2270  static inline void                                                  \
2271  do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2272            CPURISCVState *env, int vxrm)                             \
2273  {                                                                   \
2274      TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2275      *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2276  }
2277  
2278  static inline void
vext_vx_rm_1(void * vd,void * v0,target_long s1,void * vs2,CPURISCVState * env,uint32_t vl,uint32_t vm,int vxrm,opivx2_rm_fn * fn,uint32_t vma,uint32_t esz)2279  vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2280               CPURISCVState *env,
2281               uint32_t vl, uint32_t vm, int vxrm,
2282               opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2283  {
2284      VSTART_CHECK_EARLY_EXIT(env);
2285  
2286      for (uint32_t i = env->vstart; i < vl; i++) {
2287          if (!vm && !vext_elem_mask(v0, i)) {
2288              /* set masked-off elements to 1s */
2289              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2290              continue;
2291          }
2292          fn(vd, s1, vs2, i, env, vxrm);
2293      }
2294      env->vstart = 0;
2295  }
2296  
2297  static inline void
vext_vx_rm_2(void * vd,void * v0,target_long s1,void * vs2,CPURISCVState * env,uint32_t desc,opivx2_rm_fn * fn,uint32_t esz)2298  vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2299               CPURISCVState *env,
2300               uint32_t desc,
2301               opivx2_rm_fn *fn, uint32_t esz)
2302  {
2303      uint32_t vm = vext_vm(desc);
2304      uint32_t vl = env->vl;
2305      uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2306      uint32_t vta = vext_vta(desc);
2307      uint32_t vma = vext_vma(desc);
2308  
2309      switch (env->vxrm) {
2310      case 0: /* rnu */
2311          vext_vx_rm_1(vd, v0, s1, vs2,
2312                       env, vl, vm, 0, fn, vma, esz);
2313          break;
2314      case 1: /* rne */
2315          vext_vx_rm_1(vd, v0, s1, vs2,
2316                       env, vl, vm, 1, fn, vma, esz);
2317          break;
2318      case 2: /* rdn */
2319          vext_vx_rm_1(vd, v0, s1, vs2,
2320                       env, vl, vm, 2, fn, vma, esz);
2321          break;
2322      default: /* rod */
2323          vext_vx_rm_1(vd, v0, s1, vs2,
2324                       env, vl, vm, 3, fn, vma, esz);
2325          break;
2326      }
2327      /* set tail elements to 1s */
2328      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2329  }
2330  
2331  /* generate helpers for fixed point instructions with OPIVX format */
2332  #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2333  void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2334                    void *vs2, CPURISCVState *env,          \
2335                    uint32_t desc)                          \
2336  {                                                         \
2337      vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2338                   do_##NAME, ESZ);                         \
2339  }
2340  
RVVCALL(OPIVX2_RM,vsaddu_vx_b,OP_UUU_B,H1,H1,saddu8)2341  RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2342  RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2343  RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2344  RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2345  GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2346  GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2347  GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2348  GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2349  
2350  static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2351  {
2352      int8_t res = a + b;
2353      if ((res ^ a) & (res ^ b) & INT8_MIN) {
2354          res = a > 0 ? INT8_MAX : INT8_MIN;
2355          env->vxsat = 0x1;
2356      }
2357      return res;
2358  }
2359  
sadd16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2360  static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2361                               int16_t b)
2362  {
2363      int16_t res = a + b;
2364      if ((res ^ a) & (res ^ b) & INT16_MIN) {
2365          res = a > 0 ? INT16_MAX : INT16_MIN;
2366          env->vxsat = 0x1;
2367      }
2368      return res;
2369  }
2370  
sadd32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2371  static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2372                               int32_t b)
2373  {
2374      int32_t res = a + b;
2375      if ((res ^ a) & (res ^ b) & INT32_MIN) {
2376          res = a > 0 ? INT32_MAX : INT32_MIN;
2377          env->vxsat = 0x1;
2378      }
2379      return res;
2380  }
2381  
sadd64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2382  static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2383                               int64_t b)
2384  {
2385      int64_t res = a + b;
2386      if ((res ^ a) & (res ^ b) & INT64_MIN) {
2387          res = a > 0 ? INT64_MAX : INT64_MIN;
2388          env->vxsat = 0x1;
2389      }
2390      return res;
2391  }
2392  
RVVCALL(OPIVV2_RM,vsadd_vv_b,OP_SSS_B,H1,H1,H1,sadd8)2393  RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2394  RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2395  RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2396  RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2397  GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2398  GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2399  GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2400  GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2401  
2402  RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2403  RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2404  RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2405  RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2406  GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2407  GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2408  GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2409  GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2410  
2411  static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2412                               uint8_t b)
2413  {
2414      uint8_t res = a - b;
2415      if (res > a) {
2416          res = 0;
2417          env->vxsat = 0x1;
2418      }
2419      return res;
2420  }
2421  
ssubu16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2422  static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2423                                 uint16_t b)
2424  {
2425      uint16_t res = a - b;
2426      if (res > a) {
2427          res = 0;
2428          env->vxsat = 0x1;
2429      }
2430      return res;
2431  }
2432  
ssubu32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2433  static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2434                                 uint32_t b)
2435  {
2436      uint32_t res = a - b;
2437      if (res > a) {
2438          res = 0;
2439          env->vxsat = 0x1;
2440      }
2441      return res;
2442  }
2443  
ssubu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2444  static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2445                                 uint64_t b)
2446  {
2447      uint64_t res = a - b;
2448      if (res > a) {
2449          res = 0;
2450          env->vxsat = 0x1;
2451      }
2452      return res;
2453  }
2454  
RVVCALL(OPIVV2_RM,vssubu_vv_b,OP_UUU_B,H1,H1,H1,ssubu8)2455  RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2456  RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2457  RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2458  RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2459  GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2460  GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2461  GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2462  GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2463  
2464  RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2465  RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2466  RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2467  RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2468  GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2469  GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2470  GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2471  GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2472  
2473  static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2474  {
2475      int8_t res = a - b;
2476      if ((res ^ a) & (a ^ b) & INT8_MIN) {
2477          res = a >= 0 ? INT8_MAX : INT8_MIN;
2478          env->vxsat = 0x1;
2479      }
2480      return res;
2481  }
2482  
ssub16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2483  static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2484                               int16_t b)
2485  {
2486      int16_t res = a - b;
2487      if ((res ^ a) & (a ^ b) & INT16_MIN) {
2488          res = a >= 0 ? INT16_MAX : INT16_MIN;
2489          env->vxsat = 0x1;
2490      }
2491      return res;
2492  }
2493  
ssub32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2494  static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2495                               int32_t b)
2496  {
2497      int32_t res = a - b;
2498      if ((res ^ a) & (a ^ b) & INT32_MIN) {
2499          res = a >= 0 ? INT32_MAX : INT32_MIN;
2500          env->vxsat = 0x1;
2501      }
2502      return res;
2503  }
2504  
ssub64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2505  static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2506                               int64_t b)
2507  {
2508      int64_t res = a - b;
2509      if ((res ^ a) & (a ^ b) & INT64_MIN) {
2510          res = a >= 0 ? INT64_MAX : INT64_MIN;
2511          env->vxsat = 0x1;
2512      }
2513      return res;
2514  }
2515  
RVVCALL(OPIVV2_RM,vssub_vv_b,OP_SSS_B,H1,H1,H1,ssub8)2516  RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2517  RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2518  RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2519  RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2520  GEN_VEXT_VV_RM(vssub_vv_b, 1)
2521  GEN_VEXT_VV_RM(vssub_vv_h, 2)
2522  GEN_VEXT_VV_RM(vssub_vv_w, 4)
2523  GEN_VEXT_VV_RM(vssub_vv_d, 8)
2524  
2525  RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2526  RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2527  RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2528  RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2529  GEN_VEXT_VX_RM(vssub_vx_b, 1)
2530  GEN_VEXT_VX_RM(vssub_vx_h, 2)
2531  GEN_VEXT_VX_RM(vssub_vx_w, 4)
2532  GEN_VEXT_VX_RM(vssub_vx_d, 8)
2533  
2534  /* Vector Single-Width Averaging Add and Subtract */
2535  static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2536  {
2537      uint8_t d = extract64(v, shift, 1);
2538      uint8_t d1;
2539      uint64_t D1, D2;
2540  
2541      if (shift == 0 || shift > 64) {
2542          return 0;
2543      }
2544  
2545      d1 = extract64(v, shift - 1, 1);
2546      D1 = extract64(v, 0, shift);
2547      if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2548          return d1;
2549      } else if (vxrm == 1) { /* round-to-nearest-even */
2550          if (shift > 1) {
2551              D2 = extract64(v, 0, shift - 1);
2552              return d1 & ((D2 != 0) | d);
2553          } else {
2554              return d1 & d;
2555          }
2556      } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2557          return !d & (D1 != 0);
2558      }
2559      return 0; /* round-down (truncate) */
2560  }
2561  
aadd32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2562  static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2563                               int32_t b)
2564  {
2565      int64_t res = (int64_t)a + b;
2566      uint8_t round = get_round(vxrm, res, 1);
2567  
2568      return (res >> 1) + round;
2569  }
2570  
aadd64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2571  static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2572                               int64_t b)
2573  {
2574      int64_t res = a + b;
2575      uint8_t round = get_round(vxrm, res, 1);
2576      int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2577  
2578      /* With signed overflow, bit 64 is inverse of bit 63. */
2579      return ((res >> 1) ^ over) + round;
2580  }
2581  
RVVCALL(OPIVV2_RM,vaadd_vv_b,OP_SSS_B,H1,H1,H1,aadd32)2582  RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2583  RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2584  RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2585  RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2586  GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2587  GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2588  GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2589  GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2590  
2591  RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2592  RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2593  RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2594  RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2595  GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2596  GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2597  GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2598  GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2599  
2600  static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2601                                 uint32_t a, uint32_t b)
2602  {
2603      uint64_t res = (uint64_t)a + b;
2604      uint8_t round = get_round(vxrm, res, 1);
2605  
2606      return (res >> 1) + round;
2607  }
2608  
aaddu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2609  static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2610                                 uint64_t a, uint64_t b)
2611  {
2612      uint64_t res = a + b;
2613      uint8_t round = get_round(vxrm, res, 1);
2614      uint64_t over = (uint64_t)(res < a) << 63;
2615  
2616      return ((res >> 1) | over) + round;
2617  }
2618  
RVVCALL(OPIVV2_RM,vaaddu_vv_b,OP_UUU_B,H1,H1,H1,aaddu32)2619  RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2620  RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2621  RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2622  RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2623  GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2624  GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2625  GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2626  GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2627  
2628  RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2629  RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2630  RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2631  RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2632  GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2633  GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2634  GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2635  GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2636  
2637  static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2638                               int32_t b)
2639  {
2640      int64_t res = (int64_t)a - b;
2641      uint8_t round = get_round(vxrm, res, 1);
2642  
2643      return (res >> 1) + round;
2644  }
2645  
asub64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2646  static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2647                               int64_t b)
2648  {
2649      int64_t res = (int64_t)a - b;
2650      uint8_t round = get_round(vxrm, res, 1);
2651      int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2652  
2653      /* With signed overflow, bit 64 is inverse of bit 63. */
2654      return ((res >> 1) ^ over) + round;
2655  }
2656  
RVVCALL(OPIVV2_RM,vasub_vv_b,OP_SSS_B,H1,H1,H1,asub32)2657  RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2658  RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2659  RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2660  RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2661  GEN_VEXT_VV_RM(vasub_vv_b, 1)
2662  GEN_VEXT_VV_RM(vasub_vv_h, 2)
2663  GEN_VEXT_VV_RM(vasub_vv_w, 4)
2664  GEN_VEXT_VV_RM(vasub_vv_d, 8)
2665  
2666  RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2667  RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2668  RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2669  RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2670  GEN_VEXT_VX_RM(vasub_vx_b, 1)
2671  GEN_VEXT_VX_RM(vasub_vx_h, 2)
2672  GEN_VEXT_VX_RM(vasub_vx_w, 4)
2673  GEN_VEXT_VX_RM(vasub_vx_d, 8)
2674  
2675  static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2676                                 uint32_t a, uint32_t b)
2677  {
2678      int64_t res = (int64_t)a - b;
2679      uint8_t round = get_round(vxrm, res, 1);
2680  
2681      return (res >> 1) + round;
2682  }
2683  
asubu64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2684  static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2685                                 uint64_t a, uint64_t b)
2686  {
2687      uint64_t res = (uint64_t)a - b;
2688      uint8_t round = get_round(vxrm, res, 1);
2689      uint64_t over = (uint64_t)(res > a) << 63;
2690  
2691      return ((res >> 1) | over) + round;
2692  }
2693  
RVVCALL(OPIVV2_RM,vasubu_vv_b,OP_UUU_B,H1,H1,H1,asubu32)2694  RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2695  RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2696  RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2697  RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2698  GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2699  GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2700  GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2701  GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2702  
2703  RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2704  RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2705  RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2706  RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2707  GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2708  GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2709  GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2710  GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2711  
2712  /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2713  static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2714  {
2715      uint8_t round;
2716      int16_t res;
2717  
2718      res = (int16_t)a * (int16_t)b;
2719      round = get_round(vxrm, res, 7);
2720      res = (res >> 7) + round;
2721  
2722      if (res > INT8_MAX) {
2723          env->vxsat = 0x1;
2724          return INT8_MAX;
2725      } else if (res < INT8_MIN) {
2726          env->vxsat = 0x1;
2727          return INT8_MIN;
2728      } else {
2729          return res;
2730      }
2731  }
2732  
vsmul16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2733  static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2734  {
2735      uint8_t round;
2736      int32_t res;
2737  
2738      res = (int32_t)a * (int32_t)b;
2739      round = get_round(vxrm, res, 15);
2740      res = (res >> 15) + round;
2741  
2742      if (res > INT16_MAX) {
2743          env->vxsat = 0x1;
2744          return INT16_MAX;
2745      } else if (res < INT16_MIN) {
2746          env->vxsat = 0x1;
2747          return INT16_MIN;
2748      } else {
2749          return res;
2750      }
2751  }
2752  
vsmul32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2753  static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2754  {
2755      uint8_t round;
2756      int64_t res;
2757  
2758      res = (int64_t)a * (int64_t)b;
2759      round = get_round(vxrm, res, 31);
2760      res = (res >> 31) + round;
2761  
2762      if (res > INT32_MAX) {
2763          env->vxsat = 0x1;
2764          return INT32_MAX;
2765      } else if (res < INT32_MIN) {
2766          env->vxsat = 0x1;
2767          return INT32_MIN;
2768      } else {
2769          return res;
2770      }
2771  }
2772  
vsmul64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2773  static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2774  {
2775      uint8_t round;
2776      uint64_t hi_64, lo_64;
2777      int64_t res;
2778  
2779      if (a == INT64_MIN && b == INT64_MIN) {
2780          env->vxsat = 1;
2781          return INT64_MAX;
2782      }
2783  
2784      muls64(&lo_64, &hi_64, a, b);
2785      round = get_round(vxrm, lo_64, 63);
2786      /*
2787       * Cannot overflow, as there are always
2788       * 2 sign bits after multiply.
2789       */
2790      res = (hi_64 << 1) | (lo_64 >> 63);
2791      if (round) {
2792          if (res == INT64_MAX) {
2793              env->vxsat = 1;
2794          } else {
2795              res += 1;
2796          }
2797      }
2798      return res;
2799  }
2800  
RVVCALL(OPIVV2_RM,vsmul_vv_b,OP_SSS_B,H1,H1,H1,vsmul8)2801  RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2802  RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2803  RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2804  RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2805  GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2806  GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2807  GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2808  GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2809  
2810  RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2811  RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2812  RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2813  RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2814  GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2815  GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2816  GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2817  GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2818  
2819  /* Vector Single-Width Scaling Shift Instructions */
2820  static inline uint8_t
2821  vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2822  {
2823      uint8_t round, shift = b & 0x7;
2824      uint8_t res;
2825  
2826      round = get_round(vxrm, a, shift);
2827      res = (a >> shift) + round;
2828      return res;
2829  }
2830  static inline uint16_t
vssrl16(CPURISCVState * env,int vxrm,uint16_t a,uint16_t b)2831  vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2832  {
2833      uint8_t round, shift = b & 0xf;
2834  
2835      round = get_round(vxrm, a, shift);
2836      return (a >> shift) + round;
2837  }
2838  static inline uint32_t
vssrl32(CPURISCVState * env,int vxrm,uint32_t a,uint32_t b)2839  vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2840  {
2841      uint8_t round, shift = b & 0x1f;
2842  
2843      round = get_round(vxrm, a, shift);
2844      return (a >> shift) + round;
2845  }
2846  static inline uint64_t
vssrl64(CPURISCVState * env,int vxrm,uint64_t a,uint64_t b)2847  vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2848  {
2849      uint8_t round, shift = b & 0x3f;
2850  
2851      round = get_round(vxrm, a, shift);
2852      return (a >> shift) + round;
2853  }
RVVCALL(OPIVV2_RM,vssrl_vv_b,OP_UUU_B,H1,H1,H1,vssrl8)2854  RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2855  RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2856  RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2857  RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2858  GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2859  GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2860  GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2861  GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2862  
2863  RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2864  RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2865  RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2866  RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2867  GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2868  GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2869  GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2870  GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2871  
2872  static inline int8_t
2873  vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2874  {
2875      uint8_t round, shift = b & 0x7;
2876  
2877      round = get_round(vxrm, a, shift);
2878      return (a >> shift) + round;
2879  }
2880  static inline int16_t
vssra16(CPURISCVState * env,int vxrm,int16_t a,int16_t b)2881  vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2882  {
2883      uint8_t round, shift = b & 0xf;
2884  
2885      round = get_round(vxrm, a, shift);
2886      return (a >> shift) + round;
2887  }
2888  static inline int32_t
vssra32(CPURISCVState * env,int vxrm,int32_t a,int32_t b)2889  vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2890  {
2891      uint8_t round, shift = b & 0x1f;
2892  
2893      round = get_round(vxrm, a, shift);
2894      return (a >> shift) + round;
2895  }
2896  static inline int64_t
vssra64(CPURISCVState * env,int vxrm,int64_t a,int64_t b)2897  vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2898  {
2899      uint8_t round, shift = b & 0x3f;
2900  
2901      round = get_round(vxrm, a, shift);
2902      return (a >> shift) + round;
2903  }
2904  
RVVCALL(OPIVV2_RM,vssra_vv_b,OP_SSS_B,H1,H1,H1,vssra8)2905  RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2906  RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2907  RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2908  RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2909  GEN_VEXT_VV_RM(vssra_vv_b, 1)
2910  GEN_VEXT_VV_RM(vssra_vv_h, 2)
2911  GEN_VEXT_VV_RM(vssra_vv_w, 4)
2912  GEN_VEXT_VV_RM(vssra_vv_d, 8)
2913  
2914  RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2915  RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2916  RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2917  RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2918  GEN_VEXT_VX_RM(vssra_vx_b, 1)
2919  GEN_VEXT_VX_RM(vssra_vx_h, 2)
2920  GEN_VEXT_VX_RM(vssra_vx_w, 4)
2921  GEN_VEXT_VX_RM(vssra_vx_d, 8)
2922  
2923  /* Vector Narrowing Fixed-Point Clip Instructions */
2924  static inline int8_t
2925  vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2926  {
2927      uint8_t round, shift = b & 0xf;
2928      int16_t res;
2929  
2930      round = get_round(vxrm, a, shift);
2931      res = (a >> shift) + round;
2932      if (res > INT8_MAX) {
2933          env->vxsat = 0x1;
2934          return INT8_MAX;
2935      } else if (res < INT8_MIN) {
2936          env->vxsat = 0x1;
2937          return INT8_MIN;
2938      } else {
2939          return res;
2940      }
2941  }
2942  
2943  static inline int16_t
vnclip16(CPURISCVState * env,int vxrm,int32_t a,int16_t b)2944  vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2945  {
2946      uint8_t round, shift = b & 0x1f;
2947      int32_t res;
2948  
2949      round = get_round(vxrm, a, shift);
2950      res = (a >> shift) + round;
2951      if (res > INT16_MAX) {
2952          env->vxsat = 0x1;
2953          return INT16_MAX;
2954      } else if (res < INT16_MIN) {
2955          env->vxsat = 0x1;
2956          return INT16_MIN;
2957      } else {
2958          return res;
2959      }
2960  }
2961  
2962  static inline int32_t
vnclip32(CPURISCVState * env,int vxrm,int64_t a,int32_t b)2963  vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2964  {
2965      uint8_t round, shift = b & 0x3f;
2966      int64_t res;
2967  
2968      round = get_round(vxrm, a, shift);
2969      res = (a >> shift) + round;
2970      if (res > INT32_MAX) {
2971          env->vxsat = 0x1;
2972          return INT32_MAX;
2973      } else if (res < INT32_MIN) {
2974          env->vxsat = 0x1;
2975          return INT32_MIN;
2976      } else {
2977          return res;
2978      }
2979  }
2980  
RVVCALL(OPIVV2_RM,vnclip_wv_b,NOP_SSS_B,H1,H2,H1,vnclip8)2981  RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2982  RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2983  RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2984  GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2985  GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2986  GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2987  
2988  RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2989  RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2990  RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2991  GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2992  GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2993  GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2994  
2995  static inline uint8_t
2996  vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2997  {
2998      uint8_t round, shift = b & 0xf;
2999      uint16_t res;
3000  
3001      round = get_round(vxrm, a, shift);
3002      res = (a >> shift) + round;
3003      if (res > UINT8_MAX) {
3004          env->vxsat = 0x1;
3005          return UINT8_MAX;
3006      } else {
3007          return res;
3008      }
3009  }
3010  
3011  static inline uint16_t
vnclipu16(CPURISCVState * env,int vxrm,uint32_t a,uint16_t b)3012  vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
3013  {
3014      uint8_t round, shift = b & 0x1f;
3015      uint32_t res;
3016  
3017      round = get_round(vxrm, a, shift);
3018      res = (a >> shift) + round;
3019      if (res > UINT16_MAX) {
3020          env->vxsat = 0x1;
3021          return UINT16_MAX;
3022      } else {
3023          return res;
3024      }
3025  }
3026  
3027  static inline uint32_t
vnclipu32(CPURISCVState * env,int vxrm,uint64_t a,uint32_t b)3028  vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
3029  {
3030      uint8_t round, shift = b & 0x3f;
3031      uint64_t res;
3032  
3033      round = get_round(vxrm, a, shift);
3034      res = (a >> shift) + round;
3035      if (res > UINT32_MAX) {
3036          env->vxsat = 0x1;
3037          return UINT32_MAX;
3038      } else {
3039          return res;
3040      }
3041  }
3042  
RVVCALL(OPIVV2_RM,vnclipu_wv_b,NOP_UUU_B,H1,H2,H1,vnclipu8)3043  RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3044  RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3045  RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3046  GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3047  GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3048  GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3049  
3050  RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3051  RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3052  RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3053  GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3054  GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3055  GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3056  
3057  /*
3058   * Vector Float Point Arithmetic Instructions
3059   */
3060  /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3061  #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3062  static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3063                        CPURISCVState *env)                      \
3064  {                                                              \
3065      TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3066      TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3067      *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3068  }
3069  
3070  #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3071  void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3072                    void *vs2, CPURISCVState *env,          \
3073                    uint32_t desc)                          \
3074  {                                                         \
3075      uint32_t vm = vext_vm(desc);                          \
3076      uint32_t vl = env->vl;                                \
3077      uint32_t total_elems =                                \
3078          vext_get_total_elems(env, desc, ESZ);             \
3079      uint32_t vta = vext_vta(desc);                        \
3080      uint32_t vma = vext_vma(desc);                        \
3081      uint32_t i;                                           \
3082                                                            \
3083      VSTART_CHECK_EARLY_EXIT(env);                         \
3084                                                            \
3085      for (i = env->vstart; i < vl; i++) {                  \
3086          if (!vm && !vext_elem_mask(v0, i)) {              \
3087              /* set masked-off elements to 1s */           \
3088              vext_set_elems_1s(vd, vma, i * ESZ,           \
3089                                (i + 1) * ESZ);             \
3090              continue;                                     \
3091          }                                                 \
3092          do_##NAME(vd, vs1, vs2, i, env);                  \
3093      }                                                     \
3094      env->vstart = 0;                                      \
3095      /* set tail elements to 1s */                         \
3096      vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3097                        total_elems * ESZ);                 \
3098  }
3099  
3100  RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3101  RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3102  RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3103  GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3104  GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3105  GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3106  
3107  #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3108  static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3109                        CPURISCVState *env)                      \
3110  {                                                              \
3111      TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3112      *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3113  }
3114  
3115  #define GEN_VEXT_VF(NAME, ESZ)                            \
3116  void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3117                    void *vs2, CPURISCVState *env,          \
3118                    uint32_t desc)                          \
3119  {                                                         \
3120      uint32_t vm = vext_vm(desc);                          \
3121      uint32_t vl = env->vl;                                \
3122      uint32_t total_elems =                                \
3123          vext_get_total_elems(env, desc, ESZ);             \
3124      uint32_t vta = vext_vta(desc);                        \
3125      uint32_t vma = vext_vma(desc);                        \
3126      uint32_t i;                                           \
3127                                                            \
3128      VSTART_CHECK_EARLY_EXIT(env);                         \
3129                                                            \
3130      for (i = env->vstart; i < vl; i++) {                  \
3131          if (!vm && !vext_elem_mask(v0, i)) {              \
3132              /* set masked-off elements to 1s */           \
3133              vext_set_elems_1s(vd, vma, i * ESZ,           \
3134                                (i + 1) * ESZ);             \
3135              continue;                                     \
3136          }                                                 \
3137          do_##NAME(vd, s1, vs2, i, env);                   \
3138      }                                                     \
3139      env->vstart = 0;                                      \
3140      /* set tail elements to 1s */                         \
3141      vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3142                        total_elems * ESZ);                 \
3143  }
3144  
3145  RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3146  RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3147  RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3148  GEN_VEXT_VF(vfadd_vf_h, 2)
3149  GEN_VEXT_VF(vfadd_vf_w, 4)
3150  GEN_VEXT_VF(vfadd_vf_d, 8)
3151  
3152  RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3153  RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3154  RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3155  GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3156  GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3157  GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3158  RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3159  RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3160  RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3161  GEN_VEXT_VF(vfsub_vf_h, 2)
3162  GEN_VEXT_VF(vfsub_vf_w, 4)
3163  GEN_VEXT_VF(vfsub_vf_d, 8)
3164  
3165  static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3166  {
3167      return float16_sub(b, a, s);
3168  }
3169  
float32_rsub(uint32_t a,uint32_t b,float_status * s)3170  static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3171  {
3172      return float32_sub(b, a, s);
3173  }
3174  
float64_rsub(uint64_t a,uint64_t b,float_status * s)3175  static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3176  {
3177      return float64_sub(b, a, s);
3178  }
3179  
RVVCALL(OPFVF2,vfrsub_vf_h,OP_UUU_H,H2,H2,float16_rsub)3180  RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3181  RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3182  RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3183  GEN_VEXT_VF(vfrsub_vf_h, 2)
3184  GEN_VEXT_VF(vfrsub_vf_w, 4)
3185  GEN_VEXT_VF(vfrsub_vf_d, 8)
3186  
3187  /* Vector Widening Floating-Point Add/Subtract Instructions */
3188  static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3189  {
3190      return float32_add(float16_to_float32(a, true, s),
3191                         float16_to_float32(b, true, s), s);
3192  }
3193  
vfwadd32(uint32_t a,uint32_t b,float_status * s)3194  static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3195  {
3196      return float64_add(float32_to_float64(a, s),
3197                         float32_to_float64(b, s), s);
3198  
3199  }
3200  
RVVCALL(OPFVV2,vfwadd_vv_h,WOP_UUU_H,H4,H2,H2,vfwadd16)3201  RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3202  RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3203  GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3204  GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3205  RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3206  RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3207  GEN_VEXT_VF(vfwadd_vf_h, 4)
3208  GEN_VEXT_VF(vfwadd_vf_w, 8)
3209  
3210  static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3211  {
3212      return float32_sub(float16_to_float32(a, true, s),
3213                         float16_to_float32(b, true, s), s);
3214  }
3215  
vfwsub32(uint32_t a,uint32_t b,float_status * s)3216  static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3217  {
3218      return float64_sub(float32_to_float64(a, s),
3219                         float32_to_float64(b, s), s);
3220  
3221  }
3222  
RVVCALL(OPFVV2,vfwsub_vv_h,WOP_UUU_H,H4,H2,H2,vfwsub16)3223  RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3224  RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3225  GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3226  GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3227  RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3228  RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3229  GEN_VEXT_VF(vfwsub_vf_h, 4)
3230  GEN_VEXT_VF(vfwsub_vf_w, 8)
3231  
3232  static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3233  {
3234      return float32_add(a, float16_to_float32(b, true, s), s);
3235  }
3236  
vfwaddw32(uint64_t a,uint32_t b,float_status * s)3237  static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3238  {
3239      return float64_add(a, float32_to_float64(b, s), s);
3240  }
3241  
RVVCALL(OPFVV2,vfwadd_wv_h,WOP_WUUU_H,H4,H2,H2,vfwaddw16)3242  RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3243  RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3244  GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3245  GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3246  RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3247  RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3248  GEN_VEXT_VF(vfwadd_wf_h, 4)
3249  GEN_VEXT_VF(vfwadd_wf_w, 8)
3250  
3251  static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3252  {
3253      return float32_sub(a, float16_to_float32(b, true, s), s);
3254  }
3255  
vfwsubw32(uint64_t a,uint32_t b,float_status * s)3256  static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3257  {
3258      return float64_sub(a, float32_to_float64(b, s), s);
3259  }
3260  
RVVCALL(OPFVV2,vfwsub_wv_h,WOP_WUUU_H,H4,H2,H2,vfwsubw16)3261  RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3262  RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3263  GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3264  GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3265  RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3266  RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3267  GEN_VEXT_VF(vfwsub_wf_h, 4)
3268  GEN_VEXT_VF(vfwsub_wf_w, 8)
3269  
3270  /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3271  RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3272  RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3273  RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3274  GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3275  GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3276  GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3277  RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3278  RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3279  RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3280  GEN_VEXT_VF(vfmul_vf_h, 2)
3281  GEN_VEXT_VF(vfmul_vf_w, 4)
3282  GEN_VEXT_VF(vfmul_vf_d, 8)
3283  
3284  RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3285  RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3286  RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3287  GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3288  GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3289  GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3290  RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3291  RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3292  RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3293  GEN_VEXT_VF(vfdiv_vf_h, 2)
3294  GEN_VEXT_VF(vfdiv_vf_w, 4)
3295  GEN_VEXT_VF(vfdiv_vf_d, 8)
3296  
3297  static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3298  {
3299      return float16_div(b, a, s);
3300  }
3301  
float32_rdiv(uint32_t a,uint32_t b,float_status * s)3302  static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3303  {
3304      return float32_div(b, a, s);
3305  }
3306  
float64_rdiv(uint64_t a,uint64_t b,float_status * s)3307  static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3308  {
3309      return float64_div(b, a, s);
3310  }
3311  
RVVCALL(OPFVF2,vfrdiv_vf_h,OP_UUU_H,H2,H2,float16_rdiv)3312  RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3313  RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3314  RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3315  GEN_VEXT_VF(vfrdiv_vf_h, 2)
3316  GEN_VEXT_VF(vfrdiv_vf_w, 4)
3317  GEN_VEXT_VF(vfrdiv_vf_d, 8)
3318  
3319  /* Vector Widening Floating-Point Multiply */
3320  static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3321  {
3322      return float32_mul(float16_to_float32(a, true, s),
3323                         float16_to_float32(b, true, s), s);
3324  }
3325  
vfwmul32(uint32_t a,uint32_t b,float_status * s)3326  static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3327  {
3328      return float64_mul(float32_to_float64(a, s),
3329                         float32_to_float64(b, s), s);
3330  
3331  }
RVVCALL(OPFVV2,vfwmul_vv_h,WOP_UUU_H,H4,H2,H2,vfwmul16)3332  RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3333  RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3334  GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3335  GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3336  RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3337  RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3338  GEN_VEXT_VF(vfwmul_vf_h, 4)
3339  GEN_VEXT_VF(vfwmul_vf_w, 8)
3340  
3341  /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3342  #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3343  static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3344                        CPURISCVState *env)                          \
3345  {                                                                  \
3346      TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3347      TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3348      TD d = *((TD *)vd + HD(i));                                    \
3349      *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3350  }
3351  
3352  static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3353  {
3354      return float16_muladd(a, b, d, 0, s);
3355  }
3356  
fmacc32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3357  static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3358  {
3359      return float32_muladd(a, b, d, 0, s);
3360  }
3361  
fmacc64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3362  static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3363  {
3364      return float64_muladd(a, b, d, 0, s);
3365  }
3366  
RVVCALL(OPFVV3,vfmacc_vv_h,OP_UUU_H,H2,H2,H2,fmacc16)3367  RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3368  RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3369  RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3370  GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3371  GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3372  GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3373  
3374  #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3375  static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3376                        CPURISCVState *env)                         \
3377  {                                                                 \
3378      TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3379      TD d = *((TD *)vd + HD(i));                                   \
3380      *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3381  }
3382  
3383  RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3384  RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3385  RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3386  GEN_VEXT_VF(vfmacc_vf_h, 2)
3387  GEN_VEXT_VF(vfmacc_vf_w, 4)
3388  GEN_VEXT_VF(vfmacc_vf_d, 8)
3389  
3390  static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3391  {
3392      return float16_muladd(a, b, d, float_muladd_negate_c |
3393                                     float_muladd_negate_product, s);
3394  }
3395  
fnmacc32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3396  static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3397  {
3398      return float32_muladd(a, b, d, float_muladd_negate_c |
3399                                     float_muladd_negate_product, s);
3400  }
3401  
fnmacc64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3402  static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3403  {
3404      return float64_muladd(a, b, d, float_muladd_negate_c |
3405                                     float_muladd_negate_product, s);
3406  }
3407  
RVVCALL(OPFVV3,vfnmacc_vv_h,OP_UUU_H,H2,H2,H2,fnmacc16)3408  RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3409  RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3410  RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3411  GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3412  GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3413  GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3414  RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3415  RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3416  RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3417  GEN_VEXT_VF(vfnmacc_vf_h, 2)
3418  GEN_VEXT_VF(vfnmacc_vf_w, 4)
3419  GEN_VEXT_VF(vfnmacc_vf_d, 8)
3420  
3421  static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3422  {
3423      return float16_muladd(a, b, d, float_muladd_negate_c, s);
3424  }
3425  
fmsac32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3426  static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3427  {
3428      return float32_muladd(a, b, d, float_muladd_negate_c, s);
3429  }
3430  
fmsac64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3431  static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3432  {
3433      return float64_muladd(a, b, d, float_muladd_negate_c, s);
3434  }
3435  
RVVCALL(OPFVV3,vfmsac_vv_h,OP_UUU_H,H2,H2,H2,fmsac16)3436  RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3437  RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3438  RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3439  GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3440  GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3441  GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3442  RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3443  RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3444  RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3445  GEN_VEXT_VF(vfmsac_vf_h, 2)
3446  GEN_VEXT_VF(vfmsac_vf_w, 4)
3447  GEN_VEXT_VF(vfmsac_vf_d, 8)
3448  
3449  static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3450  {
3451      return float16_muladd(a, b, d, float_muladd_negate_product, s);
3452  }
3453  
fnmsac32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3454  static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3455  {
3456      return float32_muladd(a, b, d, float_muladd_negate_product, s);
3457  }
3458  
fnmsac64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3459  static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3460  {
3461      return float64_muladd(a, b, d, float_muladd_negate_product, s);
3462  }
3463  
RVVCALL(OPFVV3,vfnmsac_vv_h,OP_UUU_H,H2,H2,H2,fnmsac16)3464  RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3465  RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3466  RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3467  GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3468  GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3469  GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3470  RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3471  RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3472  RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3473  GEN_VEXT_VF(vfnmsac_vf_h, 2)
3474  GEN_VEXT_VF(vfnmsac_vf_w, 4)
3475  GEN_VEXT_VF(vfnmsac_vf_d, 8)
3476  
3477  static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3478  {
3479      return float16_muladd(d, b, a, 0, s);
3480  }
3481  
fmadd32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3482  static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3483  {
3484      return float32_muladd(d, b, a, 0, s);
3485  }
3486  
fmadd64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3487  static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3488  {
3489      return float64_muladd(d, b, a, 0, s);
3490  }
3491  
RVVCALL(OPFVV3,vfmadd_vv_h,OP_UUU_H,H2,H2,H2,fmadd16)3492  RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3493  RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3494  RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3495  GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3496  GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3497  GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3498  RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3499  RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3500  RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3501  GEN_VEXT_VF(vfmadd_vf_h, 2)
3502  GEN_VEXT_VF(vfmadd_vf_w, 4)
3503  GEN_VEXT_VF(vfmadd_vf_d, 8)
3504  
3505  static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3506  {
3507      return float16_muladd(d, b, a, float_muladd_negate_c |
3508                                     float_muladd_negate_product, s);
3509  }
3510  
fnmadd32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3511  static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3512  {
3513      return float32_muladd(d, b, a, float_muladd_negate_c |
3514                                     float_muladd_negate_product, s);
3515  }
3516  
fnmadd64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3517  static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3518  {
3519      return float64_muladd(d, b, a, float_muladd_negate_c |
3520                                     float_muladd_negate_product, s);
3521  }
3522  
RVVCALL(OPFVV3,vfnmadd_vv_h,OP_UUU_H,H2,H2,H2,fnmadd16)3523  RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3524  RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3525  RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3526  GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3527  GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3528  GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3529  RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3530  RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3531  RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3532  GEN_VEXT_VF(vfnmadd_vf_h, 2)
3533  GEN_VEXT_VF(vfnmadd_vf_w, 4)
3534  GEN_VEXT_VF(vfnmadd_vf_d, 8)
3535  
3536  static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3537  {
3538      return float16_muladd(d, b, a, float_muladd_negate_c, s);
3539  }
3540  
fmsub32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3541  static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3542  {
3543      return float32_muladd(d, b, a, float_muladd_negate_c, s);
3544  }
3545  
fmsub64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3546  static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3547  {
3548      return float64_muladd(d, b, a, float_muladd_negate_c, s);
3549  }
3550  
RVVCALL(OPFVV3,vfmsub_vv_h,OP_UUU_H,H2,H2,H2,fmsub16)3551  RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3552  RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3553  RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3554  GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3555  GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3556  GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3557  RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3558  RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3559  RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3560  GEN_VEXT_VF(vfmsub_vf_h, 2)
3561  GEN_VEXT_VF(vfmsub_vf_w, 4)
3562  GEN_VEXT_VF(vfmsub_vf_d, 8)
3563  
3564  static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3565  {
3566      return float16_muladd(d, b, a, float_muladd_negate_product, s);
3567  }
3568  
fnmsub32(uint32_t a,uint32_t b,uint32_t d,float_status * s)3569  static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3570  {
3571      return float32_muladd(d, b, a, float_muladd_negate_product, s);
3572  }
3573  
fnmsub64(uint64_t a,uint64_t b,uint64_t d,float_status * s)3574  static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3575  {
3576      return float64_muladd(d, b, a, float_muladd_negate_product, s);
3577  }
3578  
RVVCALL(OPFVV3,vfnmsub_vv_h,OP_UUU_H,H2,H2,H2,fnmsub16)3579  RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3580  RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3581  RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3582  GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3583  GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3584  GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3585  RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3586  RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3587  RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3588  GEN_VEXT_VF(vfnmsub_vf_h, 2)
3589  GEN_VEXT_VF(vfnmsub_vf_w, 4)
3590  GEN_VEXT_VF(vfnmsub_vf_d, 8)
3591  
3592  /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3593  static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3594  {
3595      return float32_muladd(float16_to_float32(a, true, s),
3596                            float16_to_float32(b, true, s), d, 0, s);
3597  }
3598  
fwmacc32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3599  static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3600  {
3601      return float64_muladd(float32_to_float64(a, s),
3602                            float32_to_float64(b, s), d, 0, s);
3603  }
3604  
RVVCALL(OPFVV3,vfwmacc_vv_h,WOP_UUU_H,H4,H2,H2,fwmacc16)3605  RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3606  RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3607  GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3608  GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3609  RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3610  RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3611  GEN_VEXT_VF(vfwmacc_vf_h, 4)
3612  GEN_VEXT_VF(vfwmacc_vf_w, 8)
3613  
3614  static uint32_t fwmaccbf16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3615  {
3616      return float32_muladd(bfloat16_to_float32(a, s),
3617                            bfloat16_to_float32(b, s), d, 0, s);
3618  }
3619  
RVVCALL(OPFVV3,vfwmaccbf16_vv,WOP_UUU_H,H4,H2,H2,fwmaccbf16)3620  RVVCALL(OPFVV3, vfwmaccbf16_vv, WOP_UUU_H, H4, H2, H2, fwmaccbf16)
3621  GEN_VEXT_VV_ENV(vfwmaccbf16_vv, 4)
3622  RVVCALL(OPFVF3, vfwmaccbf16_vf, WOP_UUU_H, H4, H2, fwmaccbf16)
3623  GEN_VEXT_VF(vfwmaccbf16_vf, 4)
3624  
3625  static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3626  {
3627      return float32_muladd(float16_to_float32(a, true, s),
3628                            float16_to_float32(b, true, s), d,
3629                            float_muladd_negate_c | float_muladd_negate_product,
3630                            s);
3631  }
3632  
fwnmacc32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3633  static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3634  {
3635      return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3636                            d, float_muladd_negate_c |
3637                               float_muladd_negate_product, s);
3638  }
3639  
RVVCALL(OPFVV3,vfwnmacc_vv_h,WOP_UUU_H,H4,H2,H2,fwnmacc16)3640  RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3641  RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3642  GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3643  GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3644  RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3645  RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3646  GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3647  GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3648  
3649  static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3650  {
3651      return float32_muladd(float16_to_float32(a, true, s),
3652                            float16_to_float32(b, true, s), d,
3653                            float_muladd_negate_c, s);
3654  }
3655  
fwmsac32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3656  static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3657  {
3658      return float64_muladd(float32_to_float64(a, s),
3659                            float32_to_float64(b, s), d,
3660                            float_muladd_negate_c, s);
3661  }
3662  
RVVCALL(OPFVV3,vfwmsac_vv_h,WOP_UUU_H,H4,H2,H2,fwmsac16)3663  RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3664  RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3665  GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3666  GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3667  RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3668  RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3669  GEN_VEXT_VF(vfwmsac_vf_h, 4)
3670  GEN_VEXT_VF(vfwmsac_vf_w, 8)
3671  
3672  static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3673  {
3674      return float32_muladd(float16_to_float32(a, true, s),
3675                            float16_to_float32(b, true, s), d,
3676                            float_muladd_negate_product, s);
3677  }
3678  
fwnmsac32(uint32_t a,uint32_t b,uint64_t d,float_status * s)3679  static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3680  {
3681      return float64_muladd(float32_to_float64(a, s),
3682                            float32_to_float64(b, s), d,
3683                            float_muladd_negate_product, s);
3684  }
3685  
RVVCALL(OPFVV3,vfwnmsac_vv_h,WOP_UUU_H,H4,H2,H2,fwnmsac16)3686  RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3687  RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3688  GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3689  GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3690  RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3691  RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3692  GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3693  GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3694  
3695  /* Vector Floating-Point Square-Root Instruction */
3696  #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3697  static void do_##NAME(void *vd, void *vs2, int i,      \
3698                        CPURISCVState *env)              \
3699  {                                                      \
3700      TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3701      *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3702  }
3703  
3704  #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3705  void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3706                    CPURISCVState *env, uint32_t desc)   \
3707  {                                                      \
3708      uint32_t vm = vext_vm(desc);                       \
3709      uint32_t vl = env->vl;                             \
3710      uint32_t total_elems =                             \
3711          vext_get_total_elems(env, desc, ESZ);          \
3712      uint32_t vta = vext_vta(desc);                     \
3713      uint32_t vma = vext_vma(desc);                     \
3714      uint32_t i;                                        \
3715                                                         \
3716      VSTART_CHECK_EARLY_EXIT(env);                      \
3717                                                         \
3718      if (vl == 0) {                                     \
3719          return;                                        \
3720      }                                                  \
3721      for (i = env->vstart; i < vl; i++) {               \
3722          if (!vm && !vext_elem_mask(v0, i)) {           \
3723              /* set masked-off elements to 1s */        \
3724              vext_set_elems_1s(vd, vma, i * ESZ,        \
3725                                (i + 1) * ESZ);          \
3726              continue;                                  \
3727          }                                              \
3728          do_##NAME(vd, vs2, i, env);                    \
3729      }                                                  \
3730      env->vstart = 0;                                   \
3731      vext_set_elems_1s(vd, vta, vl * ESZ,               \
3732                        total_elems * ESZ);              \
3733  }
3734  
3735  RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3736  RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3737  RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3738  GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3739  GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3740  GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3741  
3742  /*
3743   * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3744   *
3745   * Adapted from riscv-v-spec recip.c:
3746   * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3747   */
3748  static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3749  {
3750      uint64_t sign = extract64(f, frac_size + exp_size, 1);
3751      uint64_t exp = extract64(f, frac_size, exp_size);
3752      uint64_t frac = extract64(f, 0, frac_size);
3753  
3754      const uint8_t lookup_table[] = {
3755          52, 51, 50, 48, 47, 46, 44, 43,
3756          42, 41, 40, 39, 38, 36, 35, 34,
3757          33, 32, 31, 30, 30, 29, 28, 27,
3758          26, 25, 24, 23, 23, 22, 21, 20,
3759          19, 19, 18, 17, 16, 16, 15, 14,
3760          14, 13, 12, 12, 11, 10, 10, 9,
3761          9, 8, 7, 7, 6, 6, 5, 4,
3762          4, 3, 3, 2, 2, 1, 1, 0,
3763          127, 125, 123, 121, 119, 118, 116, 114,
3764          113, 111, 109, 108, 106, 105, 103, 102,
3765          100, 99, 97, 96, 95, 93, 92, 91,
3766          90, 88, 87, 86, 85, 84, 83, 82,
3767          80, 79, 78, 77, 76, 75, 74, 73,
3768          72, 71, 70, 70, 69, 68, 67, 66,
3769          65, 64, 63, 63, 62, 61, 60, 59,
3770          59, 58, 57, 56, 56, 55, 54, 53
3771      };
3772      const int precision = 7;
3773  
3774      if (exp == 0 && frac != 0) { /* subnormal */
3775          /* Normalize the subnormal. */
3776          while (extract64(frac, frac_size - 1, 1) == 0) {
3777              exp--;
3778              frac <<= 1;
3779          }
3780  
3781          frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3782      }
3783  
3784      int idx = ((exp & 1) << (precision - 1)) |
3785                (frac >> (frac_size - precision + 1));
3786      uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3787                          (frac_size - precision);
3788      uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3789  
3790      uint64_t val = 0;
3791      val = deposit64(val, 0, frac_size, out_frac);
3792      val = deposit64(val, frac_size, exp_size, out_exp);
3793      val = deposit64(val, frac_size + exp_size, 1, sign);
3794      return val;
3795  }
3796  
frsqrt7_h(float16 f,float_status * s)3797  static float16 frsqrt7_h(float16 f, float_status *s)
3798  {
3799      int exp_size = 5, frac_size = 10;
3800      bool sign = float16_is_neg(f);
3801  
3802      /*
3803       * frsqrt7(sNaN) = canonical NaN
3804       * frsqrt7(-inf) = canonical NaN
3805       * frsqrt7(-normal) = canonical NaN
3806       * frsqrt7(-subnormal) = canonical NaN
3807       */
3808      if (float16_is_signaling_nan(f, s) ||
3809          (float16_is_infinity(f) && sign) ||
3810          (float16_is_normal(f) && sign) ||
3811          (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3812          s->float_exception_flags |= float_flag_invalid;
3813          return float16_default_nan(s);
3814      }
3815  
3816      /* frsqrt7(qNaN) = canonical NaN */
3817      if (float16_is_quiet_nan(f, s)) {
3818          return float16_default_nan(s);
3819      }
3820  
3821      /* frsqrt7(+-0) = +-inf */
3822      if (float16_is_zero(f)) {
3823          s->float_exception_flags |= float_flag_divbyzero;
3824          return float16_set_sign(float16_infinity, sign);
3825      }
3826  
3827      /* frsqrt7(+inf) = +0 */
3828      if (float16_is_infinity(f) && !sign) {
3829          return float16_set_sign(float16_zero, sign);
3830      }
3831  
3832      /* +normal, +subnormal */
3833      uint64_t val = frsqrt7(f, exp_size, frac_size);
3834      return make_float16(val);
3835  }
3836  
frsqrt7_s(float32 f,float_status * s)3837  static float32 frsqrt7_s(float32 f, float_status *s)
3838  {
3839      int exp_size = 8, frac_size = 23;
3840      bool sign = float32_is_neg(f);
3841  
3842      /*
3843       * frsqrt7(sNaN) = canonical NaN
3844       * frsqrt7(-inf) = canonical NaN
3845       * frsqrt7(-normal) = canonical NaN
3846       * frsqrt7(-subnormal) = canonical NaN
3847       */
3848      if (float32_is_signaling_nan(f, s) ||
3849          (float32_is_infinity(f) && sign) ||
3850          (float32_is_normal(f) && sign) ||
3851          (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3852          s->float_exception_flags |= float_flag_invalid;
3853          return float32_default_nan(s);
3854      }
3855  
3856      /* frsqrt7(qNaN) = canonical NaN */
3857      if (float32_is_quiet_nan(f, s)) {
3858          return float32_default_nan(s);
3859      }
3860  
3861      /* frsqrt7(+-0) = +-inf */
3862      if (float32_is_zero(f)) {
3863          s->float_exception_flags |= float_flag_divbyzero;
3864          return float32_set_sign(float32_infinity, sign);
3865      }
3866  
3867      /* frsqrt7(+inf) = +0 */
3868      if (float32_is_infinity(f) && !sign) {
3869          return float32_set_sign(float32_zero, sign);
3870      }
3871  
3872      /* +normal, +subnormal */
3873      uint64_t val = frsqrt7(f, exp_size, frac_size);
3874      return make_float32(val);
3875  }
3876  
frsqrt7_d(float64 f,float_status * s)3877  static float64 frsqrt7_d(float64 f, float_status *s)
3878  {
3879      int exp_size = 11, frac_size = 52;
3880      bool sign = float64_is_neg(f);
3881  
3882      /*
3883       * frsqrt7(sNaN) = canonical NaN
3884       * frsqrt7(-inf) = canonical NaN
3885       * frsqrt7(-normal) = canonical NaN
3886       * frsqrt7(-subnormal) = canonical NaN
3887       */
3888      if (float64_is_signaling_nan(f, s) ||
3889          (float64_is_infinity(f) && sign) ||
3890          (float64_is_normal(f) && sign) ||
3891          (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3892          s->float_exception_flags |= float_flag_invalid;
3893          return float64_default_nan(s);
3894      }
3895  
3896      /* frsqrt7(qNaN) = canonical NaN */
3897      if (float64_is_quiet_nan(f, s)) {
3898          return float64_default_nan(s);
3899      }
3900  
3901      /* frsqrt7(+-0) = +-inf */
3902      if (float64_is_zero(f)) {
3903          s->float_exception_flags |= float_flag_divbyzero;
3904          return float64_set_sign(float64_infinity, sign);
3905      }
3906  
3907      /* frsqrt7(+inf) = +0 */
3908      if (float64_is_infinity(f) && !sign) {
3909          return float64_set_sign(float64_zero, sign);
3910      }
3911  
3912      /* +normal, +subnormal */
3913      uint64_t val = frsqrt7(f, exp_size, frac_size);
3914      return make_float64(val);
3915  }
3916  
RVVCALL(OPFVV1,vfrsqrt7_v_h,OP_UU_H,H2,H2,frsqrt7_h)3917  RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3918  RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3919  RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3920  GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3921  GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3922  GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3923  
3924  /*
3925   * Vector Floating-Point Reciprocal Estimate Instruction
3926   *
3927   * Adapted from riscv-v-spec recip.c:
3928   * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3929   */
3930  static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3931                        float_status *s)
3932  {
3933      uint64_t sign = extract64(f, frac_size + exp_size, 1);
3934      uint64_t exp = extract64(f, frac_size, exp_size);
3935      uint64_t frac = extract64(f, 0, frac_size);
3936  
3937      const uint8_t lookup_table[] = {
3938          127, 125, 123, 121, 119, 117, 116, 114,
3939          112, 110, 109, 107, 105, 104, 102, 100,
3940          99, 97, 96, 94, 93, 91, 90, 88,
3941          87, 85, 84, 83, 81, 80, 79, 77,
3942          76, 75, 74, 72, 71, 70, 69, 68,
3943          66, 65, 64, 63, 62, 61, 60, 59,
3944          58, 57, 56, 55, 54, 53, 52, 51,
3945          50, 49, 48, 47, 46, 45, 44, 43,
3946          42, 41, 40, 40, 39, 38, 37, 36,
3947          35, 35, 34, 33, 32, 31, 31, 30,
3948          29, 28, 28, 27, 26, 25, 25, 24,
3949          23, 23, 22, 21, 21, 20, 19, 19,
3950          18, 17, 17, 16, 15, 15, 14, 14,
3951          13, 12, 12, 11, 11, 10, 9, 9,
3952          8, 8, 7, 7, 6, 5, 5, 4,
3953          4, 3, 3, 2, 2, 1, 1, 0
3954      };
3955      const int precision = 7;
3956  
3957      if (exp == 0 && frac != 0) { /* subnormal */
3958          /* Normalize the subnormal. */
3959          while (extract64(frac, frac_size - 1, 1) == 0) {
3960              exp--;
3961              frac <<= 1;
3962          }
3963  
3964          frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3965  
3966          if (exp != 0 && exp != UINT64_MAX) {
3967              /*
3968               * Overflow to inf or max value of same sign,
3969               * depending on sign and rounding mode.
3970               */
3971              s->float_exception_flags |= (float_flag_inexact |
3972                                           float_flag_overflow);
3973  
3974              if ((s->float_rounding_mode == float_round_to_zero) ||
3975                  ((s->float_rounding_mode == float_round_down) && !sign) ||
3976                  ((s->float_rounding_mode == float_round_up) && sign)) {
3977                  /* Return greatest/negative finite value. */
3978                  return (sign << (exp_size + frac_size)) |
3979                         (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3980              } else {
3981                  /* Return +-inf. */
3982                  return (sign << (exp_size + frac_size)) |
3983                         MAKE_64BIT_MASK(frac_size, exp_size);
3984              }
3985          }
3986      }
3987  
3988      int idx = frac >> (frac_size - precision);
3989      uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3990                          (frac_size - precision);
3991      uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3992  
3993      if (out_exp == 0 || out_exp == UINT64_MAX) {
3994          /*
3995           * The result is subnormal, but don't raise the underflow exception,
3996           * because there's no additional loss of precision.
3997           */
3998          out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3999          if (out_exp == UINT64_MAX) {
4000              out_frac >>= 1;
4001              out_exp = 0;
4002          }
4003      }
4004  
4005      uint64_t val = 0;
4006      val = deposit64(val, 0, frac_size, out_frac);
4007      val = deposit64(val, frac_size, exp_size, out_exp);
4008      val = deposit64(val, frac_size + exp_size, 1, sign);
4009      return val;
4010  }
4011  
frec7_h(float16 f,float_status * s)4012  static float16 frec7_h(float16 f, float_status *s)
4013  {
4014      int exp_size = 5, frac_size = 10;
4015      bool sign = float16_is_neg(f);
4016  
4017      /* frec7(+-inf) = +-0 */
4018      if (float16_is_infinity(f)) {
4019          return float16_set_sign(float16_zero, sign);
4020      }
4021  
4022      /* frec7(+-0) = +-inf */
4023      if (float16_is_zero(f)) {
4024          s->float_exception_flags |= float_flag_divbyzero;
4025          return float16_set_sign(float16_infinity, sign);
4026      }
4027  
4028      /* frec7(sNaN) = canonical NaN */
4029      if (float16_is_signaling_nan(f, s)) {
4030          s->float_exception_flags |= float_flag_invalid;
4031          return float16_default_nan(s);
4032      }
4033  
4034      /* frec7(qNaN) = canonical NaN */
4035      if (float16_is_quiet_nan(f, s)) {
4036          return float16_default_nan(s);
4037      }
4038  
4039      /* +-normal, +-subnormal */
4040      uint64_t val = frec7(f, exp_size, frac_size, s);
4041      return make_float16(val);
4042  }
4043  
frec7_s(float32 f,float_status * s)4044  static float32 frec7_s(float32 f, float_status *s)
4045  {
4046      int exp_size = 8, frac_size = 23;
4047      bool sign = float32_is_neg(f);
4048  
4049      /* frec7(+-inf) = +-0 */
4050      if (float32_is_infinity(f)) {
4051          return float32_set_sign(float32_zero, sign);
4052      }
4053  
4054      /* frec7(+-0) = +-inf */
4055      if (float32_is_zero(f)) {
4056          s->float_exception_flags |= float_flag_divbyzero;
4057          return float32_set_sign(float32_infinity, sign);
4058      }
4059  
4060      /* frec7(sNaN) = canonical NaN */
4061      if (float32_is_signaling_nan(f, s)) {
4062          s->float_exception_flags |= float_flag_invalid;
4063          return float32_default_nan(s);
4064      }
4065  
4066      /* frec7(qNaN) = canonical NaN */
4067      if (float32_is_quiet_nan(f, s)) {
4068          return float32_default_nan(s);
4069      }
4070  
4071      /* +-normal, +-subnormal */
4072      uint64_t val = frec7(f, exp_size, frac_size, s);
4073      return make_float32(val);
4074  }
4075  
frec7_d(float64 f,float_status * s)4076  static float64 frec7_d(float64 f, float_status *s)
4077  {
4078      int exp_size = 11, frac_size = 52;
4079      bool sign = float64_is_neg(f);
4080  
4081      /* frec7(+-inf) = +-0 */
4082      if (float64_is_infinity(f)) {
4083          return float64_set_sign(float64_zero, sign);
4084      }
4085  
4086      /* frec7(+-0) = +-inf */
4087      if (float64_is_zero(f)) {
4088          s->float_exception_flags |= float_flag_divbyzero;
4089          return float64_set_sign(float64_infinity, sign);
4090      }
4091  
4092      /* frec7(sNaN) = canonical NaN */
4093      if (float64_is_signaling_nan(f, s)) {
4094          s->float_exception_flags |= float_flag_invalid;
4095          return float64_default_nan(s);
4096      }
4097  
4098      /* frec7(qNaN) = canonical NaN */
4099      if (float64_is_quiet_nan(f, s)) {
4100          return float64_default_nan(s);
4101      }
4102  
4103      /* +-normal, +-subnormal */
4104      uint64_t val = frec7(f, exp_size, frac_size, s);
4105      return make_float64(val);
4106  }
4107  
RVVCALL(OPFVV1,vfrec7_v_h,OP_UU_H,H2,H2,frec7_h)4108  RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4109  RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4110  RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4111  GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4112  GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4113  GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4114  
4115  /* Vector Floating-Point MIN/MAX Instructions */
4116  RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4117  RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4118  RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4119  GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4120  GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4121  GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4122  RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4123  RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4124  RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4125  GEN_VEXT_VF(vfmin_vf_h, 2)
4126  GEN_VEXT_VF(vfmin_vf_w, 4)
4127  GEN_VEXT_VF(vfmin_vf_d, 8)
4128  
4129  RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4130  RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4131  RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4132  GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4133  GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4134  GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4135  RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4136  RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4137  RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4138  GEN_VEXT_VF(vfmax_vf_h, 2)
4139  GEN_VEXT_VF(vfmax_vf_w, 4)
4140  GEN_VEXT_VF(vfmax_vf_d, 8)
4141  
4142  /* Vector Floating-Point Sign-Injection Instructions */
4143  static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4144  {
4145      return deposit64(b, 0, 15, a);
4146  }
4147  
fsgnj32(uint32_t a,uint32_t b,float_status * s)4148  static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4149  {
4150      return deposit64(b, 0, 31, a);
4151  }
4152  
fsgnj64(uint64_t a,uint64_t b,float_status * s)4153  static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4154  {
4155      return deposit64(b, 0, 63, a);
4156  }
4157  
RVVCALL(OPFVV2,vfsgnj_vv_h,OP_UUU_H,H2,H2,H2,fsgnj16)4158  RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4159  RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4160  RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4161  GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4162  GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4163  GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4164  RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4165  RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4166  RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4167  GEN_VEXT_VF(vfsgnj_vf_h, 2)
4168  GEN_VEXT_VF(vfsgnj_vf_w, 4)
4169  GEN_VEXT_VF(vfsgnj_vf_d, 8)
4170  
4171  static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4172  {
4173      return deposit64(~b, 0, 15, a);
4174  }
4175  
fsgnjn32(uint32_t a,uint32_t b,float_status * s)4176  static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4177  {
4178      return deposit64(~b, 0, 31, a);
4179  }
4180  
fsgnjn64(uint64_t a,uint64_t b,float_status * s)4181  static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4182  {
4183      return deposit64(~b, 0, 63, a);
4184  }
4185  
RVVCALL(OPFVV2,vfsgnjn_vv_h,OP_UUU_H,H2,H2,H2,fsgnjn16)4186  RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4187  RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4188  RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4189  GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4190  GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4191  GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4192  RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4193  RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4194  RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4195  GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4196  GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4197  GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4198  
4199  static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4200  {
4201      return deposit64(b ^ a, 0, 15, a);
4202  }
4203  
fsgnjx32(uint32_t a,uint32_t b,float_status * s)4204  static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4205  {
4206      return deposit64(b ^ a, 0, 31, a);
4207  }
4208  
fsgnjx64(uint64_t a,uint64_t b,float_status * s)4209  static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4210  {
4211      return deposit64(b ^ a, 0, 63, a);
4212  }
4213  
RVVCALL(OPFVV2,vfsgnjx_vv_h,OP_UUU_H,H2,H2,H2,fsgnjx16)4214  RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4215  RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4216  RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4217  GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4218  GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4219  GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4220  RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4221  RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4222  RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4223  GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4224  GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4225  GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4226  
4227  /* Vector Floating-Point Compare Instructions */
4228  #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4229  void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4230                    CPURISCVState *env, uint32_t desc)          \
4231  {                                                             \
4232      uint32_t vm = vext_vm(desc);                              \
4233      uint32_t vl = env->vl;                                    \
4234      uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;    \
4235      uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4236      uint32_t vma = vext_vma(desc);                            \
4237      uint32_t i;                                               \
4238                                                                \
4239      VSTART_CHECK_EARLY_EXIT(env);                             \
4240                                                                \
4241      for (i = env->vstart; i < vl; i++) {                      \
4242          ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4243          ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4244          if (!vm && !vext_elem_mask(v0, i)) {                  \
4245              /* set masked-off elements to 1s */               \
4246              if (vma) {                                        \
4247                  vext_set_elem_mask(vd, i, 1);                 \
4248              }                                                 \
4249              continue;                                         \
4250          }                                                     \
4251          vext_set_elem_mask(vd, i,                             \
4252                             DO_OP(s2, s1, &env->fp_status));   \
4253      }                                                         \
4254      env->vstart = 0;                                          \
4255      /*
4256       * mask destination register are always tail-agnostic
4257       * set tail elements to 1s
4258       */                                                       \
4259      if (vta_all_1s) {                                         \
4260          for (; i < total_elems; i++) {                        \
4261              vext_set_elem_mask(vd, i, 1);                     \
4262          }                                                     \
4263      }                                                         \
4264  }
4265  
4266  GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4267  GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4268  GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4269  
4270  #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4271  void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4272                    CPURISCVState *env, uint32_t desc)                \
4273  {                                                                   \
4274      uint32_t vm = vext_vm(desc);                                    \
4275      uint32_t vl = env->vl;                                          \
4276      uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;          \
4277      uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4278      uint32_t vma = vext_vma(desc);                                  \
4279      uint32_t i;                                                     \
4280                                                                      \
4281      VSTART_CHECK_EARLY_EXIT(env);                                   \
4282                                                                      \
4283      for (i = env->vstart; i < vl; i++) {                            \
4284          ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4285          if (!vm && !vext_elem_mask(v0, i)) {                        \
4286              /* set masked-off elements to 1s */                     \
4287              if (vma) {                                              \
4288                  vext_set_elem_mask(vd, i, 1);                       \
4289              }                                                       \
4290              continue;                                               \
4291          }                                                           \
4292          vext_set_elem_mask(vd, i,                                   \
4293                             DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4294      }                                                               \
4295      env->vstart = 0;                                                \
4296      /*
4297       * mask destination register are always tail-agnostic
4298       * set tail elements to 1s
4299       */                                                             \
4300      if (vta_all_1s) {                                               \
4301          for (; i < total_elems; i++) {                              \
4302              vext_set_elem_mask(vd, i, 1);                           \
4303          }                                                           \
4304      }                                                               \
4305  }
4306  
4307  GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4308  GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4309  GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4310  
4311  static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4312  {
4313      FloatRelation compare = float16_compare_quiet(a, b, s);
4314      return compare != float_relation_equal;
4315  }
4316  
vmfne32(uint32_t a,uint32_t b,float_status * s)4317  static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4318  {
4319      FloatRelation compare = float32_compare_quiet(a, b, s);
4320      return compare != float_relation_equal;
4321  }
4322  
vmfne64(uint64_t a,uint64_t b,float_status * s)4323  static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4324  {
4325      FloatRelation compare = float64_compare_quiet(a, b, s);
4326      return compare != float_relation_equal;
4327  }
4328  
GEN_VEXT_CMP_VV_ENV(vmfne_vv_h,uint16_t,H2,vmfne16)4329  GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4330  GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4331  GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4332  GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4333  GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4334  GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4335  
4336  GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4337  GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4338  GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4339  GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4340  GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4341  GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4342  
4343  GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4344  GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4345  GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4346  GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4347  GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4348  GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4349  
4350  static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4351  {
4352      FloatRelation compare = float16_compare(a, b, s);
4353      return compare == float_relation_greater;
4354  }
4355  
vmfgt32(uint32_t a,uint32_t b,float_status * s)4356  static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4357  {
4358      FloatRelation compare = float32_compare(a, b, s);
4359      return compare == float_relation_greater;
4360  }
4361  
vmfgt64(uint64_t a,uint64_t b,float_status * s)4362  static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4363  {
4364      FloatRelation compare = float64_compare(a, b, s);
4365      return compare == float_relation_greater;
4366  }
4367  
GEN_VEXT_CMP_VF(vmfgt_vf_h,uint16_t,H2,vmfgt16)4368  GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4369  GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4370  GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4371  
4372  static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4373  {
4374      FloatRelation compare = float16_compare(a, b, s);
4375      return compare == float_relation_greater ||
4376             compare == float_relation_equal;
4377  }
4378  
vmfge32(uint32_t a,uint32_t b,float_status * s)4379  static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4380  {
4381      FloatRelation compare = float32_compare(a, b, s);
4382      return compare == float_relation_greater ||
4383             compare == float_relation_equal;
4384  }
4385  
vmfge64(uint64_t a,uint64_t b,float_status * s)4386  static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4387  {
4388      FloatRelation compare = float64_compare(a, b, s);
4389      return compare == float_relation_greater ||
4390             compare == float_relation_equal;
4391  }
4392  
GEN_VEXT_CMP_VF(vmfge_vf_h,uint16_t,H2,vmfge16)4393  GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4394  GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4395  GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4396  
4397  /* Vector Floating-Point Classify Instruction */
4398  target_ulong fclass_h(uint64_t frs1)
4399  {
4400      float16 f = frs1;
4401      bool sign = float16_is_neg(f);
4402  
4403      if (float16_is_infinity(f)) {
4404          return sign ? 1 << 0 : 1 << 7;
4405      } else if (float16_is_zero(f)) {
4406          return sign ? 1 << 3 : 1 << 4;
4407      } else if (float16_is_zero_or_denormal(f)) {
4408          return sign ? 1 << 2 : 1 << 5;
4409      } else if (float16_is_any_nan(f)) {
4410          float_status s = { }; /* for snan_bit_is_one */
4411          return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4412      } else {
4413          return sign ? 1 << 1 : 1 << 6;
4414      }
4415  }
4416  
fclass_s(uint64_t frs1)4417  target_ulong fclass_s(uint64_t frs1)
4418  {
4419      float32 f = frs1;
4420      bool sign = float32_is_neg(f);
4421  
4422      if (float32_is_infinity(f)) {
4423          return sign ? 1 << 0 : 1 << 7;
4424      } else if (float32_is_zero(f)) {
4425          return sign ? 1 << 3 : 1 << 4;
4426      } else if (float32_is_zero_or_denormal(f)) {
4427          return sign ? 1 << 2 : 1 << 5;
4428      } else if (float32_is_any_nan(f)) {
4429          float_status s = { }; /* for snan_bit_is_one */
4430          return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4431      } else {
4432          return sign ? 1 << 1 : 1 << 6;
4433      }
4434  }
4435  
fclass_d(uint64_t frs1)4436  target_ulong fclass_d(uint64_t frs1)
4437  {
4438      float64 f = frs1;
4439      bool sign = float64_is_neg(f);
4440  
4441      if (float64_is_infinity(f)) {
4442          return sign ? 1 << 0 : 1 << 7;
4443      } else if (float64_is_zero(f)) {
4444          return sign ? 1 << 3 : 1 << 4;
4445      } else if (float64_is_zero_or_denormal(f)) {
4446          return sign ? 1 << 2 : 1 << 5;
4447      } else if (float64_is_any_nan(f)) {
4448          float_status s = { }; /* for snan_bit_is_one */
4449          return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4450      } else {
4451          return sign ? 1 << 1 : 1 << 6;
4452      }
4453  }
4454  
RVVCALL(OPIVV1,vfclass_v_h,OP_UU_H,H2,H2,fclass_h)4455  RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4456  RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4457  RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4458  GEN_VEXT_V(vfclass_v_h, 2)
4459  GEN_VEXT_V(vfclass_v_w, 4)
4460  GEN_VEXT_V(vfclass_v_d, 8)
4461  
4462  /* Vector Floating-Point Merge Instruction */
4463  
4464  #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4465  void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4466                    CPURISCVState *env, uint32_t desc)          \
4467  {                                                             \
4468      uint32_t vm = vext_vm(desc);                              \
4469      uint32_t vl = env->vl;                                    \
4470      uint32_t esz = sizeof(ETYPE);                             \
4471      uint32_t total_elems =                                    \
4472          vext_get_total_elems(env, desc, esz);                 \
4473      uint32_t vta = vext_vta(desc);                            \
4474      uint32_t i;                                               \
4475                                                                \
4476      VSTART_CHECK_EARLY_EXIT(env);                             \
4477                                                                \
4478      for (i = env->vstart; i < vl; i++) {                      \
4479          ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4480          *((ETYPE *)vd + H(i)) =                               \
4481              (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4482      }                                                         \
4483      env->vstart = 0;                                          \
4484      /* set tail elements to 1s */                             \
4485      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4486  }
4487  
4488  GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4489  GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4490  GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4491  
4492  /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4493  /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4494  RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4495  RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4496  RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4497  GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4498  GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4499  GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4500  
4501  /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4502  RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4503  RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4504  RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4505  GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4506  GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4507  GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4508  
4509  /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4510  RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4511  RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4512  RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4513  GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4514  GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4515  GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4516  
4517  /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4518  RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4519  RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4520  RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4521  GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4522  GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4523  GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4524  
4525  /* Widening Floating-Point/Integer Type-Convert Instructions */
4526  /* (TD, T2, TX2) */
4527  #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4528  #define WOP_UU_H uint32_t, uint16_t, uint16_t
4529  #define WOP_UU_W uint64_t, uint32_t, uint32_t
4530  /*
4531   * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4532   */
4533  RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4534  RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4535  GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4536  GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4537  
4538  /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4539  RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4540  RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4541  GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4542  GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4543  
4544  /*
4545   * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4546   */
4547  RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4548  RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4549  RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4550  GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4551  GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4552  GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4553  
4554  /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4555  RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4556  RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4557  RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4558  GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4559  GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4560  GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4561  
4562  /*
4563   * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4564   */
4565  static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4566  {
4567      return float16_to_float32(a, true, s);
4568  }
4569  
RVVCALL(OPFVV1,vfwcvt_f_f_v_h,WOP_UU_H,H4,H2,vfwcvtffv16)4570  RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4571  RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4572  GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4573  GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4574  
4575  RVVCALL(OPFVV1, vfwcvtbf16_f_f_v, WOP_UU_H, H4, H2, bfloat16_to_float32)
4576  GEN_VEXT_V_ENV(vfwcvtbf16_f_f_v, 4)
4577  
4578  /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4579  /* (TD, T2, TX2) */
4580  #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4581  #define NOP_UU_H uint16_t, uint32_t, uint32_t
4582  #define NOP_UU_W uint32_t, uint64_t, uint64_t
4583  /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4584  RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4585  RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4586  RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4587  GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4588  GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4589  GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4590  
4591  /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4592  RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4593  RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4594  RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4595  GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4596  GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4597  GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4598  
4599  /*
4600   * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4601   */
4602  RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4603  RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4604  GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4605  GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4606  
4607  /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4608  RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4609  RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4610  GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4611  GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4612  
4613  /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4614  static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4615  {
4616      return float32_to_float16(a, true, s);
4617  }
4618  
RVVCALL(OPFVV1,vfncvt_f_f_w_h,NOP_UU_H,H2,H4,vfncvtffv16)4619  RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4620  RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4621  GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4622  GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4623  
4624  RVVCALL(OPFVV1, vfncvtbf16_f_f_w, NOP_UU_H, H2, H4, float32_to_bfloat16)
4625  GEN_VEXT_V_ENV(vfncvtbf16_f_f_w, 2)
4626  
4627  /*
4628   * Vector Reduction Operations
4629   */
4630  /* Vector Single-Width Integer Reduction Instructions */
4631  #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4632  void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4633                    void *vs2, CPURISCVState *env,          \
4634                    uint32_t desc)                          \
4635  {                                                         \
4636      uint32_t vm = vext_vm(desc);                          \
4637      uint32_t vl = env->vl;                                \
4638      uint32_t esz = sizeof(TD);                            \
4639      uint32_t vlenb = simd_maxsz(desc);                    \
4640      uint32_t vta = vext_vta(desc);                        \
4641      uint32_t i;                                           \
4642      TD s1 =  *((TD *)vs1 + HD(0));                        \
4643                                                            \
4644      for (i = env->vstart; i < vl; i++) {                  \
4645          TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4646          if (!vm && !vext_elem_mask(v0, i)) {              \
4647              continue;                                     \
4648          }                                                 \
4649          s1 = OP(s1, (TD)s2);                              \
4650      }                                                     \
4651      *((TD *)vd + HD(0)) = s1;                             \
4652      env->vstart = 0;                                      \
4653      /* set tail elements to 1s */                         \
4654      vext_set_elems_1s(vd, vta, esz, vlenb);               \
4655  }
4656  
4657  /* vd[0] = sum(vs1[0], vs2[*]) */
4658  GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4659  GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4660  GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4661  GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4662  
4663  /* vd[0] = maxu(vs1[0], vs2[*]) */
4664  GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4665  GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4666  GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4667  GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4668  
4669  /* vd[0] = max(vs1[0], vs2[*]) */
4670  GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4671  GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4672  GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4673  GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4674  
4675  /* vd[0] = minu(vs1[0], vs2[*]) */
4676  GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4677  GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4678  GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4679  GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4680  
4681  /* vd[0] = min(vs1[0], vs2[*]) */
4682  GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4683  GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4684  GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4685  GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4686  
4687  /* vd[0] = and(vs1[0], vs2[*]) */
4688  GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4689  GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4690  GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4691  GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4692  
4693  /* vd[0] = or(vs1[0], vs2[*]) */
4694  GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4695  GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4696  GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4697  GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4698  
4699  /* vd[0] = xor(vs1[0], vs2[*]) */
4700  GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4701  GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4702  GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4703  GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4704  
4705  /* Vector Widening Integer Reduction Instructions */
4706  /* signed sum reduction into double-width accumulator */
4707  GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4708  GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4709  GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4710  
4711  /* Unsigned sum reduction into double-width accumulator */
4712  GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4713  GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4714  GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4715  
4716  /* Vector Single-Width Floating-Point Reduction Instructions */
4717  #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4718  void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4719                    void *vs2, CPURISCVState *env,           \
4720                    uint32_t desc)                           \
4721  {                                                          \
4722      uint32_t vm = vext_vm(desc);                           \
4723      uint32_t vl = env->vl;                                 \
4724      uint32_t esz = sizeof(TD);                             \
4725      uint32_t vlenb = simd_maxsz(desc);                     \
4726      uint32_t vta = vext_vta(desc);                         \
4727      uint32_t i;                                            \
4728      TD s1 =  *((TD *)vs1 + HD(0));                         \
4729                                                             \
4730      for (i = env->vstart; i < vl; i++) {                   \
4731          TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4732          if (!vm && !vext_elem_mask(v0, i)) {               \
4733              continue;                                      \
4734          }                                                  \
4735          s1 = OP(s1, (TD)s2, &env->fp_status);              \
4736      }                                                      \
4737      *((TD *)vd + HD(0)) = s1;                              \
4738      env->vstart = 0;                                       \
4739      /* set tail elements to 1s */                          \
4740      vext_set_elems_1s(vd, vta, esz, vlenb);                \
4741  }
4742  
4743  /* Unordered sum */
4744  GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4745  GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4746  GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4747  
4748  /* Ordered sum */
4749  GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4750  GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4751  GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4752  
4753  /* Maximum value */
4754  GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4755                float16_maximum_number)
4756  GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4757                float32_maximum_number)
4758  GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4759                float64_maximum_number)
4760  
4761  /* Minimum value */
4762  GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4763                float16_minimum_number)
4764  GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4765                float32_minimum_number)
4766  GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4767                float64_minimum_number)
4768  
4769  /* Vector Widening Floating-Point Add Instructions */
4770  static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4771  {
4772      return float32_add(a, float16_to_float32(b, true, s), s);
4773  }
4774  
fwadd32(uint64_t a,uint32_t b,float_status * s)4775  static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4776  {
4777      return float64_add(a, float32_to_float64(b, s), s);
4778  }
4779  
4780  /* Vector Widening Floating-Point Reduction Instructions */
4781  /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
GEN_VEXT_FRED(vfwredusum_vs_h,uint32_t,uint16_t,H4,H2,fwadd16)4782  GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4783  GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4784  GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4785  GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4786  
4787  /*
4788   * Vector Mask Operations
4789   */
4790  /* Vector Mask-Register Logical Instructions */
4791  #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4792  void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4793                    void *vs2, CPURISCVState *env,          \
4794                    uint32_t desc)                          \
4795  {                                                         \
4796      uint32_t vl = env->vl;                                \
4797      uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;\
4798      uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4799      uint32_t i;                                           \
4800      int a, b;                                             \
4801                                                            \
4802      VSTART_CHECK_EARLY_EXIT(env);                         \
4803                                                            \
4804      for (i = env->vstart; i < vl; i++) {                  \
4805          a = vext_elem_mask(vs1, i);                       \
4806          b = vext_elem_mask(vs2, i);                       \
4807          vext_set_elem_mask(vd, i, OP(b, a));              \
4808      }                                                     \
4809      env->vstart = 0;                                      \
4810      /*
4811       * mask destination register are always tail-agnostic
4812       * set tail elements to 1s
4813       */                                                   \
4814      if (vta_all_1s) {                                     \
4815          for (; i < total_elems; i++) {                    \
4816              vext_set_elem_mask(vd, i, 1);                 \
4817          }                                                 \
4818      }                                                     \
4819  }
4820  
4821  #define DO_NAND(N, M)  (!(N & M))
4822  #define DO_ANDNOT(N, M)  (N & !M)
4823  #define DO_NOR(N, M)  (!(N | M))
4824  #define DO_ORNOT(N, M)  (N | !M)
4825  #define DO_XNOR(N, M)  (!(N ^ M))
4826  
4827  GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4828  GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4829  GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4830  GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4831  GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4832  GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4833  GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4834  GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4835  
4836  /* Vector count population in mask vcpop */
4837  target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4838                               uint32_t desc)
4839  {
4840      target_ulong cnt = 0;
4841      uint32_t vm = vext_vm(desc);
4842      uint32_t vl = env->vl;
4843      int i;
4844  
4845      for (i = env->vstart; i < vl; i++) {
4846          if (vm || vext_elem_mask(v0, i)) {
4847              if (vext_elem_mask(vs2, i)) {
4848                  cnt++;
4849              }
4850          }
4851      }
4852      env->vstart = 0;
4853      return cnt;
4854  }
4855  
4856  /* vfirst find-first-set mask bit */
HELPER(vfirst_m)4857  target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4858                                uint32_t desc)
4859  {
4860      uint32_t vm = vext_vm(desc);
4861      uint32_t vl = env->vl;
4862      int i;
4863  
4864      for (i = env->vstart; i < vl; i++) {
4865          if (vm || vext_elem_mask(v0, i)) {
4866              if (vext_elem_mask(vs2, i)) {
4867                  return i;
4868              }
4869          }
4870      }
4871      env->vstart = 0;
4872      return -1LL;
4873  }
4874  
4875  enum set_mask_type {
4876      ONLY_FIRST = 1,
4877      INCLUDE_FIRST,
4878      BEFORE_FIRST,
4879  };
4880  
vmsetm(void * vd,void * v0,void * vs2,CPURISCVState * env,uint32_t desc,enum set_mask_type type)4881  static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4882                     uint32_t desc, enum set_mask_type type)
4883  {
4884      uint32_t vm = vext_vm(desc);
4885      uint32_t vl = env->vl;
4886      uint32_t total_elems = riscv_cpu_cfg(env)->vlenb << 3;
4887      uint32_t vta_all_1s = vext_vta_all_1s(desc);
4888      uint32_t vma = vext_vma(desc);
4889      int i;
4890      bool first_mask_bit = false;
4891  
4892      for (i = env->vstart; i < vl; i++) {
4893          if (!vm && !vext_elem_mask(v0, i)) {
4894              /* set masked-off elements to 1s */
4895              if (vma) {
4896                  vext_set_elem_mask(vd, i, 1);
4897              }
4898              continue;
4899          }
4900          /* write a zero to all following active elements */
4901          if (first_mask_bit) {
4902              vext_set_elem_mask(vd, i, 0);
4903              continue;
4904          }
4905          if (vext_elem_mask(vs2, i)) {
4906              first_mask_bit = true;
4907              if (type == BEFORE_FIRST) {
4908                  vext_set_elem_mask(vd, i, 0);
4909              } else {
4910                  vext_set_elem_mask(vd, i, 1);
4911              }
4912          } else {
4913              if (type == ONLY_FIRST) {
4914                  vext_set_elem_mask(vd, i, 0);
4915              } else {
4916                  vext_set_elem_mask(vd, i, 1);
4917              }
4918          }
4919      }
4920      env->vstart = 0;
4921      /*
4922       * mask destination register are always tail-agnostic
4923       * set tail elements to 1s
4924       */
4925      if (vta_all_1s) {
4926          for (; i < total_elems; i++) {
4927              vext_set_elem_mask(vd, i, 1);
4928          }
4929      }
4930  }
4931  
HELPER(vmsbf_m)4932  void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4933                       uint32_t desc)
4934  {
4935      vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4936  }
4937  
HELPER(vmsif_m)4938  void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4939                       uint32_t desc)
4940  {
4941      vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4942  }
4943  
HELPER(vmsof_m)4944  void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4945                       uint32_t desc)
4946  {
4947      vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4948  }
4949  
4950  /* Vector Iota Instruction */
4951  #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4952  void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4953                    uint32_t desc)                                          \
4954  {                                                                         \
4955      uint32_t vm = vext_vm(desc);                                          \
4956      uint32_t vl = env->vl;                                                \
4957      uint32_t esz = sizeof(ETYPE);                                         \
4958      uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4959      uint32_t vta = vext_vta(desc);                                        \
4960      uint32_t vma = vext_vma(desc);                                        \
4961      uint32_t sum = 0;                                                     \
4962      int i;                                                                \
4963                                                                            \
4964      for (i = env->vstart; i < vl; i++) {                                  \
4965          if (!vm && !vext_elem_mask(v0, i)) {                              \
4966              /* set masked-off elements to 1s */                           \
4967              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4968              continue;                                                     \
4969          }                                                                 \
4970          *((ETYPE *)vd + H(i)) = sum;                                      \
4971          if (vext_elem_mask(vs2, i)) {                                     \
4972              sum++;                                                        \
4973          }                                                                 \
4974      }                                                                     \
4975      env->vstart = 0;                                                      \
4976      /* set tail elements to 1s */                                         \
4977      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4978  }
4979  
GEN_VEXT_VIOTA_M(viota_m_b,uint8_t,H1)4980  GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4981  GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4982  GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4983  GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4984  
4985  /* Vector Element Index Instruction */
4986  #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4987  void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4988  {                                                                         \
4989      uint32_t vm = vext_vm(desc);                                          \
4990      uint32_t vl = env->vl;                                                \
4991      uint32_t esz = sizeof(ETYPE);                                         \
4992      uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4993      uint32_t vta = vext_vta(desc);                                        \
4994      uint32_t vma = vext_vma(desc);                                        \
4995      int i;                                                                \
4996                                                                            \
4997      VSTART_CHECK_EARLY_EXIT(env);                                         \
4998                                                                            \
4999      for (i = env->vstart; i < vl; i++) {                                  \
5000          if (!vm && !vext_elem_mask(v0, i)) {                              \
5001              /* set masked-off elements to 1s */                           \
5002              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5003              continue;                                                     \
5004          }                                                                 \
5005          *((ETYPE *)vd + H(i)) = i;                                        \
5006      }                                                                     \
5007      env->vstart = 0;                                                      \
5008      /* set tail elements to 1s */                                         \
5009      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5010  }
5011  
5012  GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
5013  GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
5014  GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
5015  GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
5016  
5017  /*
5018   * Vector Permutation Instructions
5019   */
5020  
5021  /* Vector Slide Instructions */
5022  #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
5023  void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5024                    CPURISCVState *env, uint32_t desc)                      \
5025  {                                                                         \
5026      uint32_t vm = vext_vm(desc);                                          \
5027      uint32_t vl = env->vl;                                                \
5028      uint32_t esz = sizeof(ETYPE);                                         \
5029      uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5030      uint32_t vta = vext_vta(desc);                                        \
5031      uint32_t vma = vext_vma(desc);                                        \
5032      target_ulong offset = s1, i_min, i;                                   \
5033                                                                            \
5034      VSTART_CHECK_EARLY_EXIT(env);                                         \
5035                                                                            \
5036      i_min = MAX(env->vstart, offset);                                     \
5037      for (i = i_min; i < vl; i++) {                                        \
5038          if (!vm && !vext_elem_mask(v0, i)) {                              \
5039              /* set masked-off elements to 1s */                           \
5040              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5041              continue;                                                     \
5042          }                                                                 \
5043          *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5044      }                                                                     \
5045      env->vstart = 0;                                                      \
5046      /* set tail elements to 1s */                                         \
5047      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5048  }
5049  
5050  /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5051  GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5052  GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5053  GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5054  GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5055  
5056  #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5057  void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5058                    CPURISCVState *env, uint32_t desc)                      \
5059  {                                                                         \
5060      uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5061      uint32_t vm = vext_vm(desc);                                          \
5062      uint32_t vl = env->vl;                                                \
5063      uint32_t esz = sizeof(ETYPE);                                         \
5064      uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5065      uint32_t vta = vext_vta(desc);                                        \
5066      uint32_t vma = vext_vma(desc);                                        \
5067      target_ulong i_max, i_min, i;                                         \
5068                                                                            \
5069      VSTART_CHECK_EARLY_EXIT(env);                                         \
5070                                                                            \
5071      i_min = MIN(s1 < vlmax ? vlmax - s1 : 0, vl);                         \
5072      i_max = MAX(i_min, env->vstart);                                      \
5073      for (i = env->vstart; i < i_max; ++i) {                               \
5074          if (!vm && !vext_elem_mask(v0, i)) {                              \
5075              /* set masked-off elements to 1s */                           \
5076              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5077              continue;                                                     \
5078          }                                                                 \
5079          *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5080      }                                                                     \
5081                                                                            \
5082      for (i = i_max; i < vl; ++i) {                                        \
5083          if (vm || vext_elem_mask(v0, i)) {                                \
5084              *((ETYPE *)vd + H(i)) = 0;                                    \
5085          }                                                                 \
5086      }                                                                     \
5087                                                                            \
5088      env->vstart = 0;                                                      \
5089      /* set tail elements to 1s */                                         \
5090      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5091  }
5092  
5093  /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5094  GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5095  GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5096  GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5097  GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5098  
5099  #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5100  static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5101                                   void *vs2, CPURISCVState *env,             \
5102                                   uint32_t desc)                             \
5103  {                                                                           \
5104      typedef uint##BITWIDTH##_t ETYPE;                                       \
5105      uint32_t vm = vext_vm(desc);                                            \
5106      uint32_t vl = env->vl;                                                  \
5107      uint32_t esz = sizeof(ETYPE);                                           \
5108      uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5109      uint32_t vta = vext_vta(desc);                                          \
5110      uint32_t vma = vext_vma(desc);                                          \
5111      uint32_t i;                                                             \
5112                                                                              \
5113      VSTART_CHECK_EARLY_EXIT(env);                                           \
5114                                                                              \
5115      for (i = env->vstart; i < vl; i++) {                                    \
5116          if (!vm && !vext_elem_mask(v0, i)) {                                \
5117              /* set masked-off elements to 1s */                             \
5118              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5119              continue;                                                       \
5120          }                                                                   \
5121          if (i == 0) {                                                       \
5122              *((ETYPE *)vd + H(i)) = s1;                                     \
5123          } else {                                                            \
5124              *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5125          }                                                                   \
5126      }                                                                       \
5127      env->vstart = 0;                                                        \
5128      /* set tail elements to 1s */                                           \
5129      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5130  }
5131  
5132  GEN_VEXT_VSLIE1UP(8,  H1)
5133  GEN_VEXT_VSLIE1UP(16, H2)
5134  GEN_VEXT_VSLIE1UP(32, H4)
5135  GEN_VEXT_VSLIE1UP(64, H8)
5136  
5137  #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5138  void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5139                    CPURISCVState *env, uint32_t desc)              \
5140  {                                                                 \
5141      vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5142  }
5143  
5144  /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5145  GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5146  GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5147  GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5148  GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5149  
5150  #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5151  static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5152                                     void *vs2, CPURISCVState *env,             \
5153                                     uint32_t desc)                             \
5154  {                                                                             \
5155      typedef uint##BITWIDTH##_t ETYPE;                                         \
5156      uint32_t vm = vext_vm(desc);                                              \
5157      uint32_t vl = env->vl;                                                    \
5158      uint32_t esz = sizeof(ETYPE);                                             \
5159      uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5160      uint32_t vta = vext_vta(desc);                                            \
5161      uint32_t vma = vext_vma(desc);                                            \
5162      uint32_t i;                                                               \
5163                                                                                \
5164      VSTART_CHECK_EARLY_EXIT(env);                                             \
5165                                                                                \
5166      for (i = env->vstart; i < vl; i++) {                                      \
5167          if (!vm && !vext_elem_mask(v0, i)) {                                  \
5168              /* set masked-off elements to 1s */                               \
5169              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5170              continue;                                                         \
5171          }                                                                     \
5172          if (i == vl - 1) {                                                    \
5173              *((ETYPE *)vd + H(i)) = s1;                                       \
5174          } else {                                                              \
5175              *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5176          }                                                                     \
5177      }                                                                         \
5178      env->vstart = 0;                                                          \
5179      /* set tail elements to 1s */                                             \
5180      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5181  }
5182  
5183  GEN_VEXT_VSLIDE1DOWN(8,  H1)
5184  GEN_VEXT_VSLIDE1DOWN(16, H2)
5185  GEN_VEXT_VSLIDE1DOWN(32, H4)
5186  GEN_VEXT_VSLIDE1DOWN(64, H8)
5187  
5188  #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5189  void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5190                    CPURISCVState *env, uint32_t desc)              \
5191  {                                                                 \
5192      vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5193  }
5194  
5195  /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5196  GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5197  GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5198  GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5199  GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5200  
5201  /* Vector Floating-Point Slide Instructions */
5202  #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5203  void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5204                    CPURISCVState *env, uint32_t desc)          \
5205  {                                                             \
5206      vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5207  }
5208  
5209  /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5210  GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5211  GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5212  GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5213  
5214  #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5215  void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5216                    CPURISCVState *env, uint32_t desc)          \
5217  {                                                             \
5218      vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5219  }
5220  
5221  /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5222  GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5223  GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5224  GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5225  
5226  /* Vector Register Gather Instruction */
5227  #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5228  void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5229                    CPURISCVState *env, uint32_t desc)                      \
5230  {                                                                         \
5231      uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5232      uint32_t vm = vext_vm(desc);                                          \
5233      uint32_t vl = env->vl;                                                \
5234      uint32_t esz = sizeof(TS2);                                           \
5235      uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5236      uint32_t vta = vext_vta(desc);                                        \
5237      uint32_t vma = vext_vma(desc);                                        \
5238      uint64_t index;                                                       \
5239      uint32_t i;                                                           \
5240                                                                            \
5241      VSTART_CHECK_EARLY_EXIT(env);                                         \
5242                                                                            \
5243      for (i = env->vstart; i < vl; i++) {                                  \
5244          if (!vm && !vext_elem_mask(v0, i)) {                              \
5245              /* set masked-off elements to 1s */                           \
5246              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5247              continue;                                                     \
5248          }                                                                 \
5249          index = *((TS1 *)vs1 + HS1(i));                                   \
5250          if (index >= vlmax) {                                             \
5251              *((TS2 *)vd + HS2(i)) = 0;                                    \
5252          } else {                                                          \
5253              *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5254          }                                                                 \
5255      }                                                                     \
5256      env->vstart = 0;                                                      \
5257      /* set tail elements to 1s */                                         \
5258      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5259  }
5260  
5261  /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5262  GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5263  GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5264  GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5265  GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5266  
5267  GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5268  GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5269  GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5270  GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5271  
5272  #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5273  void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5274                    CPURISCVState *env, uint32_t desc)                      \
5275  {                                                                         \
5276      uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5277      uint32_t vm = vext_vm(desc);                                          \
5278      uint32_t vl = env->vl;                                                \
5279      uint32_t esz = sizeof(ETYPE);                                         \
5280      uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5281      uint32_t vta = vext_vta(desc);                                        \
5282      uint32_t vma = vext_vma(desc);                                        \
5283      uint64_t index = s1;                                                  \
5284      uint32_t i;                                                           \
5285                                                                            \
5286      VSTART_CHECK_EARLY_EXIT(env);                                         \
5287                                                                            \
5288      for (i = env->vstart; i < vl; i++) {                                  \
5289          if (!vm && !vext_elem_mask(v0, i)) {                              \
5290              /* set masked-off elements to 1s */                           \
5291              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5292              continue;                                                     \
5293          }                                                                 \
5294          if (index >= vlmax) {                                             \
5295              *((ETYPE *)vd + H(i)) = 0;                                    \
5296          } else {                                                          \
5297              *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5298          }                                                                 \
5299      }                                                                     \
5300      env->vstart = 0;                                                      \
5301      /* set tail elements to 1s */                                         \
5302      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5303  }
5304  
5305  /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5306  GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5307  GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5308  GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5309  GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5310  
5311  /* Vector Compress Instruction */
5312  #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5313  void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5314                    CPURISCVState *env, uint32_t desc)                      \
5315  {                                                                         \
5316      uint32_t vl = env->vl;                                                \
5317      uint32_t esz = sizeof(ETYPE);                                         \
5318      uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5319      uint32_t vta = vext_vta(desc);                                        \
5320      uint32_t num = 0, i;                                                  \
5321                                                                            \
5322      for (i = env->vstart; i < vl; i++) {                                  \
5323          if (!vext_elem_mask(vs1, i)) {                                    \
5324              continue;                                                     \
5325          }                                                                 \
5326          *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5327          num++;                                                            \
5328      }                                                                     \
5329      env->vstart = 0;                                                      \
5330      /* set tail elements to 1s */                                         \
5331      vext_set_elems_1s(vd, vta, num * esz, total_elems * esz);             \
5332  }
5333  
5334  /* Compress into vd elements of vs2 where vs1 is enabled */
5335  GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5336  GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5337  GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5338  GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5339  
5340  /* Vector Whole Register Move */
5341  void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5342  {
5343      /* EEW = SEW */
5344      uint32_t maxsz = simd_maxsz(desc);
5345      uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5346      uint32_t startb = env->vstart * sewb;
5347      uint32_t i = startb;
5348  
5349      if (startb >= maxsz) {
5350          env->vstart = 0;
5351          return;
5352      }
5353  
5354      if (HOST_BIG_ENDIAN && i % 8 != 0) {
5355          uint32_t j = ROUND_UP(i, 8);
5356          memcpy((uint8_t *)vd + H1(j - 1),
5357                 (uint8_t *)vs2 + H1(j - 1),
5358                 j - i);
5359          i = j;
5360      }
5361  
5362      memcpy((uint8_t *)vd + H1(i),
5363             (uint8_t *)vs2 + H1(i),
5364             maxsz - i);
5365  
5366      env->vstart = 0;
5367  }
5368  
5369  /* Vector Integer Extension */
5370  #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5371  void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5372                    CPURISCVState *env, uint32_t desc)             \
5373  {                                                                \
5374      uint32_t vl = env->vl;                                       \
5375      uint32_t vm = vext_vm(desc);                                 \
5376      uint32_t esz = sizeof(ETYPE);                                \
5377      uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5378      uint32_t vta = vext_vta(desc);                               \
5379      uint32_t vma = vext_vma(desc);                               \
5380      uint32_t i;                                                  \
5381                                                                   \
5382      VSTART_CHECK_EARLY_EXIT(env);                                \
5383                                                                   \
5384      for (i = env->vstart; i < vl; i++) {                         \
5385          if (!vm && !vext_elem_mask(v0, i)) {                     \
5386              /* set masked-off elements to 1s */                  \
5387              vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5388              continue;                                            \
5389          }                                                        \
5390          *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5391      }                                                            \
5392      env->vstart = 0;                                             \
5393      /* set tail elements to 1s */                                \
5394      vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5395  }
5396  
5397  GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5398  GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5399  GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5400  GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5401  GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5402  GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5403  
5404  GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5405  GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5406  GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5407  GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5408  GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5409  GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5410