xref: /openbmc/qemu/target/riscv/vector_helper.c (revision 7b945bdc)
1 /*
2  * RISC-V Vector Extension Helpers for QEMU.
3  *
4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30 
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32                             target_ulong s2)
33 {
34     int vlmax, vl;
35     RISCVCPU *cpu = env_archcpu(env);
36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39     int xlen = riscv_cpu_xlen(env);
40     bool vill = (s2 >> (xlen - 1)) & 0x1;
41     target_ulong reserved = s2 &
42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44 
45     if (lmul & 4) {
46         /* Fractional LMUL. */
47         if (lmul == 4 ||
48             cpu->cfg.elen >> (8 - lmul) < sew) {
49             vill = true;
50         }
51     }
52 
53     if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) {
54         /* only set vill bit. */
55         env->vill = 1;
56         env->vtype = 0;
57         env->vl = 0;
58         env->vstart = 0;
59         return 0;
60     }
61 
62     vlmax = vext_get_vlmax(cpu, s2);
63     if (s1 <= vlmax) {
64         vl = s1;
65     } else {
66         vl = vlmax;
67     }
68     env->vl = vl;
69     env->vtype = s2;
70     env->vstart = 0;
71     env->vill = 0;
72     return vl;
73 }
74 
75 /*
76  * Note that vector data is stored in host-endian 64-bit chunks,
77  * so addressing units smaller than that needs a host-endian fixup.
78  */
79 #if HOST_BIG_ENDIAN
80 #define H1(x)   ((x) ^ 7)
81 #define H1_2(x) ((x) ^ 6)
82 #define H1_4(x) ((x) ^ 4)
83 #define H2(x)   ((x) ^ 3)
84 #define H4(x)   ((x) ^ 1)
85 #define H8(x)   ((x))
86 #else
87 #define H1(x)   (x)
88 #define H1_2(x) (x)
89 #define H1_4(x) (x)
90 #define H2(x)   (x)
91 #define H4(x)   (x)
92 #define H8(x)   (x)
93 #endif
94 
95 static inline uint32_t vext_nf(uint32_t desc)
96 {
97     return FIELD_EX32(simd_data(desc), VDATA, NF);
98 }
99 
100 static inline uint32_t vext_vm(uint32_t desc)
101 {
102     return FIELD_EX32(simd_data(desc), VDATA, VM);
103 }
104 
105 /*
106  * Encode LMUL to lmul as following:
107  *     LMUL    vlmul    lmul
108  *      1       000       0
109  *      2       001       1
110  *      4       010       2
111  *      8       011       3
112  *      -       100       -
113  *     1/8      101      -3
114  *     1/4      110      -2
115  *     1/2      111      -1
116  */
117 static inline int32_t vext_lmul(uint32_t desc)
118 {
119     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
120 }
121 
122 static inline uint32_t vext_vta(uint32_t desc)
123 {
124     return FIELD_EX32(simd_data(desc), VDATA, VTA);
125 }
126 
127 static inline uint32_t vext_vma(uint32_t desc)
128 {
129     return FIELD_EX32(simd_data(desc), VDATA, VMA);
130 }
131 
132 static inline uint32_t vext_vta_all_1s(uint32_t desc)
133 {
134     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
135 }
136 
137 /*
138  * Get the maximum number of elements can be operated.
139  *
140  * log2_esz: log2 of element size in bytes.
141  */
142 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
143 {
144     /*
145      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
146      * so vlen in bytes (vlenb) is encoded as maxsz.
147      */
148     uint32_t vlenb = simd_maxsz(desc);
149 
150     /* Return VLMAX */
151     int scale = vext_lmul(desc) - log2_esz;
152     return scale < 0 ? vlenb >> -scale : vlenb << scale;
153 }
154 
155 /*
156  * Get number of total elements, including prestart, body and tail elements.
157  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
158  * are held in the same vector register.
159  */
160 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
161                                             uint32_t esz)
162 {
163     uint32_t vlenb = simd_maxsz(desc);
164     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
165     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
166                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
167     return (vlenb << emul) / esz;
168 }
169 
170 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
171 {
172     return (addr & ~env->cur_pmmask) | env->cur_pmbase;
173 }
174 
175 /*
176  * This function checks watchpoint before real load operation.
177  *
178  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
179  * In user mode, there is no watchpoint support now.
180  *
181  * It will trigger an exception if there is no mapping in TLB
182  * and page table walk can't fill the TLB entry. Then the guest
183  * software can return here after process the exception or never return.
184  */
185 static void probe_pages(CPURISCVState *env, target_ulong addr,
186                         target_ulong len, uintptr_t ra,
187                         MMUAccessType access_type)
188 {
189     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
190     target_ulong curlen = MIN(pagelen, len);
191 
192     probe_access(env, adjust_addr(env, addr), curlen, access_type,
193                  cpu_mmu_index(env, false), ra);
194     if (len > curlen) {
195         addr += curlen;
196         curlen = len - curlen;
197         probe_access(env, adjust_addr(env, addr), curlen, access_type,
198                      cpu_mmu_index(env, false), ra);
199     }
200 }
201 
202 /* set agnostic elements to 1s */
203 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
204                               uint32_t tot)
205 {
206     if (is_agnostic == 0) {
207         /* policy undisturbed */
208         return;
209     }
210     if (tot - cnt == 0) {
211         return;
212     }
213     memset(base + cnt, -1, tot - cnt);
214 }
215 
216 static inline void vext_set_elem_mask(void *v0, int index,
217                                       uint8_t value)
218 {
219     int idx = index / 64;
220     int pos = index % 64;
221     uint64_t old = ((uint64_t *)v0)[idx];
222     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
223 }
224 
225 /*
226  * Earlier designs (pre-0.9) had a varying number of bits
227  * per mask value (MLEN). In the 0.9 design, MLEN=1.
228  * (Section 4.5)
229  */
230 static inline int vext_elem_mask(void *v0, int index)
231 {
232     int idx = index / 64;
233     int pos = index  % 64;
234     return (((uint64_t *)v0)[idx] >> pos) & 1;
235 }
236 
237 /* elements operations for load and store */
238 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
239                                uint32_t idx, void *vd, uintptr_t retaddr);
240 
241 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
242 static void NAME(CPURISCVState *env, abi_ptr addr,         \
243                  uint32_t idx, void *vd, uintptr_t retaddr)\
244 {                                                          \
245     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
246     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
247 }                                                          \
248 
249 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
250 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
251 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
252 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
253 
254 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
255 static void NAME(CPURISCVState *env, abi_ptr addr,         \
256                  uint32_t idx, void *vd, uintptr_t retaddr)\
257 {                                                          \
258     ETYPE data = *((ETYPE *)vd + H(idx));                  \
259     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
260 }
261 
262 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
263 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
264 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
265 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
266 
267 static void vext_set_tail_elems_1s(CPURISCVState *env, target_ulong vl,
268                                    void *vd, uint32_t desc, uint32_t nf,
269                                    uint32_t esz, uint32_t max_elems)
270 {
271     uint32_t total_elems, vlenb, registers_used;
272     uint32_t vta = vext_vta(desc);
273     int k;
274 
275     if (vta == 0) {
276         return;
277     }
278 
279     total_elems = vext_get_total_elems(env, desc, esz);
280     vlenb = riscv_cpu_cfg(env)->vlen >> 3;
281 
282     for (k = 0; k < nf; ++k) {
283         vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
284                           (k * max_elems + max_elems) * esz);
285     }
286 
287     if (nf * max_elems % total_elems != 0) {
288         registers_used = ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
289         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
290                           registers_used * vlenb);
291     }
292 }
293 
294 /*
295  * stride: access vector element from strided memory
296  */
297 static void
298 vext_ldst_stride(void *vd, void *v0, target_ulong base,
299                  target_ulong stride, CPURISCVState *env,
300                  uint32_t desc, uint32_t vm,
301                  vext_ldst_elem_fn *ldst_elem,
302                  uint32_t log2_esz, uintptr_t ra)
303 {
304     uint32_t i, k;
305     uint32_t nf = vext_nf(desc);
306     uint32_t max_elems = vext_max_elems(desc, log2_esz);
307     uint32_t esz = 1 << log2_esz;
308     uint32_t vma = vext_vma(desc);
309 
310     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
311         k = 0;
312         while (k < nf) {
313             if (!vm && !vext_elem_mask(v0, i)) {
314                 /* set masked-off elements to 1s */
315                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
316                                   (i + k * max_elems + 1) * esz);
317                 k++;
318                 continue;
319             }
320             target_ulong addr = base + stride * i + (k << log2_esz);
321             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
322             k++;
323         }
324     }
325     env->vstart = 0;
326 
327     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
328 }
329 
330 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
331 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
332                   target_ulong stride, CPURISCVState *env,              \
333                   uint32_t desc)                                        \
334 {                                                                       \
335     uint32_t vm = vext_vm(desc);                                        \
336     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
337                      ctzl(sizeof(ETYPE)), GETPC());                     \
338 }
339 
340 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
341 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
342 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
343 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
344 
345 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
346 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
347                   target_ulong stride, CPURISCVState *env,              \
348                   uint32_t desc)                                        \
349 {                                                                       \
350     uint32_t vm = vext_vm(desc);                                        \
351     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
352                      ctzl(sizeof(ETYPE)), GETPC());                     \
353 }
354 
355 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
356 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
357 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
358 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
359 
360 /*
361  * unit-stride: access elements stored contiguously in memory
362  */
363 
364 /* unmasked unit-stride load and store operation */
365 static void
366 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
367              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
368              uintptr_t ra)
369 {
370     uint32_t i, k;
371     uint32_t nf = vext_nf(desc);
372     uint32_t max_elems = vext_max_elems(desc, log2_esz);
373     uint32_t esz = 1 << log2_esz;
374 
375     /* load bytes from guest memory */
376     for (i = env->vstart; i < evl; i++, env->vstart++) {
377         k = 0;
378         while (k < nf) {
379             target_ulong addr = base + ((i * nf + k) << log2_esz);
380             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
381             k++;
382         }
383     }
384     env->vstart = 0;
385 
386     vext_set_tail_elems_1s(env, evl, vd, desc, nf, esz, max_elems);
387 }
388 
389 /*
390  * masked unit-stride load and store operation will be a special case of
391  * stride, stride = NF * sizeof (MTYPE)
392  */
393 
394 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
395 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
396                          CPURISCVState *env, uint32_t desc)             \
397 {                                                                       \
398     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
399     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
400                      ctzl(sizeof(ETYPE)), GETPC());                     \
401 }                                                                       \
402                                                                         \
403 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
404                   CPURISCVState *env, uint32_t desc)                    \
405 {                                                                       \
406     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
407                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
408 }
409 
410 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
411 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
412 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
413 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
414 
415 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
416 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
417                          CPURISCVState *env, uint32_t desc)              \
418 {                                                                        \
419     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
420     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
421                      ctzl(sizeof(ETYPE)), GETPC());                      \
422 }                                                                        \
423                                                                          \
424 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
425                   CPURISCVState *env, uint32_t desc)                     \
426 {                                                                        \
427     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
428                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
429 }
430 
431 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
432 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
433 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
434 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
435 
436 /*
437  * unit stride mask load and store, EEW = 1
438  */
439 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
440                     CPURISCVState *env, uint32_t desc)
441 {
442     /* evl = ceil(vl/8) */
443     uint8_t evl = (env->vl + 7) >> 3;
444     vext_ldst_us(vd, base, env, desc, lde_b,
445                  0, evl, GETPC());
446 }
447 
448 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
449                     CPURISCVState *env, uint32_t desc)
450 {
451     /* evl = ceil(vl/8) */
452     uint8_t evl = (env->vl + 7) >> 3;
453     vext_ldst_us(vd, base, env, desc, ste_b,
454                  0, evl, GETPC());
455 }
456 
457 /*
458  * index: access vector element from indexed memory
459  */
460 typedef target_ulong vext_get_index_addr(target_ulong base,
461         uint32_t idx, void *vs2);
462 
463 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
464 static target_ulong NAME(target_ulong base,            \
465                          uint32_t idx, void *vs2)      \
466 {                                                      \
467     return (base + *((ETYPE *)vs2 + H(idx)));          \
468 }
469 
470 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
471 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
472 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
473 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
474 
475 static inline void
476 vext_ldst_index(void *vd, void *v0, target_ulong base,
477                 void *vs2, CPURISCVState *env, uint32_t desc,
478                 vext_get_index_addr get_index_addr,
479                 vext_ldst_elem_fn *ldst_elem,
480                 uint32_t log2_esz, uintptr_t ra)
481 {
482     uint32_t i, k;
483     uint32_t nf = vext_nf(desc);
484     uint32_t vm = vext_vm(desc);
485     uint32_t max_elems = vext_max_elems(desc, log2_esz);
486     uint32_t esz = 1 << log2_esz;
487     uint32_t vma = vext_vma(desc);
488 
489     /* load bytes from guest memory */
490     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
491         k = 0;
492         while (k < nf) {
493             if (!vm && !vext_elem_mask(v0, i)) {
494                 /* set masked-off elements to 1s */
495                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
496                                   (i + k * max_elems + 1) * esz);
497                 k++;
498                 continue;
499             }
500             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
501             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
502             k++;
503         }
504     }
505     env->vstart = 0;
506 
507     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
508 }
509 
510 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
511 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
512                   void *vs2, CPURISCVState *env, uint32_t desc)            \
513 {                                                                          \
514     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
515                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
516 }
517 
518 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
519 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
520 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
521 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
522 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
523 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
524 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
525 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
526 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
527 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
528 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
529 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
530 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
531 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
532 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
533 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
534 
535 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
536 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
537                   void *vs2, CPURISCVState *env, uint32_t desc)  \
538 {                                                                \
539     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
540                     STORE_FN, ctzl(sizeof(ETYPE)),               \
541                     GETPC());                                    \
542 }
543 
544 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
545 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
546 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
547 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
548 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
549 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
550 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
551 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
552 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
553 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
554 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
555 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
556 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
557 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
558 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
559 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
560 
561 /*
562  * unit-stride fault-only-fisrt load instructions
563  */
564 static inline void
565 vext_ldff(void *vd, void *v0, target_ulong base,
566           CPURISCVState *env, uint32_t desc,
567           vext_ldst_elem_fn *ldst_elem,
568           uint32_t log2_esz, uintptr_t ra)
569 {
570     void *host;
571     uint32_t i, k, vl = 0;
572     uint32_t nf = vext_nf(desc);
573     uint32_t vm = vext_vm(desc);
574     uint32_t max_elems = vext_max_elems(desc, log2_esz);
575     uint32_t esz = 1 << log2_esz;
576     uint32_t vma = vext_vma(desc);
577     target_ulong addr, offset, remain;
578 
579     /* probe every access */
580     for (i = env->vstart; i < env->vl; i++) {
581         if (!vm && !vext_elem_mask(v0, i)) {
582             continue;
583         }
584         addr = adjust_addr(env, base + i * (nf << log2_esz));
585         if (i == 0) {
586             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
587         } else {
588             /* if it triggers an exception, no need to check watchpoint */
589             remain = nf << log2_esz;
590             while (remain > 0) {
591                 offset = -(addr | TARGET_PAGE_MASK);
592                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
593                                          cpu_mmu_index(env, false));
594                 if (host) {
595 #ifdef CONFIG_USER_ONLY
596                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
597                         vl = i;
598                         goto ProbeSuccess;
599                     }
600 #else
601                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
602 #endif
603                 } else {
604                     vl = i;
605                     goto ProbeSuccess;
606                 }
607                 if (remain <=  offset) {
608                     break;
609                 }
610                 remain -= offset;
611                 addr = adjust_addr(env, addr + offset);
612             }
613         }
614     }
615 ProbeSuccess:
616     /* load bytes from guest memory */
617     if (vl != 0) {
618         env->vl = vl;
619     }
620     for (i = env->vstart; i < env->vl; i++) {
621         k = 0;
622         while (k < nf) {
623             if (!vm && !vext_elem_mask(v0, i)) {
624                 /* set masked-off elements to 1s */
625                 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
626                                   (i + k * max_elems + 1) * esz);
627                 k++;
628                 continue;
629             }
630             target_ulong addr = base + ((i * nf + k) << log2_esz);
631             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
632             k++;
633         }
634     }
635     env->vstart = 0;
636 
637     vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
638 }
639 
640 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
641 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
642                   CPURISCVState *env, uint32_t desc)      \
643 {                                                         \
644     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
645               ctzl(sizeof(ETYPE)), GETPC());              \
646 }
647 
648 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
649 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
650 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
651 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
652 
653 #define DO_SWAP(N, M) (M)
654 #define DO_AND(N, M)  (N & M)
655 #define DO_XOR(N, M)  (N ^ M)
656 #define DO_OR(N, M)   (N | M)
657 #define DO_ADD(N, M)  (N + M)
658 
659 /* Signed min/max */
660 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
661 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
662 
663 /* Unsigned min/max */
664 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
665 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
666 
667 /*
668  * load and store whole register instructions
669  */
670 static void
671 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
672                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
673 {
674     uint32_t i, k, off, pos;
675     uint32_t nf = vext_nf(desc);
676     uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
677     uint32_t max_elems = vlenb >> log2_esz;
678 
679     k = env->vstart / max_elems;
680     off = env->vstart % max_elems;
681 
682     if (off) {
683         /* load/store rest of elements of current segment pointed by vstart */
684         for (pos = off; pos < max_elems; pos++, env->vstart++) {
685             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
686             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
687                       ra);
688         }
689         k++;
690     }
691 
692     /* load/store elements for rest of segments */
693     for (; k < nf; k++) {
694         for (i = 0; i < max_elems; i++, env->vstart++) {
695             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
696             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
697         }
698     }
699 
700     env->vstart = 0;
701 }
702 
703 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
704 void HELPER(NAME)(void *vd, target_ulong base,       \
705                   CPURISCVState *env, uint32_t desc) \
706 {                                                    \
707     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
708                     ctzl(sizeof(ETYPE)), GETPC());   \
709 }
710 
711 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
712 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
713 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
714 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
715 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
716 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
717 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
718 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
719 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
720 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
721 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
722 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
723 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
724 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
725 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
726 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
727 
728 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
729 void HELPER(NAME)(void *vd, target_ulong base,       \
730                   CPURISCVState *env, uint32_t desc) \
731 {                                                    \
732     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
733                     ctzl(sizeof(ETYPE)), GETPC());   \
734 }
735 
736 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
737 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
738 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
739 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
740 
741 /*
742  * Vector Integer Arithmetic Instructions
743  */
744 
745 /* expand macro args before macro */
746 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
747 
748 /* (TD, T1, T2, TX1, TX2) */
749 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
750 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
751 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
752 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
753 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
754 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
755 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
756 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
757 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
758 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
759 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
760 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
761 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
762 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
763 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
764 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
765 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
766 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
767 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
768 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
769 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
770 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
771 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
772 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
773 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
774 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
775 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
776 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
777 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
778 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
779 
780 /* operation of two vector elements */
781 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
782 
783 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
784 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
785 {                                                               \
786     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
787     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
788     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
789 }
790 #define DO_SUB(N, M) (N - M)
791 #define DO_RSUB(N, M) (M - N)
792 
793 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
794 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
795 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
796 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
797 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
798 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
799 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
800 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
801 
802 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
803                        CPURISCVState *env, uint32_t desc,
804                        opivv2_fn *fn, uint32_t esz)
805 {
806     uint32_t vm = vext_vm(desc);
807     uint32_t vl = env->vl;
808     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
809     uint32_t vta = vext_vta(desc);
810     uint32_t vma = vext_vma(desc);
811     uint32_t i;
812 
813     for (i = env->vstart; i < vl; i++) {
814         if (!vm && !vext_elem_mask(v0, i)) {
815             /* set masked-off elements to 1s */
816             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
817             continue;
818         }
819         fn(vd, vs1, vs2, i);
820     }
821     env->vstart = 0;
822     /* set tail elements to 1s */
823     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
824 }
825 
826 /* generate the helpers for OPIVV */
827 #define GEN_VEXT_VV(NAME, ESZ)                            \
828 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
829                   void *vs2, CPURISCVState *env,          \
830                   uint32_t desc)                          \
831 {                                                         \
832     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
833                do_##NAME, ESZ);                           \
834 }
835 
836 GEN_VEXT_VV(vadd_vv_b, 1)
837 GEN_VEXT_VV(vadd_vv_h, 2)
838 GEN_VEXT_VV(vadd_vv_w, 4)
839 GEN_VEXT_VV(vadd_vv_d, 8)
840 GEN_VEXT_VV(vsub_vv_b, 1)
841 GEN_VEXT_VV(vsub_vv_h, 2)
842 GEN_VEXT_VV(vsub_vv_w, 4)
843 GEN_VEXT_VV(vsub_vv_d, 8)
844 
845 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
846 
847 /*
848  * (T1)s1 gives the real operator type.
849  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
850  */
851 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
852 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
853 {                                                                   \
854     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
855     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
856 }
857 
858 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
859 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
860 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
861 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
862 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
863 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
864 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
865 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
866 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
867 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
868 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
869 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
870 
871 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
872                        CPURISCVState *env, uint32_t desc,
873                        opivx2_fn fn, uint32_t esz)
874 {
875     uint32_t vm = vext_vm(desc);
876     uint32_t vl = env->vl;
877     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
878     uint32_t vta = vext_vta(desc);
879     uint32_t vma = vext_vma(desc);
880     uint32_t i;
881 
882     for (i = env->vstart; i < vl; i++) {
883         if (!vm && !vext_elem_mask(v0, i)) {
884             /* set masked-off elements to 1s */
885             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
886             continue;
887         }
888         fn(vd, s1, vs2, i);
889     }
890     env->vstart = 0;
891     /* set tail elements to 1s */
892     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
893 }
894 
895 /* generate the helpers for OPIVX */
896 #define GEN_VEXT_VX(NAME, ESZ)                            \
897 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
898                   void *vs2, CPURISCVState *env,          \
899                   uint32_t desc)                          \
900 {                                                         \
901     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
902                do_##NAME, ESZ);                           \
903 }
904 
905 GEN_VEXT_VX(vadd_vx_b, 1)
906 GEN_VEXT_VX(vadd_vx_h, 2)
907 GEN_VEXT_VX(vadd_vx_w, 4)
908 GEN_VEXT_VX(vadd_vx_d, 8)
909 GEN_VEXT_VX(vsub_vx_b, 1)
910 GEN_VEXT_VX(vsub_vx_h, 2)
911 GEN_VEXT_VX(vsub_vx_w, 4)
912 GEN_VEXT_VX(vsub_vx_d, 8)
913 GEN_VEXT_VX(vrsub_vx_b, 1)
914 GEN_VEXT_VX(vrsub_vx_h, 2)
915 GEN_VEXT_VX(vrsub_vx_w, 4)
916 GEN_VEXT_VX(vrsub_vx_d, 8)
917 
918 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
919 {
920     intptr_t oprsz = simd_oprsz(desc);
921     intptr_t i;
922 
923     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
924         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
925     }
926 }
927 
928 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
929 {
930     intptr_t oprsz = simd_oprsz(desc);
931     intptr_t i;
932 
933     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
934         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
935     }
936 }
937 
938 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
939 {
940     intptr_t oprsz = simd_oprsz(desc);
941     intptr_t i;
942 
943     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
944         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
945     }
946 }
947 
948 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
949 {
950     intptr_t oprsz = simd_oprsz(desc);
951     intptr_t i;
952 
953     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
954         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
955     }
956 }
957 
958 /* Vector Widening Integer Add/Subtract */
959 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
960 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
961 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
962 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
963 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
964 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
965 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
966 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
967 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
968 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
969 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
970 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
971 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
972 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
973 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
974 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
975 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
976 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
977 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
978 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
979 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
980 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
981 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
982 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
983 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
984 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
985 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
986 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
987 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
988 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
989 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
990 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
991 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
992 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
993 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
994 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
995 GEN_VEXT_VV(vwaddu_vv_b, 2)
996 GEN_VEXT_VV(vwaddu_vv_h, 4)
997 GEN_VEXT_VV(vwaddu_vv_w, 8)
998 GEN_VEXT_VV(vwsubu_vv_b, 2)
999 GEN_VEXT_VV(vwsubu_vv_h, 4)
1000 GEN_VEXT_VV(vwsubu_vv_w, 8)
1001 GEN_VEXT_VV(vwadd_vv_b, 2)
1002 GEN_VEXT_VV(vwadd_vv_h, 4)
1003 GEN_VEXT_VV(vwadd_vv_w, 8)
1004 GEN_VEXT_VV(vwsub_vv_b, 2)
1005 GEN_VEXT_VV(vwsub_vv_h, 4)
1006 GEN_VEXT_VV(vwsub_vv_w, 8)
1007 GEN_VEXT_VV(vwaddu_wv_b, 2)
1008 GEN_VEXT_VV(vwaddu_wv_h, 4)
1009 GEN_VEXT_VV(vwaddu_wv_w, 8)
1010 GEN_VEXT_VV(vwsubu_wv_b, 2)
1011 GEN_VEXT_VV(vwsubu_wv_h, 4)
1012 GEN_VEXT_VV(vwsubu_wv_w, 8)
1013 GEN_VEXT_VV(vwadd_wv_b, 2)
1014 GEN_VEXT_VV(vwadd_wv_h, 4)
1015 GEN_VEXT_VV(vwadd_wv_w, 8)
1016 GEN_VEXT_VV(vwsub_wv_b, 2)
1017 GEN_VEXT_VV(vwsub_wv_h, 4)
1018 GEN_VEXT_VV(vwsub_wv_w, 8)
1019 
1020 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1021 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1022 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1023 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1024 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1025 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1026 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1027 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1028 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1029 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1030 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1031 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1032 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1033 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1034 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1035 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1036 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1037 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1038 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1039 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1040 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1041 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1042 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1043 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1044 GEN_VEXT_VX(vwaddu_vx_b, 2)
1045 GEN_VEXT_VX(vwaddu_vx_h, 4)
1046 GEN_VEXT_VX(vwaddu_vx_w, 8)
1047 GEN_VEXT_VX(vwsubu_vx_b, 2)
1048 GEN_VEXT_VX(vwsubu_vx_h, 4)
1049 GEN_VEXT_VX(vwsubu_vx_w, 8)
1050 GEN_VEXT_VX(vwadd_vx_b, 2)
1051 GEN_VEXT_VX(vwadd_vx_h, 4)
1052 GEN_VEXT_VX(vwadd_vx_w, 8)
1053 GEN_VEXT_VX(vwsub_vx_b, 2)
1054 GEN_VEXT_VX(vwsub_vx_h, 4)
1055 GEN_VEXT_VX(vwsub_vx_w, 8)
1056 GEN_VEXT_VX(vwaddu_wx_b, 2)
1057 GEN_VEXT_VX(vwaddu_wx_h, 4)
1058 GEN_VEXT_VX(vwaddu_wx_w, 8)
1059 GEN_VEXT_VX(vwsubu_wx_b, 2)
1060 GEN_VEXT_VX(vwsubu_wx_h, 4)
1061 GEN_VEXT_VX(vwsubu_wx_w, 8)
1062 GEN_VEXT_VX(vwadd_wx_b, 2)
1063 GEN_VEXT_VX(vwadd_wx_h, 4)
1064 GEN_VEXT_VX(vwadd_wx_w, 8)
1065 GEN_VEXT_VX(vwsub_wx_b, 2)
1066 GEN_VEXT_VX(vwsub_wx_h, 4)
1067 GEN_VEXT_VX(vwsub_wx_w, 8)
1068 
1069 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1070 #define DO_VADC(N, M, C) (N + M + C)
1071 #define DO_VSBC(N, M, C) (N - M - C)
1072 
1073 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1074 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1075                   CPURISCVState *env, uint32_t desc)          \
1076 {                                                             \
1077     uint32_t vl = env->vl;                                    \
1078     uint32_t esz = sizeof(ETYPE);                             \
1079     uint32_t total_elems =                                    \
1080         vext_get_total_elems(env, desc, esz);                 \
1081     uint32_t vta = vext_vta(desc);                            \
1082     uint32_t i;                                               \
1083                                                               \
1084     for (i = env->vstart; i < vl; i++) {                      \
1085         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1086         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1087         ETYPE carry = vext_elem_mask(v0, i);                  \
1088                                                               \
1089         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1090     }                                                         \
1091     env->vstart = 0;                                          \
1092     /* set tail elements to 1s */                             \
1093     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1094 }
1095 
1096 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1097 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1098 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1099 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1100 
1101 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1102 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1103 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1104 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1105 
1106 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1107 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1108                   CPURISCVState *env, uint32_t desc)                     \
1109 {                                                                        \
1110     uint32_t vl = env->vl;                                               \
1111     uint32_t esz = sizeof(ETYPE);                                        \
1112     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1113     uint32_t vta = vext_vta(desc);                                       \
1114     uint32_t i;                                                          \
1115                                                                          \
1116     for (i = env->vstart; i < vl; i++) {                                 \
1117         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1118         ETYPE carry = vext_elem_mask(v0, i);                             \
1119                                                                          \
1120         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1121     }                                                                    \
1122     env->vstart = 0;                                                     \
1123     /* set tail elements to 1s */                                        \
1124     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1125 }
1126 
1127 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1128 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1129 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1130 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1131 
1132 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1133 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1134 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1135 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1136 
1137 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1138                           (__typeof(N))(N + M) < N)
1139 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1140 
1141 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1142 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1143                   CPURISCVState *env, uint32_t desc)          \
1144 {                                                             \
1145     uint32_t vl = env->vl;                                    \
1146     uint32_t vm = vext_vm(desc);                              \
1147     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1148     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1149     uint32_t i;                                               \
1150                                                               \
1151     for (i = env->vstart; i < vl; i++) {                      \
1152         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1153         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1154         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1155         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1156     }                                                         \
1157     env->vstart = 0;                                          \
1158     /*
1159      * mask destination register are always tail-agnostic
1160      * set tail elements to 1s
1161      */                                                       \
1162     if (vta_all_1s) {                                         \
1163         for (; i < total_elems; i++) {                        \
1164             vext_set_elem_mask(vd, i, 1);                     \
1165         }                                                     \
1166     }                                                         \
1167 }
1168 
1169 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1170 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1171 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1172 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1173 
1174 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1175 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1176 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1177 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1178 
1179 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1180 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1181                   void *vs2, CPURISCVState *env, uint32_t desc) \
1182 {                                                               \
1183     uint32_t vl = env->vl;                                      \
1184     uint32_t vm = vext_vm(desc);                                \
1185     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;            \
1186     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1187     uint32_t i;                                                 \
1188                                                                 \
1189     for (i = env->vstart; i < vl; i++) {                        \
1190         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1191         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1192         vext_set_elem_mask(vd, i,                               \
1193                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1194     }                                                           \
1195     env->vstart = 0;                                            \
1196     /*
1197      * mask destination register are always tail-agnostic
1198      * set tail elements to 1s
1199      */                                                         \
1200     if (vta_all_1s) {                                           \
1201         for (; i < total_elems; i++) {                          \
1202             vext_set_elem_mask(vd, i, 1);                       \
1203         }                                                       \
1204     }                                                           \
1205 }
1206 
1207 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1208 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1209 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1210 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1211 
1212 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1213 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1214 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1215 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1216 
1217 /* Vector Bitwise Logical Instructions */
1218 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1219 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1220 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1221 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1222 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1223 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1224 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1225 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1226 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1227 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1228 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1229 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1230 GEN_VEXT_VV(vand_vv_b, 1)
1231 GEN_VEXT_VV(vand_vv_h, 2)
1232 GEN_VEXT_VV(vand_vv_w, 4)
1233 GEN_VEXT_VV(vand_vv_d, 8)
1234 GEN_VEXT_VV(vor_vv_b, 1)
1235 GEN_VEXT_VV(vor_vv_h, 2)
1236 GEN_VEXT_VV(vor_vv_w, 4)
1237 GEN_VEXT_VV(vor_vv_d, 8)
1238 GEN_VEXT_VV(vxor_vv_b, 1)
1239 GEN_VEXT_VV(vxor_vv_h, 2)
1240 GEN_VEXT_VV(vxor_vv_w, 4)
1241 GEN_VEXT_VV(vxor_vv_d, 8)
1242 
1243 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1244 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1245 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1246 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1247 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1248 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1249 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1250 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1251 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1252 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1253 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1254 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1255 GEN_VEXT_VX(vand_vx_b, 1)
1256 GEN_VEXT_VX(vand_vx_h, 2)
1257 GEN_VEXT_VX(vand_vx_w, 4)
1258 GEN_VEXT_VX(vand_vx_d, 8)
1259 GEN_VEXT_VX(vor_vx_b, 1)
1260 GEN_VEXT_VX(vor_vx_h, 2)
1261 GEN_VEXT_VX(vor_vx_w, 4)
1262 GEN_VEXT_VX(vor_vx_d, 8)
1263 GEN_VEXT_VX(vxor_vx_b, 1)
1264 GEN_VEXT_VX(vxor_vx_h, 2)
1265 GEN_VEXT_VX(vxor_vx_w, 4)
1266 GEN_VEXT_VX(vxor_vx_d, 8)
1267 
1268 /* Vector Single-Width Bit Shift Instructions */
1269 #define DO_SLL(N, M)  (N << (M))
1270 #define DO_SRL(N, M)  (N >> (M))
1271 
1272 /* generate the helpers for shift instructions with two vector operators */
1273 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1274 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1275                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1276 {                                                                         \
1277     uint32_t vm = vext_vm(desc);                                          \
1278     uint32_t vl = env->vl;                                                \
1279     uint32_t esz = sizeof(TS1);                                           \
1280     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1281     uint32_t vta = vext_vta(desc);                                        \
1282     uint32_t vma = vext_vma(desc);                                        \
1283     uint32_t i;                                                           \
1284                                                                           \
1285     for (i = env->vstart; i < vl; i++) {                                  \
1286         if (!vm && !vext_elem_mask(v0, i)) {                              \
1287             /* set masked-off elements to 1s */                           \
1288             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
1289             continue;                                                     \
1290         }                                                                 \
1291         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1292         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1293         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1294     }                                                                     \
1295     env->vstart = 0;                                                      \
1296     /* set tail elements to 1s */                                         \
1297     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1298 }
1299 
1300 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1301 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1302 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1303 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1304 
1305 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1306 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1307 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1308 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1309 
1310 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1311 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1312 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1313 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1314 
1315 /*
1316  * generate the helpers for shift instructions with one vector and one scalar
1317  */
1318 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1319 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1320                   void *vs2, CPURISCVState *env,            \
1321                   uint32_t desc)                            \
1322 {                                                           \
1323     uint32_t vm = vext_vm(desc);                            \
1324     uint32_t vl = env->vl;                                  \
1325     uint32_t esz = sizeof(TD);                              \
1326     uint32_t total_elems =                                  \
1327         vext_get_total_elems(env, desc, esz);               \
1328     uint32_t vta = vext_vta(desc);                          \
1329     uint32_t vma = vext_vma(desc);                          \
1330     uint32_t i;                                             \
1331                                                             \
1332     for (i = env->vstart; i < vl; i++) {                    \
1333         if (!vm && !vext_elem_mask(v0, i)) {                \
1334             /* set masked-off elements to 1s */             \
1335             vext_set_elems_1s(vd, vma, i * esz,             \
1336                               (i + 1) * esz);               \
1337             continue;                                       \
1338         }                                                   \
1339         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1340         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1341     }                                                       \
1342     env->vstart = 0;                                        \
1343     /* set tail elements to 1s */                           \
1344     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1345 }
1346 
1347 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1348 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1349 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1350 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1351 
1352 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1353 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1354 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1355 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1356 
1357 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1358 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1359 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1360 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1361 
1362 /* Vector Narrowing Integer Right Shift Instructions */
1363 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1364 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1365 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1366 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1367 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1368 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1369 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1370 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1371 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1372 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1373 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1374 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1375 
1376 /* Vector Integer Comparison Instructions */
1377 #define DO_MSEQ(N, M) (N == M)
1378 #define DO_MSNE(N, M) (N != M)
1379 #define DO_MSLT(N, M) (N < M)
1380 #define DO_MSLE(N, M) (N <= M)
1381 #define DO_MSGT(N, M) (N > M)
1382 
1383 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1384 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1385                   CPURISCVState *env, uint32_t desc)          \
1386 {                                                             \
1387     uint32_t vm = vext_vm(desc);                              \
1388     uint32_t vl = env->vl;                                    \
1389     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
1390     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1391     uint32_t vma = vext_vma(desc);                            \
1392     uint32_t i;                                               \
1393                                                               \
1394     for (i = env->vstart; i < vl; i++) {                      \
1395         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1396         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1397         if (!vm && !vext_elem_mask(v0, i)) {                  \
1398             /* set masked-off elements to 1s */               \
1399             if (vma) {                                        \
1400                 vext_set_elem_mask(vd, i, 1);                 \
1401             }                                                 \
1402             continue;                                         \
1403         }                                                     \
1404         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1405     }                                                         \
1406     env->vstart = 0;                                          \
1407     /*
1408      * mask destination register are always tail-agnostic
1409      * set tail elements to 1s
1410      */                                                       \
1411     if (vta_all_1s) {                                         \
1412         for (; i < total_elems; i++) {                        \
1413             vext_set_elem_mask(vd, i, 1);                     \
1414         }                                                     \
1415     }                                                         \
1416 }
1417 
1418 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1419 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1420 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1421 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1422 
1423 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1424 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1425 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1426 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1427 
1428 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1429 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1430 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1431 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1432 
1433 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1434 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1435 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1436 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1437 
1438 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1439 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1440 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1441 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1442 
1443 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1444 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1445 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1446 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1447 
1448 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1449 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1450                   CPURISCVState *env, uint32_t desc)                \
1451 {                                                                   \
1452     uint32_t vm = vext_vm(desc);                                    \
1453     uint32_t vl = env->vl;                                          \
1454     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
1455     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1456     uint32_t vma = vext_vma(desc);                                  \
1457     uint32_t i;                                                     \
1458                                                                     \
1459     for (i = env->vstart; i < vl; i++) {                            \
1460         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1461         if (!vm && !vext_elem_mask(v0, i)) {                        \
1462             /* set masked-off elements to 1s */                     \
1463             if (vma) {                                              \
1464                 vext_set_elem_mask(vd, i, 1);                       \
1465             }                                                       \
1466             continue;                                               \
1467         }                                                           \
1468         vext_set_elem_mask(vd, i,                                   \
1469                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1470     }                                                               \
1471     env->vstart = 0;                                                \
1472     /*
1473      * mask destination register are always tail-agnostic
1474      * set tail elements to 1s
1475      */                                                             \
1476     if (vta_all_1s) {                                               \
1477         for (; i < total_elems; i++) {                              \
1478             vext_set_elem_mask(vd, i, 1);                           \
1479         }                                                           \
1480     }                                                               \
1481 }
1482 
1483 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1484 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1485 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1486 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1487 
1488 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1489 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1490 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1491 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1492 
1493 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1494 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1495 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1496 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1497 
1498 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1499 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1500 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1501 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1502 
1503 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1504 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1505 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1506 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1507 
1508 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1509 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1510 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1511 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1512 
1513 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1514 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1515 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1516 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1517 
1518 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1519 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1520 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1521 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1522 
1523 /* Vector Integer Min/Max Instructions */
1524 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1525 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1526 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1527 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1528 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1529 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1530 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1531 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1532 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1533 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1534 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1535 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1536 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1537 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1538 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1539 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1540 GEN_VEXT_VV(vminu_vv_b, 1)
1541 GEN_VEXT_VV(vminu_vv_h, 2)
1542 GEN_VEXT_VV(vminu_vv_w, 4)
1543 GEN_VEXT_VV(vminu_vv_d, 8)
1544 GEN_VEXT_VV(vmin_vv_b, 1)
1545 GEN_VEXT_VV(vmin_vv_h, 2)
1546 GEN_VEXT_VV(vmin_vv_w, 4)
1547 GEN_VEXT_VV(vmin_vv_d, 8)
1548 GEN_VEXT_VV(vmaxu_vv_b, 1)
1549 GEN_VEXT_VV(vmaxu_vv_h, 2)
1550 GEN_VEXT_VV(vmaxu_vv_w, 4)
1551 GEN_VEXT_VV(vmaxu_vv_d, 8)
1552 GEN_VEXT_VV(vmax_vv_b, 1)
1553 GEN_VEXT_VV(vmax_vv_h, 2)
1554 GEN_VEXT_VV(vmax_vv_w, 4)
1555 GEN_VEXT_VV(vmax_vv_d, 8)
1556 
1557 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1558 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1559 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1560 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1561 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1562 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1563 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1564 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1565 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1566 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1567 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1568 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1569 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1570 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1571 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1572 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1573 GEN_VEXT_VX(vminu_vx_b, 1)
1574 GEN_VEXT_VX(vminu_vx_h, 2)
1575 GEN_VEXT_VX(vminu_vx_w, 4)
1576 GEN_VEXT_VX(vminu_vx_d, 8)
1577 GEN_VEXT_VX(vmin_vx_b, 1)
1578 GEN_VEXT_VX(vmin_vx_h, 2)
1579 GEN_VEXT_VX(vmin_vx_w, 4)
1580 GEN_VEXT_VX(vmin_vx_d, 8)
1581 GEN_VEXT_VX(vmaxu_vx_b, 1)
1582 GEN_VEXT_VX(vmaxu_vx_h, 2)
1583 GEN_VEXT_VX(vmaxu_vx_w, 4)
1584 GEN_VEXT_VX(vmaxu_vx_d, 8)
1585 GEN_VEXT_VX(vmax_vx_b, 1)
1586 GEN_VEXT_VX(vmax_vx_h, 2)
1587 GEN_VEXT_VX(vmax_vx_w, 4)
1588 GEN_VEXT_VX(vmax_vx_d, 8)
1589 
1590 /* Vector Single-Width Integer Multiply Instructions */
1591 #define DO_MUL(N, M) (N * M)
1592 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1593 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1594 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1595 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1596 GEN_VEXT_VV(vmul_vv_b, 1)
1597 GEN_VEXT_VV(vmul_vv_h, 2)
1598 GEN_VEXT_VV(vmul_vv_w, 4)
1599 GEN_VEXT_VV(vmul_vv_d, 8)
1600 
1601 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1602 {
1603     return (int16_t)s2 * (int16_t)s1 >> 8;
1604 }
1605 
1606 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1607 {
1608     return (int32_t)s2 * (int32_t)s1 >> 16;
1609 }
1610 
1611 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1612 {
1613     return (int64_t)s2 * (int64_t)s1 >> 32;
1614 }
1615 
1616 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1617 {
1618     uint64_t hi_64, lo_64;
1619 
1620     muls64(&lo_64, &hi_64, s1, s2);
1621     return hi_64;
1622 }
1623 
1624 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1625 {
1626     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1627 }
1628 
1629 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1630 {
1631     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1632 }
1633 
1634 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1635 {
1636     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1637 }
1638 
1639 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1640 {
1641     uint64_t hi_64, lo_64;
1642 
1643     mulu64(&lo_64, &hi_64, s2, s1);
1644     return hi_64;
1645 }
1646 
1647 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1648 {
1649     return (int16_t)s2 * (uint16_t)s1 >> 8;
1650 }
1651 
1652 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1653 {
1654     return (int32_t)s2 * (uint32_t)s1 >> 16;
1655 }
1656 
1657 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1658 {
1659     return (int64_t)s2 * (uint64_t)s1 >> 32;
1660 }
1661 
1662 /*
1663  * Let  A = signed operand,
1664  *      B = unsigned operand
1665  *      P = mulu64(A, B), unsigned product
1666  *
1667  * LET  X = 2 ** 64  - A, 2's complement of A
1668  *      SP = signed product
1669  * THEN
1670  *      IF A < 0
1671  *          SP = -X * B
1672  *             = -(2 ** 64 - A) * B
1673  *             = A * B - 2 ** 64 * B
1674  *             = P - 2 ** 64 * B
1675  *      ELSE
1676  *          SP = P
1677  * THEN
1678  *      HI_P -= (A < 0 ? B : 0)
1679  */
1680 
1681 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1682 {
1683     uint64_t hi_64, lo_64;
1684 
1685     mulu64(&lo_64, &hi_64, s2, s1);
1686 
1687     hi_64 -= s2 < 0 ? s1 : 0;
1688     return hi_64;
1689 }
1690 
1691 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1692 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1693 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1694 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1695 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1696 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1697 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1698 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1699 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1700 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1701 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1702 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1703 GEN_VEXT_VV(vmulh_vv_b, 1)
1704 GEN_VEXT_VV(vmulh_vv_h, 2)
1705 GEN_VEXT_VV(vmulh_vv_w, 4)
1706 GEN_VEXT_VV(vmulh_vv_d, 8)
1707 GEN_VEXT_VV(vmulhu_vv_b, 1)
1708 GEN_VEXT_VV(vmulhu_vv_h, 2)
1709 GEN_VEXT_VV(vmulhu_vv_w, 4)
1710 GEN_VEXT_VV(vmulhu_vv_d, 8)
1711 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1712 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1713 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1714 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1715 
1716 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1717 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1718 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1719 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1720 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1721 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1722 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1723 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1724 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1725 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1726 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1727 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1728 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1729 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1730 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1731 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1732 GEN_VEXT_VX(vmul_vx_b, 1)
1733 GEN_VEXT_VX(vmul_vx_h, 2)
1734 GEN_VEXT_VX(vmul_vx_w, 4)
1735 GEN_VEXT_VX(vmul_vx_d, 8)
1736 GEN_VEXT_VX(vmulh_vx_b, 1)
1737 GEN_VEXT_VX(vmulh_vx_h, 2)
1738 GEN_VEXT_VX(vmulh_vx_w, 4)
1739 GEN_VEXT_VX(vmulh_vx_d, 8)
1740 GEN_VEXT_VX(vmulhu_vx_b, 1)
1741 GEN_VEXT_VX(vmulhu_vx_h, 2)
1742 GEN_VEXT_VX(vmulhu_vx_w, 4)
1743 GEN_VEXT_VX(vmulhu_vx_d, 8)
1744 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1745 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1746 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1747 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1748 
1749 /* Vector Integer Divide Instructions */
1750 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1751 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1752 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) : \
1753         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1754 #define DO_REM(N, M)  (unlikely(M == 0) ? N : \
1755         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1756 
1757 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1758 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1759 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1760 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1761 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1762 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1763 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1764 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1765 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1766 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1767 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1768 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1769 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1770 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1771 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1772 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1773 GEN_VEXT_VV(vdivu_vv_b, 1)
1774 GEN_VEXT_VV(vdivu_vv_h, 2)
1775 GEN_VEXT_VV(vdivu_vv_w, 4)
1776 GEN_VEXT_VV(vdivu_vv_d, 8)
1777 GEN_VEXT_VV(vdiv_vv_b, 1)
1778 GEN_VEXT_VV(vdiv_vv_h, 2)
1779 GEN_VEXT_VV(vdiv_vv_w, 4)
1780 GEN_VEXT_VV(vdiv_vv_d, 8)
1781 GEN_VEXT_VV(vremu_vv_b, 1)
1782 GEN_VEXT_VV(vremu_vv_h, 2)
1783 GEN_VEXT_VV(vremu_vv_w, 4)
1784 GEN_VEXT_VV(vremu_vv_d, 8)
1785 GEN_VEXT_VV(vrem_vv_b, 1)
1786 GEN_VEXT_VV(vrem_vv_h, 2)
1787 GEN_VEXT_VV(vrem_vv_w, 4)
1788 GEN_VEXT_VV(vrem_vv_d, 8)
1789 
1790 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1791 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1792 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1793 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1794 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1795 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1796 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1797 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1798 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1799 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1800 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1801 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1802 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1803 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1804 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1805 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1806 GEN_VEXT_VX(vdivu_vx_b, 1)
1807 GEN_VEXT_VX(vdivu_vx_h, 2)
1808 GEN_VEXT_VX(vdivu_vx_w, 4)
1809 GEN_VEXT_VX(vdivu_vx_d, 8)
1810 GEN_VEXT_VX(vdiv_vx_b, 1)
1811 GEN_VEXT_VX(vdiv_vx_h, 2)
1812 GEN_VEXT_VX(vdiv_vx_w, 4)
1813 GEN_VEXT_VX(vdiv_vx_d, 8)
1814 GEN_VEXT_VX(vremu_vx_b, 1)
1815 GEN_VEXT_VX(vremu_vx_h, 2)
1816 GEN_VEXT_VX(vremu_vx_w, 4)
1817 GEN_VEXT_VX(vremu_vx_d, 8)
1818 GEN_VEXT_VX(vrem_vx_b, 1)
1819 GEN_VEXT_VX(vrem_vx_h, 2)
1820 GEN_VEXT_VX(vrem_vx_w, 4)
1821 GEN_VEXT_VX(vrem_vx_d, 8)
1822 
1823 /* Vector Widening Integer Multiply Instructions */
1824 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1825 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1826 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1827 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1828 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1829 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1830 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1831 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1832 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1833 GEN_VEXT_VV(vwmul_vv_b, 2)
1834 GEN_VEXT_VV(vwmul_vv_h, 4)
1835 GEN_VEXT_VV(vwmul_vv_w, 8)
1836 GEN_VEXT_VV(vwmulu_vv_b, 2)
1837 GEN_VEXT_VV(vwmulu_vv_h, 4)
1838 GEN_VEXT_VV(vwmulu_vv_w, 8)
1839 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1840 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1841 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1842 
1843 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1844 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1845 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1846 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1847 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1848 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1849 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1850 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1851 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1852 GEN_VEXT_VX(vwmul_vx_b, 2)
1853 GEN_VEXT_VX(vwmul_vx_h, 4)
1854 GEN_VEXT_VX(vwmul_vx_w, 8)
1855 GEN_VEXT_VX(vwmulu_vx_b, 2)
1856 GEN_VEXT_VX(vwmulu_vx_h, 4)
1857 GEN_VEXT_VX(vwmulu_vx_w, 8)
1858 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1859 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1860 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1861 
1862 /* Vector Single-Width Integer Multiply-Add Instructions */
1863 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
1864 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1865 {                                                                  \
1866     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1867     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1868     TD d = *((TD *)vd + HD(i));                                    \
1869     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1870 }
1871 
1872 #define DO_MACC(N, M, D) (M * N + D)
1873 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1874 #define DO_MADD(N, M, D) (M * D + N)
1875 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1876 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1877 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1878 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1879 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1880 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1881 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1882 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1883 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1884 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1885 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1886 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1887 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1888 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1889 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1890 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1891 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1892 GEN_VEXT_VV(vmacc_vv_b, 1)
1893 GEN_VEXT_VV(vmacc_vv_h, 2)
1894 GEN_VEXT_VV(vmacc_vv_w, 4)
1895 GEN_VEXT_VV(vmacc_vv_d, 8)
1896 GEN_VEXT_VV(vnmsac_vv_b, 1)
1897 GEN_VEXT_VV(vnmsac_vv_h, 2)
1898 GEN_VEXT_VV(vnmsac_vv_w, 4)
1899 GEN_VEXT_VV(vnmsac_vv_d, 8)
1900 GEN_VEXT_VV(vmadd_vv_b, 1)
1901 GEN_VEXT_VV(vmadd_vv_h, 2)
1902 GEN_VEXT_VV(vmadd_vv_w, 4)
1903 GEN_VEXT_VV(vmadd_vv_d, 8)
1904 GEN_VEXT_VV(vnmsub_vv_b, 1)
1905 GEN_VEXT_VV(vnmsub_vv_h, 2)
1906 GEN_VEXT_VV(vnmsub_vv_w, 4)
1907 GEN_VEXT_VV(vnmsub_vv_d, 8)
1908 
1909 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1910 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1911 {                                                                   \
1912     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1913     TD d = *((TD *)vd + HD(i));                                     \
1914     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1915 }
1916 
1917 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1918 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1919 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1920 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1921 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1922 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1923 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1924 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1925 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1926 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1927 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1928 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1929 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1930 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1931 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1932 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1933 GEN_VEXT_VX(vmacc_vx_b, 1)
1934 GEN_VEXT_VX(vmacc_vx_h, 2)
1935 GEN_VEXT_VX(vmacc_vx_w, 4)
1936 GEN_VEXT_VX(vmacc_vx_d, 8)
1937 GEN_VEXT_VX(vnmsac_vx_b, 1)
1938 GEN_VEXT_VX(vnmsac_vx_h, 2)
1939 GEN_VEXT_VX(vnmsac_vx_w, 4)
1940 GEN_VEXT_VX(vnmsac_vx_d, 8)
1941 GEN_VEXT_VX(vmadd_vx_b, 1)
1942 GEN_VEXT_VX(vmadd_vx_h, 2)
1943 GEN_VEXT_VX(vmadd_vx_w, 4)
1944 GEN_VEXT_VX(vmadd_vx_d, 8)
1945 GEN_VEXT_VX(vnmsub_vx_b, 1)
1946 GEN_VEXT_VX(vnmsub_vx_h, 2)
1947 GEN_VEXT_VX(vnmsub_vx_w, 4)
1948 GEN_VEXT_VX(vnmsub_vx_d, 8)
1949 
1950 /* Vector Widening Integer Multiply-Add Instructions */
1951 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1952 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1953 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1954 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1955 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1956 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1957 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1958 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1959 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1960 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1961 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1962 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1963 GEN_VEXT_VV(vwmacc_vv_b, 2)
1964 GEN_VEXT_VV(vwmacc_vv_h, 4)
1965 GEN_VEXT_VV(vwmacc_vv_w, 8)
1966 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1967 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1968 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1969 
1970 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1971 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1972 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1973 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1974 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1975 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1976 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1977 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1978 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1979 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1980 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1981 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1982 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1983 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1984 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1985 GEN_VEXT_VX(vwmacc_vx_b, 2)
1986 GEN_VEXT_VX(vwmacc_vx_h, 4)
1987 GEN_VEXT_VX(vwmacc_vx_w, 8)
1988 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1989 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1990 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1991 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1992 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1993 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1994 
1995 /* Vector Integer Merge and Move Instructions */
1996 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1997 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1998                   uint32_t desc)                                     \
1999 {                                                                    \
2000     uint32_t vl = env->vl;                                           \
2001     uint32_t esz = sizeof(ETYPE);                                    \
2002     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2003     uint32_t vta = vext_vta(desc);                                   \
2004     uint32_t i;                                                      \
2005                                                                      \
2006     for (i = env->vstart; i < vl; i++) {                             \
2007         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
2008         *((ETYPE *)vd + H(i)) = s1;                                  \
2009     }                                                                \
2010     env->vstart = 0;                                                 \
2011     /* set tail elements to 1s */                                    \
2012     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2013 }
2014 
2015 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
2016 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2017 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2018 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2019 
2020 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
2021 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
2022                   uint32_t desc)                                     \
2023 {                                                                    \
2024     uint32_t vl = env->vl;                                           \
2025     uint32_t esz = sizeof(ETYPE);                                    \
2026     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2027     uint32_t vta = vext_vta(desc);                                   \
2028     uint32_t i;                                                      \
2029                                                                      \
2030     for (i = env->vstart; i < vl; i++) {                             \
2031         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2032     }                                                                \
2033     env->vstart = 0;                                                 \
2034     /* set tail elements to 1s */                                    \
2035     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2036 }
2037 
2038 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2039 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2040 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2041 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2042 
2043 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2044 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2045                   CPURISCVState *env, uint32_t desc)                 \
2046 {                                                                    \
2047     uint32_t vl = env->vl;                                           \
2048     uint32_t esz = sizeof(ETYPE);                                    \
2049     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2050     uint32_t vta = vext_vta(desc);                                   \
2051     uint32_t i;                                                      \
2052                                                                      \
2053     for (i = env->vstart; i < vl; i++) {                             \
2054         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2055         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2056     }                                                                \
2057     env->vstart = 0;                                                 \
2058     /* set tail elements to 1s */                                    \
2059     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2060 }
2061 
2062 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2063 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2064 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2065 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2066 
2067 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2068 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2069                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2070 {                                                                    \
2071     uint32_t vl = env->vl;                                           \
2072     uint32_t esz = sizeof(ETYPE);                                    \
2073     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2074     uint32_t vta = vext_vta(desc);                                   \
2075     uint32_t i;                                                      \
2076                                                                      \
2077     for (i = env->vstart; i < vl; i++) {                             \
2078         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2079         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2080                    (ETYPE)(target_long)s1);                          \
2081         *((ETYPE *)vd + H(i)) = d;                                   \
2082     }                                                                \
2083     env->vstart = 0;                                                 \
2084     /* set tail elements to 1s */                                    \
2085     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2086 }
2087 
2088 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2089 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2090 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2091 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2092 
2093 /*
2094  * Vector Fixed-Point Arithmetic Instructions
2095  */
2096 
2097 /* Vector Single-Width Saturating Add and Subtract */
2098 
2099 /*
2100  * As fixed point instructions probably have round mode and saturation,
2101  * define common macros for fixed point here.
2102  */
2103 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2104                           CPURISCVState *env, int vxrm);
2105 
2106 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2107 static inline void                                                  \
2108 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2109           CPURISCVState *env, int vxrm)                             \
2110 {                                                                   \
2111     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2112     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2113     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2114 }
2115 
2116 static inline void
2117 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2118              CPURISCVState *env,
2119              uint32_t vl, uint32_t vm, int vxrm,
2120              opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2121 {
2122     for (uint32_t i = env->vstart; i < vl; i++) {
2123         if (!vm && !vext_elem_mask(v0, i)) {
2124             /* set masked-off elements to 1s */
2125             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2126             continue;
2127         }
2128         fn(vd, vs1, vs2, i, env, vxrm);
2129     }
2130     env->vstart = 0;
2131 }
2132 
2133 static inline void
2134 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2135              CPURISCVState *env,
2136              uint32_t desc,
2137              opivv2_rm_fn *fn, uint32_t esz)
2138 {
2139     uint32_t vm = vext_vm(desc);
2140     uint32_t vl = env->vl;
2141     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2142     uint32_t vta = vext_vta(desc);
2143     uint32_t vma = vext_vma(desc);
2144 
2145     switch (env->vxrm) {
2146     case 0: /* rnu */
2147         vext_vv_rm_1(vd, v0, vs1, vs2,
2148                      env, vl, vm, 0, fn, vma, esz);
2149         break;
2150     case 1: /* rne */
2151         vext_vv_rm_1(vd, v0, vs1, vs2,
2152                      env, vl, vm, 1, fn, vma, esz);
2153         break;
2154     case 2: /* rdn */
2155         vext_vv_rm_1(vd, v0, vs1, vs2,
2156                      env, vl, vm, 2, fn, vma, esz);
2157         break;
2158     default: /* rod */
2159         vext_vv_rm_1(vd, v0, vs1, vs2,
2160                      env, vl, vm, 3, fn, vma, esz);
2161         break;
2162     }
2163     /* set tail elements to 1s */
2164     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2165 }
2166 
2167 /* generate helpers for fixed point instructions with OPIVV format */
2168 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2169 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2170                   CPURISCVState *env, uint32_t desc)            \
2171 {                                                               \
2172     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2173                  do_##NAME, ESZ);                               \
2174 }
2175 
2176 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a,
2177                              uint8_t b)
2178 {
2179     uint8_t res = a + b;
2180     if (res < a) {
2181         res = UINT8_MAX;
2182         env->vxsat = 0x1;
2183     }
2184     return res;
2185 }
2186 
2187 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2188                                uint16_t b)
2189 {
2190     uint16_t res = a + b;
2191     if (res < a) {
2192         res = UINT16_MAX;
2193         env->vxsat = 0x1;
2194     }
2195     return res;
2196 }
2197 
2198 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2199                                uint32_t b)
2200 {
2201     uint32_t res = a + b;
2202     if (res < a) {
2203         res = UINT32_MAX;
2204         env->vxsat = 0x1;
2205     }
2206     return res;
2207 }
2208 
2209 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2210                                uint64_t b)
2211 {
2212     uint64_t res = a + b;
2213     if (res < a) {
2214         res = UINT64_MAX;
2215         env->vxsat = 0x1;
2216     }
2217     return res;
2218 }
2219 
2220 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2221 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2222 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2223 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2224 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2225 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2226 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2227 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2228 
2229 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2230                           CPURISCVState *env, int vxrm);
2231 
2232 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2233 static inline void                                                  \
2234 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2235           CPURISCVState *env, int vxrm)                             \
2236 {                                                                   \
2237     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2238     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2239 }
2240 
2241 static inline void
2242 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2243              CPURISCVState *env,
2244              uint32_t vl, uint32_t vm, int vxrm,
2245              opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2246 {
2247     for (uint32_t i = env->vstart; i < vl; i++) {
2248         if (!vm && !vext_elem_mask(v0, i)) {
2249             /* set masked-off elements to 1s */
2250             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2251             continue;
2252         }
2253         fn(vd, s1, vs2, i, env, vxrm);
2254     }
2255     env->vstart = 0;
2256 }
2257 
2258 static inline void
2259 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2260              CPURISCVState *env,
2261              uint32_t desc,
2262              opivx2_rm_fn *fn, uint32_t esz)
2263 {
2264     uint32_t vm = vext_vm(desc);
2265     uint32_t vl = env->vl;
2266     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2267     uint32_t vta = vext_vta(desc);
2268     uint32_t vma = vext_vma(desc);
2269 
2270     switch (env->vxrm) {
2271     case 0: /* rnu */
2272         vext_vx_rm_1(vd, v0, s1, vs2,
2273                      env, vl, vm, 0, fn, vma, esz);
2274         break;
2275     case 1: /* rne */
2276         vext_vx_rm_1(vd, v0, s1, vs2,
2277                      env, vl, vm, 1, fn, vma, esz);
2278         break;
2279     case 2: /* rdn */
2280         vext_vx_rm_1(vd, v0, s1, vs2,
2281                      env, vl, vm, 2, fn, vma, esz);
2282         break;
2283     default: /* rod */
2284         vext_vx_rm_1(vd, v0, s1, vs2,
2285                      env, vl, vm, 3, fn, vma, esz);
2286         break;
2287     }
2288     /* set tail elements to 1s */
2289     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2290 }
2291 
2292 /* generate helpers for fixed point instructions with OPIVX format */
2293 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2294 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2295                   void *vs2, CPURISCVState *env,          \
2296                   uint32_t desc)                          \
2297 {                                                         \
2298     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2299                  do_##NAME, ESZ);                         \
2300 }
2301 
2302 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2303 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2304 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2305 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2306 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2307 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2308 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2309 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2310 
2311 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2312 {
2313     int8_t res = a + b;
2314     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2315         res = a > 0 ? INT8_MAX : INT8_MIN;
2316         env->vxsat = 0x1;
2317     }
2318     return res;
2319 }
2320 
2321 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a,
2322                              int16_t b)
2323 {
2324     int16_t res = a + b;
2325     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2326         res = a > 0 ? INT16_MAX : INT16_MIN;
2327         env->vxsat = 0x1;
2328     }
2329     return res;
2330 }
2331 
2332 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a,
2333                              int32_t b)
2334 {
2335     int32_t res = a + b;
2336     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2337         res = a > 0 ? INT32_MAX : INT32_MIN;
2338         env->vxsat = 0x1;
2339     }
2340     return res;
2341 }
2342 
2343 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a,
2344                              int64_t b)
2345 {
2346     int64_t res = a + b;
2347     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2348         res = a > 0 ? INT64_MAX : INT64_MIN;
2349         env->vxsat = 0x1;
2350     }
2351     return res;
2352 }
2353 
2354 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2355 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2356 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2357 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2358 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2359 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2360 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2361 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2362 
2363 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2364 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2365 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2366 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2367 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2368 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2369 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2370 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2371 
2372 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a,
2373                              uint8_t b)
2374 {
2375     uint8_t res = a - b;
2376     if (res > a) {
2377         res = 0;
2378         env->vxsat = 0x1;
2379     }
2380     return res;
2381 }
2382 
2383 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2384                                uint16_t b)
2385 {
2386     uint16_t res = a - b;
2387     if (res > a) {
2388         res = 0;
2389         env->vxsat = 0x1;
2390     }
2391     return res;
2392 }
2393 
2394 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2395                                uint32_t b)
2396 {
2397     uint32_t res = a - b;
2398     if (res > a) {
2399         res = 0;
2400         env->vxsat = 0x1;
2401     }
2402     return res;
2403 }
2404 
2405 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2406                                uint64_t b)
2407 {
2408     uint64_t res = a - b;
2409     if (res > a) {
2410         res = 0;
2411         env->vxsat = 0x1;
2412     }
2413     return res;
2414 }
2415 
2416 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2417 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2418 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2419 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2420 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2421 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2422 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2423 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2424 
2425 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2426 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2427 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2428 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2429 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2430 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2431 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2432 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2433 
2434 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2435 {
2436     int8_t res = a - b;
2437     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2438         res = a >= 0 ? INT8_MAX : INT8_MIN;
2439         env->vxsat = 0x1;
2440     }
2441     return res;
2442 }
2443 
2444 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a,
2445                              int16_t b)
2446 {
2447     int16_t res = a - b;
2448     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2449         res = a >= 0 ? INT16_MAX : INT16_MIN;
2450         env->vxsat = 0x1;
2451     }
2452     return res;
2453 }
2454 
2455 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a,
2456                              int32_t b)
2457 {
2458     int32_t res = a - b;
2459     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2460         res = a >= 0 ? INT32_MAX : INT32_MIN;
2461         env->vxsat = 0x1;
2462     }
2463     return res;
2464 }
2465 
2466 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a,
2467                              int64_t b)
2468 {
2469     int64_t res = a - b;
2470     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2471         res = a >= 0 ? INT64_MAX : INT64_MIN;
2472         env->vxsat = 0x1;
2473     }
2474     return res;
2475 }
2476 
2477 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2478 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2479 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2480 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2481 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2482 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2483 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2484 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2485 
2486 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2487 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2488 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2489 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2490 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2491 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2492 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2493 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2494 
2495 /* Vector Single-Width Averaging Add and Subtract */
2496 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2497 {
2498     uint8_t d = extract64(v, shift, 1);
2499     uint8_t d1;
2500     uint64_t D1, D2;
2501 
2502     if (shift == 0 || shift > 64) {
2503         return 0;
2504     }
2505 
2506     d1 = extract64(v, shift - 1, 1);
2507     D1 = extract64(v, 0, shift);
2508     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2509         return d1;
2510     } else if (vxrm == 1) { /* round-to-nearest-even */
2511         if (shift > 1) {
2512             D2 = extract64(v, 0, shift - 1);
2513             return d1 & ((D2 != 0) | d);
2514         } else {
2515             return d1 & d;
2516         }
2517     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2518         return !d & (D1 != 0);
2519     }
2520     return 0; /* round-down (truncate) */
2521 }
2522 
2523 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a,
2524                              int32_t b)
2525 {
2526     int64_t res = (int64_t)a + b;
2527     uint8_t round = get_round(vxrm, res, 1);
2528 
2529     return (res >> 1) + round;
2530 }
2531 
2532 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a,
2533                              int64_t b)
2534 {
2535     int64_t res = a + b;
2536     uint8_t round = get_round(vxrm, res, 1);
2537     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2538 
2539     /* With signed overflow, bit 64 is inverse of bit 63. */
2540     return ((res >> 1) ^ over) + round;
2541 }
2542 
2543 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2544 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2545 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2546 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2547 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2548 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2549 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2550 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2551 
2552 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2553 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2554 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2555 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2556 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2557 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2558 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2559 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2560 
2561 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2562                                uint32_t a, uint32_t b)
2563 {
2564     uint64_t res = (uint64_t)a + b;
2565     uint8_t round = get_round(vxrm, res, 1);
2566 
2567     return (res >> 1) + round;
2568 }
2569 
2570 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2571                                uint64_t a, uint64_t b)
2572 {
2573     uint64_t res = a + b;
2574     uint8_t round = get_round(vxrm, res, 1);
2575     uint64_t over = (uint64_t)(res < a) << 63;
2576 
2577     return ((res >> 1) | over) + round;
2578 }
2579 
2580 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2581 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2582 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2583 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2584 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2585 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2586 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2587 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2588 
2589 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2590 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2591 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2592 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2593 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2594 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2595 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2596 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2597 
2598 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a,
2599                              int32_t b)
2600 {
2601     int64_t res = (int64_t)a - b;
2602     uint8_t round = get_round(vxrm, res, 1);
2603 
2604     return (res >> 1) + round;
2605 }
2606 
2607 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a,
2608                              int64_t b)
2609 {
2610     int64_t res = (int64_t)a - b;
2611     uint8_t round = get_round(vxrm, res, 1);
2612     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2613 
2614     /* With signed overflow, bit 64 is inverse of bit 63. */
2615     return ((res >> 1) ^ over) + round;
2616 }
2617 
2618 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2619 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2620 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2621 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2622 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2623 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2624 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2625 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2626 
2627 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2628 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2629 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2630 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2631 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2632 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2633 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2634 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2635 
2636 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2637                                uint32_t a, uint32_t b)
2638 {
2639     int64_t res = (int64_t)a - b;
2640     uint8_t round = get_round(vxrm, res, 1);
2641 
2642     return (res >> 1) + round;
2643 }
2644 
2645 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2646                                uint64_t a, uint64_t b)
2647 {
2648     uint64_t res = (uint64_t)a - b;
2649     uint8_t round = get_round(vxrm, res, 1);
2650     uint64_t over = (uint64_t)(res > a) << 63;
2651 
2652     return ((res >> 1) | over) + round;
2653 }
2654 
2655 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2656 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2657 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2658 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2659 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2660 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2661 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2662 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2663 
2664 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2665 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2666 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2667 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2668 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2669 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2670 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2671 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2672 
2673 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2674 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2675 {
2676     uint8_t round;
2677     int16_t res;
2678 
2679     res = (int16_t)a * (int16_t)b;
2680     round = get_round(vxrm, res, 7);
2681     res = (res >> 7) + round;
2682 
2683     if (res > INT8_MAX) {
2684         env->vxsat = 0x1;
2685         return INT8_MAX;
2686     } else if (res < INT8_MIN) {
2687         env->vxsat = 0x1;
2688         return INT8_MIN;
2689     } else {
2690         return res;
2691     }
2692 }
2693 
2694 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2695 {
2696     uint8_t round;
2697     int32_t res;
2698 
2699     res = (int32_t)a * (int32_t)b;
2700     round = get_round(vxrm, res, 15);
2701     res = (res >> 15) + round;
2702 
2703     if (res > INT16_MAX) {
2704         env->vxsat = 0x1;
2705         return INT16_MAX;
2706     } else if (res < INT16_MIN) {
2707         env->vxsat = 0x1;
2708         return INT16_MIN;
2709     } else {
2710         return res;
2711     }
2712 }
2713 
2714 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2715 {
2716     uint8_t round;
2717     int64_t res;
2718 
2719     res = (int64_t)a * (int64_t)b;
2720     round = get_round(vxrm, res, 31);
2721     res = (res >> 31) + round;
2722 
2723     if (res > INT32_MAX) {
2724         env->vxsat = 0x1;
2725         return INT32_MAX;
2726     } else if (res < INT32_MIN) {
2727         env->vxsat = 0x1;
2728         return INT32_MIN;
2729     } else {
2730         return res;
2731     }
2732 }
2733 
2734 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2735 {
2736     uint8_t round;
2737     uint64_t hi_64, lo_64;
2738     int64_t res;
2739 
2740     if (a == INT64_MIN && b == INT64_MIN) {
2741         env->vxsat = 1;
2742         return INT64_MAX;
2743     }
2744 
2745     muls64(&lo_64, &hi_64, a, b);
2746     round = get_round(vxrm, lo_64, 63);
2747     /*
2748      * Cannot overflow, as there are always
2749      * 2 sign bits after multiply.
2750      */
2751     res = (hi_64 << 1) | (lo_64 >> 63);
2752     if (round) {
2753         if (res == INT64_MAX) {
2754             env->vxsat = 1;
2755         } else {
2756             res += 1;
2757         }
2758     }
2759     return res;
2760 }
2761 
2762 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2763 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2764 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2765 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2766 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2767 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2768 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2769 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2770 
2771 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2772 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2773 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2774 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2775 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2776 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2777 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2778 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2779 
2780 /* Vector Single-Width Scaling Shift Instructions */
2781 static inline uint8_t
2782 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2783 {
2784     uint8_t round, shift = b & 0x7;
2785     uint8_t res;
2786 
2787     round = get_round(vxrm, a, shift);
2788     res = (a >> shift) + round;
2789     return res;
2790 }
2791 static inline uint16_t
2792 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2793 {
2794     uint8_t round, shift = b & 0xf;
2795 
2796     round = get_round(vxrm, a, shift);
2797     return (a >> shift) + round;
2798 }
2799 static inline uint32_t
2800 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2801 {
2802     uint8_t round, shift = b & 0x1f;
2803 
2804     round = get_round(vxrm, a, shift);
2805     return (a >> shift) + round;
2806 }
2807 static inline uint64_t
2808 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2809 {
2810     uint8_t round, shift = b & 0x3f;
2811 
2812     round = get_round(vxrm, a, shift);
2813     return (a >> shift) + round;
2814 }
2815 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2816 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2817 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2818 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2819 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2820 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2821 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2822 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2823 
2824 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2825 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2826 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2827 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2828 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2829 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2830 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2831 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2832 
2833 static inline int8_t
2834 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2835 {
2836     uint8_t round, shift = b & 0x7;
2837 
2838     round = get_round(vxrm, a, shift);
2839     return (a >> shift) + round;
2840 }
2841 static inline int16_t
2842 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2843 {
2844     uint8_t round, shift = b & 0xf;
2845 
2846     round = get_round(vxrm, a, shift);
2847     return (a >> shift) + round;
2848 }
2849 static inline int32_t
2850 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2851 {
2852     uint8_t round, shift = b & 0x1f;
2853 
2854     round = get_round(vxrm, a, shift);
2855     return (a >> shift) + round;
2856 }
2857 static inline int64_t
2858 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2859 {
2860     uint8_t round, shift = b & 0x3f;
2861 
2862     round = get_round(vxrm, a, shift);
2863     return (a >> shift) + round;
2864 }
2865 
2866 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2867 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2868 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2869 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2870 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2871 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2872 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2873 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2874 
2875 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2876 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2877 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2878 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2879 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2880 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2881 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2882 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2883 
2884 /* Vector Narrowing Fixed-Point Clip Instructions */
2885 static inline int8_t
2886 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2887 {
2888     uint8_t round, shift = b & 0xf;
2889     int16_t res;
2890 
2891     round = get_round(vxrm, a, shift);
2892     res = (a >> shift) + round;
2893     if (res > INT8_MAX) {
2894         env->vxsat = 0x1;
2895         return INT8_MAX;
2896     } else if (res < INT8_MIN) {
2897         env->vxsat = 0x1;
2898         return INT8_MIN;
2899     } else {
2900         return res;
2901     }
2902 }
2903 
2904 static inline int16_t
2905 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2906 {
2907     uint8_t round, shift = b & 0x1f;
2908     int32_t res;
2909 
2910     round = get_round(vxrm, a, shift);
2911     res = (a >> shift) + round;
2912     if (res > INT16_MAX) {
2913         env->vxsat = 0x1;
2914         return INT16_MAX;
2915     } else if (res < INT16_MIN) {
2916         env->vxsat = 0x1;
2917         return INT16_MIN;
2918     } else {
2919         return res;
2920     }
2921 }
2922 
2923 static inline int32_t
2924 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2925 {
2926     uint8_t round, shift = b & 0x3f;
2927     int64_t res;
2928 
2929     round = get_round(vxrm, a, shift);
2930     res = (a >> shift) + round;
2931     if (res > INT32_MAX) {
2932         env->vxsat = 0x1;
2933         return INT32_MAX;
2934     } else if (res < INT32_MIN) {
2935         env->vxsat = 0x1;
2936         return INT32_MIN;
2937     } else {
2938         return res;
2939     }
2940 }
2941 
2942 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2943 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2944 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2945 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2946 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2947 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2948 
2949 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2950 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2951 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2952 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2953 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2954 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2955 
2956 static inline uint8_t
2957 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2958 {
2959     uint8_t round, shift = b & 0xf;
2960     uint16_t res;
2961 
2962     round = get_round(vxrm, a, shift);
2963     res = (a >> shift) + round;
2964     if (res > UINT8_MAX) {
2965         env->vxsat = 0x1;
2966         return UINT8_MAX;
2967     } else {
2968         return res;
2969     }
2970 }
2971 
2972 static inline uint16_t
2973 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2974 {
2975     uint8_t round, shift = b & 0x1f;
2976     uint32_t res;
2977 
2978     round = get_round(vxrm, a, shift);
2979     res = (a >> shift) + round;
2980     if (res > UINT16_MAX) {
2981         env->vxsat = 0x1;
2982         return UINT16_MAX;
2983     } else {
2984         return res;
2985     }
2986 }
2987 
2988 static inline uint32_t
2989 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2990 {
2991     uint8_t round, shift = b & 0x3f;
2992     uint64_t res;
2993 
2994     round = get_round(vxrm, a, shift);
2995     res = (a >> shift) + round;
2996     if (res > UINT32_MAX) {
2997         env->vxsat = 0x1;
2998         return UINT32_MAX;
2999     } else {
3000         return res;
3001     }
3002 }
3003 
3004 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
3005 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
3006 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
3007 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
3008 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
3009 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
3010 
3011 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
3012 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3013 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3014 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3015 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3016 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3017 
3018 /*
3019  * Vector Float Point Arithmetic Instructions
3020  */
3021 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3022 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
3023 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
3024                       CPURISCVState *env)                      \
3025 {                                                              \
3026     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
3027     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3028     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
3029 }
3030 
3031 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
3032 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
3033                   void *vs2, CPURISCVState *env,          \
3034                   uint32_t desc)                          \
3035 {                                                         \
3036     uint32_t vm = vext_vm(desc);                          \
3037     uint32_t vl = env->vl;                                \
3038     uint32_t total_elems =                                \
3039         vext_get_total_elems(env, desc, ESZ);             \
3040     uint32_t vta = vext_vta(desc);                        \
3041     uint32_t vma = vext_vma(desc);                        \
3042     uint32_t i;                                           \
3043                                                           \
3044     for (i = env->vstart; i < vl; i++) {                  \
3045         if (!vm && !vext_elem_mask(v0, i)) {              \
3046             /* set masked-off elements to 1s */           \
3047             vext_set_elems_1s(vd, vma, i * ESZ,           \
3048                               (i + 1) * ESZ);             \
3049             continue;                                     \
3050         }                                                 \
3051         do_##NAME(vd, vs1, vs2, i, env);                  \
3052     }                                                     \
3053     env->vstart = 0;                                      \
3054     /* set tail elements to 1s */                         \
3055     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3056                       total_elems * ESZ);                 \
3057 }
3058 
3059 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3060 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3061 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3062 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3063 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3064 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3065 
3066 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3067 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3068                       CPURISCVState *env)                      \
3069 {                                                              \
3070     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3071     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3072 }
3073 
3074 #define GEN_VEXT_VF(NAME, ESZ)                            \
3075 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3076                   void *vs2, CPURISCVState *env,          \
3077                   uint32_t desc)                          \
3078 {                                                         \
3079     uint32_t vm = vext_vm(desc);                          \
3080     uint32_t vl = env->vl;                                \
3081     uint32_t total_elems =                                \
3082         vext_get_total_elems(env, desc, ESZ);             \
3083     uint32_t vta = vext_vta(desc);                        \
3084     uint32_t vma = vext_vma(desc);                        \
3085     uint32_t i;                                           \
3086                                                           \
3087     for (i = env->vstart; i < vl; i++) {                  \
3088         if (!vm && !vext_elem_mask(v0, i)) {              \
3089             /* set masked-off elements to 1s */           \
3090             vext_set_elems_1s(vd, vma, i * ESZ,           \
3091                               (i + 1) * ESZ);             \
3092             continue;                                     \
3093         }                                                 \
3094         do_##NAME(vd, s1, vs2, i, env);                   \
3095     }                                                     \
3096     env->vstart = 0;                                      \
3097     /* set tail elements to 1s */                         \
3098     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3099                       total_elems * ESZ);                 \
3100 }
3101 
3102 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3103 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3104 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3105 GEN_VEXT_VF(vfadd_vf_h, 2)
3106 GEN_VEXT_VF(vfadd_vf_w, 4)
3107 GEN_VEXT_VF(vfadd_vf_d, 8)
3108 
3109 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3110 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3111 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3112 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3113 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3114 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3115 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3116 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3117 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3118 GEN_VEXT_VF(vfsub_vf_h, 2)
3119 GEN_VEXT_VF(vfsub_vf_w, 4)
3120 GEN_VEXT_VF(vfsub_vf_d, 8)
3121 
3122 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3123 {
3124     return float16_sub(b, a, s);
3125 }
3126 
3127 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3128 {
3129     return float32_sub(b, a, s);
3130 }
3131 
3132 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3133 {
3134     return float64_sub(b, a, s);
3135 }
3136 
3137 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3138 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3139 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3140 GEN_VEXT_VF(vfrsub_vf_h, 2)
3141 GEN_VEXT_VF(vfrsub_vf_w, 4)
3142 GEN_VEXT_VF(vfrsub_vf_d, 8)
3143 
3144 /* Vector Widening Floating-Point Add/Subtract Instructions */
3145 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3146 {
3147     return float32_add(float16_to_float32(a, true, s),
3148                        float16_to_float32(b, true, s), s);
3149 }
3150 
3151 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3152 {
3153     return float64_add(float32_to_float64(a, s),
3154                        float32_to_float64(b, s), s);
3155 
3156 }
3157 
3158 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3159 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3160 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3161 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3162 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3163 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3164 GEN_VEXT_VF(vfwadd_vf_h, 4)
3165 GEN_VEXT_VF(vfwadd_vf_w, 8)
3166 
3167 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3168 {
3169     return float32_sub(float16_to_float32(a, true, s),
3170                        float16_to_float32(b, true, s), s);
3171 }
3172 
3173 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3174 {
3175     return float64_sub(float32_to_float64(a, s),
3176                        float32_to_float64(b, s), s);
3177 
3178 }
3179 
3180 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3181 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3182 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3183 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3184 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3185 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3186 GEN_VEXT_VF(vfwsub_vf_h, 4)
3187 GEN_VEXT_VF(vfwsub_vf_w, 8)
3188 
3189 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3190 {
3191     return float32_add(a, float16_to_float32(b, true, s), s);
3192 }
3193 
3194 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3195 {
3196     return float64_add(a, float32_to_float64(b, s), s);
3197 }
3198 
3199 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3200 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3201 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3202 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3203 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3204 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3205 GEN_VEXT_VF(vfwadd_wf_h, 4)
3206 GEN_VEXT_VF(vfwadd_wf_w, 8)
3207 
3208 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3209 {
3210     return float32_sub(a, float16_to_float32(b, true, s), s);
3211 }
3212 
3213 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3214 {
3215     return float64_sub(a, float32_to_float64(b, s), s);
3216 }
3217 
3218 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3219 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3220 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3221 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3222 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3223 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3224 GEN_VEXT_VF(vfwsub_wf_h, 4)
3225 GEN_VEXT_VF(vfwsub_wf_w, 8)
3226 
3227 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3228 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3229 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3230 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3231 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3232 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3233 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3234 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3235 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3236 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3237 GEN_VEXT_VF(vfmul_vf_h, 2)
3238 GEN_VEXT_VF(vfmul_vf_w, 4)
3239 GEN_VEXT_VF(vfmul_vf_d, 8)
3240 
3241 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3242 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3243 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3244 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3245 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3246 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3247 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3248 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3249 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3250 GEN_VEXT_VF(vfdiv_vf_h, 2)
3251 GEN_VEXT_VF(vfdiv_vf_w, 4)
3252 GEN_VEXT_VF(vfdiv_vf_d, 8)
3253 
3254 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3255 {
3256     return float16_div(b, a, s);
3257 }
3258 
3259 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3260 {
3261     return float32_div(b, a, s);
3262 }
3263 
3264 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3265 {
3266     return float64_div(b, a, s);
3267 }
3268 
3269 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3270 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3271 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3272 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3273 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3274 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3275 
3276 /* Vector Widening Floating-Point Multiply */
3277 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3278 {
3279     return float32_mul(float16_to_float32(a, true, s),
3280                        float16_to_float32(b, true, s), s);
3281 }
3282 
3283 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3284 {
3285     return float64_mul(float32_to_float64(a, s),
3286                        float32_to_float64(b, s), s);
3287 
3288 }
3289 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3290 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3291 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3292 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3293 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3294 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3295 GEN_VEXT_VF(vfwmul_vf_h, 4)
3296 GEN_VEXT_VF(vfwmul_vf_w, 8)
3297 
3298 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3299 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3300 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3301                       CPURISCVState *env)                          \
3302 {                                                                  \
3303     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3304     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3305     TD d = *((TD *)vd + HD(i));                                    \
3306     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3307 }
3308 
3309 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3310 {
3311     return float16_muladd(a, b, d, 0, s);
3312 }
3313 
3314 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3315 {
3316     return float32_muladd(a, b, d, 0, s);
3317 }
3318 
3319 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3320 {
3321     return float64_muladd(a, b, d, 0, s);
3322 }
3323 
3324 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3325 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3326 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3327 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3328 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3329 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3330 
3331 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3332 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3333                       CPURISCVState *env)                         \
3334 {                                                                 \
3335     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3336     TD d = *((TD *)vd + HD(i));                                   \
3337     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3338 }
3339 
3340 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3341 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3342 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3343 GEN_VEXT_VF(vfmacc_vf_h, 2)
3344 GEN_VEXT_VF(vfmacc_vf_w, 4)
3345 GEN_VEXT_VF(vfmacc_vf_d, 8)
3346 
3347 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3348 {
3349     return float16_muladd(a, b, d, float_muladd_negate_c |
3350                                    float_muladd_negate_product, s);
3351 }
3352 
3353 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3354 {
3355     return float32_muladd(a, b, d, float_muladd_negate_c |
3356                                    float_muladd_negate_product, s);
3357 }
3358 
3359 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3360 {
3361     return float64_muladd(a, b, d, float_muladd_negate_c |
3362                                    float_muladd_negate_product, s);
3363 }
3364 
3365 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3366 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3367 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3368 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3369 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3370 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3371 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3372 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3373 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3374 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3375 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3376 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3377 
3378 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3379 {
3380     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3381 }
3382 
3383 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3384 {
3385     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3386 }
3387 
3388 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3389 {
3390     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3391 }
3392 
3393 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3394 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3395 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3396 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3397 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3398 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3399 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3400 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3401 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3402 GEN_VEXT_VF(vfmsac_vf_h, 2)
3403 GEN_VEXT_VF(vfmsac_vf_w, 4)
3404 GEN_VEXT_VF(vfmsac_vf_d, 8)
3405 
3406 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3407 {
3408     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3409 }
3410 
3411 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3412 {
3413     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3414 }
3415 
3416 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3417 {
3418     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3419 }
3420 
3421 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3422 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3423 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3424 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3425 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3426 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3427 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3428 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3429 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3430 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3431 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3432 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3433 
3434 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3435 {
3436     return float16_muladd(d, b, a, 0, s);
3437 }
3438 
3439 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3440 {
3441     return float32_muladd(d, b, a, 0, s);
3442 }
3443 
3444 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3445 {
3446     return float64_muladd(d, b, a, 0, s);
3447 }
3448 
3449 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3450 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3451 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3452 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3453 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3454 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3455 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3456 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3457 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3458 GEN_VEXT_VF(vfmadd_vf_h, 2)
3459 GEN_VEXT_VF(vfmadd_vf_w, 4)
3460 GEN_VEXT_VF(vfmadd_vf_d, 8)
3461 
3462 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3463 {
3464     return float16_muladd(d, b, a, float_muladd_negate_c |
3465                                    float_muladd_negate_product, s);
3466 }
3467 
3468 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3469 {
3470     return float32_muladd(d, b, a, float_muladd_negate_c |
3471                                    float_muladd_negate_product, s);
3472 }
3473 
3474 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3475 {
3476     return float64_muladd(d, b, a, float_muladd_negate_c |
3477                                    float_muladd_negate_product, s);
3478 }
3479 
3480 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3481 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3482 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3483 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3484 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3485 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3486 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3487 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3488 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3489 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3490 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3491 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3492 
3493 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3494 {
3495     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3496 }
3497 
3498 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3499 {
3500     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3501 }
3502 
3503 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3504 {
3505     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3506 }
3507 
3508 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3509 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3510 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3511 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3512 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3513 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3514 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3515 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3516 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3517 GEN_VEXT_VF(vfmsub_vf_h, 2)
3518 GEN_VEXT_VF(vfmsub_vf_w, 4)
3519 GEN_VEXT_VF(vfmsub_vf_d, 8)
3520 
3521 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3522 {
3523     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3524 }
3525 
3526 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3527 {
3528     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3529 }
3530 
3531 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3532 {
3533     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3534 }
3535 
3536 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3537 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3538 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3539 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3540 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3541 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3542 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3543 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3544 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3545 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3546 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3547 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3548 
3549 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3550 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3551 {
3552     return float32_muladd(float16_to_float32(a, true, s),
3553                           float16_to_float32(b, true, s), d, 0, s);
3554 }
3555 
3556 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3557 {
3558     return float64_muladd(float32_to_float64(a, s),
3559                           float32_to_float64(b, s), d, 0, s);
3560 }
3561 
3562 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3563 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3564 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3565 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3566 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3567 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3568 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3569 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3570 
3571 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3572 {
3573     return float32_muladd(float16_to_float32(a, true, s),
3574                           float16_to_float32(b, true, s), d,
3575                           float_muladd_negate_c | float_muladd_negate_product,
3576                           s);
3577 }
3578 
3579 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3580 {
3581     return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s),
3582                           d, float_muladd_negate_c |
3583                              float_muladd_negate_product, s);
3584 }
3585 
3586 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3587 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3588 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3589 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3590 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3591 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3592 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3593 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3594 
3595 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3596 {
3597     return float32_muladd(float16_to_float32(a, true, s),
3598                           float16_to_float32(b, true, s), d,
3599                           float_muladd_negate_c, s);
3600 }
3601 
3602 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3603 {
3604     return float64_muladd(float32_to_float64(a, s),
3605                           float32_to_float64(b, s), d,
3606                           float_muladd_negate_c, s);
3607 }
3608 
3609 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3610 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3611 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3612 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3613 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3614 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3615 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3616 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3617 
3618 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3619 {
3620     return float32_muladd(float16_to_float32(a, true, s),
3621                           float16_to_float32(b, true, s), d,
3622                           float_muladd_negate_product, s);
3623 }
3624 
3625 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3626 {
3627     return float64_muladd(float32_to_float64(a, s),
3628                           float32_to_float64(b, s), d,
3629                           float_muladd_negate_product, s);
3630 }
3631 
3632 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3633 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3634 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3635 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3636 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3637 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3638 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3639 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3640 
3641 /* Vector Floating-Point Square-Root Instruction */
3642 /* (TD, T2, TX2) */
3643 #define OP_UU_H uint16_t, uint16_t, uint16_t
3644 #define OP_UU_W uint32_t, uint32_t, uint32_t
3645 #define OP_UU_D uint64_t, uint64_t, uint64_t
3646 
3647 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
3648 static void do_##NAME(void *vd, void *vs2, int i,      \
3649                       CPURISCVState *env)              \
3650 {                                                      \
3651     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3652     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3653 }
3654 
3655 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3656 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3657                   CPURISCVState *env, uint32_t desc)   \
3658 {                                                      \
3659     uint32_t vm = vext_vm(desc);                       \
3660     uint32_t vl = env->vl;                             \
3661     uint32_t total_elems =                             \
3662         vext_get_total_elems(env, desc, ESZ);          \
3663     uint32_t vta = vext_vta(desc);                     \
3664     uint32_t vma = vext_vma(desc);                     \
3665     uint32_t i;                                        \
3666                                                        \
3667     if (vl == 0) {                                     \
3668         return;                                        \
3669     }                                                  \
3670     for (i = env->vstart; i < vl; i++) {               \
3671         if (!vm && !vext_elem_mask(v0, i)) {           \
3672             /* set masked-off elements to 1s */        \
3673             vext_set_elems_1s(vd, vma, i * ESZ,        \
3674                               (i + 1) * ESZ);          \
3675             continue;                                  \
3676         }                                              \
3677         do_##NAME(vd, vs2, i, env);                    \
3678     }                                                  \
3679     env->vstart = 0;                                   \
3680     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3681                       total_elems * ESZ);              \
3682 }
3683 
3684 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3685 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3686 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3687 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3688 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3689 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3690 
3691 /*
3692  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3693  *
3694  * Adapted from riscv-v-spec recip.c:
3695  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3696  */
3697 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3698 {
3699     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3700     uint64_t exp = extract64(f, frac_size, exp_size);
3701     uint64_t frac = extract64(f, 0, frac_size);
3702 
3703     const uint8_t lookup_table[] = {
3704         52, 51, 50, 48, 47, 46, 44, 43,
3705         42, 41, 40, 39, 38, 36, 35, 34,
3706         33, 32, 31, 30, 30, 29, 28, 27,
3707         26, 25, 24, 23, 23, 22, 21, 20,
3708         19, 19, 18, 17, 16, 16, 15, 14,
3709         14, 13, 12, 12, 11, 10, 10, 9,
3710         9, 8, 7, 7, 6, 6, 5, 4,
3711         4, 3, 3, 2, 2, 1, 1, 0,
3712         127, 125, 123, 121, 119, 118, 116, 114,
3713         113, 111, 109, 108, 106, 105, 103, 102,
3714         100, 99, 97, 96, 95, 93, 92, 91,
3715         90, 88, 87, 86, 85, 84, 83, 82,
3716         80, 79, 78, 77, 76, 75, 74, 73,
3717         72, 71, 70, 70, 69, 68, 67, 66,
3718         65, 64, 63, 63, 62, 61, 60, 59,
3719         59, 58, 57, 56, 56, 55, 54, 53
3720     };
3721     const int precision = 7;
3722 
3723     if (exp == 0 && frac != 0) { /* subnormal */
3724         /* Normalize the subnormal. */
3725         while (extract64(frac, frac_size - 1, 1) == 0) {
3726             exp--;
3727             frac <<= 1;
3728         }
3729 
3730         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3731     }
3732 
3733     int idx = ((exp & 1) << (precision - 1)) |
3734               (frac >> (frac_size - precision + 1));
3735     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3736                         (frac_size - precision);
3737     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3738 
3739     uint64_t val = 0;
3740     val = deposit64(val, 0, frac_size, out_frac);
3741     val = deposit64(val, frac_size, exp_size, out_exp);
3742     val = deposit64(val, frac_size + exp_size, 1, sign);
3743     return val;
3744 }
3745 
3746 static float16 frsqrt7_h(float16 f, float_status *s)
3747 {
3748     int exp_size = 5, frac_size = 10;
3749     bool sign = float16_is_neg(f);
3750 
3751     /*
3752      * frsqrt7(sNaN) = canonical NaN
3753      * frsqrt7(-inf) = canonical NaN
3754      * frsqrt7(-normal) = canonical NaN
3755      * frsqrt7(-subnormal) = canonical NaN
3756      */
3757     if (float16_is_signaling_nan(f, s) ||
3758         (float16_is_infinity(f) && sign) ||
3759         (float16_is_normal(f) && sign) ||
3760         (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3761         s->float_exception_flags |= float_flag_invalid;
3762         return float16_default_nan(s);
3763     }
3764 
3765     /* frsqrt7(qNaN) = canonical NaN */
3766     if (float16_is_quiet_nan(f, s)) {
3767         return float16_default_nan(s);
3768     }
3769 
3770     /* frsqrt7(+-0) = +-inf */
3771     if (float16_is_zero(f)) {
3772         s->float_exception_flags |= float_flag_divbyzero;
3773         return float16_set_sign(float16_infinity, sign);
3774     }
3775 
3776     /* frsqrt7(+inf) = +0 */
3777     if (float16_is_infinity(f) && !sign) {
3778         return float16_set_sign(float16_zero, sign);
3779     }
3780 
3781     /* +normal, +subnormal */
3782     uint64_t val = frsqrt7(f, exp_size, frac_size);
3783     return make_float16(val);
3784 }
3785 
3786 static float32 frsqrt7_s(float32 f, float_status *s)
3787 {
3788     int exp_size = 8, frac_size = 23;
3789     bool sign = float32_is_neg(f);
3790 
3791     /*
3792      * frsqrt7(sNaN) = canonical NaN
3793      * frsqrt7(-inf) = canonical NaN
3794      * frsqrt7(-normal) = canonical NaN
3795      * frsqrt7(-subnormal) = canonical NaN
3796      */
3797     if (float32_is_signaling_nan(f, s) ||
3798         (float32_is_infinity(f) && sign) ||
3799         (float32_is_normal(f) && sign) ||
3800         (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3801         s->float_exception_flags |= float_flag_invalid;
3802         return float32_default_nan(s);
3803     }
3804 
3805     /* frsqrt7(qNaN) = canonical NaN */
3806     if (float32_is_quiet_nan(f, s)) {
3807         return float32_default_nan(s);
3808     }
3809 
3810     /* frsqrt7(+-0) = +-inf */
3811     if (float32_is_zero(f)) {
3812         s->float_exception_flags |= float_flag_divbyzero;
3813         return float32_set_sign(float32_infinity, sign);
3814     }
3815 
3816     /* frsqrt7(+inf) = +0 */
3817     if (float32_is_infinity(f) && !sign) {
3818         return float32_set_sign(float32_zero, sign);
3819     }
3820 
3821     /* +normal, +subnormal */
3822     uint64_t val = frsqrt7(f, exp_size, frac_size);
3823     return make_float32(val);
3824 }
3825 
3826 static float64 frsqrt7_d(float64 f, float_status *s)
3827 {
3828     int exp_size = 11, frac_size = 52;
3829     bool sign = float64_is_neg(f);
3830 
3831     /*
3832      * frsqrt7(sNaN) = canonical NaN
3833      * frsqrt7(-inf) = canonical NaN
3834      * frsqrt7(-normal) = canonical NaN
3835      * frsqrt7(-subnormal) = canonical NaN
3836      */
3837     if (float64_is_signaling_nan(f, s) ||
3838         (float64_is_infinity(f) && sign) ||
3839         (float64_is_normal(f) && sign) ||
3840         (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3841         s->float_exception_flags |= float_flag_invalid;
3842         return float64_default_nan(s);
3843     }
3844 
3845     /* frsqrt7(qNaN) = canonical NaN */
3846     if (float64_is_quiet_nan(f, s)) {
3847         return float64_default_nan(s);
3848     }
3849 
3850     /* frsqrt7(+-0) = +-inf */
3851     if (float64_is_zero(f)) {
3852         s->float_exception_flags |= float_flag_divbyzero;
3853         return float64_set_sign(float64_infinity, sign);
3854     }
3855 
3856     /* frsqrt7(+inf) = +0 */
3857     if (float64_is_infinity(f) && !sign) {
3858         return float64_set_sign(float64_zero, sign);
3859     }
3860 
3861     /* +normal, +subnormal */
3862     uint64_t val = frsqrt7(f, exp_size, frac_size);
3863     return make_float64(val);
3864 }
3865 
3866 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3867 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3868 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3869 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3870 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3871 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3872 
3873 /*
3874  * Vector Floating-Point Reciprocal Estimate Instruction
3875  *
3876  * Adapted from riscv-v-spec recip.c:
3877  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3878  */
3879 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3880                       float_status *s)
3881 {
3882     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3883     uint64_t exp = extract64(f, frac_size, exp_size);
3884     uint64_t frac = extract64(f, 0, frac_size);
3885 
3886     const uint8_t lookup_table[] = {
3887         127, 125, 123, 121, 119, 117, 116, 114,
3888         112, 110, 109, 107, 105, 104, 102, 100,
3889         99, 97, 96, 94, 93, 91, 90, 88,
3890         87, 85, 84, 83, 81, 80, 79, 77,
3891         76, 75, 74, 72, 71, 70, 69, 68,
3892         66, 65, 64, 63, 62, 61, 60, 59,
3893         58, 57, 56, 55, 54, 53, 52, 51,
3894         50, 49, 48, 47, 46, 45, 44, 43,
3895         42, 41, 40, 40, 39, 38, 37, 36,
3896         35, 35, 34, 33, 32, 31, 31, 30,
3897         29, 28, 28, 27, 26, 25, 25, 24,
3898         23, 23, 22, 21, 21, 20, 19, 19,
3899         18, 17, 17, 16, 15, 15, 14, 14,
3900         13, 12, 12, 11, 11, 10, 9, 9,
3901         8, 8, 7, 7, 6, 5, 5, 4,
3902         4, 3, 3, 2, 2, 1, 1, 0
3903     };
3904     const int precision = 7;
3905 
3906     if (exp == 0 && frac != 0) { /* subnormal */
3907         /* Normalize the subnormal. */
3908         while (extract64(frac, frac_size - 1, 1) == 0) {
3909             exp--;
3910             frac <<= 1;
3911         }
3912 
3913         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3914 
3915         if (exp != 0 && exp != UINT64_MAX) {
3916             /*
3917              * Overflow to inf or max value of same sign,
3918              * depending on sign and rounding mode.
3919              */
3920             s->float_exception_flags |= (float_flag_inexact |
3921                                          float_flag_overflow);
3922 
3923             if ((s->float_rounding_mode == float_round_to_zero) ||
3924                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3925                 ((s->float_rounding_mode == float_round_up) && sign)) {
3926                 /* Return greatest/negative finite value. */
3927                 return (sign << (exp_size + frac_size)) |
3928                        (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3929             } else {
3930                 /* Return +-inf. */
3931                 return (sign << (exp_size + frac_size)) |
3932                        MAKE_64BIT_MASK(frac_size, exp_size);
3933             }
3934         }
3935     }
3936 
3937     int idx = frac >> (frac_size - precision);
3938     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3939                         (frac_size - precision);
3940     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3941 
3942     if (out_exp == 0 || out_exp == UINT64_MAX) {
3943         /*
3944          * The result is subnormal, but don't raise the underflow exception,
3945          * because there's no additional loss of precision.
3946          */
3947         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3948         if (out_exp == UINT64_MAX) {
3949             out_frac >>= 1;
3950             out_exp = 0;
3951         }
3952     }
3953 
3954     uint64_t val = 0;
3955     val = deposit64(val, 0, frac_size, out_frac);
3956     val = deposit64(val, frac_size, exp_size, out_exp);
3957     val = deposit64(val, frac_size + exp_size, 1, sign);
3958     return val;
3959 }
3960 
3961 static float16 frec7_h(float16 f, float_status *s)
3962 {
3963     int exp_size = 5, frac_size = 10;
3964     bool sign = float16_is_neg(f);
3965 
3966     /* frec7(+-inf) = +-0 */
3967     if (float16_is_infinity(f)) {
3968         return float16_set_sign(float16_zero, sign);
3969     }
3970 
3971     /* frec7(+-0) = +-inf */
3972     if (float16_is_zero(f)) {
3973         s->float_exception_flags |= float_flag_divbyzero;
3974         return float16_set_sign(float16_infinity, sign);
3975     }
3976 
3977     /* frec7(sNaN) = canonical NaN */
3978     if (float16_is_signaling_nan(f, s)) {
3979         s->float_exception_flags |= float_flag_invalid;
3980         return float16_default_nan(s);
3981     }
3982 
3983     /* frec7(qNaN) = canonical NaN */
3984     if (float16_is_quiet_nan(f, s)) {
3985         return float16_default_nan(s);
3986     }
3987 
3988     /* +-normal, +-subnormal */
3989     uint64_t val = frec7(f, exp_size, frac_size, s);
3990     return make_float16(val);
3991 }
3992 
3993 static float32 frec7_s(float32 f, float_status *s)
3994 {
3995     int exp_size = 8, frac_size = 23;
3996     bool sign = float32_is_neg(f);
3997 
3998     /* frec7(+-inf) = +-0 */
3999     if (float32_is_infinity(f)) {
4000         return float32_set_sign(float32_zero, sign);
4001     }
4002 
4003     /* frec7(+-0) = +-inf */
4004     if (float32_is_zero(f)) {
4005         s->float_exception_flags |= float_flag_divbyzero;
4006         return float32_set_sign(float32_infinity, sign);
4007     }
4008 
4009     /* frec7(sNaN) = canonical NaN */
4010     if (float32_is_signaling_nan(f, s)) {
4011         s->float_exception_flags |= float_flag_invalid;
4012         return float32_default_nan(s);
4013     }
4014 
4015     /* frec7(qNaN) = canonical NaN */
4016     if (float32_is_quiet_nan(f, s)) {
4017         return float32_default_nan(s);
4018     }
4019 
4020     /* +-normal, +-subnormal */
4021     uint64_t val = frec7(f, exp_size, frac_size, s);
4022     return make_float32(val);
4023 }
4024 
4025 static float64 frec7_d(float64 f, float_status *s)
4026 {
4027     int exp_size = 11, frac_size = 52;
4028     bool sign = float64_is_neg(f);
4029 
4030     /* frec7(+-inf) = +-0 */
4031     if (float64_is_infinity(f)) {
4032         return float64_set_sign(float64_zero, sign);
4033     }
4034 
4035     /* frec7(+-0) = +-inf */
4036     if (float64_is_zero(f)) {
4037         s->float_exception_flags |= float_flag_divbyzero;
4038         return float64_set_sign(float64_infinity, sign);
4039     }
4040 
4041     /* frec7(sNaN) = canonical NaN */
4042     if (float64_is_signaling_nan(f, s)) {
4043         s->float_exception_flags |= float_flag_invalid;
4044         return float64_default_nan(s);
4045     }
4046 
4047     /* frec7(qNaN) = canonical NaN */
4048     if (float64_is_quiet_nan(f, s)) {
4049         return float64_default_nan(s);
4050     }
4051 
4052     /* +-normal, +-subnormal */
4053     uint64_t val = frec7(f, exp_size, frac_size, s);
4054     return make_float64(val);
4055 }
4056 
4057 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4058 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4059 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4060 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4061 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4062 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4063 
4064 /* Vector Floating-Point MIN/MAX Instructions */
4065 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4066 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4067 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4068 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4069 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4070 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4071 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4072 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4073 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4074 GEN_VEXT_VF(vfmin_vf_h, 2)
4075 GEN_VEXT_VF(vfmin_vf_w, 4)
4076 GEN_VEXT_VF(vfmin_vf_d, 8)
4077 
4078 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4079 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4080 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4081 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4082 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4083 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4084 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4085 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4086 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4087 GEN_VEXT_VF(vfmax_vf_h, 2)
4088 GEN_VEXT_VF(vfmax_vf_w, 4)
4089 GEN_VEXT_VF(vfmax_vf_d, 8)
4090 
4091 /* Vector Floating-Point Sign-Injection Instructions */
4092 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4093 {
4094     return deposit64(b, 0, 15, a);
4095 }
4096 
4097 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4098 {
4099     return deposit64(b, 0, 31, a);
4100 }
4101 
4102 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4103 {
4104     return deposit64(b, 0, 63, a);
4105 }
4106 
4107 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4108 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4109 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4110 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4111 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4112 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4113 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4114 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4115 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4116 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4117 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4118 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4119 
4120 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4121 {
4122     return deposit64(~b, 0, 15, a);
4123 }
4124 
4125 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4126 {
4127     return deposit64(~b, 0, 31, a);
4128 }
4129 
4130 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4131 {
4132     return deposit64(~b, 0, 63, a);
4133 }
4134 
4135 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4136 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4137 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4138 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4139 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4140 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4141 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4142 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4143 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4144 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4145 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4146 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4147 
4148 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4149 {
4150     return deposit64(b ^ a, 0, 15, a);
4151 }
4152 
4153 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4154 {
4155     return deposit64(b ^ a, 0, 31, a);
4156 }
4157 
4158 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4159 {
4160     return deposit64(b ^ a, 0, 63, a);
4161 }
4162 
4163 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4164 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4165 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4166 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4167 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4168 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4169 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4170 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4171 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4172 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4173 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4174 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4175 
4176 /* Vector Floating-Point Compare Instructions */
4177 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4178 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4179                   CPURISCVState *env, uint32_t desc)          \
4180 {                                                             \
4181     uint32_t vm = vext_vm(desc);                              \
4182     uint32_t vl = env->vl;                                    \
4183     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;          \
4184     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4185     uint32_t vma = vext_vma(desc);                            \
4186     uint32_t i;                                               \
4187                                                               \
4188     for (i = env->vstart; i < vl; i++) {                      \
4189         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4190         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4191         if (!vm && !vext_elem_mask(v0, i)) {                  \
4192             /* set masked-off elements to 1s */               \
4193             if (vma) {                                        \
4194                 vext_set_elem_mask(vd, i, 1);                 \
4195             }                                                 \
4196             continue;                                         \
4197         }                                                     \
4198         vext_set_elem_mask(vd, i,                             \
4199                            DO_OP(s2, s1, &env->fp_status));   \
4200     }                                                         \
4201     env->vstart = 0;                                          \
4202     /*
4203      * mask destination register are always tail-agnostic
4204      * set tail elements to 1s
4205      */                                                       \
4206     if (vta_all_1s) {                                         \
4207         for (; i < total_elems; i++) {                        \
4208             vext_set_elem_mask(vd, i, 1);                     \
4209         }                                                     \
4210     }                                                         \
4211 }
4212 
4213 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4214 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4215 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4216 
4217 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4218 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4219                   CPURISCVState *env, uint32_t desc)                \
4220 {                                                                   \
4221     uint32_t vm = vext_vm(desc);                                    \
4222     uint32_t vl = env->vl;                                          \
4223     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;                \
4224     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4225     uint32_t vma = vext_vma(desc);                                  \
4226     uint32_t i;                                                     \
4227                                                                     \
4228     for (i = env->vstart; i < vl; i++) {                            \
4229         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4230         if (!vm && !vext_elem_mask(v0, i)) {                        \
4231             /* set masked-off elements to 1s */                     \
4232             if (vma) {                                              \
4233                 vext_set_elem_mask(vd, i, 1);                       \
4234             }                                                       \
4235             continue;                                               \
4236         }                                                           \
4237         vext_set_elem_mask(vd, i,                                   \
4238                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4239     }                                                               \
4240     env->vstart = 0;                                                \
4241     /*
4242      * mask destination register are always tail-agnostic
4243      * set tail elements to 1s
4244      */                                                             \
4245     if (vta_all_1s) {                                               \
4246         for (; i < total_elems; i++) {                              \
4247             vext_set_elem_mask(vd, i, 1);                           \
4248         }                                                           \
4249     }                                                               \
4250 }
4251 
4252 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4253 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4254 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4255 
4256 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4257 {
4258     FloatRelation compare = float16_compare_quiet(a, b, s);
4259     return compare != float_relation_equal;
4260 }
4261 
4262 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4263 {
4264     FloatRelation compare = float32_compare_quiet(a, b, s);
4265     return compare != float_relation_equal;
4266 }
4267 
4268 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4269 {
4270     FloatRelation compare = float64_compare_quiet(a, b, s);
4271     return compare != float_relation_equal;
4272 }
4273 
4274 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4275 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4276 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4277 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4278 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4279 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4280 
4281 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4282 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4283 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4284 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4285 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4286 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4287 
4288 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4289 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4290 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4291 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4292 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4293 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4294 
4295 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4296 {
4297     FloatRelation compare = float16_compare(a, b, s);
4298     return compare == float_relation_greater;
4299 }
4300 
4301 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4302 {
4303     FloatRelation compare = float32_compare(a, b, s);
4304     return compare == float_relation_greater;
4305 }
4306 
4307 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4308 {
4309     FloatRelation compare = float64_compare(a, b, s);
4310     return compare == float_relation_greater;
4311 }
4312 
4313 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4314 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4315 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4316 
4317 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4318 {
4319     FloatRelation compare = float16_compare(a, b, s);
4320     return compare == float_relation_greater ||
4321            compare == float_relation_equal;
4322 }
4323 
4324 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4325 {
4326     FloatRelation compare = float32_compare(a, b, s);
4327     return compare == float_relation_greater ||
4328            compare == float_relation_equal;
4329 }
4330 
4331 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4332 {
4333     FloatRelation compare = float64_compare(a, b, s);
4334     return compare == float_relation_greater ||
4335            compare == float_relation_equal;
4336 }
4337 
4338 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4339 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4340 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4341 
4342 /* Vector Floating-Point Classify Instruction */
4343 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4344 static void do_##NAME(void *vd, void *vs2, int i)      \
4345 {                                                      \
4346     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4347     *((TD *)vd + HD(i)) = OP(s2);                      \
4348 }
4349 
4350 #define GEN_VEXT_V(NAME, ESZ)                          \
4351 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4352                   CPURISCVState *env, uint32_t desc)   \
4353 {                                                      \
4354     uint32_t vm = vext_vm(desc);                       \
4355     uint32_t vl = env->vl;                             \
4356     uint32_t total_elems =                             \
4357         vext_get_total_elems(env, desc, ESZ);          \
4358     uint32_t vta = vext_vta(desc);                     \
4359     uint32_t vma = vext_vma(desc);                     \
4360     uint32_t i;                                        \
4361                                                        \
4362     for (i = env->vstart; i < vl; i++) {               \
4363         if (!vm && !vext_elem_mask(v0, i)) {           \
4364             /* set masked-off elements to 1s */        \
4365             vext_set_elems_1s(vd, vma, i * ESZ,        \
4366                               (i + 1) * ESZ);          \
4367             continue;                                  \
4368         }                                              \
4369         do_##NAME(vd, vs2, i);                         \
4370     }                                                  \
4371     env->vstart = 0;                                   \
4372     /* set tail elements to 1s */                      \
4373     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4374                       total_elems * ESZ);              \
4375 }
4376 
4377 target_ulong fclass_h(uint64_t frs1)
4378 {
4379     float16 f = frs1;
4380     bool sign = float16_is_neg(f);
4381 
4382     if (float16_is_infinity(f)) {
4383         return sign ? 1 << 0 : 1 << 7;
4384     } else if (float16_is_zero(f)) {
4385         return sign ? 1 << 3 : 1 << 4;
4386     } else if (float16_is_zero_or_denormal(f)) {
4387         return sign ? 1 << 2 : 1 << 5;
4388     } else if (float16_is_any_nan(f)) {
4389         float_status s = { }; /* for snan_bit_is_one */
4390         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4391     } else {
4392         return sign ? 1 << 1 : 1 << 6;
4393     }
4394 }
4395 
4396 target_ulong fclass_s(uint64_t frs1)
4397 {
4398     float32 f = frs1;
4399     bool sign = float32_is_neg(f);
4400 
4401     if (float32_is_infinity(f)) {
4402         return sign ? 1 << 0 : 1 << 7;
4403     } else if (float32_is_zero(f)) {
4404         return sign ? 1 << 3 : 1 << 4;
4405     } else if (float32_is_zero_or_denormal(f)) {
4406         return sign ? 1 << 2 : 1 << 5;
4407     } else if (float32_is_any_nan(f)) {
4408         float_status s = { }; /* for snan_bit_is_one */
4409         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4410     } else {
4411         return sign ? 1 << 1 : 1 << 6;
4412     }
4413 }
4414 
4415 target_ulong fclass_d(uint64_t frs1)
4416 {
4417     float64 f = frs1;
4418     bool sign = float64_is_neg(f);
4419 
4420     if (float64_is_infinity(f)) {
4421         return sign ? 1 << 0 : 1 << 7;
4422     } else if (float64_is_zero(f)) {
4423         return sign ? 1 << 3 : 1 << 4;
4424     } else if (float64_is_zero_or_denormal(f)) {
4425         return sign ? 1 << 2 : 1 << 5;
4426     } else if (float64_is_any_nan(f)) {
4427         float_status s = { }; /* for snan_bit_is_one */
4428         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4429     } else {
4430         return sign ? 1 << 1 : 1 << 6;
4431     }
4432 }
4433 
4434 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4435 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4436 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4437 GEN_VEXT_V(vfclass_v_h, 2)
4438 GEN_VEXT_V(vfclass_v_w, 4)
4439 GEN_VEXT_V(vfclass_v_d, 8)
4440 
4441 /* Vector Floating-Point Merge Instruction */
4442 
4443 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4444 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4445                   CPURISCVState *env, uint32_t desc)          \
4446 {                                                             \
4447     uint32_t vm = vext_vm(desc);                              \
4448     uint32_t vl = env->vl;                                    \
4449     uint32_t esz = sizeof(ETYPE);                             \
4450     uint32_t total_elems =                                    \
4451         vext_get_total_elems(env, desc, esz);                 \
4452     uint32_t vta = vext_vta(desc);                            \
4453     uint32_t i;                                               \
4454                                                               \
4455     for (i = env->vstart; i < vl; i++) {                      \
4456         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4457         *((ETYPE *)vd + H(i)) =                               \
4458             (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4459     }                                                         \
4460     env->vstart = 0;                                          \
4461     /* set tail elements to 1s */                             \
4462     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4463 }
4464 
4465 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4466 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4467 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4468 
4469 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4470 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4471 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4472 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4473 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4474 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4475 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4476 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4477 
4478 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4479 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4480 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4481 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4482 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4483 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4484 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4485 
4486 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4487 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4488 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4489 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4490 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4491 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4492 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4493 
4494 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4495 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4496 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4497 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4498 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4499 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4500 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4501 
4502 /* Widening Floating-Point/Integer Type-Convert Instructions */
4503 /* (TD, T2, TX2) */
4504 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4505 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4506 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4507 /*
4508  * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.
4509  */
4510 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4511 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4512 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4513 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4514 
4515 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4516 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4517 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4518 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4519 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4520 
4521 /*
4522  * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float.
4523  */
4524 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4525 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4526 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4527 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4528 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4529 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4530 
4531 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4532 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4533 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4534 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4535 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4536 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4537 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4538 
4539 /*
4540  * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float.
4541  */
4542 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4543 {
4544     return float16_to_float32(a, true, s);
4545 }
4546 
4547 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4548 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4549 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4550 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4551 
4552 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4553 /* (TD, T2, TX2) */
4554 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4555 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4556 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4557 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4558 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4559 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4560 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4561 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4562 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4563 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4564 
4565 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4566 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4567 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4568 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4569 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4570 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4571 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4572 
4573 /*
4574  * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float.
4575  */
4576 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4577 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4578 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4579 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4580 
4581 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4582 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4583 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4584 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4585 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4586 
4587 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4588 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4589 {
4590     return float32_to_float16(a, true, s);
4591 }
4592 
4593 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4594 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4595 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4596 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4597 
4598 /*
4599  * Vector Reduction Operations
4600  */
4601 /* Vector Single-Width Integer Reduction Instructions */
4602 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4603 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4604                   void *vs2, CPURISCVState *env,          \
4605                   uint32_t desc)                          \
4606 {                                                         \
4607     uint32_t vm = vext_vm(desc);                          \
4608     uint32_t vl = env->vl;                                \
4609     uint32_t esz = sizeof(TD);                            \
4610     uint32_t vlenb = simd_maxsz(desc);                    \
4611     uint32_t vta = vext_vta(desc);                        \
4612     uint32_t i;                                           \
4613     TD s1 =  *((TD *)vs1 + HD(0));                        \
4614                                                           \
4615     for (i = env->vstart; i < vl; i++) {                  \
4616         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4617         if (!vm && !vext_elem_mask(v0, i)) {              \
4618             continue;                                     \
4619         }                                                 \
4620         s1 = OP(s1, (TD)s2);                              \
4621     }                                                     \
4622     *((TD *)vd + HD(0)) = s1;                             \
4623     env->vstart = 0;                                      \
4624     /* set tail elements to 1s */                         \
4625     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4626 }
4627 
4628 /* vd[0] = sum(vs1[0], vs2[*]) */
4629 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4630 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4631 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4632 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4633 
4634 /* vd[0] = maxu(vs1[0], vs2[*]) */
4635 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4636 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4637 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4638 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4639 
4640 /* vd[0] = max(vs1[0], vs2[*]) */
4641 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4642 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4643 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4644 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4645 
4646 /* vd[0] = minu(vs1[0], vs2[*]) */
4647 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4648 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4649 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4650 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4651 
4652 /* vd[0] = min(vs1[0], vs2[*]) */
4653 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4654 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4655 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4656 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4657 
4658 /* vd[0] = and(vs1[0], vs2[*]) */
4659 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4660 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4661 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4662 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4663 
4664 /* vd[0] = or(vs1[0], vs2[*]) */
4665 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4666 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4667 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4668 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4669 
4670 /* vd[0] = xor(vs1[0], vs2[*]) */
4671 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4672 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4673 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4674 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4675 
4676 /* Vector Widening Integer Reduction Instructions */
4677 /* signed sum reduction into double-width accumulator */
4678 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4679 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4680 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4681 
4682 /* Unsigned sum reduction into double-width accumulator */
4683 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4684 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4685 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4686 
4687 /* Vector Single-Width Floating-Point Reduction Instructions */
4688 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4689 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4690                   void *vs2, CPURISCVState *env,           \
4691                   uint32_t desc)                           \
4692 {                                                          \
4693     uint32_t vm = vext_vm(desc);                           \
4694     uint32_t vl = env->vl;                                 \
4695     uint32_t esz = sizeof(TD);                             \
4696     uint32_t vlenb = simd_maxsz(desc);                     \
4697     uint32_t vta = vext_vta(desc);                         \
4698     uint32_t i;                                            \
4699     TD s1 =  *((TD *)vs1 + HD(0));                         \
4700                                                            \
4701     for (i = env->vstart; i < vl; i++) {                   \
4702         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4703         if (!vm && !vext_elem_mask(v0, i)) {               \
4704             continue;                                      \
4705         }                                                  \
4706         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4707     }                                                      \
4708     *((TD *)vd + HD(0)) = s1;                              \
4709     env->vstart = 0;                                       \
4710     /* set tail elements to 1s */                          \
4711     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4712 }
4713 
4714 /* Unordered sum */
4715 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4716 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4717 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4718 
4719 /* Ordered sum */
4720 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4721 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4722 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4723 
4724 /* Maximum value */
4725 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2,
4726               float16_maximum_number)
4727 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4,
4728               float32_maximum_number)
4729 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8,
4730               float64_maximum_number)
4731 
4732 /* Minimum value */
4733 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2,
4734               float16_minimum_number)
4735 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4,
4736               float32_minimum_number)
4737 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8,
4738               float64_minimum_number)
4739 
4740 /* Vector Widening Floating-Point Add Instructions */
4741 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4742 {
4743     return float32_add(a, float16_to_float32(b, true, s), s);
4744 }
4745 
4746 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4747 {
4748     return float64_add(a, float32_to_float64(b, s), s);
4749 }
4750 
4751 /* Vector Widening Floating-Point Reduction Instructions */
4752 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4753 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4754 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4755 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4756 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4757 
4758 /*
4759  * Vector Mask Operations
4760  */
4761 /* Vector Mask-Register Logical Instructions */
4762 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4763 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4764                   void *vs2, CPURISCVState *env,          \
4765                   uint32_t desc)                          \
4766 {                                                         \
4767     uint32_t vl = env->vl;                                \
4768     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;      \
4769     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4770     uint32_t i;                                           \
4771     int a, b;                                             \
4772                                                           \
4773     for (i = env->vstart; i < vl; i++) {                  \
4774         a = vext_elem_mask(vs1, i);                       \
4775         b = vext_elem_mask(vs2, i);                       \
4776         vext_set_elem_mask(vd, i, OP(b, a));              \
4777     }                                                     \
4778     env->vstart = 0;                                      \
4779     /*
4780      * mask destination register are always tail-agnostic
4781      * set tail elements to 1s
4782      */                                                   \
4783     if (vta_all_1s) {                                     \
4784         for (; i < total_elems; i++) {                    \
4785             vext_set_elem_mask(vd, i, 1);                 \
4786         }                                                 \
4787     }                                                     \
4788 }
4789 
4790 #define DO_NAND(N, M)  (!(N & M))
4791 #define DO_ANDNOT(N, M)  (N & !M)
4792 #define DO_NOR(N, M)  (!(N | M))
4793 #define DO_ORNOT(N, M)  (N | !M)
4794 #define DO_XNOR(N, M)  (!(N ^ M))
4795 
4796 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4797 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4798 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4799 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4800 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4801 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4802 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4803 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4804 
4805 /* Vector count population in mask vcpop */
4806 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4807                              uint32_t desc)
4808 {
4809     target_ulong cnt = 0;
4810     uint32_t vm = vext_vm(desc);
4811     uint32_t vl = env->vl;
4812     int i;
4813 
4814     for (i = env->vstart; i < vl; i++) {
4815         if (vm || vext_elem_mask(v0, i)) {
4816             if (vext_elem_mask(vs2, i)) {
4817                 cnt++;
4818             }
4819         }
4820     }
4821     env->vstart = 0;
4822     return cnt;
4823 }
4824 
4825 /* vfirst find-first-set mask bit */
4826 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4827                               uint32_t desc)
4828 {
4829     uint32_t vm = vext_vm(desc);
4830     uint32_t vl = env->vl;
4831     int i;
4832 
4833     for (i = env->vstart; i < vl; i++) {
4834         if (vm || vext_elem_mask(v0, i)) {
4835             if (vext_elem_mask(vs2, i)) {
4836                 return i;
4837             }
4838         }
4839     }
4840     env->vstart = 0;
4841     return -1LL;
4842 }
4843 
4844 enum set_mask_type {
4845     ONLY_FIRST = 1,
4846     INCLUDE_FIRST,
4847     BEFORE_FIRST,
4848 };
4849 
4850 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4851                    uint32_t desc, enum set_mask_type type)
4852 {
4853     uint32_t vm = vext_vm(desc);
4854     uint32_t vl = env->vl;
4855     uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4856     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4857     uint32_t vma = vext_vma(desc);
4858     int i;
4859     bool first_mask_bit = false;
4860 
4861     for (i = env->vstart; i < vl; i++) {
4862         if (!vm && !vext_elem_mask(v0, i)) {
4863             /* set masked-off elements to 1s */
4864             if (vma) {
4865                 vext_set_elem_mask(vd, i, 1);
4866             }
4867             continue;
4868         }
4869         /* write a zero to all following active elements */
4870         if (first_mask_bit) {
4871             vext_set_elem_mask(vd, i, 0);
4872             continue;
4873         }
4874         if (vext_elem_mask(vs2, i)) {
4875             first_mask_bit = true;
4876             if (type == BEFORE_FIRST) {
4877                 vext_set_elem_mask(vd, i, 0);
4878             } else {
4879                 vext_set_elem_mask(vd, i, 1);
4880             }
4881         } else {
4882             if (type == ONLY_FIRST) {
4883                 vext_set_elem_mask(vd, i, 0);
4884             } else {
4885                 vext_set_elem_mask(vd, i, 1);
4886             }
4887         }
4888     }
4889     env->vstart = 0;
4890     /*
4891      * mask destination register are always tail-agnostic
4892      * set tail elements to 1s
4893      */
4894     if (vta_all_1s) {
4895         for (; i < total_elems; i++) {
4896             vext_set_elem_mask(vd, i, 1);
4897         }
4898     }
4899 }
4900 
4901 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4902                      uint32_t desc)
4903 {
4904     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4905 }
4906 
4907 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4908                      uint32_t desc)
4909 {
4910     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4911 }
4912 
4913 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4914                      uint32_t desc)
4915 {
4916     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4917 }
4918 
4919 /* Vector Iota Instruction */
4920 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4921 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4922                   uint32_t desc)                                          \
4923 {                                                                         \
4924     uint32_t vm = vext_vm(desc);                                          \
4925     uint32_t vl = env->vl;                                                \
4926     uint32_t esz = sizeof(ETYPE);                                         \
4927     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4928     uint32_t vta = vext_vta(desc);                                        \
4929     uint32_t vma = vext_vma(desc);                                        \
4930     uint32_t sum = 0;                                                     \
4931     int i;                                                                \
4932                                                                           \
4933     for (i = env->vstart; i < vl; i++) {                                  \
4934         if (!vm && !vext_elem_mask(v0, i)) {                              \
4935             /* set masked-off elements to 1s */                           \
4936             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4937             continue;                                                     \
4938         }                                                                 \
4939         *((ETYPE *)vd + H(i)) = sum;                                      \
4940         if (vext_elem_mask(vs2, i)) {                                     \
4941             sum++;                                                        \
4942         }                                                                 \
4943     }                                                                     \
4944     env->vstart = 0;                                                      \
4945     /* set tail elements to 1s */                                         \
4946     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4947 }
4948 
4949 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4950 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4951 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4952 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4953 
4954 /* Vector Element Index Instruction */
4955 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4956 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4957 {                                                                         \
4958     uint32_t vm = vext_vm(desc);                                          \
4959     uint32_t vl = env->vl;                                                \
4960     uint32_t esz = sizeof(ETYPE);                                         \
4961     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4962     uint32_t vta = vext_vta(desc);                                        \
4963     uint32_t vma = vext_vma(desc);                                        \
4964     int i;                                                                \
4965                                                                           \
4966     for (i = env->vstart; i < vl; i++) {                                  \
4967         if (!vm && !vext_elem_mask(v0, i)) {                              \
4968             /* set masked-off elements to 1s */                           \
4969             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
4970             continue;                                                     \
4971         }                                                                 \
4972         *((ETYPE *)vd + H(i)) = i;                                        \
4973     }                                                                     \
4974     env->vstart = 0;                                                      \
4975     /* set tail elements to 1s */                                         \
4976     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4977 }
4978 
4979 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4980 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4981 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4982 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4983 
4984 /*
4985  * Vector Permutation Instructions
4986  */
4987 
4988 /* Vector Slide Instructions */
4989 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4990 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4991                   CPURISCVState *env, uint32_t desc)                      \
4992 {                                                                         \
4993     uint32_t vm = vext_vm(desc);                                          \
4994     uint32_t vl = env->vl;                                                \
4995     uint32_t esz = sizeof(ETYPE);                                         \
4996     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4997     uint32_t vta = vext_vta(desc);                                        \
4998     uint32_t vma = vext_vma(desc);                                        \
4999     target_ulong offset = s1, i_min, i;                                   \
5000                                                                           \
5001     i_min = MAX(env->vstart, offset);                                     \
5002     for (i = i_min; i < vl; i++) {                                        \
5003         if (!vm && !vext_elem_mask(v0, i)) {                              \
5004             /* set masked-off elements to 1s */                           \
5005             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5006             continue;                                                     \
5007         }                                                                 \
5008         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
5009     }                                                                     \
5010     /* set tail elements to 1s */                                         \
5011     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5012 }
5013 
5014 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
5015 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
5016 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
5017 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
5018 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
5019 
5020 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
5021 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5022                   CPURISCVState *env, uint32_t desc)                      \
5023 {                                                                         \
5024     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5025     uint32_t vm = vext_vm(desc);                                          \
5026     uint32_t vl = env->vl;                                                \
5027     uint32_t esz = sizeof(ETYPE);                                         \
5028     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5029     uint32_t vta = vext_vta(desc);                                        \
5030     uint32_t vma = vext_vma(desc);                                        \
5031     target_ulong i_max, i;                                                \
5032                                                                           \
5033     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
5034     for (i = env->vstart; i < i_max; ++i) {                               \
5035         if (!vm && !vext_elem_mask(v0, i)) {                              \
5036             /* set masked-off elements to 1s */                           \
5037             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5038             continue;                                                     \
5039         }                                                                 \
5040         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));              \
5041     }                                                                     \
5042                                                                           \
5043     for (i = i_max; i < vl; ++i) {                                        \
5044         if (vm || vext_elem_mask(v0, i)) {                                \
5045             *((ETYPE *)vd + H(i)) = 0;                                    \
5046         }                                                                 \
5047     }                                                                     \
5048                                                                           \
5049     env->vstart = 0;                                                      \
5050     /* set tail elements to 1s */                                         \
5051     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5052 }
5053 
5054 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5055 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
5056 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5057 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5058 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5059 
5060 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
5061 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5062                                  void *vs2, CPURISCVState *env,             \
5063                                  uint32_t desc)                             \
5064 {                                                                           \
5065     typedef uint##BITWIDTH##_t ETYPE;                                       \
5066     uint32_t vm = vext_vm(desc);                                            \
5067     uint32_t vl = env->vl;                                                  \
5068     uint32_t esz = sizeof(ETYPE);                                           \
5069     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5070     uint32_t vta = vext_vta(desc);                                          \
5071     uint32_t vma = vext_vma(desc);                                          \
5072     uint32_t i;                                                             \
5073                                                                             \
5074     for (i = env->vstart; i < vl; i++) {                                    \
5075         if (!vm && !vext_elem_mask(v0, i)) {                                \
5076             /* set masked-off elements to 1s */                             \
5077             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);             \
5078             continue;                                                       \
5079         }                                                                   \
5080         if (i == 0) {                                                       \
5081             *((ETYPE *)vd + H(i)) = s1;                                     \
5082         } else {                                                            \
5083             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5084         }                                                                   \
5085     }                                                                       \
5086     env->vstart = 0;                                                        \
5087     /* set tail elements to 1s */                                           \
5088     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5089 }
5090 
5091 GEN_VEXT_VSLIE1UP(8,  H1)
5092 GEN_VEXT_VSLIE1UP(16, H2)
5093 GEN_VEXT_VSLIE1UP(32, H4)
5094 GEN_VEXT_VSLIE1UP(64, H8)
5095 
5096 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5097 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5098                   CPURISCVState *env, uint32_t desc)              \
5099 {                                                                 \
5100     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5101 }
5102 
5103 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5104 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5105 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5106 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5107 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5108 
5109 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5110 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1,           \
5111                                    void *vs2, CPURISCVState *env,             \
5112                                    uint32_t desc)                             \
5113 {                                                                             \
5114     typedef uint##BITWIDTH##_t ETYPE;                                         \
5115     uint32_t vm = vext_vm(desc);                                              \
5116     uint32_t vl = env->vl;                                                    \
5117     uint32_t esz = sizeof(ETYPE);                                             \
5118     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5119     uint32_t vta = vext_vta(desc);                                            \
5120     uint32_t vma = vext_vma(desc);                                            \
5121     uint32_t i;                                                               \
5122                                                                               \
5123     for (i = env->vstart; i < vl; i++) {                                      \
5124         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5125             /* set masked-off elements to 1s */                               \
5126             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);               \
5127             continue;                                                         \
5128         }                                                                     \
5129         if (i == vl - 1) {                                                    \
5130             *((ETYPE *)vd + H(i)) = s1;                                       \
5131         } else {                                                              \
5132             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5133         }                                                                     \
5134     }                                                                         \
5135     env->vstart = 0;                                                          \
5136     /* set tail elements to 1s */                                             \
5137     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5138 }
5139 
5140 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5141 GEN_VEXT_VSLIDE1DOWN(16, H2)
5142 GEN_VEXT_VSLIDE1DOWN(32, H4)
5143 GEN_VEXT_VSLIDE1DOWN(64, H8)
5144 
5145 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5146 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5147                   CPURISCVState *env, uint32_t desc)              \
5148 {                                                                 \
5149     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5150 }
5151 
5152 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5153 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5154 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5155 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5156 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5157 
5158 /* Vector Floating-Point Slide Instructions */
5159 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5160 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5161                   CPURISCVState *env, uint32_t desc)          \
5162 {                                                             \
5163     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5164 }
5165 
5166 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5167 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5168 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5169 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5170 
5171 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5172 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5173                   CPURISCVState *env, uint32_t desc)          \
5174 {                                                             \
5175     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5176 }
5177 
5178 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5179 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5180 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5181 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5182 
5183 /* Vector Register Gather Instruction */
5184 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5185 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5186                   CPURISCVState *env, uint32_t desc)                      \
5187 {                                                                         \
5188     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5189     uint32_t vm = vext_vm(desc);                                          \
5190     uint32_t vl = env->vl;                                                \
5191     uint32_t esz = sizeof(TS2);                                           \
5192     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5193     uint32_t vta = vext_vta(desc);                                        \
5194     uint32_t vma = vext_vma(desc);                                        \
5195     uint64_t index;                                                       \
5196     uint32_t i;                                                           \
5197                                                                           \
5198     for (i = env->vstart; i < vl; i++) {                                  \
5199         if (!vm && !vext_elem_mask(v0, i)) {                              \
5200             /* set masked-off elements to 1s */                           \
5201             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5202             continue;                                                     \
5203         }                                                                 \
5204         index = *((TS1 *)vs1 + HS1(i));                                   \
5205         if (index >= vlmax) {                                             \
5206             *((TS2 *)vd + HS2(i)) = 0;                                    \
5207         } else {                                                          \
5208             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5209         }                                                                 \
5210     }                                                                     \
5211     env->vstart = 0;                                                      \
5212     /* set tail elements to 1s */                                         \
5213     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5214 }
5215 
5216 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5217 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5218 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5219 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5220 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5221 
5222 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5223 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5224 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5225 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5226 
5227 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5228 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5229                   CPURISCVState *env, uint32_t desc)                      \
5230 {                                                                         \
5231     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5232     uint32_t vm = vext_vm(desc);                                          \
5233     uint32_t vl = env->vl;                                                \
5234     uint32_t esz = sizeof(ETYPE);                                         \
5235     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5236     uint32_t vta = vext_vta(desc);                                        \
5237     uint32_t vma = vext_vma(desc);                                        \
5238     uint64_t index = s1;                                                  \
5239     uint32_t i;                                                           \
5240                                                                           \
5241     for (i = env->vstart; i < vl; i++) {                                  \
5242         if (!vm && !vext_elem_mask(v0, i)) {                              \
5243             /* set masked-off elements to 1s */                           \
5244             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);           \
5245             continue;                                                     \
5246         }                                                                 \
5247         if (index >= vlmax) {                                             \
5248             *((ETYPE *)vd + H(i)) = 0;                                    \
5249         } else {                                                          \
5250             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5251         }                                                                 \
5252     }                                                                     \
5253     env->vstart = 0;                                                      \
5254     /* set tail elements to 1s */                                         \
5255     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5256 }
5257 
5258 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5259 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5260 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5261 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5262 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5263 
5264 /* Vector Compress Instruction */
5265 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5266 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5267                   CPURISCVState *env, uint32_t desc)                      \
5268 {                                                                         \
5269     uint32_t vl = env->vl;                                                \
5270     uint32_t esz = sizeof(ETYPE);                                         \
5271     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5272     uint32_t vta = vext_vta(desc);                                        \
5273     uint32_t num = 0, i;                                                  \
5274                                                                           \
5275     for (i = env->vstart; i < vl; i++) {                                  \
5276         if (!vext_elem_mask(vs1, i)) {                                    \
5277             continue;                                                     \
5278         }                                                                 \
5279         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5280         num++;                                                            \
5281     }                                                                     \
5282     env->vstart = 0;                                                      \
5283     /* set tail elements to 1s */                                         \
5284     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5285 }
5286 
5287 /* Compress into vd elements of vs2 where vs1 is enabled */
5288 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5289 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5290 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5291 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5292 
5293 /* Vector Whole Register Move */
5294 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5295 {
5296     /* EEW = SEW */
5297     uint32_t maxsz = simd_maxsz(desc);
5298     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5299     uint32_t startb = env->vstart * sewb;
5300     uint32_t i = startb;
5301 
5302     memcpy((uint8_t *)vd + H1(i),
5303            (uint8_t *)vs2 + H1(i),
5304            maxsz - startb);
5305 
5306     env->vstart = 0;
5307 }
5308 
5309 /* Vector Integer Extension */
5310 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5311 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5312                   CPURISCVState *env, uint32_t desc)             \
5313 {                                                                \
5314     uint32_t vl = env->vl;                                       \
5315     uint32_t vm = vext_vm(desc);                                 \
5316     uint32_t esz = sizeof(ETYPE);                                \
5317     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5318     uint32_t vta = vext_vta(desc);                               \
5319     uint32_t vma = vext_vma(desc);                               \
5320     uint32_t i;                                                  \
5321                                                                  \
5322     for (i = env->vstart; i < vl; i++) {                         \
5323         if (!vm && !vext_elem_mask(v0, i)) {                     \
5324             /* set masked-off elements to 1s */                  \
5325             vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);  \
5326             continue;                                            \
5327         }                                                        \
5328         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5329     }                                                            \
5330     env->vstart = 0;                                             \
5331     /* set tail elements to 1s */                                \
5332     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5333 }
5334 
5335 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5336 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5337 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5338 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5339 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5340 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5341 
5342 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5343 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5344 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5345 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5346 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5347 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)
5348