1 /* 2 * RISC-V Vector Extension Helpers for QEMU. 3 * 4 * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved. 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms and conditions of the GNU General Public License, 8 * version 2 or later, as published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 * more details. 14 * 15 * You should have received a copy of the GNU General Public License along with 16 * this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #include "qemu/osdep.h" 20 #include "qemu/host-utils.h" 21 #include "qemu/bitops.h" 22 #include "cpu.h" 23 #include "exec/memop.h" 24 #include "exec/exec-all.h" 25 #include "exec/helper-proto.h" 26 #include "fpu/softfloat.h" 27 #include "tcg/tcg-gvec-desc.h" 28 #include "internals.h" 29 #include <math.h> 30 31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1, 32 target_ulong s2) 33 { 34 int vlmax, vl; 35 RISCVCPU *cpu = env_archcpu(env); 36 uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL); 37 uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW); 38 uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV); 39 int xlen = riscv_cpu_xlen(env); 40 bool vill = (s2 >> (xlen - 1)) & 0x1; 41 target_ulong reserved = s2 & 42 MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT, 43 xlen - 1 - R_VTYPE_RESERVED_SHIFT); 44 45 if (lmul & 4) { 46 /* Fractional LMUL. */ 47 if (lmul == 4 || 48 cpu->cfg.elen >> (8 - lmul) < sew) { 49 vill = true; 50 } 51 } 52 53 if ((sew > cpu->cfg.elen) || vill || (ediv != 0) || (reserved != 0)) { 54 /* only set vill bit. */ 55 env->vill = 1; 56 env->vtype = 0; 57 env->vl = 0; 58 env->vstart = 0; 59 return 0; 60 } 61 62 vlmax = vext_get_vlmax(cpu, s2); 63 if (s1 <= vlmax) { 64 vl = s1; 65 } else { 66 vl = vlmax; 67 } 68 env->vl = vl; 69 env->vtype = s2; 70 env->vstart = 0; 71 env->vill = 0; 72 return vl; 73 } 74 75 /* 76 * Note that vector data is stored in host-endian 64-bit chunks, 77 * so addressing units smaller than that needs a host-endian fixup. 78 */ 79 #if HOST_BIG_ENDIAN 80 #define H1(x) ((x) ^ 7) 81 #define H1_2(x) ((x) ^ 6) 82 #define H1_4(x) ((x) ^ 4) 83 #define H2(x) ((x) ^ 3) 84 #define H4(x) ((x) ^ 1) 85 #define H8(x) ((x)) 86 #else 87 #define H1(x) (x) 88 #define H1_2(x) (x) 89 #define H1_4(x) (x) 90 #define H2(x) (x) 91 #define H4(x) (x) 92 #define H8(x) (x) 93 #endif 94 95 static inline uint32_t vext_nf(uint32_t desc) 96 { 97 return FIELD_EX32(simd_data(desc), VDATA, NF); 98 } 99 100 static inline uint32_t vext_vm(uint32_t desc) 101 { 102 return FIELD_EX32(simd_data(desc), VDATA, VM); 103 } 104 105 /* 106 * Encode LMUL to lmul as following: 107 * LMUL vlmul lmul 108 * 1 000 0 109 * 2 001 1 110 * 4 010 2 111 * 8 011 3 112 * - 100 - 113 * 1/8 101 -3 114 * 1/4 110 -2 115 * 1/2 111 -1 116 */ 117 static inline int32_t vext_lmul(uint32_t desc) 118 { 119 return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3); 120 } 121 122 static inline uint32_t vext_vta(uint32_t desc) 123 { 124 return FIELD_EX32(simd_data(desc), VDATA, VTA); 125 } 126 127 static inline uint32_t vext_vma(uint32_t desc) 128 { 129 return FIELD_EX32(simd_data(desc), VDATA, VMA); 130 } 131 132 static inline uint32_t vext_vta_all_1s(uint32_t desc) 133 { 134 return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S); 135 } 136 137 /* 138 * Get the maximum number of elements can be operated. 139 * 140 * log2_esz: log2 of element size in bytes. 141 */ 142 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz) 143 { 144 /* 145 * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits. 146 * so vlen in bytes (vlenb) is encoded as maxsz. 147 */ 148 uint32_t vlenb = simd_maxsz(desc); 149 150 /* Return VLMAX */ 151 int scale = vext_lmul(desc) - log2_esz; 152 return scale < 0 ? vlenb >> -scale : vlenb << scale; 153 } 154 155 /* 156 * Get number of total elements, including prestart, body and tail elements. 157 * Note that when LMUL < 1, the tail includes the elements past VLMAX that 158 * are held in the same vector register. 159 */ 160 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc, 161 uint32_t esz) 162 { 163 uint32_t vlenb = simd_maxsz(desc); 164 uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW); 165 int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 : 166 ctzl(esz) - ctzl(sew) + vext_lmul(desc); 167 return (vlenb << emul) / esz; 168 } 169 170 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr) 171 { 172 return (addr & env->cur_pmmask) | env->cur_pmbase; 173 } 174 175 /* 176 * This function checks watchpoint before real load operation. 177 * 178 * In softmmu mode, the TLB API probe_access is enough for watchpoint check. 179 * In user mode, there is no watchpoint support now. 180 * 181 * It will trigger an exception if there is no mapping in TLB 182 * and page table walk can't fill the TLB entry. Then the guest 183 * software can return here after process the exception or never return. 184 */ 185 static void probe_pages(CPURISCVState *env, target_ulong addr, 186 target_ulong len, uintptr_t ra, 187 MMUAccessType access_type) 188 { 189 target_ulong pagelen = -(addr | TARGET_PAGE_MASK); 190 target_ulong curlen = MIN(pagelen, len); 191 192 probe_access(env, adjust_addr(env, addr), curlen, access_type, 193 cpu_mmu_index(env, false), ra); 194 if (len > curlen) { 195 addr += curlen; 196 curlen = len - curlen; 197 probe_access(env, adjust_addr(env, addr), curlen, access_type, 198 cpu_mmu_index(env, false), ra); 199 } 200 } 201 202 /* set agnostic elements to 1s */ 203 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt, 204 uint32_t tot) 205 { 206 if (is_agnostic == 0) { 207 /* policy undisturbed */ 208 return; 209 } 210 if (tot - cnt == 0) { 211 return; 212 } 213 memset(base + cnt, -1, tot - cnt); 214 } 215 216 static inline void vext_set_elem_mask(void *v0, int index, 217 uint8_t value) 218 { 219 int idx = index / 64; 220 int pos = index % 64; 221 uint64_t old = ((uint64_t *)v0)[idx]; 222 ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value); 223 } 224 225 /* 226 * Earlier designs (pre-0.9) had a varying number of bits 227 * per mask value (MLEN). In the 0.9 design, MLEN=1. 228 * (Section 4.5) 229 */ 230 static inline int vext_elem_mask(void *v0, int index) 231 { 232 int idx = index / 64; 233 int pos = index % 64; 234 return (((uint64_t *)v0)[idx] >> pos) & 1; 235 } 236 237 /* elements operations for load and store */ 238 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr, 239 uint32_t idx, void *vd, uintptr_t retaddr); 240 241 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \ 242 static void NAME(CPURISCVState *env, abi_ptr addr, \ 243 uint32_t idx, void *vd, uintptr_t retaddr)\ 244 { \ 245 ETYPE *cur = ((ETYPE *)vd + H(idx)); \ 246 *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \ 247 } \ 248 249 GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb) 250 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw) 251 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl) 252 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq) 253 254 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \ 255 static void NAME(CPURISCVState *env, abi_ptr addr, \ 256 uint32_t idx, void *vd, uintptr_t retaddr)\ 257 { \ 258 ETYPE data = *((ETYPE *)vd + H(idx)); \ 259 cpu_##STSUF##_data_ra(env, addr, data, retaddr); \ 260 } 261 262 GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb) 263 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw) 264 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl) 265 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq) 266 267 static void vext_set_tail_elems_1s(CPURISCVState *env, target_ulong vl, 268 void *vd, uint32_t desc, uint32_t nf, 269 uint32_t esz, uint32_t max_elems) 270 { 271 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 272 uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3; 273 uint32_t vta = vext_vta(desc); 274 uint32_t registers_used; 275 int k; 276 277 for (k = 0; k < nf; ++k) { 278 vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz, 279 (k * max_elems + max_elems) * esz); 280 } 281 282 if (nf * max_elems % total_elems != 0) { 283 registers_used = ((nf * max_elems) * esz + (vlenb - 1)) / vlenb; 284 vext_set_elems_1s(vd, vta, (nf * max_elems) * esz, 285 registers_used * vlenb); 286 } 287 } 288 289 /* 290 * stride: access vector element from strided memory 291 */ 292 static void 293 vext_ldst_stride(void *vd, void *v0, target_ulong base, 294 target_ulong stride, CPURISCVState *env, 295 uint32_t desc, uint32_t vm, 296 vext_ldst_elem_fn *ldst_elem, 297 uint32_t log2_esz, uintptr_t ra) 298 { 299 uint32_t i, k; 300 uint32_t nf = vext_nf(desc); 301 uint32_t max_elems = vext_max_elems(desc, log2_esz); 302 uint32_t esz = 1 << log2_esz; 303 uint32_t vma = vext_vma(desc); 304 305 for (i = env->vstart; i < env->vl; i++, env->vstart++) { 306 k = 0; 307 while (k < nf) { 308 if (!vm && !vext_elem_mask(v0, i)) { 309 /* set masked-off elements to 1s */ 310 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz, 311 (i + k * max_elems + 1) * esz); 312 k++; 313 continue; 314 } 315 target_ulong addr = base + stride * i + (k << log2_esz); 316 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); 317 k++; 318 } 319 } 320 env->vstart = 0; 321 322 vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems); 323 } 324 325 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \ 326 void HELPER(NAME)(void *vd, void * v0, target_ulong base, \ 327 target_ulong stride, CPURISCVState *env, \ 328 uint32_t desc) \ 329 { \ 330 uint32_t vm = vext_vm(desc); \ 331 vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \ 332 ctzl(sizeof(ETYPE)), GETPC()); \ 333 } 334 335 GEN_VEXT_LD_STRIDE(vlse8_v, int8_t, lde_b) 336 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h) 337 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w) 338 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d) 339 340 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN) \ 341 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ 342 target_ulong stride, CPURISCVState *env, \ 343 uint32_t desc) \ 344 { \ 345 uint32_t vm = vext_vm(desc); \ 346 vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN, \ 347 ctzl(sizeof(ETYPE)), GETPC()); \ 348 } 349 350 GEN_VEXT_ST_STRIDE(vsse8_v, int8_t, ste_b) 351 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h) 352 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w) 353 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d) 354 355 /* 356 * unit-stride: access elements stored contiguously in memory 357 */ 358 359 /* unmasked unit-stride load and store operation */ 360 static void 361 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, 362 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl, 363 uintptr_t ra) 364 { 365 uint32_t i, k; 366 uint32_t nf = vext_nf(desc); 367 uint32_t max_elems = vext_max_elems(desc, log2_esz); 368 uint32_t esz = 1 << log2_esz; 369 370 /* load bytes from guest memory */ 371 for (i = env->vstart; i < evl; i++, env->vstart++) { 372 k = 0; 373 while (k < nf) { 374 target_ulong addr = base + ((i * nf + k) << log2_esz); 375 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); 376 k++; 377 } 378 } 379 env->vstart = 0; 380 381 vext_set_tail_elems_1s(env, evl, vd, desc, nf, esz, max_elems); 382 } 383 384 /* 385 * masked unit-stride load and store operation will be a special case of 386 * stride, stride = NF * sizeof (MTYPE) 387 */ 388 389 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN) \ 390 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \ 391 CPURISCVState *env, uint32_t desc) \ 392 { \ 393 uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \ 394 vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN, \ 395 ctzl(sizeof(ETYPE)), GETPC()); \ 396 } \ 397 \ 398 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ 399 CPURISCVState *env, uint32_t desc) \ 400 { \ 401 vext_ldst_us(vd, base, env, desc, LOAD_FN, \ 402 ctzl(sizeof(ETYPE)), env->vl, GETPC()); \ 403 } 404 405 GEN_VEXT_LD_US(vle8_v, int8_t, lde_b) 406 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h) 407 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w) 408 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d) 409 410 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN) \ 411 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \ 412 CPURISCVState *env, uint32_t desc) \ 413 { \ 414 uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \ 415 vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN, \ 416 ctzl(sizeof(ETYPE)), GETPC()); \ 417 } \ 418 \ 419 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ 420 CPURISCVState *env, uint32_t desc) \ 421 { \ 422 vext_ldst_us(vd, base, env, desc, STORE_FN, \ 423 ctzl(sizeof(ETYPE)), env->vl, GETPC()); \ 424 } 425 426 GEN_VEXT_ST_US(vse8_v, int8_t, ste_b) 427 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h) 428 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w) 429 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d) 430 431 /* 432 * unit stride mask load and store, EEW = 1 433 */ 434 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base, 435 CPURISCVState *env, uint32_t desc) 436 { 437 /* evl = ceil(vl/8) */ 438 uint8_t evl = (env->vl + 7) >> 3; 439 vext_ldst_us(vd, base, env, desc, lde_b, 440 0, evl, GETPC()); 441 } 442 443 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base, 444 CPURISCVState *env, uint32_t desc) 445 { 446 /* evl = ceil(vl/8) */ 447 uint8_t evl = (env->vl + 7) >> 3; 448 vext_ldst_us(vd, base, env, desc, ste_b, 449 0, evl, GETPC()); 450 } 451 452 /* 453 * index: access vector element from indexed memory 454 */ 455 typedef target_ulong vext_get_index_addr(target_ulong base, 456 uint32_t idx, void *vs2); 457 458 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H) \ 459 static target_ulong NAME(target_ulong base, \ 460 uint32_t idx, void *vs2) \ 461 { \ 462 return (base + *((ETYPE *)vs2 + H(idx))); \ 463 } 464 465 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t, H1) 466 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2) 467 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4) 468 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8) 469 470 static inline void 471 vext_ldst_index(void *vd, void *v0, target_ulong base, 472 void *vs2, CPURISCVState *env, uint32_t desc, 473 vext_get_index_addr get_index_addr, 474 vext_ldst_elem_fn *ldst_elem, 475 uint32_t log2_esz, uintptr_t ra) 476 { 477 uint32_t i, k; 478 uint32_t nf = vext_nf(desc); 479 uint32_t vm = vext_vm(desc); 480 uint32_t max_elems = vext_max_elems(desc, log2_esz); 481 uint32_t esz = 1 << log2_esz; 482 uint32_t vma = vext_vma(desc); 483 484 /* load bytes from guest memory */ 485 for (i = env->vstart; i < env->vl; i++, env->vstart++) { 486 k = 0; 487 while (k < nf) { 488 if (!vm && !vext_elem_mask(v0, i)) { 489 /* set masked-off elements to 1s */ 490 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz, 491 (i + k * max_elems + 1) * esz); 492 k++; 493 continue; 494 } 495 abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz); 496 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); 497 k++; 498 } 499 } 500 env->vstart = 0; 501 502 vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems); 503 } 504 505 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN) \ 506 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ 507 void *vs2, CPURISCVState *env, uint32_t desc) \ 508 { \ 509 vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN, \ 510 LOAD_FN, ctzl(sizeof(ETYPE)), GETPC()); \ 511 } 512 513 GEN_VEXT_LD_INDEX(vlxei8_8_v, int8_t, idx_b, lde_b) 514 GEN_VEXT_LD_INDEX(vlxei8_16_v, int16_t, idx_b, lde_h) 515 GEN_VEXT_LD_INDEX(vlxei8_32_v, int32_t, idx_b, lde_w) 516 GEN_VEXT_LD_INDEX(vlxei8_64_v, int64_t, idx_b, lde_d) 517 GEN_VEXT_LD_INDEX(vlxei16_8_v, int8_t, idx_h, lde_b) 518 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h) 519 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w) 520 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d) 521 GEN_VEXT_LD_INDEX(vlxei32_8_v, int8_t, idx_w, lde_b) 522 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h) 523 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w) 524 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d) 525 GEN_VEXT_LD_INDEX(vlxei64_8_v, int8_t, idx_d, lde_b) 526 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h) 527 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w) 528 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d) 529 530 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN) \ 531 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ 532 void *vs2, CPURISCVState *env, uint32_t desc) \ 533 { \ 534 vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN, \ 535 STORE_FN, ctzl(sizeof(ETYPE)), \ 536 GETPC()); \ 537 } 538 539 GEN_VEXT_ST_INDEX(vsxei8_8_v, int8_t, idx_b, ste_b) 540 GEN_VEXT_ST_INDEX(vsxei8_16_v, int16_t, idx_b, ste_h) 541 GEN_VEXT_ST_INDEX(vsxei8_32_v, int32_t, idx_b, ste_w) 542 GEN_VEXT_ST_INDEX(vsxei8_64_v, int64_t, idx_b, ste_d) 543 GEN_VEXT_ST_INDEX(vsxei16_8_v, int8_t, idx_h, ste_b) 544 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h) 545 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w) 546 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d) 547 GEN_VEXT_ST_INDEX(vsxei32_8_v, int8_t, idx_w, ste_b) 548 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h) 549 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w) 550 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d) 551 GEN_VEXT_ST_INDEX(vsxei64_8_v, int8_t, idx_d, ste_b) 552 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h) 553 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w) 554 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d) 555 556 /* 557 * unit-stride fault-only-fisrt load instructions 558 */ 559 static inline void 560 vext_ldff(void *vd, void *v0, target_ulong base, 561 CPURISCVState *env, uint32_t desc, 562 vext_ldst_elem_fn *ldst_elem, 563 uint32_t log2_esz, uintptr_t ra) 564 { 565 void *host; 566 uint32_t i, k, vl = 0; 567 uint32_t nf = vext_nf(desc); 568 uint32_t vm = vext_vm(desc); 569 uint32_t max_elems = vext_max_elems(desc, log2_esz); 570 uint32_t esz = 1 << log2_esz; 571 uint32_t vma = vext_vma(desc); 572 target_ulong addr, offset, remain; 573 574 /* probe every access */ 575 for (i = env->vstart; i < env->vl; i++) { 576 if (!vm && !vext_elem_mask(v0, i)) { 577 continue; 578 } 579 addr = adjust_addr(env, base + i * (nf << log2_esz)); 580 if (i == 0) { 581 probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD); 582 } else { 583 /* if it triggers an exception, no need to check watchpoint */ 584 remain = nf << log2_esz; 585 while (remain > 0) { 586 offset = -(addr | TARGET_PAGE_MASK); 587 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, 588 cpu_mmu_index(env, false)); 589 if (host) { 590 #ifdef CONFIG_USER_ONLY 591 if (page_check_range(addr, offset, PAGE_READ) < 0) { 592 vl = i; 593 goto ProbeSuccess; 594 } 595 #else 596 probe_pages(env, addr, offset, ra, MMU_DATA_LOAD); 597 #endif 598 } else { 599 vl = i; 600 goto ProbeSuccess; 601 } 602 if (remain <= offset) { 603 break; 604 } 605 remain -= offset; 606 addr = adjust_addr(env, addr + offset); 607 } 608 } 609 } 610 ProbeSuccess: 611 /* load bytes from guest memory */ 612 if (vl != 0) { 613 env->vl = vl; 614 } 615 for (i = env->vstart; i < env->vl; i++) { 616 k = 0; 617 while (k < nf) { 618 if (!vm && !vext_elem_mask(v0, i)) { 619 /* set masked-off elements to 1s */ 620 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz, 621 (i + k * max_elems + 1) * esz); 622 k++; 623 continue; 624 } 625 target_ulong addr = base + ((i * nf + k) << log2_esz); 626 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); 627 k++; 628 } 629 } 630 env->vstart = 0; 631 632 vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems); 633 } 634 635 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN) \ 636 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \ 637 CPURISCVState *env, uint32_t desc) \ 638 { \ 639 vext_ldff(vd, v0, base, env, desc, LOAD_FN, \ 640 ctzl(sizeof(ETYPE)), GETPC()); \ 641 } 642 643 GEN_VEXT_LDFF(vle8ff_v, int8_t, lde_b) 644 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h) 645 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w) 646 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d) 647 648 #define DO_SWAP(N, M) (M) 649 #define DO_AND(N, M) (N & M) 650 #define DO_XOR(N, M) (N ^ M) 651 #define DO_OR(N, M) (N | M) 652 #define DO_ADD(N, M) (N + M) 653 654 /* Signed min/max */ 655 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) 656 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) 657 658 /* Unsigned min/max */ 659 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M) 660 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M) 661 662 /* 663 * load and store whole register instructions 664 */ 665 static void 666 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, 667 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra) 668 { 669 uint32_t i, k, off, pos; 670 uint32_t nf = vext_nf(desc); 671 uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3; 672 uint32_t max_elems = vlenb >> log2_esz; 673 674 k = env->vstart / max_elems; 675 off = env->vstart % max_elems; 676 677 if (off) { 678 /* load/store rest of elements of current segment pointed by vstart */ 679 for (pos = off; pos < max_elems; pos++, env->vstart++) { 680 target_ulong addr = base + ((pos + k * max_elems) << log2_esz); 681 ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, 682 ra); 683 } 684 k++; 685 } 686 687 /* load/store elements for rest of segments */ 688 for (; k < nf; k++) { 689 for (i = 0; i < max_elems; i++, env->vstart++) { 690 target_ulong addr = base + ((i + k * max_elems) << log2_esz); 691 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); 692 } 693 } 694 695 env->vstart = 0; 696 } 697 698 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN) \ 699 void HELPER(NAME)(void *vd, target_ulong base, \ 700 CPURISCVState *env, uint32_t desc) \ 701 { \ 702 vext_ldst_whole(vd, base, env, desc, LOAD_FN, \ 703 ctzl(sizeof(ETYPE)), GETPC()); \ 704 } 705 706 GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b) 707 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h) 708 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w) 709 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d) 710 GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b) 711 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h) 712 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w) 713 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d) 714 GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b) 715 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h) 716 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w) 717 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d) 718 GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b) 719 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h) 720 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w) 721 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d) 722 723 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN) \ 724 void HELPER(NAME)(void *vd, target_ulong base, \ 725 CPURISCVState *env, uint32_t desc) \ 726 { \ 727 vext_ldst_whole(vd, base, env, desc, STORE_FN, \ 728 ctzl(sizeof(ETYPE)), GETPC()); \ 729 } 730 731 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b) 732 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b) 733 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b) 734 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b) 735 736 /* 737 * Vector Integer Arithmetic Instructions 738 */ 739 740 /* expand macro args before macro */ 741 #define RVVCALL(macro, ...) macro(__VA_ARGS__) 742 743 /* (TD, T1, T2, TX1, TX2) */ 744 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t 745 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t 746 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t 747 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t 748 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t 749 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t 750 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t 751 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t 752 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t 753 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t 754 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t 755 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t 756 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t 757 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t 758 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t 759 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t 760 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t 761 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t 762 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t 763 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t 764 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t 765 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t 766 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t 767 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t 768 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t 769 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t 770 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t 771 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t 772 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t 773 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t 774 775 /* operation of two vector elements */ 776 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i); 777 778 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \ 779 static void do_##NAME(void *vd, void *vs1, void *vs2, int i) \ 780 { \ 781 TX1 s1 = *((T1 *)vs1 + HS1(i)); \ 782 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 783 *((TD *)vd + HD(i)) = OP(s2, s1); \ 784 } 785 #define DO_SUB(N, M) (N - M) 786 #define DO_RSUB(N, M) (M - N) 787 788 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD) 789 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD) 790 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD) 791 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD) 792 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB) 793 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB) 794 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB) 795 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB) 796 797 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2, 798 CPURISCVState *env, uint32_t desc, 799 opivv2_fn *fn, uint32_t esz) 800 { 801 uint32_t vm = vext_vm(desc); 802 uint32_t vl = env->vl; 803 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 804 uint32_t vta = vext_vta(desc); 805 uint32_t vma = vext_vma(desc); 806 uint32_t i; 807 808 for (i = env->vstart; i < vl; i++) { 809 if (!vm && !vext_elem_mask(v0, i)) { 810 /* set masked-off elements to 1s */ 811 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); 812 continue; 813 } 814 fn(vd, vs1, vs2, i); 815 } 816 env->vstart = 0; 817 /* set tail elements to 1s */ 818 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); 819 } 820 821 /* generate the helpers for OPIVV */ 822 #define GEN_VEXT_VV(NAME, ESZ) \ 823 void HELPER(NAME)(void *vd, void *v0, void *vs1, \ 824 void *vs2, CPURISCVState *env, \ 825 uint32_t desc) \ 826 { \ 827 do_vext_vv(vd, v0, vs1, vs2, env, desc, \ 828 do_##NAME, ESZ); \ 829 } 830 831 GEN_VEXT_VV(vadd_vv_b, 1) 832 GEN_VEXT_VV(vadd_vv_h, 2) 833 GEN_VEXT_VV(vadd_vv_w, 4) 834 GEN_VEXT_VV(vadd_vv_d, 8) 835 GEN_VEXT_VV(vsub_vv_b, 1) 836 GEN_VEXT_VV(vsub_vv_h, 2) 837 GEN_VEXT_VV(vsub_vv_w, 4) 838 GEN_VEXT_VV(vsub_vv_d, 8) 839 840 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i); 841 842 /* 843 * (T1)s1 gives the real operator type. 844 * (TX1)(T1)s1 expands the operator type of widen or narrow operations. 845 */ 846 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \ 847 static void do_##NAME(void *vd, target_long s1, void *vs2, int i) \ 848 { \ 849 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 850 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1); \ 851 } 852 853 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD) 854 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD) 855 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD) 856 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD) 857 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB) 858 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB) 859 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB) 860 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB) 861 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB) 862 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB) 863 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB) 864 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB) 865 866 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2, 867 CPURISCVState *env, uint32_t desc, 868 opivx2_fn fn, uint32_t esz) 869 { 870 uint32_t vm = vext_vm(desc); 871 uint32_t vl = env->vl; 872 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 873 uint32_t vta = vext_vta(desc); 874 uint32_t vma = vext_vma(desc); 875 uint32_t i; 876 877 for (i = env->vstart; i < vl; i++) { 878 if (!vm && !vext_elem_mask(v0, i)) { 879 /* set masked-off elements to 1s */ 880 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); 881 continue; 882 } 883 fn(vd, s1, vs2, i); 884 } 885 env->vstart = 0; 886 /* set tail elements to 1s */ 887 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); 888 } 889 890 /* generate the helpers for OPIVX */ 891 #define GEN_VEXT_VX(NAME, ESZ) \ 892 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ 893 void *vs2, CPURISCVState *env, \ 894 uint32_t desc) \ 895 { \ 896 do_vext_vx(vd, v0, s1, vs2, env, desc, \ 897 do_##NAME, ESZ); \ 898 } 899 900 GEN_VEXT_VX(vadd_vx_b, 1) 901 GEN_VEXT_VX(vadd_vx_h, 2) 902 GEN_VEXT_VX(vadd_vx_w, 4) 903 GEN_VEXT_VX(vadd_vx_d, 8) 904 GEN_VEXT_VX(vsub_vx_b, 1) 905 GEN_VEXT_VX(vsub_vx_h, 2) 906 GEN_VEXT_VX(vsub_vx_w, 4) 907 GEN_VEXT_VX(vsub_vx_d, 8) 908 GEN_VEXT_VX(vrsub_vx_b, 1) 909 GEN_VEXT_VX(vrsub_vx_h, 2) 910 GEN_VEXT_VX(vrsub_vx_w, 4) 911 GEN_VEXT_VX(vrsub_vx_d, 8) 912 913 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc) 914 { 915 intptr_t oprsz = simd_oprsz(desc); 916 intptr_t i; 917 918 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 919 *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i); 920 } 921 } 922 923 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc) 924 { 925 intptr_t oprsz = simd_oprsz(desc); 926 intptr_t i; 927 928 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 929 *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i); 930 } 931 } 932 933 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc) 934 { 935 intptr_t oprsz = simd_oprsz(desc); 936 intptr_t i; 937 938 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 939 *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i); 940 } 941 } 942 943 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc) 944 { 945 intptr_t oprsz = simd_oprsz(desc); 946 intptr_t i; 947 948 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 949 *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i); 950 } 951 } 952 953 /* Vector Widening Integer Add/Subtract */ 954 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t 955 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t 956 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t 957 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t 958 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t 959 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t 960 #define WOP_WUUU_B uint16_t, uint8_t, uint16_t, uint16_t, uint16_t 961 #define WOP_WUUU_H uint32_t, uint16_t, uint32_t, uint32_t, uint32_t 962 #define WOP_WUUU_W uint64_t, uint32_t, uint64_t, uint64_t, uint64_t 963 #define WOP_WSSS_B int16_t, int8_t, int16_t, int16_t, int16_t 964 #define WOP_WSSS_H int32_t, int16_t, int32_t, int32_t, int32_t 965 #define WOP_WSSS_W int64_t, int32_t, int64_t, int64_t, int64_t 966 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD) 967 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD) 968 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD) 969 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB) 970 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB) 971 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB) 972 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD) 973 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD) 974 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD) 975 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB) 976 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB) 977 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB) 978 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD) 979 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD) 980 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD) 981 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB) 982 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB) 983 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB) 984 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD) 985 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD) 986 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD) 987 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB) 988 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB) 989 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB) 990 GEN_VEXT_VV(vwaddu_vv_b, 2) 991 GEN_VEXT_VV(vwaddu_vv_h, 4) 992 GEN_VEXT_VV(vwaddu_vv_w, 8) 993 GEN_VEXT_VV(vwsubu_vv_b, 2) 994 GEN_VEXT_VV(vwsubu_vv_h, 4) 995 GEN_VEXT_VV(vwsubu_vv_w, 8) 996 GEN_VEXT_VV(vwadd_vv_b, 2) 997 GEN_VEXT_VV(vwadd_vv_h, 4) 998 GEN_VEXT_VV(vwadd_vv_w, 8) 999 GEN_VEXT_VV(vwsub_vv_b, 2) 1000 GEN_VEXT_VV(vwsub_vv_h, 4) 1001 GEN_VEXT_VV(vwsub_vv_w, 8) 1002 GEN_VEXT_VV(vwaddu_wv_b, 2) 1003 GEN_VEXT_VV(vwaddu_wv_h, 4) 1004 GEN_VEXT_VV(vwaddu_wv_w, 8) 1005 GEN_VEXT_VV(vwsubu_wv_b, 2) 1006 GEN_VEXT_VV(vwsubu_wv_h, 4) 1007 GEN_VEXT_VV(vwsubu_wv_w, 8) 1008 GEN_VEXT_VV(vwadd_wv_b, 2) 1009 GEN_VEXT_VV(vwadd_wv_h, 4) 1010 GEN_VEXT_VV(vwadd_wv_w, 8) 1011 GEN_VEXT_VV(vwsub_wv_b, 2) 1012 GEN_VEXT_VV(vwsub_wv_h, 4) 1013 GEN_VEXT_VV(vwsub_wv_w, 8) 1014 1015 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD) 1016 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD) 1017 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD) 1018 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB) 1019 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB) 1020 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB) 1021 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD) 1022 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD) 1023 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD) 1024 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB) 1025 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB) 1026 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB) 1027 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD) 1028 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD) 1029 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD) 1030 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB) 1031 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB) 1032 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB) 1033 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD) 1034 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD) 1035 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD) 1036 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB) 1037 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB) 1038 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB) 1039 GEN_VEXT_VX(vwaddu_vx_b, 2) 1040 GEN_VEXT_VX(vwaddu_vx_h, 4) 1041 GEN_VEXT_VX(vwaddu_vx_w, 8) 1042 GEN_VEXT_VX(vwsubu_vx_b, 2) 1043 GEN_VEXT_VX(vwsubu_vx_h, 4) 1044 GEN_VEXT_VX(vwsubu_vx_w, 8) 1045 GEN_VEXT_VX(vwadd_vx_b, 2) 1046 GEN_VEXT_VX(vwadd_vx_h, 4) 1047 GEN_VEXT_VX(vwadd_vx_w, 8) 1048 GEN_VEXT_VX(vwsub_vx_b, 2) 1049 GEN_VEXT_VX(vwsub_vx_h, 4) 1050 GEN_VEXT_VX(vwsub_vx_w, 8) 1051 GEN_VEXT_VX(vwaddu_wx_b, 2) 1052 GEN_VEXT_VX(vwaddu_wx_h, 4) 1053 GEN_VEXT_VX(vwaddu_wx_w, 8) 1054 GEN_VEXT_VX(vwsubu_wx_b, 2) 1055 GEN_VEXT_VX(vwsubu_wx_h, 4) 1056 GEN_VEXT_VX(vwsubu_wx_w, 8) 1057 GEN_VEXT_VX(vwadd_wx_b, 2) 1058 GEN_VEXT_VX(vwadd_wx_h, 4) 1059 GEN_VEXT_VX(vwadd_wx_w, 8) 1060 GEN_VEXT_VX(vwsub_wx_b, 2) 1061 GEN_VEXT_VX(vwsub_wx_h, 4) 1062 GEN_VEXT_VX(vwsub_wx_w, 8) 1063 1064 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */ 1065 #define DO_VADC(N, M, C) (N + M + C) 1066 #define DO_VSBC(N, M, C) (N - M - C) 1067 1068 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP) \ 1069 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ 1070 CPURISCVState *env, uint32_t desc) \ 1071 { \ 1072 uint32_t vl = env->vl; \ 1073 uint32_t esz = sizeof(ETYPE); \ 1074 uint32_t total_elems = \ 1075 vext_get_total_elems(env, desc, esz); \ 1076 uint32_t vta = vext_vta(desc); \ 1077 uint32_t i; \ 1078 \ 1079 for (i = env->vstart; i < vl; i++) { \ 1080 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \ 1081 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \ 1082 ETYPE carry = vext_elem_mask(v0, i); \ 1083 \ 1084 *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry); \ 1085 } \ 1086 env->vstart = 0; \ 1087 /* set tail elements to 1s */ \ 1088 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 1089 } 1090 1091 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t, H1, DO_VADC) 1092 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC) 1093 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC) 1094 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC) 1095 1096 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t, H1, DO_VSBC) 1097 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC) 1098 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC) 1099 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC) 1100 1101 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP) \ 1102 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ 1103 CPURISCVState *env, uint32_t desc) \ 1104 { \ 1105 uint32_t vl = env->vl; \ 1106 uint32_t esz = sizeof(ETYPE); \ 1107 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 1108 uint32_t vta = vext_vta(desc); \ 1109 uint32_t i; \ 1110 \ 1111 for (i = env->vstart; i < vl; i++) { \ 1112 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \ 1113 ETYPE carry = vext_elem_mask(v0, i); \ 1114 \ 1115 *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\ 1116 } \ 1117 env->vstart = 0; \ 1118 /* set tail elements to 1s */ \ 1119 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 1120 } 1121 1122 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t, H1, DO_VADC) 1123 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC) 1124 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC) 1125 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC) 1126 1127 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t, H1, DO_VSBC) 1128 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC) 1129 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC) 1130 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC) 1131 1132 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N : \ 1133 (__typeof(N))(N + M) < N) 1134 #define DO_MSBC(N, M, C) (C ? N <= M : N < M) 1135 1136 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP) \ 1137 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ 1138 CPURISCVState *env, uint32_t desc) \ 1139 { \ 1140 uint32_t vl = env->vl; \ 1141 uint32_t vm = vext_vm(desc); \ 1142 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \ 1143 uint32_t vta_all_1s = vext_vta_all_1s(desc); \ 1144 uint32_t i; \ 1145 \ 1146 for (i = env->vstart; i < vl; i++) { \ 1147 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \ 1148 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \ 1149 ETYPE carry = !vm && vext_elem_mask(v0, i); \ 1150 vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry)); \ 1151 } \ 1152 env->vstart = 0; \ 1153 /* 1154 * mask destination register are always tail-agnostic 1155 * set tail elements to 1s 1156 */ \ 1157 if (vta_all_1s) { \ 1158 for (; i < total_elems; i++) { \ 1159 vext_set_elem_mask(vd, i, 1); \ 1160 } \ 1161 } \ 1162 } 1163 1164 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t, H1, DO_MADC) 1165 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC) 1166 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC) 1167 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC) 1168 1169 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t, H1, DO_MSBC) 1170 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC) 1171 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC) 1172 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC) 1173 1174 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP) \ 1175 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ 1176 void *vs2, CPURISCVState *env, uint32_t desc) \ 1177 { \ 1178 uint32_t vl = env->vl; \ 1179 uint32_t vm = vext_vm(desc); \ 1180 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \ 1181 uint32_t vta_all_1s = vext_vta_all_1s(desc); \ 1182 uint32_t i; \ 1183 \ 1184 for (i = env->vstart; i < vl; i++) { \ 1185 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \ 1186 ETYPE carry = !vm && vext_elem_mask(v0, i); \ 1187 vext_set_elem_mask(vd, i, \ 1188 DO_OP(s2, (ETYPE)(target_long)s1, carry)); \ 1189 } \ 1190 env->vstart = 0; \ 1191 /* 1192 * mask destination register are always tail-agnostic 1193 * set tail elements to 1s 1194 */ \ 1195 if (vta_all_1s) { \ 1196 for (; i < total_elems; i++) { \ 1197 vext_set_elem_mask(vd, i, 1); \ 1198 } \ 1199 } \ 1200 } 1201 1202 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t, H1, DO_MADC) 1203 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC) 1204 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC) 1205 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC) 1206 1207 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t, H1, DO_MSBC) 1208 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC) 1209 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC) 1210 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC) 1211 1212 /* Vector Bitwise Logical Instructions */ 1213 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND) 1214 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND) 1215 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND) 1216 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND) 1217 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR) 1218 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR) 1219 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR) 1220 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR) 1221 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR) 1222 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR) 1223 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR) 1224 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR) 1225 GEN_VEXT_VV(vand_vv_b, 1) 1226 GEN_VEXT_VV(vand_vv_h, 2) 1227 GEN_VEXT_VV(vand_vv_w, 4) 1228 GEN_VEXT_VV(vand_vv_d, 8) 1229 GEN_VEXT_VV(vor_vv_b, 1) 1230 GEN_VEXT_VV(vor_vv_h, 2) 1231 GEN_VEXT_VV(vor_vv_w, 4) 1232 GEN_VEXT_VV(vor_vv_d, 8) 1233 GEN_VEXT_VV(vxor_vv_b, 1) 1234 GEN_VEXT_VV(vxor_vv_h, 2) 1235 GEN_VEXT_VV(vxor_vv_w, 4) 1236 GEN_VEXT_VV(vxor_vv_d, 8) 1237 1238 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND) 1239 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND) 1240 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND) 1241 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND) 1242 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR) 1243 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR) 1244 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR) 1245 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR) 1246 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR) 1247 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR) 1248 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR) 1249 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR) 1250 GEN_VEXT_VX(vand_vx_b, 1) 1251 GEN_VEXT_VX(vand_vx_h, 2) 1252 GEN_VEXT_VX(vand_vx_w, 4) 1253 GEN_VEXT_VX(vand_vx_d, 8) 1254 GEN_VEXT_VX(vor_vx_b, 1) 1255 GEN_VEXT_VX(vor_vx_h, 2) 1256 GEN_VEXT_VX(vor_vx_w, 4) 1257 GEN_VEXT_VX(vor_vx_d, 8) 1258 GEN_VEXT_VX(vxor_vx_b, 1) 1259 GEN_VEXT_VX(vxor_vx_h, 2) 1260 GEN_VEXT_VX(vxor_vx_w, 4) 1261 GEN_VEXT_VX(vxor_vx_d, 8) 1262 1263 /* Vector Single-Width Bit Shift Instructions */ 1264 #define DO_SLL(N, M) (N << (M)) 1265 #define DO_SRL(N, M) (N >> (M)) 1266 1267 /* generate the helpers for shift instructions with two vector operators */ 1268 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK) \ 1269 void HELPER(NAME)(void *vd, void *v0, void *vs1, \ 1270 void *vs2, CPURISCVState *env, uint32_t desc) \ 1271 { \ 1272 uint32_t vm = vext_vm(desc); \ 1273 uint32_t vl = env->vl; \ 1274 uint32_t esz = sizeof(TS1); \ 1275 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 1276 uint32_t vta = vext_vta(desc); \ 1277 uint32_t vma = vext_vma(desc); \ 1278 uint32_t i; \ 1279 \ 1280 for (i = env->vstart; i < vl; i++) { \ 1281 if (!vm && !vext_elem_mask(v0, i)) { \ 1282 /* set masked-off elements to 1s */ \ 1283 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \ 1284 continue; \ 1285 } \ 1286 TS1 s1 = *((TS1 *)vs1 + HS1(i)); \ 1287 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \ 1288 *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK); \ 1289 } \ 1290 env->vstart = 0; \ 1291 /* set tail elements to 1s */ \ 1292 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 1293 } 1294 1295 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t, uint8_t, H1, H1, DO_SLL, 0x7) 1296 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf) 1297 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f) 1298 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f) 1299 1300 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7) 1301 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf) 1302 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f) 1303 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f) 1304 1305 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t, int8_t, H1, H1, DO_SRL, 0x7) 1306 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf) 1307 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f) 1308 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f) 1309 1310 /* 1311 * generate the helpers for shift instructions with one vector and one scalar 1312 */ 1313 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \ 1314 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ 1315 void *vs2, CPURISCVState *env, \ 1316 uint32_t desc) \ 1317 { \ 1318 uint32_t vm = vext_vm(desc); \ 1319 uint32_t vl = env->vl; \ 1320 uint32_t esz = sizeof(TD); \ 1321 uint32_t total_elems = \ 1322 vext_get_total_elems(env, desc, esz); \ 1323 uint32_t vta = vext_vta(desc); \ 1324 uint32_t vma = vext_vma(desc); \ 1325 uint32_t i; \ 1326 \ 1327 for (i = env->vstart; i < vl; i++) { \ 1328 if (!vm && !vext_elem_mask(v0, i)) { \ 1329 /* set masked-off elements to 1s */ \ 1330 vext_set_elems_1s(vd, vma, i * esz, \ 1331 (i + 1) * esz); \ 1332 continue; \ 1333 } \ 1334 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \ 1335 *((TD *)vd + HD(i)) = OP(s2, s1 & MASK); \ 1336 } \ 1337 env->vstart = 0; \ 1338 /* set tail elements to 1s */ \ 1339 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\ 1340 } 1341 1342 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7) 1343 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf) 1344 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f) 1345 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f) 1346 1347 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7) 1348 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf) 1349 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f) 1350 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f) 1351 1352 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7) 1353 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf) 1354 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f) 1355 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f) 1356 1357 /* Vector Narrowing Integer Right Shift Instructions */ 1358 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf) 1359 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f) 1360 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f) 1361 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t, int16_t, H1, H2, DO_SRL, 0xf) 1362 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f) 1363 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f) 1364 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf) 1365 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f) 1366 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f) 1367 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf) 1368 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f) 1369 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f) 1370 1371 /* Vector Integer Comparison Instructions */ 1372 #define DO_MSEQ(N, M) (N == M) 1373 #define DO_MSNE(N, M) (N != M) 1374 #define DO_MSLT(N, M) (N < M) 1375 #define DO_MSLE(N, M) (N <= M) 1376 #define DO_MSGT(N, M) (N > M) 1377 1378 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP) \ 1379 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ 1380 CPURISCVState *env, uint32_t desc) \ 1381 { \ 1382 uint32_t vm = vext_vm(desc); \ 1383 uint32_t vl = env->vl; \ 1384 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \ 1385 uint32_t vta_all_1s = vext_vta_all_1s(desc); \ 1386 uint32_t vma = vext_vma(desc); \ 1387 uint32_t i; \ 1388 \ 1389 for (i = env->vstart; i < vl; i++) { \ 1390 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \ 1391 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \ 1392 if (!vm && !vext_elem_mask(v0, i)) { \ 1393 /* set masked-off elements to 1s */ \ 1394 if (vma) { \ 1395 vext_set_elem_mask(vd, i, 1); \ 1396 } \ 1397 continue; \ 1398 } \ 1399 vext_set_elem_mask(vd, i, DO_OP(s2, s1)); \ 1400 } \ 1401 env->vstart = 0; \ 1402 /* 1403 * mask destination register are always tail-agnostic 1404 * set tail elements to 1s 1405 */ \ 1406 if (vta_all_1s) { \ 1407 for (; i < total_elems; i++) { \ 1408 vext_set_elem_mask(vd, i, 1); \ 1409 } \ 1410 } \ 1411 } 1412 1413 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t, H1, DO_MSEQ) 1414 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ) 1415 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ) 1416 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ) 1417 1418 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t, H1, DO_MSNE) 1419 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE) 1420 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE) 1421 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE) 1422 1423 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t, H1, DO_MSLT) 1424 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT) 1425 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT) 1426 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT) 1427 1428 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t, H1, DO_MSLT) 1429 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT) 1430 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT) 1431 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT) 1432 1433 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t, H1, DO_MSLE) 1434 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE) 1435 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE) 1436 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE) 1437 1438 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t, H1, DO_MSLE) 1439 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE) 1440 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE) 1441 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE) 1442 1443 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP) \ 1444 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ 1445 CPURISCVState *env, uint32_t desc) \ 1446 { \ 1447 uint32_t vm = vext_vm(desc); \ 1448 uint32_t vl = env->vl; \ 1449 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \ 1450 uint32_t vta_all_1s = vext_vta_all_1s(desc); \ 1451 uint32_t vma = vext_vma(desc); \ 1452 uint32_t i; \ 1453 \ 1454 for (i = env->vstart; i < vl; i++) { \ 1455 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \ 1456 if (!vm && !vext_elem_mask(v0, i)) { \ 1457 /* set masked-off elements to 1s */ \ 1458 if (vma) { \ 1459 vext_set_elem_mask(vd, i, 1); \ 1460 } \ 1461 continue; \ 1462 } \ 1463 vext_set_elem_mask(vd, i, \ 1464 DO_OP(s2, (ETYPE)(target_long)s1)); \ 1465 } \ 1466 env->vstart = 0; \ 1467 /* 1468 * mask destination register are always tail-agnostic 1469 * set tail elements to 1s 1470 */ \ 1471 if (vta_all_1s) { \ 1472 for (; i < total_elems; i++) { \ 1473 vext_set_elem_mask(vd, i, 1); \ 1474 } \ 1475 } \ 1476 } 1477 1478 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t, H1, DO_MSEQ) 1479 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ) 1480 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ) 1481 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ) 1482 1483 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t, H1, DO_MSNE) 1484 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE) 1485 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE) 1486 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE) 1487 1488 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t, H1, DO_MSLT) 1489 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT) 1490 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT) 1491 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT) 1492 1493 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t, H1, DO_MSLT) 1494 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT) 1495 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT) 1496 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT) 1497 1498 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t, H1, DO_MSLE) 1499 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE) 1500 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE) 1501 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE) 1502 1503 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t, H1, DO_MSLE) 1504 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE) 1505 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE) 1506 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE) 1507 1508 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t, H1, DO_MSGT) 1509 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT) 1510 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT) 1511 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT) 1512 1513 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t, H1, DO_MSGT) 1514 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT) 1515 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT) 1516 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT) 1517 1518 /* Vector Integer Min/Max Instructions */ 1519 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN) 1520 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN) 1521 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN) 1522 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN) 1523 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN) 1524 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN) 1525 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN) 1526 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN) 1527 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX) 1528 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX) 1529 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX) 1530 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX) 1531 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX) 1532 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX) 1533 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX) 1534 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX) 1535 GEN_VEXT_VV(vminu_vv_b, 1) 1536 GEN_VEXT_VV(vminu_vv_h, 2) 1537 GEN_VEXT_VV(vminu_vv_w, 4) 1538 GEN_VEXT_VV(vminu_vv_d, 8) 1539 GEN_VEXT_VV(vmin_vv_b, 1) 1540 GEN_VEXT_VV(vmin_vv_h, 2) 1541 GEN_VEXT_VV(vmin_vv_w, 4) 1542 GEN_VEXT_VV(vmin_vv_d, 8) 1543 GEN_VEXT_VV(vmaxu_vv_b, 1) 1544 GEN_VEXT_VV(vmaxu_vv_h, 2) 1545 GEN_VEXT_VV(vmaxu_vv_w, 4) 1546 GEN_VEXT_VV(vmaxu_vv_d, 8) 1547 GEN_VEXT_VV(vmax_vv_b, 1) 1548 GEN_VEXT_VV(vmax_vv_h, 2) 1549 GEN_VEXT_VV(vmax_vv_w, 4) 1550 GEN_VEXT_VV(vmax_vv_d, 8) 1551 1552 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN) 1553 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN) 1554 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN) 1555 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN) 1556 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN) 1557 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN) 1558 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN) 1559 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN) 1560 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX) 1561 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX) 1562 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX) 1563 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX) 1564 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX) 1565 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX) 1566 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX) 1567 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX) 1568 GEN_VEXT_VX(vminu_vx_b, 1) 1569 GEN_VEXT_VX(vminu_vx_h, 2) 1570 GEN_VEXT_VX(vminu_vx_w, 4) 1571 GEN_VEXT_VX(vminu_vx_d, 8) 1572 GEN_VEXT_VX(vmin_vx_b, 1) 1573 GEN_VEXT_VX(vmin_vx_h, 2) 1574 GEN_VEXT_VX(vmin_vx_w, 4) 1575 GEN_VEXT_VX(vmin_vx_d, 8) 1576 GEN_VEXT_VX(vmaxu_vx_b, 1) 1577 GEN_VEXT_VX(vmaxu_vx_h, 2) 1578 GEN_VEXT_VX(vmaxu_vx_w, 4) 1579 GEN_VEXT_VX(vmaxu_vx_d, 8) 1580 GEN_VEXT_VX(vmax_vx_b, 1) 1581 GEN_VEXT_VX(vmax_vx_h, 2) 1582 GEN_VEXT_VX(vmax_vx_w, 4) 1583 GEN_VEXT_VX(vmax_vx_d, 8) 1584 1585 /* Vector Single-Width Integer Multiply Instructions */ 1586 #define DO_MUL(N, M) (N * M) 1587 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL) 1588 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL) 1589 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL) 1590 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL) 1591 GEN_VEXT_VV(vmul_vv_b, 1) 1592 GEN_VEXT_VV(vmul_vv_h, 2) 1593 GEN_VEXT_VV(vmul_vv_w, 4) 1594 GEN_VEXT_VV(vmul_vv_d, 8) 1595 1596 static int8_t do_mulh_b(int8_t s2, int8_t s1) 1597 { 1598 return (int16_t)s2 * (int16_t)s1 >> 8; 1599 } 1600 1601 static int16_t do_mulh_h(int16_t s2, int16_t s1) 1602 { 1603 return (int32_t)s2 * (int32_t)s1 >> 16; 1604 } 1605 1606 static int32_t do_mulh_w(int32_t s2, int32_t s1) 1607 { 1608 return (int64_t)s2 * (int64_t)s1 >> 32; 1609 } 1610 1611 static int64_t do_mulh_d(int64_t s2, int64_t s1) 1612 { 1613 uint64_t hi_64, lo_64; 1614 1615 muls64(&lo_64, &hi_64, s1, s2); 1616 return hi_64; 1617 } 1618 1619 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1) 1620 { 1621 return (uint16_t)s2 * (uint16_t)s1 >> 8; 1622 } 1623 1624 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1) 1625 { 1626 return (uint32_t)s2 * (uint32_t)s1 >> 16; 1627 } 1628 1629 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1) 1630 { 1631 return (uint64_t)s2 * (uint64_t)s1 >> 32; 1632 } 1633 1634 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1) 1635 { 1636 uint64_t hi_64, lo_64; 1637 1638 mulu64(&lo_64, &hi_64, s2, s1); 1639 return hi_64; 1640 } 1641 1642 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1) 1643 { 1644 return (int16_t)s2 * (uint16_t)s1 >> 8; 1645 } 1646 1647 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1) 1648 { 1649 return (int32_t)s2 * (uint32_t)s1 >> 16; 1650 } 1651 1652 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1) 1653 { 1654 return (int64_t)s2 * (uint64_t)s1 >> 32; 1655 } 1656 1657 /* 1658 * Let A = signed operand, 1659 * B = unsigned operand 1660 * P = mulu64(A, B), unsigned product 1661 * 1662 * LET X = 2 ** 64 - A, 2's complement of A 1663 * SP = signed product 1664 * THEN 1665 * IF A < 0 1666 * SP = -X * B 1667 * = -(2 ** 64 - A) * B 1668 * = A * B - 2 ** 64 * B 1669 * = P - 2 ** 64 * B 1670 * ELSE 1671 * SP = P 1672 * THEN 1673 * HI_P -= (A < 0 ? B : 0) 1674 */ 1675 1676 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1) 1677 { 1678 uint64_t hi_64, lo_64; 1679 1680 mulu64(&lo_64, &hi_64, s2, s1); 1681 1682 hi_64 -= s2 < 0 ? s1 : 0; 1683 return hi_64; 1684 } 1685 1686 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b) 1687 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h) 1688 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w) 1689 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d) 1690 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b) 1691 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h) 1692 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w) 1693 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d) 1694 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b) 1695 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h) 1696 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w) 1697 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d) 1698 GEN_VEXT_VV(vmulh_vv_b, 1) 1699 GEN_VEXT_VV(vmulh_vv_h, 2) 1700 GEN_VEXT_VV(vmulh_vv_w, 4) 1701 GEN_VEXT_VV(vmulh_vv_d, 8) 1702 GEN_VEXT_VV(vmulhu_vv_b, 1) 1703 GEN_VEXT_VV(vmulhu_vv_h, 2) 1704 GEN_VEXT_VV(vmulhu_vv_w, 4) 1705 GEN_VEXT_VV(vmulhu_vv_d, 8) 1706 GEN_VEXT_VV(vmulhsu_vv_b, 1) 1707 GEN_VEXT_VV(vmulhsu_vv_h, 2) 1708 GEN_VEXT_VV(vmulhsu_vv_w, 4) 1709 GEN_VEXT_VV(vmulhsu_vv_d, 8) 1710 1711 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL) 1712 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL) 1713 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL) 1714 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL) 1715 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b) 1716 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h) 1717 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w) 1718 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d) 1719 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b) 1720 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h) 1721 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w) 1722 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d) 1723 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b) 1724 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h) 1725 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w) 1726 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d) 1727 GEN_VEXT_VX(vmul_vx_b, 1) 1728 GEN_VEXT_VX(vmul_vx_h, 2) 1729 GEN_VEXT_VX(vmul_vx_w, 4) 1730 GEN_VEXT_VX(vmul_vx_d, 8) 1731 GEN_VEXT_VX(vmulh_vx_b, 1) 1732 GEN_VEXT_VX(vmulh_vx_h, 2) 1733 GEN_VEXT_VX(vmulh_vx_w, 4) 1734 GEN_VEXT_VX(vmulh_vx_d, 8) 1735 GEN_VEXT_VX(vmulhu_vx_b, 1) 1736 GEN_VEXT_VX(vmulhu_vx_h, 2) 1737 GEN_VEXT_VX(vmulhu_vx_w, 4) 1738 GEN_VEXT_VX(vmulhu_vx_d, 8) 1739 GEN_VEXT_VX(vmulhsu_vx_b, 1) 1740 GEN_VEXT_VX(vmulhsu_vx_h, 2) 1741 GEN_VEXT_VX(vmulhsu_vx_w, 4) 1742 GEN_VEXT_VX(vmulhsu_vx_d, 8) 1743 1744 /* Vector Integer Divide Instructions */ 1745 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M) 1746 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M) 1747 #define DO_DIV(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : \ 1748 unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M) 1749 #define DO_REM(N, M) (unlikely(M == 0) ? N : \ 1750 unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M) 1751 1752 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU) 1753 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU) 1754 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU) 1755 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU) 1756 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV) 1757 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV) 1758 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV) 1759 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV) 1760 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU) 1761 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU) 1762 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU) 1763 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU) 1764 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM) 1765 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM) 1766 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM) 1767 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM) 1768 GEN_VEXT_VV(vdivu_vv_b, 1) 1769 GEN_VEXT_VV(vdivu_vv_h, 2) 1770 GEN_VEXT_VV(vdivu_vv_w, 4) 1771 GEN_VEXT_VV(vdivu_vv_d, 8) 1772 GEN_VEXT_VV(vdiv_vv_b, 1) 1773 GEN_VEXT_VV(vdiv_vv_h, 2) 1774 GEN_VEXT_VV(vdiv_vv_w, 4) 1775 GEN_VEXT_VV(vdiv_vv_d, 8) 1776 GEN_VEXT_VV(vremu_vv_b, 1) 1777 GEN_VEXT_VV(vremu_vv_h, 2) 1778 GEN_VEXT_VV(vremu_vv_w, 4) 1779 GEN_VEXT_VV(vremu_vv_d, 8) 1780 GEN_VEXT_VV(vrem_vv_b, 1) 1781 GEN_VEXT_VV(vrem_vv_h, 2) 1782 GEN_VEXT_VV(vrem_vv_w, 4) 1783 GEN_VEXT_VV(vrem_vv_d, 8) 1784 1785 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU) 1786 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU) 1787 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU) 1788 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU) 1789 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV) 1790 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV) 1791 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV) 1792 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV) 1793 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU) 1794 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU) 1795 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU) 1796 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU) 1797 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM) 1798 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM) 1799 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM) 1800 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM) 1801 GEN_VEXT_VX(vdivu_vx_b, 1) 1802 GEN_VEXT_VX(vdivu_vx_h, 2) 1803 GEN_VEXT_VX(vdivu_vx_w, 4) 1804 GEN_VEXT_VX(vdivu_vx_d, 8) 1805 GEN_VEXT_VX(vdiv_vx_b, 1) 1806 GEN_VEXT_VX(vdiv_vx_h, 2) 1807 GEN_VEXT_VX(vdiv_vx_w, 4) 1808 GEN_VEXT_VX(vdiv_vx_d, 8) 1809 GEN_VEXT_VX(vremu_vx_b, 1) 1810 GEN_VEXT_VX(vremu_vx_h, 2) 1811 GEN_VEXT_VX(vremu_vx_w, 4) 1812 GEN_VEXT_VX(vremu_vx_d, 8) 1813 GEN_VEXT_VX(vrem_vx_b, 1) 1814 GEN_VEXT_VX(vrem_vx_h, 2) 1815 GEN_VEXT_VX(vrem_vx_w, 4) 1816 GEN_VEXT_VX(vrem_vx_d, 8) 1817 1818 /* Vector Widening Integer Multiply Instructions */ 1819 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL) 1820 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL) 1821 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL) 1822 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL) 1823 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL) 1824 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL) 1825 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL) 1826 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL) 1827 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL) 1828 GEN_VEXT_VV(vwmul_vv_b, 2) 1829 GEN_VEXT_VV(vwmul_vv_h, 4) 1830 GEN_VEXT_VV(vwmul_vv_w, 8) 1831 GEN_VEXT_VV(vwmulu_vv_b, 2) 1832 GEN_VEXT_VV(vwmulu_vv_h, 4) 1833 GEN_VEXT_VV(vwmulu_vv_w, 8) 1834 GEN_VEXT_VV(vwmulsu_vv_b, 2) 1835 GEN_VEXT_VV(vwmulsu_vv_h, 4) 1836 GEN_VEXT_VV(vwmulsu_vv_w, 8) 1837 1838 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL) 1839 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL) 1840 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL) 1841 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL) 1842 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL) 1843 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL) 1844 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL) 1845 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL) 1846 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL) 1847 GEN_VEXT_VX(vwmul_vx_b, 2) 1848 GEN_VEXT_VX(vwmul_vx_h, 4) 1849 GEN_VEXT_VX(vwmul_vx_w, 8) 1850 GEN_VEXT_VX(vwmulu_vx_b, 2) 1851 GEN_VEXT_VX(vwmulu_vx_h, 4) 1852 GEN_VEXT_VX(vwmulu_vx_w, 8) 1853 GEN_VEXT_VX(vwmulsu_vx_b, 2) 1854 GEN_VEXT_VX(vwmulsu_vx_h, 4) 1855 GEN_VEXT_VX(vwmulsu_vx_w, 8) 1856 1857 /* Vector Single-Width Integer Multiply-Add Instructions */ 1858 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \ 1859 static void do_##NAME(void *vd, void *vs1, void *vs2, int i) \ 1860 { \ 1861 TX1 s1 = *((T1 *)vs1 + HS1(i)); \ 1862 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 1863 TD d = *((TD *)vd + HD(i)); \ 1864 *((TD *)vd + HD(i)) = OP(s2, s1, d); \ 1865 } 1866 1867 #define DO_MACC(N, M, D) (M * N + D) 1868 #define DO_NMSAC(N, M, D) (-(M * N) + D) 1869 #define DO_MADD(N, M, D) (M * D + N) 1870 #define DO_NMSUB(N, M, D) (-(M * D) + N) 1871 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC) 1872 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC) 1873 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC) 1874 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC) 1875 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC) 1876 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC) 1877 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC) 1878 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC) 1879 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD) 1880 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD) 1881 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD) 1882 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD) 1883 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB) 1884 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB) 1885 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB) 1886 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB) 1887 GEN_VEXT_VV(vmacc_vv_b, 1) 1888 GEN_VEXT_VV(vmacc_vv_h, 2) 1889 GEN_VEXT_VV(vmacc_vv_w, 4) 1890 GEN_VEXT_VV(vmacc_vv_d, 8) 1891 GEN_VEXT_VV(vnmsac_vv_b, 1) 1892 GEN_VEXT_VV(vnmsac_vv_h, 2) 1893 GEN_VEXT_VV(vnmsac_vv_w, 4) 1894 GEN_VEXT_VV(vnmsac_vv_d, 8) 1895 GEN_VEXT_VV(vmadd_vv_b, 1) 1896 GEN_VEXT_VV(vmadd_vv_h, 2) 1897 GEN_VEXT_VV(vmadd_vv_w, 4) 1898 GEN_VEXT_VV(vmadd_vv_d, 8) 1899 GEN_VEXT_VV(vnmsub_vv_b, 1) 1900 GEN_VEXT_VV(vnmsub_vv_h, 2) 1901 GEN_VEXT_VV(vnmsub_vv_w, 4) 1902 GEN_VEXT_VV(vnmsub_vv_d, 8) 1903 1904 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \ 1905 static void do_##NAME(void *vd, target_long s1, void *vs2, int i) \ 1906 { \ 1907 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 1908 TD d = *((TD *)vd + HD(i)); \ 1909 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d); \ 1910 } 1911 1912 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC) 1913 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC) 1914 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC) 1915 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC) 1916 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC) 1917 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC) 1918 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC) 1919 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC) 1920 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD) 1921 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD) 1922 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD) 1923 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD) 1924 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB) 1925 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB) 1926 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB) 1927 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB) 1928 GEN_VEXT_VX(vmacc_vx_b, 1) 1929 GEN_VEXT_VX(vmacc_vx_h, 2) 1930 GEN_VEXT_VX(vmacc_vx_w, 4) 1931 GEN_VEXT_VX(vmacc_vx_d, 8) 1932 GEN_VEXT_VX(vnmsac_vx_b, 1) 1933 GEN_VEXT_VX(vnmsac_vx_h, 2) 1934 GEN_VEXT_VX(vnmsac_vx_w, 4) 1935 GEN_VEXT_VX(vnmsac_vx_d, 8) 1936 GEN_VEXT_VX(vmadd_vx_b, 1) 1937 GEN_VEXT_VX(vmadd_vx_h, 2) 1938 GEN_VEXT_VX(vmadd_vx_w, 4) 1939 GEN_VEXT_VX(vmadd_vx_d, 8) 1940 GEN_VEXT_VX(vnmsub_vx_b, 1) 1941 GEN_VEXT_VX(vnmsub_vx_h, 2) 1942 GEN_VEXT_VX(vnmsub_vx_w, 4) 1943 GEN_VEXT_VX(vnmsub_vx_d, 8) 1944 1945 /* Vector Widening Integer Multiply-Add Instructions */ 1946 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC) 1947 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC) 1948 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC) 1949 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC) 1950 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC) 1951 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC) 1952 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC) 1953 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC) 1954 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC) 1955 GEN_VEXT_VV(vwmaccu_vv_b, 2) 1956 GEN_VEXT_VV(vwmaccu_vv_h, 4) 1957 GEN_VEXT_VV(vwmaccu_vv_w, 8) 1958 GEN_VEXT_VV(vwmacc_vv_b, 2) 1959 GEN_VEXT_VV(vwmacc_vv_h, 4) 1960 GEN_VEXT_VV(vwmacc_vv_w, 8) 1961 GEN_VEXT_VV(vwmaccsu_vv_b, 2) 1962 GEN_VEXT_VV(vwmaccsu_vv_h, 4) 1963 GEN_VEXT_VV(vwmaccsu_vv_w, 8) 1964 1965 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC) 1966 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC) 1967 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC) 1968 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC) 1969 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC) 1970 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC) 1971 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC) 1972 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC) 1973 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC) 1974 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC) 1975 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC) 1976 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC) 1977 GEN_VEXT_VX(vwmaccu_vx_b, 2) 1978 GEN_VEXT_VX(vwmaccu_vx_h, 4) 1979 GEN_VEXT_VX(vwmaccu_vx_w, 8) 1980 GEN_VEXT_VX(vwmacc_vx_b, 2) 1981 GEN_VEXT_VX(vwmacc_vx_h, 4) 1982 GEN_VEXT_VX(vwmacc_vx_w, 8) 1983 GEN_VEXT_VX(vwmaccsu_vx_b, 2) 1984 GEN_VEXT_VX(vwmaccsu_vx_h, 4) 1985 GEN_VEXT_VX(vwmaccsu_vx_w, 8) 1986 GEN_VEXT_VX(vwmaccus_vx_b, 2) 1987 GEN_VEXT_VX(vwmaccus_vx_h, 4) 1988 GEN_VEXT_VX(vwmaccus_vx_w, 8) 1989 1990 /* Vector Integer Merge and Move Instructions */ 1991 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H) \ 1992 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env, \ 1993 uint32_t desc) \ 1994 { \ 1995 uint32_t vl = env->vl; \ 1996 uint32_t esz = sizeof(ETYPE); \ 1997 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 1998 uint32_t vta = vext_vta(desc); \ 1999 uint32_t i; \ 2000 \ 2001 for (i = env->vstart; i < vl; i++) { \ 2002 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \ 2003 *((ETYPE *)vd + H(i)) = s1; \ 2004 } \ 2005 env->vstart = 0; \ 2006 /* set tail elements to 1s */ \ 2007 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 2008 } 2009 2010 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t, H1) 2011 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2) 2012 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4) 2013 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8) 2014 2015 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H) \ 2016 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env, \ 2017 uint32_t desc) \ 2018 { \ 2019 uint32_t vl = env->vl; \ 2020 uint32_t esz = sizeof(ETYPE); \ 2021 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 2022 uint32_t vta = vext_vta(desc); \ 2023 uint32_t i; \ 2024 \ 2025 for (i = env->vstart; i < vl; i++) { \ 2026 *((ETYPE *)vd + H(i)) = (ETYPE)s1; \ 2027 } \ 2028 env->vstart = 0; \ 2029 /* set tail elements to 1s */ \ 2030 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 2031 } 2032 2033 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t, H1) 2034 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2) 2035 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4) 2036 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8) 2037 2038 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H) \ 2039 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ 2040 CPURISCVState *env, uint32_t desc) \ 2041 { \ 2042 uint32_t vl = env->vl; \ 2043 uint32_t esz = sizeof(ETYPE); \ 2044 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 2045 uint32_t vta = vext_vta(desc); \ 2046 uint32_t i; \ 2047 \ 2048 for (i = env->vstart; i < vl; i++) { \ 2049 ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1); \ 2050 *((ETYPE *)vd + H(i)) = *(vt + H(i)); \ 2051 } \ 2052 env->vstart = 0; \ 2053 /* set tail elements to 1s */ \ 2054 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 2055 } 2056 2057 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t, H1) 2058 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2) 2059 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4) 2060 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8) 2061 2062 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H) \ 2063 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ 2064 void *vs2, CPURISCVState *env, uint32_t desc) \ 2065 { \ 2066 uint32_t vl = env->vl; \ 2067 uint32_t esz = sizeof(ETYPE); \ 2068 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 2069 uint32_t vta = vext_vta(desc); \ 2070 uint32_t i; \ 2071 \ 2072 for (i = env->vstart; i < vl; i++) { \ 2073 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \ 2074 ETYPE d = (!vext_elem_mask(v0, i) ? s2 : \ 2075 (ETYPE)(target_long)s1); \ 2076 *((ETYPE *)vd + H(i)) = d; \ 2077 } \ 2078 env->vstart = 0; \ 2079 /* set tail elements to 1s */ \ 2080 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 2081 } 2082 2083 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t, H1) 2084 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2) 2085 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4) 2086 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8) 2087 2088 /* 2089 * Vector Fixed-Point Arithmetic Instructions 2090 */ 2091 2092 /* Vector Single-Width Saturating Add and Subtract */ 2093 2094 /* 2095 * As fixed point instructions probably have round mode and saturation, 2096 * define common macros for fixed point here. 2097 */ 2098 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i, 2099 CPURISCVState *env, int vxrm); 2100 2101 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \ 2102 static inline void \ 2103 do_##NAME(void *vd, void *vs1, void *vs2, int i, \ 2104 CPURISCVState *env, int vxrm) \ 2105 { \ 2106 TX1 s1 = *((T1 *)vs1 + HS1(i)); \ 2107 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 2108 *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1); \ 2109 } 2110 2111 static inline void 2112 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2, 2113 CPURISCVState *env, 2114 uint32_t vl, uint32_t vm, int vxrm, 2115 opivv2_rm_fn *fn, uint32_t vma, uint32_t esz) 2116 { 2117 for (uint32_t i = env->vstart; i < vl; i++) { 2118 if (!vm && !vext_elem_mask(v0, i)) { 2119 /* set masked-off elements to 1s */ 2120 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); 2121 continue; 2122 } 2123 fn(vd, vs1, vs2, i, env, vxrm); 2124 } 2125 env->vstart = 0; 2126 } 2127 2128 static inline void 2129 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2, 2130 CPURISCVState *env, 2131 uint32_t desc, 2132 opivv2_rm_fn *fn, uint32_t esz) 2133 { 2134 uint32_t vm = vext_vm(desc); 2135 uint32_t vl = env->vl; 2136 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 2137 uint32_t vta = vext_vta(desc); 2138 uint32_t vma = vext_vma(desc); 2139 2140 switch (env->vxrm) { 2141 case 0: /* rnu */ 2142 vext_vv_rm_1(vd, v0, vs1, vs2, 2143 env, vl, vm, 0, fn, vma, esz); 2144 break; 2145 case 1: /* rne */ 2146 vext_vv_rm_1(vd, v0, vs1, vs2, 2147 env, vl, vm, 1, fn, vma, esz); 2148 break; 2149 case 2: /* rdn */ 2150 vext_vv_rm_1(vd, v0, vs1, vs2, 2151 env, vl, vm, 2, fn, vma, esz); 2152 break; 2153 default: /* rod */ 2154 vext_vv_rm_1(vd, v0, vs1, vs2, 2155 env, vl, vm, 3, fn, vma, esz); 2156 break; 2157 } 2158 /* set tail elements to 1s */ 2159 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); 2160 } 2161 2162 /* generate helpers for fixed point instructions with OPIVV format */ 2163 #define GEN_VEXT_VV_RM(NAME, ESZ) \ 2164 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ 2165 CPURISCVState *env, uint32_t desc) \ 2166 { \ 2167 vext_vv_rm_2(vd, v0, vs1, vs2, env, desc, \ 2168 do_##NAME, ESZ); \ 2169 } 2170 2171 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, 2172 uint8_t b) 2173 { 2174 uint8_t res = a + b; 2175 if (res < a) { 2176 res = UINT8_MAX; 2177 env->vxsat = 0x1; 2178 } 2179 return res; 2180 } 2181 2182 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a, 2183 uint16_t b) 2184 { 2185 uint16_t res = a + b; 2186 if (res < a) { 2187 res = UINT16_MAX; 2188 env->vxsat = 0x1; 2189 } 2190 return res; 2191 } 2192 2193 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a, 2194 uint32_t b) 2195 { 2196 uint32_t res = a + b; 2197 if (res < a) { 2198 res = UINT32_MAX; 2199 env->vxsat = 0x1; 2200 } 2201 return res; 2202 } 2203 2204 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a, 2205 uint64_t b) 2206 { 2207 uint64_t res = a + b; 2208 if (res < a) { 2209 res = UINT64_MAX; 2210 env->vxsat = 0x1; 2211 } 2212 return res; 2213 } 2214 2215 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8) 2216 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16) 2217 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32) 2218 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64) 2219 GEN_VEXT_VV_RM(vsaddu_vv_b, 1) 2220 GEN_VEXT_VV_RM(vsaddu_vv_h, 2) 2221 GEN_VEXT_VV_RM(vsaddu_vv_w, 4) 2222 GEN_VEXT_VV_RM(vsaddu_vv_d, 8) 2223 2224 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i, 2225 CPURISCVState *env, int vxrm); 2226 2227 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \ 2228 static inline void \ 2229 do_##NAME(void *vd, target_long s1, void *vs2, int i, \ 2230 CPURISCVState *env, int vxrm) \ 2231 { \ 2232 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 2233 *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1); \ 2234 } 2235 2236 static inline void 2237 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2, 2238 CPURISCVState *env, 2239 uint32_t vl, uint32_t vm, int vxrm, 2240 opivx2_rm_fn *fn, uint32_t vma, uint32_t esz) 2241 { 2242 for (uint32_t i = env->vstart; i < vl; i++) { 2243 if (!vm && !vext_elem_mask(v0, i)) { 2244 /* set masked-off elements to 1s */ 2245 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); 2246 continue; 2247 } 2248 fn(vd, s1, vs2, i, env, vxrm); 2249 } 2250 env->vstart = 0; 2251 } 2252 2253 static inline void 2254 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2, 2255 CPURISCVState *env, 2256 uint32_t desc, 2257 opivx2_rm_fn *fn, uint32_t esz) 2258 { 2259 uint32_t vm = vext_vm(desc); 2260 uint32_t vl = env->vl; 2261 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 2262 uint32_t vta = vext_vta(desc); 2263 uint32_t vma = vext_vma(desc); 2264 2265 switch (env->vxrm) { 2266 case 0: /* rnu */ 2267 vext_vx_rm_1(vd, v0, s1, vs2, 2268 env, vl, vm, 0, fn, vma, esz); 2269 break; 2270 case 1: /* rne */ 2271 vext_vx_rm_1(vd, v0, s1, vs2, 2272 env, vl, vm, 1, fn, vma, esz); 2273 break; 2274 case 2: /* rdn */ 2275 vext_vx_rm_1(vd, v0, s1, vs2, 2276 env, vl, vm, 2, fn, vma, esz); 2277 break; 2278 default: /* rod */ 2279 vext_vx_rm_1(vd, v0, s1, vs2, 2280 env, vl, vm, 3, fn, vma, esz); 2281 break; 2282 } 2283 /* set tail elements to 1s */ 2284 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); 2285 } 2286 2287 /* generate helpers for fixed point instructions with OPIVX format */ 2288 #define GEN_VEXT_VX_RM(NAME, ESZ) \ 2289 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \ 2290 void *vs2, CPURISCVState *env, \ 2291 uint32_t desc) \ 2292 { \ 2293 vext_vx_rm_2(vd, v0, s1, vs2, env, desc, \ 2294 do_##NAME, ESZ); \ 2295 } 2296 2297 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8) 2298 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16) 2299 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32) 2300 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64) 2301 GEN_VEXT_VX_RM(vsaddu_vx_b, 1) 2302 GEN_VEXT_VX_RM(vsaddu_vx_h, 2) 2303 GEN_VEXT_VX_RM(vsaddu_vx_w, 4) 2304 GEN_VEXT_VX_RM(vsaddu_vx_d, 8) 2305 2306 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b) 2307 { 2308 int8_t res = a + b; 2309 if ((res ^ a) & (res ^ b) & INT8_MIN) { 2310 res = a > 0 ? INT8_MAX : INT8_MIN; 2311 env->vxsat = 0x1; 2312 } 2313 return res; 2314 } 2315 2316 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, 2317 int16_t b) 2318 { 2319 int16_t res = a + b; 2320 if ((res ^ a) & (res ^ b) & INT16_MIN) { 2321 res = a > 0 ? INT16_MAX : INT16_MIN; 2322 env->vxsat = 0x1; 2323 } 2324 return res; 2325 } 2326 2327 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, 2328 int32_t b) 2329 { 2330 int32_t res = a + b; 2331 if ((res ^ a) & (res ^ b) & INT32_MIN) { 2332 res = a > 0 ? INT32_MAX : INT32_MIN; 2333 env->vxsat = 0x1; 2334 } 2335 return res; 2336 } 2337 2338 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, 2339 int64_t b) 2340 { 2341 int64_t res = a + b; 2342 if ((res ^ a) & (res ^ b) & INT64_MIN) { 2343 res = a > 0 ? INT64_MAX : INT64_MIN; 2344 env->vxsat = 0x1; 2345 } 2346 return res; 2347 } 2348 2349 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8) 2350 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16) 2351 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32) 2352 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64) 2353 GEN_VEXT_VV_RM(vsadd_vv_b, 1) 2354 GEN_VEXT_VV_RM(vsadd_vv_h, 2) 2355 GEN_VEXT_VV_RM(vsadd_vv_w, 4) 2356 GEN_VEXT_VV_RM(vsadd_vv_d, 8) 2357 2358 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8) 2359 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16) 2360 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32) 2361 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64) 2362 GEN_VEXT_VX_RM(vsadd_vx_b, 1) 2363 GEN_VEXT_VX_RM(vsadd_vx_h, 2) 2364 GEN_VEXT_VX_RM(vsadd_vx_w, 4) 2365 GEN_VEXT_VX_RM(vsadd_vx_d, 8) 2366 2367 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, 2368 uint8_t b) 2369 { 2370 uint8_t res = a - b; 2371 if (res > a) { 2372 res = 0; 2373 env->vxsat = 0x1; 2374 } 2375 return res; 2376 } 2377 2378 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a, 2379 uint16_t b) 2380 { 2381 uint16_t res = a - b; 2382 if (res > a) { 2383 res = 0; 2384 env->vxsat = 0x1; 2385 } 2386 return res; 2387 } 2388 2389 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a, 2390 uint32_t b) 2391 { 2392 uint32_t res = a - b; 2393 if (res > a) { 2394 res = 0; 2395 env->vxsat = 0x1; 2396 } 2397 return res; 2398 } 2399 2400 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a, 2401 uint64_t b) 2402 { 2403 uint64_t res = a - b; 2404 if (res > a) { 2405 res = 0; 2406 env->vxsat = 0x1; 2407 } 2408 return res; 2409 } 2410 2411 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8) 2412 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16) 2413 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32) 2414 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64) 2415 GEN_VEXT_VV_RM(vssubu_vv_b, 1) 2416 GEN_VEXT_VV_RM(vssubu_vv_h, 2) 2417 GEN_VEXT_VV_RM(vssubu_vv_w, 4) 2418 GEN_VEXT_VV_RM(vssubu_vv_d, 8) 2419 2420 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8) 2421 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16) 2422 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32) 2423 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64) 2424 GEN_VEXT_VX_RM(vssubu_vx_b, 1) 2425 GEN_VEXT_VX_RM(vssubu_vx_h, 2) 2426 GEN_VEXT_VX_RM(vssubu_vx_w, 4) 2427 GEN_VEXT_VX_RM(vssubu_vx_d, 8) 2428 2429 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b) 2430 { 2431 int8_t res = a - b; 2432 if ((res ^ a) & (a ^ b) & INT8_MIN) { 2433 res = a >= 0 ? INT8_MAX : INT8_MIN; 2434 env->vxsat = 0x1; 2435 } 2436 return res; 2437 } 2438 2439 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, 2440 int16_t b) 2441 { 2442 int16_t res = a - b; 2443 if ((res ^ a) & (a ^ b) & INT16_MIN) { 2444 res = a >= 0 ? INT16_MAX : INT16_MIN; 2445 env->vxsat = 0x1; 2446 } 2447 return res; 2448 } 2449 2450 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, 2451 int32_t b) 2452 { 2453 int32_t res = a - b; 2454 if ((res ^ a) & (a ^ b) & INT32_MIN) { 2455 res = a >= 0 ? INT32_MAX : INT32_MIN; 2456 env->vxsat = 0x1; 2457 } 2458 return res; 2459 } 2460 2461 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, 2462 int64_t b) 2463 { 2464 int64_t res = a - b; 2465 if ((res ^ a) & (a ^ b) & INT64_MIN) { 2466 res = a >= 0 ? INT64_MAX : INT64_MIN; 2467 env->vxsat = 0x1; 2468 } 2469 return res; 2470 } 2471 2472 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8) 2473 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16) 2474 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32) 2475 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64) 2476 GEN_VEXT_VV_RM(vssub_vv_b, 1) 2477 GEN_VEXT_VV_RM(vssub_vv_h, 2) 2478 GEN_VEXT_VV_RM(vssub_vv_w, 4) 2479 GEN_VEXT_VV_RM(vssub_vv_d, 8) 2480 2481 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8) 2482 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16) 2483 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32) 2484 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64) 2485 GEN_VEXT_VX_RM(vssub_vx_b, 1) 2486 GEN_VEXT_VX_RM(vssub_vx_h, 2) 2487 GEN_VEXT_VX_RM(vssub_vx_w, 4) 2488 GEN_VEXT_VX_RM(vssub_vx_d, 8) 2489 2490 /* Vector Single-Width Averaging Add and Subtract */ 2491 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift) 2492 { 2493 uint8_t d = extract64(v, shift, 1); 2494 uint8_t d1; 2495 uint64_t D1, D2; 2496 2497 if (shift == 0 || shift > 64) { 2498 return 0; 2499 } 2500 2501 d1 = extract64(v, shift - 1, 1); 2502 D1 = extract64(v, 0, shift); 2503 if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */ 2504 return d1; 2505 } else if (vxrm == 1) { /* round-to-nearest-even */ 2506 if (shift > 1) { 2507 D2 = extract64(v, 0, shift - 1); 2508 return d1 & ((D2 != 0) | d); 2509 } else { 2510 return d1 & d; 2511 } 2512 } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */ 2513 return !d & (D1 != 0); 2514 } 2515 return 0; /* round-down (truncate) */ 2516 } 2517 2518 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, 2519 int32_t b) 2520 { 2521 int64_t res = (int64_t)a + b; 2522 uint8_t round = get_round(vxrm, res, 1); 2523 2524 return (res >> 1) + round; 2525 } 2526 2527 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, 2528 int64_t b) 2529 { 2530 int64_t res = a + b; 2531 uint8_t round = get_round(vxrm, res, 1); 2532 int64_t over = (res ^ a) & (res ^ b) & INT64_MIN; 2533 2534 /* With signed overflow, bit 64 is inverse of bit 63. */ 2535 return ((res >> 1) ^ over) + round; 2536 } 2537 2538 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32) 2539 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32) 2540 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32) 2541 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64) 2542 GEN_VEXT_VV_RM(vaadd_vv_b, 1) 2543 GEN_VEXT_VV_RM(vaadd_vv_h, 2) 2544 GEN_VEXT_VV_RM(vaadd_vv_w, 4) 2545 GEN_VEXT_VV_RM(vaadd_vv_d, 8) 2546 2547 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32) 2548 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32) 2549 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32) 2550 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64) 2551 GEN_VEXT_VX_RM(vaadd_vx_b, 1) 2552 GEN_VEXT_VX_RM(vaadd_vx_h, 2) 2553 GEN_VEXT_VX_RM(vaadd_vx_w, 4) 2554 GEN_VEXT_VX_RM(vaadd_vx_d, 8) 2555 2556 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm, 2557 uint32_t a, uint32_t b) 2558 { 2559 uint64_t res = (uint64_t)a + b; 2560 uint8_t round = get_round(vxrm, res, 1); 2561 2562 return (res >> 1) + round; 2563 } 2564 2565 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm, 2566 uint64_t a, uint64_t b) 2567 { 2568 uint64_t res = a + b; 2569 uint8_t round = get_round(vxrm, res, 1); 2570 uint64_t over = (uint64_t)(res < a) << 63; 2571 2572 return ((res >> 1) | over) + round; 2573 } 2574 2575 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32) 2576 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32) 2577 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32) 2578 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64) 2579 GEN_VEXT_VV_RM(vaaddu_vv_b, 1) 2580 GEN_VEXT_VV_RM(vaaddu_vv_h, 2) 2581 GEN_VEXT_VV_RM(vaaddu_vv_w, 4) 2582 GEN_VEXT_VV_RM(vaaddu_vv_d, 8) 2583 2584 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32) 2585 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32) 2586 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32) 2587 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64) 2588 GEN_VEXT_VX_RM(vaaddu_vx_b, 1) 2589 GEN_VEXT_VX_RM(vaaddu_vx_h, 2) 2590 GEN_VEXT_VX_RM(vaaddu_vx_w, 4) 2591 GEN_VEXT_VX_RM(vaaddu_vx_d, 8) 2592 2593 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, 2594 int32_t b) 2595 { 2596 int64_t res = (int64_t)a - b; 2597 uint8_t round = get_round(vxrm, res, 1); 2598 2599 return (res >> 1) + round; 2600 } 2601 2602 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, 2603 int64_t b) 2604 { 2605 int64_t res = (int64_t)a - b; 2606 uint8_t round = get_round(vxrm, res, 1); 2607 int64_t over = (res ^ a) & (a ^ b) & INT64_MIN; 2608 2609 /* With signed overflow, bit 64 is inverse of bit 63. */ 2610 return ((res >> 1) ^ over) + round; 2611 } 2612 2613 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32) 2614 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32) 2615 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32) 2616 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64) 2617 GEN_VEXT_VV_RM(vasub_vv_b, 1) 2618 GEN_VEXT_VV_RM(vasub_vv_h, 2) 2619 GEN_VEXT_VV_RM(vasub_vv_w, 4) 2620 GEN_VEXT_VV_RM(vasub_vv_d, 8) 2621 2622 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32) 2623 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32) 2624 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32) 2625 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64) 2626 GEN_VEXT_VX_RM(vasub_vx_b, 1) 2627 GEN_VEXT_VX_RM(vasub_vx_h, 2) 2628 GEN_VEXT_VX_RM(vasub_vx_w, 4) 2629 GEN_VEXT_VX_RM(vasub_vx_d, 8) 2630 2631 static inline uint32_t asubu32(CPURISCVState *env, int vxrm, 2632 uint32_t a, uint32_t b) 2633 { 2634 int64_t res = (int64_t)a - b; 2635 uint8_t round = get_round(vxrm, res, 1); 2636 2637 return (res >> 1) + round; 2638 } 2639 2640 static inline uint64_t asubu64(CPURISCVState *env, int vxrm, 2641 uint64_t a, uint64_t b) 2642 { 2643 uint64_t res = (uint64_t)a - b; 2644 uint8_t round = get_round(vxrm, res, 1); 2645 uint64_t over = (uint64_t)(res > a) << 63; 2646 2647 return ((res >> 1) | over) + round; 2648 } 2649 2650 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32) 2651 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32) 2652 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32) 2653 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64) 2654 GEN_VEXT_VV_RM(vasubu_vv_b, 1) 2655 GEN_VEXT_VV_RM(vasubu_vv_h, 2) 2656 GEN_VEXT_VV_RM(vasubu_vv_w, 4) 2657 GEN_VEXT_VV_RM(vasubu_vv_d, 8) 2658 2659 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32) 2660 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32) 2661 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32) 2662 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64) 2663 GEN_VEXT_VX_RM(vasubu_vx_b, 1) 2664 GEN_VEXT_VX_RM(vasubu_vx_h, 2) 2665 GEN_VEXT_VX_RM(vasubu_vx_w, 4) 2666 GEN_VEXT_VX_RM(vasubu_vx_d, 8) 2667 2668 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */ 2669 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b) 2670 { 2671 uint8_t round; 2672 int16_t res; 2673 2674 res = (int16_t)a * (int16_t)b; 2675 round = get_round(vxrm, res, 7); 2676 res = (res >> 7) + round; 2677 2678 if (res > INT8_MAX) { 2679 env->vxsat = 0x1; 2680 return INT8_MAX; 2681 } else if (res < INT8_MIN) { 2682 env->vxsat = 0x1; 2683 return INT8_MIN; 2684 } else { 2685 return res; 2686 } 2687 } 2688 2689 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b) 2690 { 2691 uint8_t round; 2692 int32_t res; 2693 2694 res = (int32_t)a * (int32_t)b; 2695 round = get_round(vxrm, res, 15); 2696 res = (res >> 15) + round; 2697 2698 if (res > INT16_MAX) { 2699 env->vxsat = 0x1; 2700 return INT16_MAX; 2701 } else if (res < INT16_MIN) { 2702 env->vxsat = 0x1; 2703 return INT16_MIN; 2704 } else { 2705 return res; 2706 } 2707 } 2708 2709 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) 2710 { 2711 uint8_t round; 2712 int64_t res; 2713 2714 res = (int64_t)a * (int64_t)b; 2715 round = get_round(vxrm, res, 31); 2716 res = (res >> 31) + round; 2717 2718 if (res > INT32_MAX) { 2719 env->vxsat = 0x1; 2720 return INT32_MAX; 2721 } else if (res < INT32_MIN) { 2722 env->vxsat = 0x1; 2723 return INT32_MIN; 2724 } else { 2725 return res; 2726 } 2727 } 2728 2729 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b) 2730 { 2731 uint8_t round; 2732 uint64_t hi_64, lo_64; 2733 int64_t res; 2734 2735 if (a == INT64_MIN && b == INT64_MIN) { 2736 env->vxsat = 1; 2737 return INT64_MAX; 2738 } 2739 2740 muls64(&lo_64, &hi_64, a, b); 2741 round = get_round(vxrm, lo_64, 63); 2742 /* 2743 * Cannot overflow, as there are always 2744 * 2 sign bits after multiply. 2745 */ 2746 res = (hi_64 << 1) | (lo_64 >> 63); 2747 if (round) { 2748 if (res == INT64_MAX) { 2749 env->vxsat = 1; 2750 } else { 2751 res += 1; 2752 } 2753 } 2754 return res; 2755 } 2756 2757 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8) 2758 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16) 2759 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32) 2760 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64) 2761 GEN_VEXT_VV_RM(vsmul_vv_b, 1) 2762 GEN_VEXT_VV_RM(vsmul_vv_h, 2) 2763 GEN_VEXT_VV_RM(vsmul_vv_w, 4) 2764 GEN_VEXT_VV_RM(vsmul_vv_d, 8) 2765 2766 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8) 2767 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16) 2768 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32) 2769 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64) 2770 GEN_VEXT_VX_RM(vsmul_vx_b, 1) 2771 GEN_VEXT_VX_RM(vsmul_vx_h, 2) 2772 GEN_VEXT_VX_RM(vsmul_vx_w, 4) 2773 GEN_VEXT_VX_RM(vsmul_vx_d, 8) 2774 2775 /* Vector Single-Width Scaling Shift Instructions */ 2776 static inline uint8_t 2777 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b) 2778 { 2779 uint8_t round, shift = b & 0x7; 2780 uint8_t res; 2781 2782 round = get_round(vxrm, a, shift); 2783 res = (a >> shift) + round; 2784 return res; 2785 } 2786 static inline uint16_t 2787 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b) 2788 { 2789 uint8_t round, shift = b & 0xf; 2790 2791 round = get_round(vxrm, a, shift); 2792 return (a >> shift) + round; 2793 } 2794 static inline uint32_t 2795 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b) 2796 { 2797 uint8_t round, shift = b & 0x1f; 2798 2799 round = get_round(vxrm, a, shift); 2800 return (a >> shift) + round; 2801 } 2802 static inline uint64_t 2803 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b) 2804 { 2805 uint8_t round, shift = b & 0x3f; 2806 2807 round = get_round(vxrm, a, shift); 2808 return (a >> shift) + round; 2809 } 2810 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8) 2811 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16) 2812 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32) 2813 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64) 2814 GEN_VEXT_VV_RM(vssrl_vv_b, 1) 2815 GEN_VEXT_VV_RM(vssrl_vv_h, 2) 2816 GEN_VEXT_VV_RM(vssrl_vv_w, 4) 2817 GEN_VEXT_VV_RM(vssrl_vv_d, 8) 2818 2819 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8) 2820 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16) 2821 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32) 2822 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64) 2823 GEN_VEXT_VX_RM(vssrl_vx_b, 1) 2824 GEN_VEXT_VX_RM(vssrl_vx_h, 2) 2825 GEN_VEXT_VX_RM(vssrl_vx_w, 4) 2826 GEN_VEXT_VX_RM(vssrl_vx_d, 8) 2827 2828 static inline int8_t 2829 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b) 2830 { 2831 uint8_t round, shift = b & 0x7; 2832 2833 round = get_round(vxrm, a, shift); 2834 return (a >> shift) + round; 2835 } 2836 static inline int16_t 2837 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b) 2838 { 2839 uint8_t round, shift = b & 0xf; 2840 2841 round = get_round(vxrm, a, shift); 2842 return (a >> shift) + round; 2843 } 2844 static inline int32_t 2845 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) 2846 { 2847 uint8_t round, shift = b & 0x1f; 2848 2849 round = get_round(vxrm, a, shift); 2850 return (a >> shift) + round; 2851 } 2852 static inline int64_t 2853 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b) 2854 { 2855 uint8_t round, shift = b & 0x3f; 2856 2857 round = get_round(vxrm, a, shift); 2858 return (a >> shift) + round; 2859 } 2860 2861 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8) 2862 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16) 2863 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32) 2864 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64) 2865 GEN_VEXT_VV_RM(vssra_vv_b, 1) 2866 GEN_VEXT_VV_RM(vssra_vv_h, 2) 2867 GEN_VEXT_VV_RM(vssra_vv_w, 4) 2868 GEN_VEXT_VV_RM(vssra_vv_d, 8) 2869 2870 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8) 2871 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16) 2872 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32) 2873 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64) 2874 GEN_VEXT_VX_RM(vssra_vx_b, 1) 2875 GEN_VEXT_VX_RM(vssra_vx_h, 2) 2876 GEN_VEXT_VX_RM(vssra_vx_w, 4) 2877 GEN_VEXT_VX_RM(vssra_vx_d, 8) 2878 2879 /* Vector Narrowing Fixed-Point Clip Instructions */ 2880 static inline int8_t 2881 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b) 2882 { 2883 uint8_t round, shift = b & 0xf; 2884 int16_t res; 2885 2886 round = get_round(vxrm, a, shift); 2887 res = (a >> shift) + round; 2888 if (res > INT8_MAX) { 2889 env->vxsat = 0x1; 2890 return INT8_MAX; 2891 } else if (res < INT8_MIN) { 2892 env->vxsat = 0x1; 2893 return INT8_MIN; 2894 } else { 2895 return res; 2896 } 2897 } 2898 2899 static inline int16_t 2900 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b) 2901 { 2902 uint8_t round, shift = b & 0x1f; 2903 int32_t res; 2904 2905 round = get_round(vxrm, a, shift); 2906 res = (a >> shift) + round; 2907 if (res > INT16_MAX) { 2908 env->vxsat = 0x1; 2909 return INT16_MAX; 2910 } else if (res < INT16_MIN) { 2911 env->vxsat = 0x1; 2912 return INT16_MIN; 2913 } else { 2914 return res; 2915 } 2916 } 2917 2918 static inline int32_t 2919 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b) 2920 { 2921 uint8_t round, shift = b & 0x3f; 2922 int64_t res; 2923 2924 round = get_round(vxrm, a, shift); 2925 res = (a >> shift) + round; 2926 if (res > INT32_MAX) { 2927 env->vxsat = 0x1; 2928 return INT32_MAX; 2929 } else if (res < INT32_MIN) { 2930 env->vxsat = 0x1; 2931 return INT32_MIN; 2932 } else { 2933 return res; 2934 } 2935 } 2936 2937 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8) 2938 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16) 2939 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32) 2940 GEN_VEXT_VV_RM(vnclip_wv_b, 1) 2941 GEN_VEXT_VV_RM(vnclip_wv_h, 2) 2942 GEN_VEXT_VV_RM(vnclip_wv_w, 4) 2943 2944 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8) 2945 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16) 2946 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32) 2947 GEN_VEXT_VX_RM(vnclip_wx_b, 1) 2948 GEN_VEXT_VX_RM(vnclip_wx_h, 2) 2949 GEN_VEXT_VX_RM(vnclip_wx_w, 4) 2950 2951 static inline uint8_t 2952 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b) 2953 { 2954 uint8_t round, shift = b & 0xf; 2955 uint16_t res; 2956 2957 round = get_round(vxrm, a, shift); 2958 res = (a >> shift) + round; 2959 if (res > UINT8_MAX) { 2960 env->vxsat = 0x1; 2961 return UINT8_MAX; 2962 } else { 2963 return res; 2964 } 2965 } 2966 2967 static inline uint16_t 2968 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b) 2969 { 2970 uint8_t round, shift = b & 0x1f; 2971 uint32_t res; 2972 2973 round = get_round(vxrm, a, shift); 2974 res = (a >> shift) + round; 2975 if (res > UINT16_MAX) { 2976 env->vxsat = 0x1; 2977 return UINT16_MAX; 2978 } else { 2979 return res; 2980 } 2981 } 2982 2983 static inline uint32_t 2984 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b) 2985 { 2986 uint8_t round, shift = b & 0x3f; 2987 uint64_t res; 2988 2989 round = get_round(vxrm, a, shift); 2990 res = (a >> shift) + round; 2991 if (res > UINT32_MAX) { 2992 env->vxsat = 0x1; 2993 return UINT32_MAX; 2994 } else { 2995 return res; 2996 } 2997 } 2998 2999 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8) 3000 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16) 3001 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32) 3002 GEN_VEXT_VV_RM(vnclipu_wv_b, 1) 3003 GEN_VEXT_VV_RM(vnclipu_wv_h, 2) 3004 GEN_VEXT_VV_RM(vnclipu_wv_w, 4) 3005 3006 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8) 3007 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16) 3008 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32) 3009 GEN_VEXT_VX_RM(vnclipu_wx_b, 1) 3010 GEN_VEXT_VX_RM(vnclipu_wx_h, 2) 3011 GEN_VEXT_VX_RM(vnclipu_wx_w, 4) 3012 3013 /* 3014 * Vector Float Point Arithmetic Instructions 3015 */ 3016 /* Vector Single-Width Floating-Point Add/Subtract Instructions */ 3017 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \ 3018 static void do_##NAME(void *vd, void *vs1, void *vs2, int i, \ 3019 CPURISCVState *env) \ 3020 { \ 3021 TX1 s1 = *((T1 *)vs1 + HS1(i)); \ 3022 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 3023 *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status); \ 3024 } 3025 3026 #define GEN_VEXT_VV_ENV(NAME, ESZ) \ 3027 void HELPER(NAME)(void *vd, void *v0, void *vs1, \ 3028 void *vs2, CPURISCVState *env, \ 3029 uint32_t desc) \ 3030 { \ 3031 uint32_t vm = vext_vm(desc); \ 3032 uint32_t vl = env->vl; \ 3033 uint32_t total_elems = \ 3034 vext_get_total_elems(env, desc, ESZ); \ 3035 uint32_t vta = vext_vta(desc); \ 3036 uint32_t vma = vext_vma(desc); \ 3037 uint32_t i; \ 3038 \ 3039 for (i = env->vstart; i < vl; i++) { \ 3040 if (!vm && !vext_elem_mask(v0, i)) { \ 3041 /* set masked-off elements to 1s */ \ 3042 vext_set_elems_1s(vd, vma, i * ESZ, \ 3043 (i + 1) * ESZ); \ 3044 continue; \ 3045 } \ 3046 do_##NAME(vd, vs1, vs2, i, env); \ 3047 } \ 3048 env->vstart = 0; \ 3049 /* set tail elements to 1s */ \ 3050 vext_set_elems_1s(vd, vta, vl * ESZ, \ 3051 total_elems * ESZ); \ 3052 } 3053 3054 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add) 3055 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add) 3056 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add) 3057 GEN_VEXT_VV_ENV(vfadd_vv_h, 2) 3058 GEN_VEXT_VV_ENV(vfadd_vv_w, 4) 3059 GEN_VEXT_VV_ENV(vfadd_vv_d, 8) 3060 3061 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \ 3062 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \ 3063 CPURISCVState *env) \ 3064 { \ 3065 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 3066 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\ 3067 } 3068 3069 #define GEN_VEXT_VF(NAME, ESZ) \ 3070 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, \ 3071 void *vs2, CPURISCVState *env, \ 3072 uint32_t desc) \ 3073 { \ 3074 uint32_t vm = vext_vm(desc); \ 3075 uint32_t vl = env->vl; \ 3076 uint32_t total_elems = \ 3077 vext_get_total_elems(env, desc, ESZ); \ 3078 uint32_t vta = vext_vta(desc); \ 3079 uint32_t vma = vext_vma(desc); \ 3080 uint32_t i; \ 3081 \ 3082 for (i = env->vstart; i < vl; i++) { \ 3083 if (!vm && !vext_elem_mask(v0, i)) { \ 3084 /* set masked-off elements to 1s */ \ 3085 vext_set_elems_1s(vd, vma, i * ESZ, \ 3086 (i + 1) * ESZ); \ 3087 continue; \ 3088 } \ 3089 do_##NAME(vd, s1, vs2, i, env); \ 3090 } \ 3091 env->vstart = 0; \ 3092 /* set tail elements to 1s */ \ 3093 vext_set_elems_1s(vd, vta, vl * ESZ, \ 3094 total_elems * ESZ); \ 3095 } 3096 3097 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add) 3098 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add) 3099 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add) 3100 GEN_VEXT_VF(vfadd_vf_h, 2) 3101 GEN_VEXT_VF(vfadd_vf_w, 4) 3102 GEN_VEXT_VF(vfadd_vf_d, 8) 3103 3104 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub) 3105 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub) 3106 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub) 3107 GEN_VEXT_VV_ENV(vfsub_vv_h, 2) 3108 GEN_VEXT_VV_ENV(vfsub_vv_w, 4) 3109 GEN_VEXT_VV_ENV(vfsub_vv_d, 8) 3110 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub) 3111 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub) 3112 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub) 3113 GEN_VEXT_VF(vfsub_vf_h, 2) 3114 GEN_VEXT_VF(vfsub_vf_w, 4) 3115 GEN_VEXT_VF(vfsub_vf_d, 8) 3116 3117 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s) 3118 { 3119 return float16_sub(b, a, s); 3120 } 3121 3122 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s) 3123 { 3124 return float32_sub(b, a, s); 3125 } 3126 3127 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s) 3128 { 3129 return float64_sub(b, a, s); 3130 } 3131 3132 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub) 3133 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub) 3134 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub) 3135 GEN_VEXT_VF(vfrsub_vf_h, 2) 3136 GEN_VEXT_VF(vfrsub_vf_w, 4) 3137 GEN_VEXT_VF(vfrsub_vf_d, 8) 3138 3139 /* Vector Widening Floating-Point Add/Subtract Instructions */ 3140 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s) 3141 { 3142 return float32_add(float16_to_float32(a, true, s), 3143 float16_to_float32(b, true, s), s); 3144 } 3145 3146 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s) 3147 { 3148 return float64_add(float32_to_float64(a, s), 3149 float32_to_float64(b, s), s); 3150 3151 } 3152 3153 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16) 3154 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32) 3155 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4) 3156 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8) 3157 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16) 3158 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32) 3159 GEN_VEXT_VF(vfwadd_vf_h, 4) 3160 GEN_VEXT_VF(vfwadd_vf_w, 8) 3161 3162 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s) 3163 { 3164 return float32_sub(float16_to_float32(a, true, s), 3165 float16_to_float32(b, true, s), s); 3166 } 3167 3168 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s) 3169 { 3170 return float64_sub(float32_to_float64(a, s), 3171 float32_to_float64(b, s), s); 3172 3173 } 3174 3175 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16) 3176 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32) 3177 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4) 3178 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8) 3179 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16) 3180 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32) 3181 GEN_VEXT_VF(vfwsub_vf_h, 4) 3182 GEN_VEXT_VF(vfwsub_vf_w, 8) 3183 3184 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s) 3185 { 3186 return float32_add(a, float16_to_float32(b, true, s), s); 3187 } 3188 3189 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s) 3190 { 3191 return float64_add(a, float32_to_float64(b, s), s); 3192 } 3193 3194 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16) 3195 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32) 3196 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4) 3197 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8) 3198 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16) 3199 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32) 3200 GEN_VEXT_VF(vfwadd_wf_h, 4) 3201 GEN_VEXT_VF(vfwadd_wf_w, 8) 3202 3203 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s) 3204 { 3205 return float32_sub(a, float16_to_float32(b, true, s), s); 3206 } 3207 3208 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s) 3209 { 3210 return float64_sub(a, float32_to_float64(b, s), s); 3211 } 3212 3213 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16) 3214 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32) 3215 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4) 3216 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8) 3217 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16) 3218 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32) 3219 GEN_VEXT_VF(vfwsub_wf_h, 4) 3220 GEN_VEXT_VF(vfwsub_wf_w, 8) 3221 3222 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */ 3223 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul) 3224 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul) 3225 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul) 3226 GEN_VEXT_VV_ENV(vfmul_vv_h, 2) 3227 GEN_VEXT_VV_ENV(vfmul_vv_w, 4) 3228 GEN_VEXT_VV_ENV(vfmul_vv_d, 8) 3229 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul) 3230 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul) 3231 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul) 3232 GEN_VEXT_VF(vfmul_vf_h, 2) 3233 GEN_VEXT_VF(vfmul_vf_w, 4) 3234 GEN_VEXT_VF(vfmul_vf_d, 8) 3235 3236 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div) 3237 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div) 3238 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div) 3239 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2) 3240 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4) 3241 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8) 3242 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div) 3243 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div) 3244 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div) 3245 GEN_VEXT_VF(vfdiv_vf_h, 2) 3246 GEN_VEXT_VF(vfdiv_vf_w, 4) 3247 GEN_VEXT_VF(vfdiv_vf_d, 8) 3248 3249 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s) 3250 { 3251 return float16_div(b, a, s); 3252 } 3253 3254 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s) 3255 { 3256 return float32_div(b, a, s); 3257 } 3258 3259 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s) 3260 { 3261 return float64_div(b, a, s); 3262 } 3263 3264 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv) 3265 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv) 3266 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv) 3267 GEN_VEXT_VF(vfrdiv_vf_h, 2) 3268 GEN_VEXT_VF(vfrdiv_vf_w, 4) 3269 GEN_VEXT_VF(vfrdiv_vf_d, 8) 3270 3271 /* Vector Widening Floating-Point Multiply */ 3272 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s) 3273 { 3274 return float32_mul(float16_to_float32(a, true, s), 3275 float16_to_float32(b, true, s), s); 3276 } 3277 3278 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s) 3279 { 3280 return float64_mul(float32_to_float64(a, s), 3281 float32_to_float64(b, s), s); 3282 3283 } 3284 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16) 3285 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32) 3286 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4) 3287 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8) 3288 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16) 3289 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32) 3290 GEN_VEXT_VF(vfwmul_vf_h, 4) 3291 GEN_VEXT_VF(vfwmul_vf_w, 8) 3292 3293 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */ 3294 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \ 3295 static void do_##NAME(void *vd, void *vs1, void *vs2, int i, \ 3296 CPURISCVState *env) \ 3297 { \ 3298 TX1 s1 = *((T1 *)vs1 + HS1(i)); \ 3299 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 3300 TD d = *((TD *)vd + HD(i)); \ 3301 *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status); \ 3302 } 3303 3304 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s) 3305 { 3306 return float16_muladd(a, b, d, 0, s); 3307 } 3308 3309 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s) 3310 { 3311 return float32_muladd(a, b, d, 0, s); 3312 } 3313 3314 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s) 3315 { 3316 return float64_muladd(a, b, d, 0, s); 3317 } 3318 3319 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16) 3320 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32) 3321 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64) 3322 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2) 3323 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4) 3324 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8) 3325 3326 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \ 3327 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \ 3328 CPURISCVState *env) \ 3329 { \ 3330 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 3331 TD d = *((TD *)vd + HD(i)); \ 3332 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\ 3333 } 3334 3335 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16) 3336 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32) 3337 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64) 3338 GEN_VEXT_VF(vfmacc_vf_h, 2) 3339 GEN_VEXT_VF(vfmacc_vf_w, 4) 3340 GEN_VEXT_VF(vfmacc_vf_d, 8) 3341 3342 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s) 3343 { 3344 return float16_muladd(a, b, d, float_muladd_negate_c | 3345 float_muladd_negate_product, s); 3346 } 3347 3348 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s) 3349 { 3350 return float32_muladd(a, b, d, float_muladd_negate_c | 3351 float_muladd_negate_product, s); 3352 } 3353 3354 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s) 3355 { 3356 return float64_muladd(a, b, d, float_muladd_negate_c | 3357 float_muladd_negate_product, s); 3358 } 3359 3360 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16) 3361 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32) 3362 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64) 3363 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2) 3364 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4) 3365 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8) 3366 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16) 3367 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32) 3368 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64) 3369 GEN_VEXT_VF(vfnmacc_vf_h, 2) 3370 GEN_VEXT_VF(vfnmacc_vf_w, 4) 3371 GEN_VEXT_VF(vfnmacc_vf_d, 8) 3372 3373 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s) 3374 { 3375 return float16_muladd(a, b, d, float_muladd_negate_c, s); 3376 } 3377 3378 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s) 3379 { 3380 return float32_muladd(a, b, d, float_muladd_negate_c, s); 3381 } 3382 3383 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s) 3384 { 3385 return float64_muladd(a, b, d, float_muladd_negate_c, s); 3386 } 3387 3388 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16) 3389 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32) 3390 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64) 3391 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2) 3392 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4) 3393 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8) 3394 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16) 3395 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32) 3396 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64) 3397 GEN_VEXT_VF(vfmsac_vf_h, 2) 3398 GEN_VEXT_VF(vfmsac_vf_w, 4) 3399 GEN_VEXT_VF(vfmsac_vf_d, 8) 3400 3401 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s) 3402 { 3403 return float16_muladd(a, b, d, float_muladd_negate_product, s); 3404 } 3405 3406 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s) 3407 { 3408 return float32_muladd(a, b, d, float_muladd_negate_product, s); 3409 } 3410 3411 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s) 3412 { 3413 return float64_muladd(a, b, d, float_muladd_negate_product, s); 3414 } 3415 3416 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16) 3417 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32) 3418 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64) 3419 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2) 3420 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4) 3421 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8) 3422 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16) 3423 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32) 3424 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64) 3425 GEN_VEXT_VF(vfnmsac_vf_h, 2) 3426 GEN_VEXT_VF(vfnmsac_vf_w, 4) 3427 GEN_VEXT_VF(vfnmsac_vf_d, 8) 3428 3429 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s) 3430 { 3431 return float16_muladd(d, b, a, 0, s); 3432 } 3433 3434 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s) 3435 { 3436 return float32_muladd(d, b, a, 0, s); 3437 } 3438 3439 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s) 3440 { 3441 return float64_muladd(d, b, a, 0, s); 3442 } 3443 3444 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16) 3445 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32) 3446 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64) 3447 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2) 3448 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4) 3449 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8) 3450 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16) 3451 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32) 3452 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64) 3453 GEN_VEXT_VF(vfmadd_vf_h, 2) 3454 GEN_VEXT_VF(vfmadd_vf_w, 4) 3455 GEN_VEXT_VF(vfmadd_vf_d, 8) 3456 3457 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s) 3458 { 3459 return float16_muladd(d, b, a, float_muladd_negate_c | 3460 float_muladd_negate_product, s); 3461 } 3462 3463 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s) 3464 { 3465 return float32_muladd(d, b, a, float_muladd_negate_c | 3466 float_muladd_negate_product, s); 3467 } 3468 3469 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s) 3470 { 3471 return float64_muladd(d, b, a, float_muladd_negate_c | 3472 float_muladd_negate_product, s); 3473 } 3474 3475 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16) 3476 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32) 3477 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64) 3478 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2) 3479 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4) 3480 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8) 3481 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16) 3482 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32) 3483 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64) 3484 GEN_VEXT_VF(vfnmadd_vf_h, 2) 3485 GEN_VEXT_VF(vfnmadd_vf_w, 4) 3486 GEN_VEXT_VF(vfnmadd_vf_d, 8) 3487 3488 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s) 3489 { 3490 return float16_muladd(d, b, a, float_muladd_negate_c, s); 3491 } 3492 3493 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s) 3494 { 3495 return float32_muladd(d, b, a, float_muladd_negate_c, s); 3496 } 3497 3498 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s) 3499 { 3500 return float64_muladd(d, b, a, float_muladd_negate_c, s); 3501 } 3502 3503 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16) 3504 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32) 3505 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64) 3506 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2) 3507 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4) 3508 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8) 3509 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16) 3510 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32) 3511 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64) 3512 GEN_VEXT_VF(vfmsub_vf_h, 2) 3513 GEN_VEXT_VF(vfmsub_vf_w, 4) 3514 GEN_VEXT_VF(vfmsub_vf_d, 8) 3515 3516 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s) 3517 { 3518 return float16_muladd(d, b, a, float_muladd_negate_product, s); 3519 } 3520 3521 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s) 3522 { 3523 return float32_muladd(d, b, a, float_muladd_negate_product, s); 3524 } 3525 3526 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s) 3527 { 3528 return float64_muladd(d, b, a, float_muladd_negate_product, s); 3529 } 3530 3531 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16) 3532 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32) 3533 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64) 3534 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2) 3535 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4) 3536 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8) 3537 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16) 3538 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32) 3539 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64) 3540 GEN_VEXT_VF(vfnmsub_vf_h, 2) 3541 GEN_VEXT_VF(vfnmsub_vf_w, 4) 3542 GEN_VEXT_VF(vfnmsub_vf_d, 8) 3543 3544 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */ 3545 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s) 3546 { 3547 return float32_muladd(float16_to_float32(a, true, s), 3548 float16_to_float32(b, true, s), d, 0, s); 3549 } 3550 3551 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s) 3552 { 3553 return float64_muladd(float32_to_float64(a, s), 3554 float32_to_float64(b, s), d, 0, s); 3555 } 3556 3557 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16) 3558 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32) 3559 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4) 3560 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8) 3561 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16) 3562 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32) 3563 GEN_VEXT_VF(vfwmacc_vf_h, 4) 3564 GEN_VEXT_VF(vfwmacc_vf_w, 8) 3565 3566 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s) 3567 { 3568 return float32_muladd(float16_to_float32(a, true, s), 3569 float16_to_float32(b, true, s), d, 3570 float_muladd_negate_c | float_muladd_negate_product, 3571 s); 3572 } 3573 3574 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s) 3575 { 3576 return float64_muladd(float32_to_float64(a, s), float32_to_float64(b, s), 3577 d, float_muladd_negate_c | 3578 float_muladd_negate_product, s); 3579 } 3580 3581 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16) 3582 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32) 3583 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4) 3584 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8) 3585 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16) 3586 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32) 3587 GEN_VEXT_VF(vfwnmacc_vf_h, 4) 3588 GEN_VEXT_VF(vfwnmacc_vf_w, 8) 3589 3590 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s) 3591 { 3592 return float32_muladd(float16_to_float32(a, true, s), 3593 float16_to_float32(b, true, s), d, 3594 float_muladd_negate_c, s); 3595 } 3596 3597 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s) 3598 { 3599 return float64_muladd(float32_to_float64(a, s), 3600 float32_to_float64(b, s), d, 3601 float_muladd_negate_c, s); 3602 } 3603 3604 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16) 3605 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32) 3606 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4) 3607 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8) 3608 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16) 3609 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32) 3610 GEN_VEXT_VF(vfwmsac_vf_h, 4) 3611 GEN_VEXT_VF(vfwmsac_vf_w, 8) 3612 3613 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s) 3614 { 3615 return float32_muladd(float16_to_float32(a, true, s), 3616 float16_to_float32(b, true, s), d, 3617 float_muladd_negate_product, s); 3618 } 3619 3620 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s) 3621 { 3622 return float64_muladd(float32_to_float64(a, s), 3623 float32_to_float64(b, s), d, 3624 float_muladd_negate_product, s); 3625 } 3626 3627 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16) 3628 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32) 3629 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4) 3630 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8) 3631 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16) 3632 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32) 3633 GEN_VEXT_VF(vfwnmsac_vf_h, 4) 3634 GEN_VEXT_VF(vfwnmsac_vf_w, 8) 3635 3636 /* Vector Floating-Point Square-Root Instruction */ 3637 /* (TD, T2, TX2) */ 3638 #define OP_UU_H uint16_t, uint16_t, uint16_t 3639 #define OP_UU_W uint32_t, uint32_t, uint32_t 3640 #define OP_UU_D uint64_t, uint64_t, uint64_t 3641 3642 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP) \ 3643 static void do_##NAME(void *vd, void *vs2, int i, \ 3644 CPURISCVState *env) \ 3645 { \ 3646 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 3647 *((TD *)vd + HD(i)) = OP(s2, &env->fp_status); \ 3648 } 3649 3650 #define GEN_VEXT_V_ENV(NAME, ESZ) \ 3651 void HELPER(NAME)(void *vd, void *v0, void *vs2, \ 3652 CPURISCVState *env, uint32_t desc) \ 3653 { \ 3654 uint32_t vm = vext_vm(desc); \ 3655 uint32_t vl = env->vl; \ 3656 uint32_t total_elems = \ 3657 vext_get_total_elems(env, desc, ESZ); \ 3658 uint32_t vta = vext_vta(desc); \ 3659 uint32_t vma = vext_vma(desc); \ 3660 uint32_t i; \ 3661 \ 3662 if (vl == 0) { \ 3663 return; \ 3664 } \ 3665 for (i = env->vstart; i < vl; i++) { \ 3666 if (!vm && !vext_elem_mask(v0, i)) { \ 3667 /* set masked-off elements to 1s */ \ 3668 vext_set_elems_1s(vd, vma, i * ESZ, \ 3669 (i + 1) * ESZ); \ 3670 continue; \ 3671 } \ 3672 do_##NAME(vd, vs2, i, env); \ 3673 } \ 3674 env->vstart = 0; \ 3675 vext_set_elems_1s(vd, vta, vl * ESZ, \ 3676 total_elems * ESZ); \ 3677 } 3678 3679 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt) 3680 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt) 3681 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt) 3682 GEN_VEXT_V_ENV(vfsqrt_v_h, 2) 3683 GEN_VEXT_V_ENV(vfsqrt_v_w, 4) 3684 GEN_VEXT_V_ENV(vfsqrt_v_d, 8) 3685 3686 /* 3687 * Vector Floating-Point Reciprocal Square-Root Estimate Instruction 3688 * 3689 * Adapted from riscv-v-spec recip.c: 3690 * https://github.com/riscv/riscv-v-spec/blob/master/recip.c 3691 */ 3692 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size) 3693 { 3694 uint64_t sign = extract64(f, frac_size + exp_size, 1); 3695 uint64_t exp = extract64(f, frac_size, exp_size); 3696 uint64_t frac = extract64(f, 0, frac_size); 3697 3698 const uint8_t lookup_table[] = { 3699 52, 51, 50, 48, 47, 46, 44, 43, 3700 42, 41, 40, 39, 38, 36, 35, 34, 3701 33, 32, 31, 30, 30, 29, 28, 27, 3702 26, 25, 24, 23, 23, 22, 21, 20, 3703 19, 19, 18, 17, 16, 16, 15, 14, 3704 14, 13, 12, 12, 11, 10, 10, 9, 3705 9, 8, 7, 7, 6, 6, 5, 4, 3706 4, 3, 3, 2, 2, 1, 1, 0, 3707 127, 125, 123, 121, 119, 118, 116, 114, 3708 113, 111, 109, 108, 106, 105, 103, 102, 3709 100, 99, 97, 96, 95, 93, 92, 91, 3710 90, 88, 87, 86, 85, 84, 83, 82, 3711 80, 79, 78, 77, 76, 75, 74, 73, 3712 72, 71, 70, 70, 69, 68, 67, 66, 3713 65, 64, 63, 63, 62, 61, 60, 59, 3714 59, 58, 57, 56, 56, 55, 54, 53 3715 }; 3716 const int precision = 7; 3717 3718 if (exp == 0 && frac != 0) { /* subnormal */ 3719 /* Normalize the subnormal. */ 3720 while (extract64(frac, frac_size - 1, 1) == 0) { 3721 exp--; 3722 frac <<= 1; 3723 } 3724 3725 frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size); 3726 } 3727 3728 int idx = ((exp & 1) << (precision - 1)) | 3729 (frac >> (frac_size - precision + 1)); 3730 uint64_t out_frac = (uint64_t)(lookup_table[idx]) << 3731 (frac_size - precision); 3732 uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2; 3733 3734 uint64_t val = 0; 3735 val = deposit64(val, 0, frac_size, out_frac); 3736 val = deposit64(val, frac_size, exp_size, out_exp); 3737 val = deposit64(val, frac_size + exp_size, 1, sign); 3738 return val; 3739 } 3740 3741 static float16 frsqrt7_h(float16 f, float_status *s) 3742 { 3743 int exp_size = 5, frac_size = 10; 3744 bool sign = float16_is_neg(f); 3745 3746 /* 3747 * frsqrt7(sNaN) = canonical NaN 3748 * frsqrt7(-inf) = canonical NaN 3749 * frsqrt7(-normal) = canonical NaN 3750 * frsqrt7(-subnormal) = canonical NaN 3751 */ 3752 if (float16_is_signaling_nan(f, s) || 3753 (float16_is_infinity(f) && sign) || 3754 (float16_is_normal(f) && sign) || 3755 (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) { 3756 s->float_exception_flags |= float_flag_invalid; 3757 return float16_default_nan(s); 3758 } 3759 3760 /* frsqrt7(qNaN) = canonical NaN */ 3761 if (float16_is_quiet_nan(f, s)) { 3762 return float16_default_nan(s); 3763 } 3764 3765 /* frsqrt7(+-0) = +-inf */ 3766 if (float16_is_zero(f)) { 3767 s->float_exception_flags |= float_flag_divbyzero; 3768 return float16_set_sign(float16_infinity, sign); 3769 } 3770 3771 /* frsqrt7(+inf) = +0 */ 3772 if (float16_is_infinity(f) && !sign) { 3773 return float16_set_sign(float16_zero, sign); 3774 } 3775 3776 /* +normal, +subnormal */ 3777 uint64_t val = frsqrt7(f, exp_size, frac_size); 3778 return make_float16(val); 3779 } 3780 3781 static float32 frsqrt7_s(float32 f, float_status *s) 3782 { 3783 int exp_size = 8, frac_size = 23; 3784 bool sign = float32_is_neg(f); 3785 3786 /* 3787 * frsqrt7(sNaN) = canonical NaN 3788 * frsqrt7(-inf) = canonical NaN 3789 * frsqrt7(-normal) = canonical NaN 3790 * frsqrt7(-subnormal) = canonical NaN 3791 */ 3792 if (float32_is_signaling_nan(f, s) || 3793 (float32_is_infinity(f) && sign) || 3794 (float32_is_normal(f) && sign) || 3795 (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) { 3796 s->float_exception_flags |= float_flag_invalid; 3797 return float32_default_nan(s); 3798 } 3799 3800 /* frsqrt7(qNaN) = canonical NaN */ 3801 if (float32_is_quiet_nan(f, s)) { 3802 return float32_default_nan(s); 3803 } 3804 3805 /* frsqrt7(+-0) = +-inf */ 3806 if (float32_is_zero(f)) { 3807 s->float_exception_flags |= float_flag_divbyzero; 3808 return float32_set_sign(float32_infinity, sign); 3809 } 3810 3811 /* frsqrt7(+inf) = +0 */ 3812 if (float32_is_infinity(f) && !sign) { 3813 return float32_set_sign(float32_zero, sign); 3814 } 3815 3816 /* +normal, +subnormal */ 3817 uint64_t val = frsqrt7(f, exp_size, frac_size); 3818 return make_float32(val); 3819 } 3820 3821 static float64 frsqrt7_d(float64 f, float_status *s) 3822 { 3823 int exp_size = 11, frac_size = 52; 3824 bool sign = float64_is_neg(f); 3825 3826 /* 3827 * frsqrt7(sNaN) = canonical NaN 3828 * frsqrt7(-inf) = canonical NaN 3829 * frsqrt7(-normal) = canonical NaN 3830 * frsqrt7(-subnormal) = canonical NaN 3831 */ 3832 if (float64_is_signaling_nan(f, s) || 3833 (float64_is_infinity(f) && sign) || 3834 (float64_is_normal(f) && sign) || 3835 (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) { 3836 s->float_exception_flags |= float_flag_invalid; 3837 return float64_default_nan(s); 3838 } 3839 3840 /* frsqrt7(qNaN) = canonical NaN */ 3841 if (float64_is_quiet_nan(f, s)) { 3842 return float64_default_nan(s); 3843 } 3844 3845 /* frsqrt7(+-0) = +-inf */ 3846 if (float64_is_zero(f)) { 3847 s->float_exception_flags |= float_flag_divbyzero; 3848 return float64_set_sign(float64_infinity, sign); 3849 } 3850 3851 /* frsqrt7(+inf) = +0 */ 3852 if (float64_is_infinity(f) && !sign) { 3853 return float64_set_sign(float64_zero, sign); 3854 } 3855 3856 /* +normal, +subnormal */ 3857 uint64_t val = frsqrt7(f, exp_size, frac_size); 3858 return make_float64(val); 3859 } 3860 3861 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h) 3862 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s) 3863 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d) 3864 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2) 3865 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4) 3866 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8) 3867 3868 /* 3869 * Vector Floating-Point Reciprocal Estimate Instruction 3870 * 3871 * Adapted from riscv-v-spec recip.c: 3872 * https://github.com/riscv/riscv-v-spec/blob/master/recip.c 3873 */ 3874 static uint64_t frec7(uint64_t f, int exp_size, int frac_size, 3875 float_status *s) 3876 { 3877 uint64_t sign = extract64(f, frac_size + exp_size, 1); 3878 uint64_t exp = extract64(f, frac_size, exp_size); 3879 uint64_t frac = extract64(f, 0, frac_size); 3880 3881 const uint8_t lookup_table[] = { 3882 127, 125, 123, 121, 119, 117, 116, 114, 3883 112, 110, 109, 107, 105, 104, 102, 100, 3884 99, 97, 96, 94, 93, 91, 90, 88, 3885 87, 85, 84, 83, 81, 80, 79, 77, 3886 76, 75, 74, 72, 71, 70, 69, 68, 3887 66, 65, 64, 63, 62, 61, 60, 59, 3888 58, 57, 56, 55, 54, 53, 52, 51, 3889 50, 49, 48, 47, 46, 45, 44, 43, 3890 42, 41, 40, 40, 39, 38, 37, 36, 3891 35, 35, 34, 33, 32, 31, 31, 30, 3892 29, 28, 28, 27, 26, 25, 25, 24, 3893 23, 23, 22, 21, 21, 20, 19, 19, 3894 18, 17, 17, 16, 15, 15, 14, 14, 3895 13, 12, 12, 11, 11, 10, 9, 9, 3896 8, 8, 7, 7, 6, 5, 5, 4, 3897 4, 3, 3, 2, 2, 1, 1, 0 3898 }; 3899 const int precision = 7; 3900 3901 if (exp == 0 && frac != 0) { /* subnormal */ 3902 /* Normalize the subnormal. */ 3903 while (extract64(frac, frac_size - 1, 1) == 0) { 3904 exp--; 3905 frac <<= 1; 3906 } 3907 3908 frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size); 3909 3910 if (exp != 0 && exp != UINT64_MAX) { 3911 /* 3912 * Overflow to inf or max value of same sign, 3913 * depending on sign and rounding mode. 3914 */ 3915 s->float_exception_flags |= (float_flag_inexact | 3916 float_flag_overflow); 3917 3918 if ((s->float_rounding_mode == float_round_to_zero) || 3919 ((s->float_rounding_mode == float_round_down) && !sign) || 3920 ((s->float_rounding_mode == float_round_up) && sign)) { 3921 /* Return greatest/negative finite value. */ 3922 return (sign << (exp_size + frac_size)) | 3923 (MAKE_64BIT_MASK(frac_size, exp_size) - 1); 3924 } else { 3925 /* Return +-inf. */ 3926 return (sign << (exp_size + frac_size)) | 3927 MAKE_64BIT_MASK(frac_size, exp_size); 3928 } 3929 } 3930 } 3931 3932 int idx = frac >> (frac_size - precision); 3933 uint64_t out_frac = (uint64_t)(lookup_table[idx]) << 3934 (frac_size - precision); 3935 uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp; 3936 3937 if (out_exp == 0 || out_exp == UINT64_MAX) { 3938 /* 3939 * The result is subnormal, but don't raise the underflow exception, 3940 * because there's no additional loss of precision. 3941 */ 3942 out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1); 3943 if (out_exp == UINT64_MAX) { 3944 out_frac >>= 1; 3945 out_exp = 0; 3946 } 3947 } 3948 3949 uint64_t val = 0; 3950 val = deposit64(val, 0, frac_size, out_frac); 3951 val = deposit64(val, frac_size, exp_size, out_exp); 3952 val = deposit64(val, frac_size + exp_size, 1, sign); 3953 return val; 3954 } 3955 3956 static float16 frec7_h(float16 f, float_status *s) 3957 { 3958 int exp_size = 5, frac_size = 10; 3959 bool sign = float16_is_neg(f); 3960 3961 /* frec7(+-inf) = +-0 */ 3962 if (float16_is_infinity(f)) { 3963 return float16_set_sign(float16_zero, sign); 3964 } 3965 3966 /* frec7(+-0) = +-inf */ 3967 if (float16_is_zero(f)) { 3968 s->float_exception_flags |= float_flag_divbyzero; 3969 return float16_set_sign(float16_infinity, sign); 3970 } 3971 3972 /* frec7(sNaN) = canonical NaN */ 3973 if (float16_is_signaling_nan(f, s)) { 3974 s->float_exception_flags |= float_flag_invalid; 3975 return float16_default_nan(s); 3976 } 3977 3978 /* frec7(qNaN) = canonical NaN */ 3979 if (float16_is_quiet_nan(f, s)) { 3980 return float16_default_nan(s); 3981 } 3982 3983 /* +-normal, +-subnormal */ 3984 uint64_t val = frec7(f, exp_size, frac_size, s); 3985 return make_float16(val); 3986 } 3987 3988 static float32 frec7_s(float32 f, float_status *s) 3989 { 3990 int exp_size = 8, frac_size = 23; 3991 bool sign = float32_is_neg(f); 3992 3993 /* frec7(+-inf) = +-0 */ 3994 if (float32_is_infinity(f)) { 3995 return float32_set_sign(float32_zero, sign); 3996 } 3997 3998 /* frec7(+-0) = +-inf */ 3999 if (float32_is_zero(f)) { 4000 s->float_exception_flags |= float_flag_divbyzero; 4001 return float32_set_sign(float32_infinity, sign); 4002 } 4003 4004 /* frec7(sNaN) = canonical NaN */ 4005 if (float32_is_signaling_nan(f, s)) { 4006 s->float_exception_flags |= float_flag_invalid; 4007 return float32_default_nan(s); 4008 } 4009 4010 /* frec7(qNaN) = canonical NaN */ 4011 if (float32_is_quiet_nan(f, s)) { 4012 return float32_default_nan(s); 4013 } 4014 4015 /* +-normal, +-subnormal */ 4016 uint64_t val = frec7(f, exp_size, frac_size, s); 4017 return make_float32(val); 4018 } 4019 4020 static float64 frec7_d(float64 f, float_status *s) 4021 { 4022 int exp_size = 11, frac_size = 52; 4023 bool sign = float64_is_neg(f); 4024 4025 /* frec7(+-inf) = +-0 */ 4026 if (float64_is_infinity(f)) { 4027 return float64_set_sign(float64_zero, sign); 4028 } 4029 4030 /* frec7(+-0) = +-inf */ 4031 if (float64_is_zero(f)) { 4032 s->float_exception_flags |= float_flag_divbyzero; 4033 return float64_set_sign(float64_infinity, sign); 4034 } 4035 4036 /* frec7(sNaN) = canonical NaN */ 4037 if (float64_is_signaling_nan(f, s)) { 4038 s->float_exception_flags |= float_flag_invalid; 4039 return float64_default_nan(s); 4040 } 4041 4042 /* frec7(qNaN) = canonical NaN */ 4043 if (float64_is_quiet_nan(f, s)) { 4044 return float64_default_nan(s); 4045 } 4046 4047 /* +-normal, +-subnormal */ 4048 uint64_t val = frec7(f, exp_size, frac_size, s); 4049 return make_float64(val); 4050 } 4051 4052 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h) 4053 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s) 4054 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d) 4055 GEN_VEXT_V_ENV(vfrec7_v_h, 2) 4056 GEN_VEXT_V_ENV(vfrec7_v_w, 4) 4057 GEN_VEXT_V_ENV(vfrec7_v_d, 8) 4058 4059 /* Vector Floating-Point MIN/MAX Instructions */ 4060 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number) 4061 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number) 4062 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number) 4063 GEN_VEXT_VV_ENV(vfmin_vv_h, 2) 4064 GEN_VEXT_VV_ENV(vfmin_vv_w, 4) 4065 GEN_VEXT_VV_ENV(vfmin_vv_d, 8) 4066 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number) 4067 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number) 4068 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number) 4069 GEN_VEXT_VF(vfmin_vf_h, 2) 4070 GEN_VEXT_VF(vfmin_vf_w, 4) 4071 GEN_VEXT_VF(vfmin_vf_d, 8) 4072 4073 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number) 4074 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number) 4075 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number) 4076 GEN_VEXT_VV_ENV(vfmax_vv_h, 2) 4077 GEN_VEXT_VV_ENV(vfmax_vv_w, 4) 4078 GEN_VEXT_VV_ENV(vfmax_vv_d, 8) 4079 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number) 4080 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number) 4081 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number) 4082 GEN_VEXT_VF(vfmax_vf_h, 2) 4083 GEN_VEXT_VF(vfmax_vf_w, 4) 4084 GEN_VEXT_VF(vfmax_vf_d, 8) 4085 4086 /* Vector Floating-Point Sign-Injection Instructions */ 4087 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s) 4088 { 4089 return deposit64(b, 0, 15, a); 4090 } 4091 4092 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s) 4093 { 4094 return deposit64(b, 0, 31, a); 4095 } 4096 4097 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s) 4098 { 4099 return deposit64(b, 0, 63, a); 4100 } 4101 4102 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16) 4103 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32) 4104 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64) 4105 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2) 4106 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4) 4107 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8) 4108 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16) 4109 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32) 4110 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64) 4111 GEN_VEXT_VF(vfsgnj_vf_h, 2) 4112 GEN_VEXT_VF(vfsgnj_vf_w, 4) 4113 GEN_VEXT_VF(vfsgnj_vf_d, 8) 4114 4115 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s) 4116 { 4117 return deposit64(~b, 0, 15, a); 4118 } 4119 4120 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s) 4121 { 4122 return deposit64(~b, 0, 31, a); 4123 } 4124 4125 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s) 4126 { 4127 return deposit64(~b, 0, 63, a); 4128 } 4129 4130 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16) 4131 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32) 4132 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64) 4133 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2) 4134 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4) 4135 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8) 4136 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16) 4137 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32) 4138 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64) 4139 GEN_VEXT_VF(vfsgnjn_vf_h, 2) 4140 GEN_VEXT_VF(vfsgnjn_vf_w, 4) 4141 GEN_VEXT_VF(vfsgnjn_vf_d, 8) 4142 4143 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s) 4144 { 4145 return deposit64(b ^ a, 0, 15, a); 4146 } 4147 4148 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s) 4149 { 4150 return deposit64(b ^ a, 0, 31, a); 4151 } 4152 4153 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s) 4154 { 4155 return deposit64(b ^ a, 0, 63, a); 4156 } 4157 4158 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16) 4159 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32) 4160 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64) 4161 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2) 4162 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4) 4163 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8) 4164 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16) 4165 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32) 4166 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64) 4167 GEN_VEXT_VF(vfsgnjx_vf_h, 2) 4168 GEN_VEXT_VF(vfsgnjx_vf_w, 4) 4169 GEN_VEXT_VF(vfsgnjx_vf_d, 8) 4170 4171 /* Vector Floating-Point Compare Instructions */ 4172 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP) \ 4173 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ 4174 CPURISCVState *env, uint32_t desc) \ 4175 { \ 4176 uint32_t vm = vext_vm(desc); \ 4177 uint32_t vl = env->vl; \ 4178 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \ 4179 uint32_t vta_all_1s = vext_vta_all_1s(desc); \ 4180 uint32_t vma = vext_vma(desc); \ 4181 uint32_t i; \ 4182 \ 4183 for (i = env->vstart; i < vl; i++) { \ 4184 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \ 4185 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \ 4186 if (!vm && !vext_elem_mask(v0, i)) { \ 4187 /* set masked-off elements to 1s */ \ 4188 if (vma) { \ 4189 vext_set_elem_mask(vd, i, 1); \ 4190 } \ 4191 continue; \ 4192 } \ 4193 vext_set_elem_mask(vd, i, \ 4194 DO_OP(s2, s1, &env->fp_status)); \ 4195 } \ 4196 env->vstart = 0; \ 4197 /* 4198 * mask destination register are always tail-agnostic 4199 * set tail elements to 1s 4200 */ \ 4201 if (vta_all_1s) { \ 4202 for (; i < total_elems; i++) { \ 4203 vext_set_elem_mask(vd, i, 1); \ 4204 } \ 4205 } \ 4206 } 4207 4208 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet) 4209 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet) 4210 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet) 4211 4212 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP) \ 4213 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ 4214 CPURISCVState *env, uint32_t desc) \ 4215 { \ 4216 uint32_t vm = vext_vm(desc); \ 4217 uint32_t vl = env->vl; \ 4218 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \ 4219 uint32_t vta_all_1s = vext_vta_all_1s(desc); \ 4220 uint32_t vma = vext_vma(desc); \ 4221 uint32_t i; \ 4222 \ 4223 for (i = env->vstart; i < vl; i++) { \ 4224 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \ 4225 if (!vm && !vext_elem_mask(v0, i)) { \ 4226 /* set masked-off elements to 1s */ \ 4227 if (vma) { \ 4228 vext_set_elem_mask(vd, i, 1); \ 4229 } \ 4230 continue; \ 4231 } \ 4232 vext_set_elem_mask(vd, i, \ 4233 DO_OP(s2, (ETYPE)s1, &env->fp_status)); \ 4234 } \ 4235 env->vstart = 0; \ 4236 /* 4237 * mask destination register are always tail-agnostic 4238 * set tail elements to 1s 4239 */ \ 4240 if (vta_all_1s) { \ 4241 for (; i < total_elems; i++) { \ 4242 vext_set_elem_mask(vd, i, 1); \ 4243 } \ 4244 } \ 4245 } 4246 4247 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet) 4248 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet) 4249 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet) 4250 4251 static bool vmfne16(uint16_t a, uint16_t b, float_status *s) 4252 { 4253 FloatRelation compare = float16_compare_quiet(a, b, s); 4254 return compare != float_relation_equal; 4255 } 4256 4257 static bool vmfne32(uint32_t a, uint32_t b, float_status *s) 4258 { 4259 FloatRelation compare = float32_compare_quiet(a, b, s); 4260 return compare != float_relation_equal; 4261 } 4262 4263 static bool vmfne64(uint64_t a, uint64_t b, float_status *s) 4264 { 4265 FloatRelation compare = float64_compare_quiet(a, b, s); 4266 return compare != float_relation_equal; 4267 } 4268 4269 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16) 4270 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32) 4271 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64) 4272 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16) 4273 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32) 4274 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64) 4275 4276 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt) 4277 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt) 4278 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt) 4279 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt) 4280 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt) 4281 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt) 4282 4283 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le) 4284 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le) 4285 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le) 4286 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le) 4287 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le) 4288 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le) 4289 4290 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s) 4291 { 4292 FloatRelation compare = float16_compare(a, b, s); 4293 return compare == float_relation_greater; 4294 } 4295 4296 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s) 4297 { 4298 FloatRelation compare = float32_compare(a, b, s); 4299 return compare == float_relation_greater; 4300 } 4301 4302 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s) 4303 { 4304 FloatRelation compare = float64_compare(a, b, s); 4305 return compare == float_relation_greater; 4306 } 4307 4308 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16) 4309 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32) 4310 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64) 4311 4312 static bool vmfge16(uint16_t a, uint16_t b, float_status *s) 4313 { 4314 FloatRelation compare = float16_compare(a, b, s); 4315 return compare == float_relation_greater || 4316 compare == float_relation_equal; 4317 } 4318 4319 static bool vmfge32(uint32_t a, uint32_t b, float_status *s) 4320 { 4321 FloatRelation compare = float32_compare(a, b, s); 4322 return compare == float_relation_greater || 4323 compare == float_relation_equal; 4324 } 4325 4326 static bool vmfge64(uint64_t a, uint64_t b, float_status *s) 4327 { 4328 FloatRelation compare = float64_compare(a, b, s); 4329 return compare == float_relation_greater || 4330 compare == float_relation_equal; 4331 } 4332 4333 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16) 4334 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32) 4335 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64) 4336 4337 /* Vector Floating-Point Classify Instruction */ 4338 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP) \ 4339 static void do_##NAME(void *vd, void *vs2, int i) \ 4340 { \ 4341 TX2 s2 = *((T2 *)vs2 + HS2(i)); \ 4342 *((TD *)vd + HD(i)) = OP(s2); \ 4343 } 4344 4345 #define GEN_VEXT_V(NAME, ESZ) \ 4346 void HELPER(NAME)(void *vd, void *v0, void *vs2, \ 4347 CPURISCVState *env, uint32_t desc) \ 4348 { \ 4349 uint32_t vm = vext_vm(desc); \ 4350 uint32_t vl = env->vl; \ 4351 uint32_t total_elems = \ 4352 vext_get_total_elems(env, desc, ESZ); \ 4353 uint32_t vta = vext_vta(desc); \ 4354 uint32_t vma = vext_vma(desc); \ 4355 uint32_t i; \ 4356 \ 4357 for (i = env->vstart; i < vl; i++) { \ 4358 if (!vm && !vext_elem_mask(v0, i)) { \ 4359 /* set masked-off elements to 1s */ \ 4360 vext_set_elems_1s(vd, vma, i * ESZ, \ 4361 (i + 1) * ESZ); \ 4362 continue; \ 4363 } \ 4364 do_##NAME(vd, vs2, i); \ 4365 } \ 4366 env->vstart = 0; \ 4367 /* set tail elements to 1s */ \ 4368 vext_set_elems_1s(vd, vta, vl * ESZ, \ 4369 total_elems * ESZ); \ 4370 } 4371 4372 target_ulong fclass_h(uint64_t frs1) 4373 { 4374 float16 f = frs1; 4375 bool sign = float16_is_neg(f); 4376 4377 if (float16_is_infinity(f)) { 4378 return sign ? 1 << 0 : 1 << 7; 4379 } else if (float16_is_zero(f)) { 4380 return sign ? 1 << 3 : 1 << 4; 4381 } else if (float16_is_zero_or_denormal(f)) { 4382 return sign ? 1 << 2 : 1 << 5; 4383 } else if (float16_is_any_nan(f)) { 4384 float_status s = { }; /* for snan_bit_is_one */ 4385 return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8; 4386 } else { 4387 return sign ? 1 << 1 : 1 << 6; 4388 } 4389 } 4390 4391 target_ulong fclass_s(uint64_t frs1) 4392 { 4393 float32 f = frs1; 4394 bool sign = float32_is_neg(f); 4395 4396 if (float32_is_infinity(f)) { 4397 return sign ? 1 << 0 : 1 << 7; 4398 } else if (float32_is_zero(f)) { 4399 return sign ? 1 << 3 : 1 << 4; 4400 } else if (float32_is_zero_or_denormal(f)) { 4401 return sign ? 1 << 2 : 1 << 5; 4402 } else if (float32_is_any_nan(f)) { 4403 float_status s = { }; /* for snan_bit_is_one */ 4404 return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8; 4405 } else { 4406 return sign ? 1 << 1 : 1 << 6; 4407 } 4408 } 4409 4410 target_ulong fclass_d(uint64_t frs1) 4411 { 4412 float64 f = frs1; 4413 bool sign = float64_is_neg(f); 4414 4415 if (float64_is_infinity(f)) { 4416 return sign ? 1 << 0 : 1 << 7; 4417 } else if (float64_is_zero(f)) { 4418 return sign ? 1 << 3 : 1 << 4; 4419 } else if (float64_is_zero_or_denormal(f)) { 4420 return sign ? 1 << 2 : 1 << 5; 4421 } else if (float64_is_any_nan(f)) { 4422 float_status s = { }; /* for snan_bit_is_one */ 4423 return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8; 4424 } else { 4425 return sign ? 1 << 1 : 1 << 6; 4426 } 4427 } 4428 4429 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h) 4430 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s) 4431 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d) 4432 GEN_VEXT_V(vfclass_v_h, 2) 4433 GEN_VEXT_V(vfclass_v_w, 4) 4434 GEN_VEXT_V(vfclass_v_d, 8) 4435 4436 /* Vector Floating-Point Merge Instruction */ 4437 4438 #define GEN_VFMERGE_VF(NAME, ETYPE, H) \ 4439 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ 4440 CPURISCVState *env, uint32_t desc) \ 4441 { \ 4442 uint32_t vm = vext_vm(desc); \ 4443 uint32_t vl = env->vl; \ 4444 uint32_t esz = sizeof(ETYPE); \ 4445 uint32_t total_elems = \ 4446 vext_get_total_elems(env, desc, esz); \ 4447 uint32_t vta = vext_vta(desc); \ 4448 uint32_t i; \ 4449 \ 4450 for (i = env->vstart; i < vl; i++) { \ 4451 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \ 4452 *((ETYPE *)vd + H(i)) = \ 4453 (!vm && !vext_elem_mask(v0, i) ? s2 : s1); \ 4454 } \ 4455 env->vstart = 0; \ 4456 /* set tail elements to 1s */ \ 4457 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 4458 } 4459 4460 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2) 4461 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4) 4462 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8) 4463 4464 /* Single-Width Floating-Point/Integer Type-Convert Instructions */ 4465 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */ 4466 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16) 4467 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32) 4468 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64) 4469 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2) 4470 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4) 4471 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8) 4472 4473 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */ 4474 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16) 4475 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32) 4476 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64) 4477 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2) 4478 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4) 4479 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8) 4480 4481 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */ 4482 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16) 4483 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32) 4484 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64) 4485 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2) 4486 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4) 4487 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8) 4488 4489 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */ 4490 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16) 4491 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32) 4492 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64) 4493 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2) 4494 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4) 4495 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8) 4496 4497 /* Widening Floating-Point/Integer Type-Convert Instructions */ 4498 /* (TD, T2, TX2) */ 4499 #define WOP_UU_B uint16_t, uint8_t, uint8_t 4500 #define WOP_UU_H uint32_t, uint16_t, uint16_t 4501 #define WOP_UU_W uint64_t, uint32_t, uint32_t 4502 /* 4503 * vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer. 4504 */ 4505 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32) 4506 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64) 4507 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4) 4508 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8) 4509 4510 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */ 4511 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32) 4512 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64) 4513 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4) 4514 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8) 4515 4516 /* 4517 * vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float. 4518 */ 4519 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16) 4520 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32) 4521 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64) 4522 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2) 4523 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4) 4524 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8) 4525 4526 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */ 4527 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16) 4528 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32) 4529 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64) 4530 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2) 4531 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4) 4532 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8) 4533 4534 /* 4535 * vfwcvt.f.f.v vd, vs2, vm # Convert single-width float to double-width float. 4536 */ 4537 static uint32_t vfwcvtffv16(uint16_t a, float_status *s) 4538 { 4539 return float16_to_float32(a, true, s); 4540 } 4541 4542 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16) 4543 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64) 4544 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4) 4545 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8) 4546 4547 /* Narrowing Floating-Point/Integer Type-Convert Instructions */ 4548 /* (TD, T2, TX2) */ 4549 #define NOP_UU_B uint8_t, uint16_t, uint32_t 4550 #define NOP_UU_H uint16_t, uint32_t, uint32_t 4551 #define NOP_UU_W uint32_t, uint64_t, uint64_t 4552 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */ 4553 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8) 4554 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16) 4555 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32) 4556 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1) 4557 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2) 4558 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4) 4559 4560 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */ 4561 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8) 4562 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16) 4563 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32) 4564 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1) 4565 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2) 4566 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4) 4567 4568 /* 4569 * vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float. 4570 */ 4571 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16) 4572 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32) 4573 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2) 4574 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4) 4575 4576 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */ 4577 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16) 4578 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32) 4579 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2) 4580 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4) 4581 4582 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */ 4583 static uint16_t vfncvtffv16(uint32_t a, float_status *s) 4584 { 4585 return float32_to_float16(a, true, s); 4586 } 4587 4588 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16) 4589 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32) 4590 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2) 4591 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4) 4592 4593 /* 4594 * Vector Reduction Operations 4595 */ 4596 /* Vector Single-Width Integer Reduction Instructions */ 4597 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP) \ 4598 void HELPER(NAME)(void *vd, void *v0, void *vs1, \ 4599 void *vs2, CPURISCVState *env, \ 4600 uint32_t desc) \ 4601 { \ 4602 uint32_t vm = vext_vm(desc); \ 4603 uint32_t vl = env->vl; \ 4604 uint32_t esz = sizeof(TD); \ 4605 uint32_t vlenb = simd_maxsz(desc); \ 4606 uint32_t vta = vext_vta(desc); \ 4607 uint32_t i; \ 4608 TD s1 = *((TD *)vs1 + HD(0)); \ 4609 \ 4610 for (i = env->vstart; i < vl; i++) { \ 4611 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \ 4612 if (!vm && !vext_elem_mask(v0, i)) { \ 4613 continue; \ 4614 } \ 4615 s1 = OP(s1, (TD)s2); \ 4616 } \ 4617 *((TD *)vd + HD(0)) = s1; \ 4618 env->vstart = 0; \ 4619 /* set tail elements to 1s */ \ 4620 vext_set_elems_1s(vd, vta, esz, vlenb); \ 4621 } 4622 4623 /* vd[0] = sum(vs1[0], vs2[*]) */ 4624 GEN_VEXT_RED(vredsum_vs_b, int8_t, int8_t, H1, H1, DO_ADD) 4625 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD) 4626 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD) 4627 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD) 4628 4629 /* vd[0] = maxu(vs1[0], vs2[*]) */ 4630 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t, uint8_t, H1, H1, DO_MAX) 4631 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX) 4632 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX) 4633 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX) 4634 4635 /* vd[0] = max(vs1[0], vs2[*]) */ 4636 GEN_VEXT_RED(vredmax_vs_b, int8_t, int8_t, H1, H1, DO_MAX) 4637 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX) 4638 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX) 4639 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX) 4640 4641 /* vd[0] = minu(vs1[0], vs2[*]) */ 4642 GEN_VEXT_RED(vredminu_vs_b, uint8_t, uint8_t, H1, H1, DO_MIN) 4643 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN) 4644 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN) 4645 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN) 4646 4647 /* vd[0] = min(vs1[0], vs2[*]) */ 4648 GEN_VEXT_RED(vredmin_vs_b, int8_t, int8_t, H1, H1, DO_MIN) 4649 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN) 4650 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN) 4651 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN) 4652 4653 /* vd[0] = and(vs1[0], vs2[*]) */ 4654 GEN_VEXT_RED(vredand_vs_b, int8_t, int8_t, H1, H1, DO_AND) 4655 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND) 4656 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND) 4657 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND) 4658 4659 /* vd[0] = or(vs1[0], vs2[*]) */ 4660 GEN_VEXT_RED(vredor_vs_b, int8_t, int8_t, H1, H1, DO_OR) 4661 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR) 4662 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR) 4663 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR) 4664 4665 /* vd[0] = xor(vs1[0], vs2[*]) */ 4666 GEN_VEXT_RED(vredxor_vs_b, int8_t, int8_t, H1, H1, DO_XOR) 4667 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR) 4668 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR) 4669 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR) 4670 4671 /* Vector Widening Integer Reduction Instructions */ 4672 /* signed sum reduction into double-width accumulator */ 4673 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t, H2, H1, DO_ADD) 4674 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD) 4675 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD) 4676 4677 /* Unsigned sum reduction into double-width accumulator */ 4678 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t, H2, H1, DO_ADD) 4679 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD) 4680 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD) 4681 4682 /* Vector Single-Width Floating-Point Reduction Instructions */ 4683 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP) \ 4684 void HELPER(NAME)(void *vd, void *v0, void *vs1, \ 4685 void *vs2, CPURISCVState *env, \ 4686 uint32_t desc) \ 4687 { \ 4688 uint32_t vm = vext_vm(desc); \ 4689 uint32_t vl = env->vl; \ 4690 uint32_t esz = sizeof(TD); \ 4691 uint32_t vlenb = simd_maxsz(desc); \ 4692 uint32_t vta = vext_vta(desc); \ 4693 uint32_t i; \ 4694 TD s1 = *((TD *)vs1 + HD(0)); \ 4695 \ 4696 for (i = env->vstart; i < vl; i++) { \ 4697 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \ 4698 if (!vm && !vext_elem_mask(v0, i)) { \ 4699 continue; \ 4700 } \ 4701 s1 = OP(s1, (TD)s2, &env->fp_status); \ 4702 } \ 4703 *((TD *)vd + HD(0)) = s1; \ 4704 env->vstart = 0; \ 4705 /* set tail elements to 1s */ \ 4706 vext_set_elems_1s(vd, vta, esz, vlenb); \ 4707 } 4708 4709 /* Unordered sum */ 4710 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add) 4711 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add) 4712 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add) 4713 4714 /* Ordered sum */ 4715 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add) 4716 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add) 4717 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add) 4718 4719 /* Maximum value */ 4720 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, 4721 float16_maximum_number) 4722 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, 4723 float32_maximum_number) 4724 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, 4725 float64_maximum_number) 4726 4727 /* Minimum value */ 4728 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, 4729 float16_minimum_number) 4730 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, 4731 float32_minimum_number) 4732 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, 4733 float64_minimum_number) 4734 4735 /* Vector Widening Floating-Point Add Instructions */ 4736 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s) 4737 { 4738 return float32_add(a, float16_to_float32(b, true, s), s); 4739 } 4740 4741 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s) 4742 { 4743 return float64_add(a, float32_to_float64(b, s), s); 4744 } 4745 4746 /* Vector Widening Floating-Point Reduction Instructions */ 4747 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */ 4748 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16) 4749 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32) 4750 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16) 4751 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32) 4752 4753 /* 4754 * Vector Mask Operations 4755 */ 4756 /* Vector Mask-Register Logical Instructions */ 4757 #define GEN_VEXT_MASK_VV(NAME, OP) \ 4758 void HELPER(NAME)(void *vd, void *v0, void *vs1, \ 4759 void *vs2, CPURISCVState *env, \ 4760 uint32_t desc) \ 4761 { \ 4762 uint32_t vl = env->vl; \ 4763 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \ 4764 uint32_t vta_all_1s = vext_vta_all_1s(desc); \ 4765 uint32_t i; \ 4766 int a, b; \ 4767 \ 4768 for (i = env->vstart; i < vl; i++) { \ 4769 a = vext_elem_mask(vs1, i); \ 4770 b = vext_elem_mask(vs2, i); \ 4771 vext_set_elem_mask(vd, i, OP(b, a)); \ 4772 } \ 4773 env->vstart = 0; \ 4774 /* 4775 * mask destination register are always tail-agnostic 4776 * set tail elements to 1s 4777 */ \ 4778 if (vta_all_1s) { \ 4779 for (; i < total_elems; i++) { \ 4780 vext_set_elem_mask(vd, i, 1); \ 4781 } \ 4782 } \ 4783 } 4784 4785 #define DO_NAND(N, M) (!(N & M)) 4786 #define DO_ANDNOT(N, M) (N & !M) 4787 #define DO_NOR(N, M) (!(N | M)) 4788 #define DO_ORNOT(N, M) (N | !M) 4789 #define DO_XNOR(N, M) (!(N ^ M)) 4790 4791 GEN_VEXT_MASK_VV(vmand_mm, DO_AND) 4792 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND) 4793 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT) 4794 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR) 4795 GEN_VEXT_MASK_VV(vmor_mm, DO_OR) 4796 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR) 4797 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT) 4798 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR) 4799 4800 /* Vector count population in mask vcpop */ 4801 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env, 4802 uint32_t desc) 4803 { 4804 target_ulong cnt = 0; 4805 uint32_t vm = vext_vm(desc); 4806 uint32_t vl = env->vl; 4807 int i; 4808 4809 for (i = env->vstart; i < vl; i++) { 4810 if (vm || vext_elem_mask(v0, i)) { 4811 if (vext_elem_mask(vs2, i)) { 4812 cnt++; 4813 } 4814 } 4815 } 4816 env->vstart = 0; 4817 return cnt; 4818 } 4819 4820 /* vfirst find-first-set mask bit */ 4821 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env, 4822 uint32_t desc) 4823 { 4824 uint32_t vm = vext_vm(desc); 4825 uint32_t vl = env->vl; 4826 int i; 4827 4828 for (i = env->vstart; i < vl; i++) { 4829 if (vm || vext_elem_mask(v0, i)) { 4830 if (vext_elem_mask(vs2, i)) { 4831 return i; 4832 } 4833 } 4834 } 4835 env->vstart = 0; 4836 return -1LL; 4837 } 4838 4839 enum set_mask_type { 4840 ONLY_FIRST = 1, 4841 INCLUDE_FIRST, 4842 BEFORE_FIRST, 4843 }; 4844 4845 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env, 4846 uint32_t desc, enum set_mask_type type) 4847 { 4848 uint32_t vm = vext_vm(desc); 4849 uint32_t vl = env->vl; 4850 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; 4851 uint32_t vta_all_1s = vext_vta_all_1s(desc); 4852 uint32_t vma = vext_vma(desc); 4853 int i; 4854 bool first_mask_bit = false; 4855 4856 for (i = env->vstart; i < vl; i++) { 4857 if (!vm && !vext_elem_mask(v0, i)) { 4858 /* set masked-off elements to 1s */ 4859 if (vma) { 4860 vext_set_elem_mask(vd, i, 1); 4861 } 4862 continue; 4863 } 4864 /* write a zero to all following active elements */ 4865 if (first_mask_bit) { 4866 vext_set_elem_mask(vd, i, 0); 4867 continue; 4868 } 4869 if (vext_elem_mask(vs2, i)) { 4870 first_mask_bit = true; 4871 if (type == BEFORE_FIRST) { 4872 vext_set_elem_mask(vd, i, 0); 4873 } else { 4874 vext_set_elem_mask(vd, i, 1); 4875 } 4876 } else { 4877 if (type == ONLY_FIRST) { 4878 vext_set_elem_mask(vd, i, 0); 4879 } else { 4880 vext_set_elem_mask(vd, i, 1); 4881 } 4882 } 4883 } 4884 env->vstart = 0; 4885 /* 4886 * mask destination register are always tail-agnostic 4887 * set tail elements to 1s 4888 */ 4889 if (vta_all_1s) { 4890 for (; i < total_elems; i++) { 4891 vext_set_elem_mask(vd, i, 1); 4892 } 4893 } 4894 } 4895 4896 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env, 4897 uint32_t desc) 4898 { 4899 vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST); 4900 } 4901 4902 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env, 4903 uint32_t desc) 4904 { 4905 vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST); 4906 } 4907 4908 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env, 4909 uint32_t desc) 4910 { 4911 vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST); 4912 } 4913 4914 /* Vector Iota Instruction */ 4915 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H) \ 4916 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env, \ 4917 uint32_t desc) \ 4918 { \ 4919 uint32_t vm = vext_vm(desc); \ 4920 uint32_t vl = env->vl; \ 4921 uint32_t esz = sizeof(ETYPE); \ 4922 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 4923 uint32_t vta = vext_vta(desc); \ 4924 uint32_t vma = vext_vma(desc); \ 4925 uint32_t sum = 0; \ 4926 int i; \ 4927 \ 4928 for (i = env->vstart; i < vl; i++) { \ 4929 if (!vm && !vext_elem_mask(v0, i)) { \ 4930 /* set masked-off elements to 1s */ \ 4931 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \ 4932 continue; \ 4933 } \ 4934 *((ETYPE *)vd + H(i)) = sum; \ 4935 if (vext_elem_mask(vs2, i)) { \ 4936 sum++; \ 4937 } \ 4938 } \ 4939 env->vstart = 0; \ 4940 /* set tail elements to 1s */ \ 4941 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 4942 } 4943 4944 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t, H1) 4945 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2) 4946 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4) 4947 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8) 4948 4949 /* Vector Element Index Instruction */ 4950 #define GEN_VEXT_VID_V(NAME, ETYPE, H) \ 4951 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc) \ 4952 { \ 4953 uint32_t vm = vext_vm(desc); \ 4954 uint32_t vl = env->vl; \ 4955 uint32_t esz = sizeof(ETYPE); \ 4956 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 4957 uint32_t vta = vext_vta(desc); \ 4958 uint32_t vma = vext_vma(desc); \ 4959 int i; \ 4960 \ 4961 for (i = env->vstart; i < vl; i++) { \ 4962 if (!vm && !vext_elem_mask(v0, i)) { \ 4963 /* set masked-off elements to 1s */ \ 4964 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \ 4965 continue; \ 4966 } \ 4967 *((ETYPE *)vd + H(i)) = i; \ 4968 } \ 4969 env->vstart = 0; \ 4970 /* set tail elements to 1s */ \ 4971 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 4972 } 4973 4974 GEN_VEXT_VID_V(vid_v_b, uint8_t, H1) 4975 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2) 4976 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4) 4977 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8) 4978 4979 /* 4980 * Vector Permutation Instructions 4981 */ 4982 4983 /* Vector Slide Instructions */ 4984 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H) \ 4985 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ 4986 CPURISCVState *env, uint32_t desc) \ 4987 { \ 4988 uint32_t vm = vext_vm(desc); \ 4989 uint32_t vl = env->vl; \ 4990 uint32_t esz = sizeof(ETYPE); \ 4991 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 4992 uint32_t vta = vext_vta(desc); \ 4993 uint32_t vma = vext_vma(desc); \ 4994 target_ulong offset = s1, i_min, i; \ 4995 \ 4996 i_min = MAX(env->vstart, offset); \ 4997 for (i = i_min; i < vl; i++) { \ 4998 if (!vm && !vext_elem_mask(v0, i)) { \ 4999 /* set masked-off elements to 1s */ \ 5000 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \ 5001 continue; \ 5002 } \ 5003 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset)); \ 5004 } \ 5005 /* set tail elements to 1s */ \ 5006 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 5007 } 5008 5009 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */ 5010 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t, H1) 5011 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2) 5012 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4) 5013 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8) 5014 5015 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H) \ 5016 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ 5017 CPURISCVState *env, uint32_t desc) \ 5018 { \ 5019 uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE))); \ 5020 uint32_t vm = vext_vm(desc); \ 5021 uint32_t vl = env->vl; \ 5022 uint32_t esz = sizeof(ETYPE); \ 5023 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 5024 uint32_t vta = vext_vta(desc); \ 5025 uint32_t vma = vext_vma(desc); \ 5026 target_ulong i_max, i; \ 5027 \ 5028 i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart); \ 5029 for (i = env->vstart; i < i_max; ++i) { \ 5030 if (!vm && !vext_elem_mask(v0, i)) { \ 5031 /* set masked-off elements to 1s */ \ 5032 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \ 5033 continue; \ 5034 } \ 5035 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1)); \ 5036 } \ 5037 \ 5038 for (i = i_max; i < vl; ++i) { \ 5039 if (vm || vext_elem_mask(v0, i)) { \ 5040 *((ETYPE *)vd + H(i)) = 0; \ 5041 } \ 5042 } \ 5043 \ 5044 env->vstart = 0; \ 5045 /* set tail elements to 1s */ \ 5046 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 5047 } 5048 5049 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */ 5050 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t, H1) 5051 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2) 5052 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4) 5053 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8) 5054 5055 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H) \ 5056 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1, \ 5057 void *vs2, CPURISCVState *env, \ 5058 uint32_t desc) \ 5059 { \ 5060 typedef uint##BITWIDTH##_t ETYPE; \ 5061 uint32_t vm = vext_vm(desc); \ 5062 uint32_t vl = env->vl; \ 5063 uint32_t esz = sizeof(ETYPE); \ 5064 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 5065 uint32_t vta = vext_vta(desc); \ 5066 uint32_t vma = vext_vma(desc); \ 5067 uint32_t i; \ 5068 \ 5069 for (i = env->vstart; i < vl; i++) { \ 5070 if (!vm && !vext_elem_mask(v0, i)) { \ 5071 /* set masked-off elements to 1s */ \ 5072 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \ 5073 continue; \ 5074 } \ 5075 if (i == 0) { \ 5076 *((ETYPE *)vd + H(i)) = s1; \ 5077 } else { \ 5078 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1)); \ 5079 } \ 5080 } \ 5081 env->vstart = 0; \ 5082 /* set tail elements to 1s */ \ 5083 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 5084 } 5085 5086 GEN_VEXT_VSLIE1UP(8, H1) 5087 GEN_VEXT_VSLIE1UP(16, H2) 5088 GEN_VEXT_VSLIE1UP(32, H4) 5089 GEN_VEXT_VSLIE1UP(64, H8) 5090 5091 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH) \ 5092 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ 5093 CPURISCVState *env, uint32_t desc) \ 5094 { \ 5095 vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc); \ 5096 } 5097 5098 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */ 5099 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8) 5100 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16) 5101 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32) 5102 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64) 5103 5104 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H) \ 5105 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1, \ 5106 void *vs2, CPURISCVState *env, \ 5107 uint32_t desc) \ 5108 { \ 5109 typedef uint##BITWIDTH##_t ETYPE; \ 5110 uint32_t vm = vext_vm(desc); \ 5111 uint32_t vl = env->vl; \ 5112 uint32_t esz = sizeof(ETYPE); \ 5113 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 5114 uint32_t vta = vext_vta(desc); \ 5115 uint32_t vma = vext_vma(desc); \ 5116 uint32_t i; \ 5117 \ 5118 for (i = env->vstart; i < vl; i++) { \ 5119 if (!vm && !vext_elem_mask(v0, i)) { \ 5120 /* set masked-off elements to 1s */ \ 5121 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \ 5122 continue; \ 5123 } \ 5124 if (i == vl - 1) { \ 5125 *((ETYPE *)vd + H(i)) = s1; \ 5126 } else { \ 5127 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1)); \ 5128 } \ 5129 } \ 5130 env->vstart = 0; \ 5131 /* set tail elements to 1s */ \ 5132 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 5133 } 5134 5135 GEN_VEXT_VSLIDE1DOWN(8, H1) 5136 GEN_VEXT_VSLIDE1DOWN(16, H2) 5137 GEN_VEXT_VSLIDE1DOWN(32, H4) 5138 GEN_VEXT_VSLIDE1DOWN(64, H8) 5139 5140 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH) \ 5141 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ 5142 CPURISCVState *env, uint32_t desc) \ 5143 { \ 5144 vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc); \ 5145 } 5146 5147 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */ 5148 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8) 5149 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16) 5150 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32) 5151 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64) 5152 5153 /* Vector Floating-Point Slide Instructions */ 5154 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH) \ 5155 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ 5156 CPURISCVState *env, uint32_t desc) \ 5157 { \ 5158 vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc); \ 5159 } 5160 5161 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */ 5162 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16) 5163 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32) 5164 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64) 5165 5166 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH) \ 5167 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \ 5168 CPURISCVState *env, uint32_t desc) \ 5169 { \ 5170 vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc); \ 5171 } 5172 5173 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */ 5174 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16) 5175 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32) 5176 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64) 5177 5178 /* Vector Register Gather Instruction */ 5179 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2) \ 5180 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ 5181 CPURISCVState *env, uint32_t desc) \ 5182 { \ 5183 uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2))); \ 5184 uint32_t vm = vext_vm(desc); \ 5185 uint32_t vl = env->vl; \ 5186 uint32_t esz = sizeof(TS2); \ 5187 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 5188 uint32_t vta = vext_vta(desc); \ 5189 uint32_t vma = vext_vma(desc); \ 5190 uint64_t index; \ 5191 uint32_t i; \ 5192 \ 5193 for (i = env->vstart; i < vl; i++) { \ 5194 if (!vm && !vext_elem_mask(v0, i)) { \ 5195 /* set masked-off elements to 1s */ \ 5196 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \ 5197 continue; \ 5198 } \ 5199 index = *((TS1 *)vs1 + HS1(i)); \ 5200 if (index >= vlmax) { \ 5201 *((TS2 *)vd + HS2(i)) = 0; \ 5202 } else { \ 5203 *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index)); \ 5204 } \ 5205 } \ 5206 env->vstart = 0; \ 5207 /* set tail elements to 1s */ \ 5208 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 5209 } 5210 5211 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */ 5212 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t, uint8_t, H1, H1) 5213 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2) 5214 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4) 5215 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8) 5216 5217 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t, H2, H1) 5218 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2) 5219 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4) 5220 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8) 5221 5222 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H) \ 5223 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ 5224 CPURISCVState *env, uint32_t desc) \ 5225 { \ 5226 uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE))); \ 5227 uint32_t vm = vext_vm(desc); \ 5228 uint32_t vl = env->vl; \ 5229 uint32_t esz = sizeof(ETYPE); \ 5230 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 5231 uint32_t vta = vext_vta(desc); \ 5232 uint32_t vma = vext_vma(desc); \ 5233 uint64_t index = s1; \ 5234 uint32_t i; \ 5235 \ 5236 for (i = env->vstart; i < vl; i++) { \ 5237 if (!vm && !vext_elem_mask(v0, i)) { \ 5238 /* set masked-off elements to 1s */ \ 5239 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \ 5240 continue; \ 5241 } \ 5242 if (index >= vlmax) { \ 5243 *((ETYPE *)vd + H(i)) = 0; \ 5244 } else { \ 5245 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index)); \ 5246 } \ 5247 } \ 5248 env->vstart = 0; \ 5249 /* set tail elements to 1s */ \ 5250 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 5251 } 5252 5253 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */ 5254 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t, H1) 5255 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2) 5256 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4) 5257 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8) 5258 5259 /* Vector Compress Instruction */ 5260 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H) \ 5261 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \ 5262 CPURISCVState *env, uint32_t desc) \ 5263 { \ 5264 uint32_t vl = env->vl; \ 5265 uint32_t esz = sizeof(ETYPE); \ 5266 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 5267 uint32_t vta = vext_vta(desc); \ 5268 uint32_t num = 0, i; \ 5269 \ 5270 for (i = env->vstart; i < vl; i++) { \ 5271 if (!vext_elem_mask(vs1, i)) { \ 5272 continue; \ 5273 } \ 5274 *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i)); \ 5275 num++; \ 5276 } \ 5277 env->vstart = 0; \ 5278 /* set tail elements to 1s */ \ 5279 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 5280 } 5281 5282 /* Compress into vd elements of vs2 where vs1 is enabled */ 5283 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t, H1) 5284 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2) 5285 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4) 5286 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8) 5287 5288 /* Vector Whole Register Move */ 5289 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc) 5290 { 5291 /* EEW = SEW */ 5292 uint32_t maxsz = simd_maxsz(desc); 5293 uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW); 5294 uint32_t startb = env->vstart * sewb; 5295 uint32_t i = startb; 5296 5297 memcpy((uint8_t *)vd + H1(i), 5298 (uint8_t *)vs2 + H1(i), 5299 maxsz - startb); 5300 5301 env->vstart = 0; 5302 } 5303 5304 /* Vector Integer Extension */ 5305 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1) \ 5306 void HELPER(NAME)(void *vd, void *v0, void *vs2, \ 5307 CPURISCVState *env, uint32_t desc) \ 5308 { \ 5309 uint32_t vl = env->vl; \ 5310 uint32_t vm = vext_vm(desc); \ 5311 uint32_t esz = sizeof(ETYPE); \ 5312 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \ 5313 uint32_t vta = vext_vta(desc); \ 5314 uint32_t vma = vext_vma(desc); \ 5315 uint32_t i; \ 5316 \ 5317 for (i = env->vstart; i < vl; i++) { \ 5318 if (!vm && !vext_elem_mask(v0, i)) { \ 5319 /* set masked-off elements to 1s */ \ 5320 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \ 5321 continue; \ 5322 } \ 5323 *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i)); \ 5324 } \ 5325 env->vstart = 0; \ 5326 /* set tail elements to 1s */ \ 5327 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \ 5328 } 5329 5330 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t, H2, H1) 5331 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2) 5332 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4) 5333 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t, H4, H1) 5334 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2) 5335 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t, H8, H1) 5336 5337 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t, H2, H1) 5338 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2) 5339 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4) 5340 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t, H4, H1) 5341 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2) 5342 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t, H8, H1) 5343