1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * QEMU LoongArch vector helper functions. 4 * 5 * Copyright (c) 2022-2023 Loongson Technology Corporation Limited 6 */ 7 8 #include "qemu/osdep.h" 9 #include "cpu.h" 10 #include "exec/exec-all.h" 11 #include "exec/helper-proto.h" 12 #include "fpu/softfloat.h" 13 #include "internals.h" 14 #include "tcg/tcg.h" 15 #include "vec.h" 16 #include "tcg/tcg-gvec-desc.h" 17 18 #define DO_ODD_EVEN(NAME, BIT, E1, E2, DO_OP) \ 19 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 20 { \ 21 int i; \ 22 VReg *Vd = (VReg *)vd; \ 23 VReg *Vj = (VReg *)vj; \ 24 VReg *Vk = (VReg *)vk; \ 25 typedef __typeof(Vd->E1(0)) TD; \ 26 int oprsz = simd_oprsz(desc); \ 27 \ 28 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 29 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i)); \ 30 } \ 31 } 32 33 DO_ODD_EVEN(vhaddw_h_b, 16, H, B, DO_ADD) 34 DO_ODD_EVEN(vhaddw_w_h, 32, W, H, DO_ADD) 35 DO_ODD_EVEN(vhaddw_d_w, 64, D, W, DO_ADD) 36 37 void HELPER(vhaddw_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 38 { 39 int i; 40 VReg *Vd = (VReg *)vd; 41 VReg *Vj = (VReg *)vj; 42 VReg *Vk = (VReg *)vk; 43 int oprsz = simd_oprsz(desc); 44 45 for (i = 0; i < oprsz / 16 ; i++) { 46 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i + 1)), 47 int128_makes64(Vk->D(2 * i))); 48 } 49 } 50 51 DO_ODD_EVEN(vhsubw_h_b, 16, H, B, DO_SUB) 52 DO_ODD_EVEN(vhsubw_w_h, 32, W, H, DO_SUB) 53 DO_ODD_EVEN(vhsubw_d_w, 64, D, W, DO_SUB) 54 55 void HELPER(vhsubw_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 56 { 57 int i; 58 VReg *Vd = (VReg *)vd; 59 VReg *Vj = (VReg *)vj; 60 VReg *Vk = (VReg *)vk; 61 int oprsz = simd_oprsz(desc); 62 63 for (i = 0; i < oprsz / 16; i++) { 64 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)), 65 int128_makes64(Vk->D(2 * i))); 66 } 67 } 68 69 DO_ODD_EVEN(vhaddw_hu_bu, 16, UH, UB, DO_ADD) 70 DO_ODD_EVEN(vhaddw_wu_hu, 32, UW, UH, DO_ADD) 71 DO_ODD_EVEN(vhaddw_du_wu, 64, UD, UW, DO_ADD) 72 73 void HELPER(vhaddw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc) 74 { 75 int i; 76 VReg *Vd = (VReg *)vd; 77 VReg *Vj = (VReg *)vj; 78 VReg *Vk = (VReg *)vk; 79 int oprsz = simd_oprsz(desc); 80 81 for (i = 0; i < oprsz / 16; i ++) { 82 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), 83 int128_make64(Vk->UD(2 * i))); 84 } 85 } 86 87 DO_ODD_EVEN(vhsubw_hu_bu, 16, UH, UB, DO_SUB) 88 DO_ODD_EVEN(vhsubw_wu_hu, 32, UW, UH, DO_SUB) 89 DO_ODD_EVEN(vhsubw_du_wu, 64, UD, UW, DO_SUB) 90 91 void HELPER(vhsubw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc) 92 { 93 int i; 94 VReg *Vd = (VReg *)vd; 95 VReg *Vj = (VReg *)vj; 96 VReg *Vk = (VReg *)vk; 97 int oprsz = simd_oprsz(desc); 98 99 for (i = 0; i < oprsz / 16; i++) { 100 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)), 101 int128_make64(Vk->UD(2 * i))); 102 } 103 } 104 105 #define DO_EVEN(NAME, BIT, E1, E2, DO_OP) \ 106 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 107 { \ 108 int i; \ 109 VReg *Vd = (VReg *)vd; \ 110 VReg *Vj = (VReg *)vj; \ 111 VReg *Vk = (VReg *)vk; \ 112 typedef __typeof(Vd->E1(0)) TD; \ 113 int oprsz = simd_oprsz(desc); \ 114 \ 115 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 116 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i) ,(TD)Vk->E2(2 * i)); \ 117 } \ 118 } 119 120 #define DO_ODD(NAME, BIT, E1, E2, DO_OP) \ 121 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 122 { \ 123 int i; \ 124 VReg *Vd = (VReg *)vd; \ 125 VReg *Vj = (VReg *)vj; \ 126 VReg *Vk = (VReg *)vk; \ 127 typedef __typeof(Vd->E1(0)) TD; \ 128 int oprsz = simd_oprsz(desc); \ 129 \ 130 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 131 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i + 1)); \ 132 } \ 133 } 134 135 void HELPER(vaddwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 136 { 137 int i; 138 VReg *Vd = (VReg *)vd; 139 VReg *Vj = (VReg *)vj; 140 VReg *Vk = (VReg *)vk; 141 int oprsz = simd_oprsz(desc); 142 143 for (i = 0; i < oprsz / 16; i++) { 144 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i)), 145 int128_makes64(Vk->D(2 * i))); 146 } 147 } 148 149 DO_EVEN(vaddwev_h_b, 16, H, B, DO_ADD) 150 DO_EVEN(vaddwev_w_h, 32, W, H, DO_ADD) 151 DO_EVEN(vaddwev_d_w, 64, D, W, DO_ADD) 152 153 void HELPER(vaddwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 154 { 155 int i; 156 VReg *Vd = (VReg *)vd; 157 VReg *Vj = (VReg *)vj; 158 VReg *Vk = (VReg *)vk; 159 int oprsz = simd_oprsz(desc); 160 161 for (i = 0; i < oprsz / 16; i++) { 162 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i +1)), 163 int128_makes64(Vk->D(2 * i +1))); 164 } 165 } 166 167 DO_ODD(vaddwod_h_b, 16, H, B, DO_ADD) 168 DO_ODD(vaddwod_w_h, 32, W, H, DO_ADD) 169 DO_ODD(vaddwod_d_w, 64, D, W, DO_ADD) 170 171 void HELPER(vsubwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 172 { 173 int i; 174 VReg *Vd = (VReg *)vd; 175 VReg *Vj = (VReg *)vj; 176 VReg *Vk = (VReg *)vk; 177 int oprsz = simd_oprsz(desc); 178 179 for (i = 0; i < oprsz / 16; i++) { 180 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i)), 181 int128_makes64(Vk->D(2 * i))); 182 } 183 } 184 185 DO_EVEN(vsubwev_h_b, 16, H, B, DO_SUB) 186 DO_EVEN(vsubwev_w_h, 32, W, H, DO_SUB) 187 DO_EVEN(vsubwev_d_w, 64, D, W, DO_SUB) 188 189 void HELPER(vsubwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc) 190 { 191 int i; 192 VReg *Vd = (VReg *)vd; 193 VReg *Vj = (VReg *)vj; 194 VReg *Vk = (VReg *)vk; 195 int oprsz = simd_oprsz(desc); 196 197 for (i = 0; i < oprsz / 16; i++) { 198 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)), 199 int128_makes64(Vk->D(2 * i + 1))); 200 } 201 } 202 203 DO_ODD(vsubwod_h_b, 16, H, B, DO_SUB) 204 DO_ODD(vsubwod_w_h, 32, W, H, DO_SUB) 205 DO_ODD(vsubwod_d_w, 64, D, W, DO_SUB) 206 207 void HELPER(vaddwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 208 { 209 int i; 210 VReg *Vd = (VReg *)vd; 211 VReg *Vj = (VReg *)vj; 212 VReg *Vk = (VReg *)vk; 213 int oprsz = simd_oprsz(desc); 214 215 for (i = 0; i < oprsz / 16; i++) { 216 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)), 217 int128_make64(Vk->UD(2 * i))); 218 } 219 } 220 221 DO_EVEN(vaddwev_h_bu, 16, UH, UB, DO_ADD) 222 DO_EVEN(vaddwev_w_hu, 32, UW, UH, DO_ADD) 223 DO_EVEN(vaddwev_d_wu, 64, UD, UW, DO_ADD) 224 225 void HELPER(vaddwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 226 { 227 int i; 228 VReg *Vd = (VReg *)vd; 229 VReg *Vj = (VReg *)vj; 230 VReg *Vk = (VReg *)vk; 231 int oprsz = simd_oprsz(desc); 232 233 for (i = 0; i < oprsz / 16; i++) { 234 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), 235 int128_make64(Vk->UD(2 * i + 1))); 236 } 237 } 238 239 DO_ODD(vaddwod_h_bu, 16, UH, UB, DO_ADD) 240 DO_ODD(vaddwod_w_hu, 32, UW, UH, DO_ADD) 241 DO_ODD(vaddwod_d_wu, 64, UD, UW, DO_ADD) 242 243 void HELPER(vsubwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 244 { 245 int i; 246 VReg *Vd = (VReg *)vd; 247 VReg *Vj = (VReg *)vj; 248 VReg *Vk = (VReg *)vk; 249 int oprsz = simd_oprsz(desc); 250 251 for (i = 0; i < oprsz / 16; i++) { 252 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i)), 253 int128_make64(Vk->UD(2 * i))); 254 } 255 } 256 257 DO_EVEN(vsubwev_h_bu, 16, UH, UB, DO_SUB) 258 DO_EVEN(vsubwev_w_hu, 32, UW, UH, DO_SUB) 259 DO_EVEN(vsubwev_d_wu, 64, UD, UW, DO_SUB) 260 261 void HELPER(vsubwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc) 262 { 263 int i; 264 VReg *Vd = (VReg *)vd; 265 VReg *Vj = (VReg *)vj; 266 VReg *Vk = (VReg *)vk; 267 int oprsz = simd_oprsz(desc); 268 269 for (i = 0; i < oprsz / 16; i++) { 270 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)), 271 int128_make64(Vk->UD(2 * i + 1))); 272 } 273 } 274 275 DO_ODD(vsubwod_h_bu, 16, UH, UB, DO_SUB) 276 DO_ODD(vsubwod_w_hu, 32, UW, UH, DO_SUB) 277 DO_ODD(vsubwod_d_wu, 64, UD, UW, DO_SUB) 278 279 #define DO_EVEN_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 280 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 281 { \ 282 int i; \ 283 VReg *Vd = (VReg *)vd; \ 284 VReg *Vj = (VReg *)vj; \ 285 VReg *Vk = (VReg *)vk; \ 286 typedef __typeof(Vd->ES1(0)) TDS; \ 287 typedef __typeof(Vd->EU1(0)) TDU; \ 288 int oprsz = simd_oprsz(desc); \ 289 \ 290 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 291 Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i) ,(TDS)Vk->ES2(2 * i)); \ 292 } \ 293 } 294 295 #define DO_ODD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 296 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 297 { \ 298 int i; \ 299 VReg *Vd = (VReg *)vd; \ 300 VReg *Vj = (VReg *)vj; \ 301 VReg *Vk = (VReg *)vk; \ 302 typedef __typeof(Vd->ES1(0)) TDS; \ 303 typedef __typeof(Vd->EU1(0)) TDU; \ 304 int oprsz = simd_oprsz(desc); \ 305 \ 306 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 307 Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i + 1), (TDS)Vk->ES2(2 * i + 1)); \ 308 } \ 309 } 310 311 void HELPER(vaddwev_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc) 312 { 313 int i; 314 VReg *Vd = (VReg *)vd; 315 VReg *Vj = (VReg *)vj; 316 VReg *Vk = (VReg *)vk; 317 int oprsz = simd_oprsz(desc); 318 319 for (i = 0; i < oprsz / 16; i++) { 320 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)), 321 int128_makes64(Vk->D(2 * i))); 322 } 323 } 324 325 DO_EVEN_U_S(vaddwev_h_bu_b, 16, H, UH, B, UB, DO_ADD) 326 DO_EVEN_U_S(vaddwev_w_hu_h, 32, W, UW, H, UH, DO_ADD) 327 DO_EVEN_U_S(vaddwev_d_wu_w, 64, D, UD, W, UW, DO_ADD) 328 329 void HELPER(vaddwod_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc) 330 { 331 int i; 332 VReg *Vd = (VReg *)vd; 333 VReg *Vj = (VReg *)vj; 334 VReg *Vk = (VReg *)vk; 335 int oprsz = simd_oprsz(desc); 336 337 for (i = 0; i < oprsz / 16; i++) { 338 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), 339 int128_makes64(Vk->D(2 * i + 1))); 340 } 341 } 342 343 DO_ODD_U_S(vaddwod_h_bu_b, 16, H, UH, B, UB, DO_ADD) 344 DO_ODD_U_S(vaddwod_w_hu_h, 32, W, UW, H, UH, DO_ADD) 345 DO_ODD_U_S(vaddwod_d_wu_w, 64, D, UD, W, UW, DO_ADD) 346 347 #define DO_3OP(NAME, BIT, E, DO_OP) \ 348 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 349 { \ 350 int i; \ 351 VReg *Vd = (VReg *)vd; \ 352 VReg *Vj = (VReg *)vj; \ 353 VReg *Vk = (VReg *)vk; \ 354 int oprsz = simd_oprsz(desc); \ 355 \ 356 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 357 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \ 358 } \ 359 } 360 361 DO_3OP(vavg_b, 8, B, DO_VAVG) 362 DO_3OP(vavg_h, 16, H, DO_VAVG) 363 DO_3OP(vavg_w, 32, W, DO_VAVG) 364 DO_3OP(vavg_d, 64, D, DO_VAVG) 365 DO_3OP(vavgr_b, 8, B, DO_VAVGR) 366 DO_3OP(vavgr_h, 16, H, DO_VAVGR) 367 DO_3OP(vavgr_w, 32, W, DO_VAVGR) 368 DO_3OP(vavgr_d, 64, D, DO_VAVGR) 369 DO_3OP(vavg_bu, 8, UB, DO_VAVG) 370 DO_3OP(vavg_hu, 16, UH, DO_VAVG) 371 DO_3OP(vavg_wu, 32, UW, DO_VAVG) 372 DO_3OP(vavg_du, 64, UD, DO_VAVG) 373 DO_3OP(vavgr_bu, 8, UB, DO_VAVGR) 374 DO_3OP(vavgr_hu, 16, UH, DO_VAVGR) 375 DO_3OP(vavgr_wu, 32, UW, DO_VAVGR) 376 DO_3OP(vavgr_du, 64, UD, DO_VAVGR) 377 378 DO_3OP(vabsd_b, 8, B, DO_VABSD) 379 DO_3OP(vabsd_h, 16, H, DO_VABSD) 380 DO_3OP(vabsd_w, 32, W, DO_VABSD) 381 DO_3OP(vabsd_d, 64, D, DO_VABSD) 382 DO_3OP(vabsd_bu, 8, UB, DO_VABSD) 383 DO_3OP(vabsd_hu, 16, UH, DO_VABSD) 384 DO_3OP(vabsd_wu, 32, UW, DO_VABSD) 385 DO_3OP(vabsd_du, 64, UD, DO_VABSD) 386 387 #define DO_VADDA(NAME, BIT, E) \ 388 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 389 { \ 390 int i; \ 391 VReg *Vd = (VReg *)vd; \ 392 VReg *Vj = (VReg *)vj; \ 393 VReg *Vk = (VReg *)vk; \ 394 int oprsz = simd_oprsz(desc); \ 395 \ 396 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 397 Vd->E(i) = DO_VABS(Vj->E(i)) + DO_VABS(Vk->E(i)); \ 398 } \ 399 } 400 401 DO_VADDA(vadda_b, 8, B) 402 DO_VADDA(vadda_h, 16, H) 403 DO_VADDA(vadda_w, 32, W) 404 DO_VADDA(vadda_d, 64, D) 405 406 #define VMINMAXI(NAME, BIT, E, DO_OP) \ 407 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 408 { \ 409 int i; \ 410 VReg *Vd = (VReg *)vd; \ 411 VReg *Vj = (VReg *)vj; \ 412 typedef __typeof(Vd->E(0)) TD; \ 413 int oprsz = simd_oprsz(desc); \ 414 \ 415 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 416 Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \ 417 } \ 418 } 419 420 VMINMAXI(vmini_b, 8, B, DO_MIN) 421 VMINMAXI(vmini_h, 16, H, DO_MIN) 422 VMINMAXI(vmini_w, 32, W, DO_MIN) 423 VMINMAXI(vmini_d, 64, D, DO_MIN) 424 VMINMAXI(vmaxi_b, 8, B, DO_MAX) 425 VMINMAXI(vmaxi_h, 16, H, DO_MAX) 426 VMINMAXI(vmaxi_w, 32, W, DO_MAX) 427 VMINMAXI(vmaxi_d, 64, D, DO_MAX) 428 VMINMAXI(vmini_bu, 8, UB, DO_MIN) 429 VMINMAXI(vmini_hu, 16, UH, DO_MIN) 430 VMINMAXI(vmini_wu, 32, UW, DO_MIN) 431 VMINMAXI(vmini_du, 64, UD, DO_MIN) 432 VMINMAXI(vmaxi_bu, 8, UB, DO_MAX) 433 VMINMAXI(vmaxi_hu, 16, UH, DO_MAX) 434 VMINMAXI(vmaxi_wu, 32, UW, DO_MAX) 435 VMINMAXI(vmaxi_du, 64, UD, DO_MAX) 436 437 #define DO_VMUH(NAME, BIT, E1, E2, DO_OP) \ 438 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 439 { \ 440 int i; \ 441 VReg *Vd = (VReg *)vd; \ 442 VReg *Vj = (VReg *)vj; \ 443 VReg *Vk = (VReg *)vk; \ 444 typedef __typeof(Vd->E1(0)) T; \ 445 int oprsz = simd_oprsz(desc); \ 446 \ 447 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 448 Vd->E2(i) = ((T)Vj->E2(i)) * ((T)Vk->E2(i)) >> BIT; \ 449 } \ 450 } 451 452 void HELPER(vmuh_d)(void *vd, void *vj, void *vk, uint32_t desc) 453 { 454 int i; 455 uint64_t l, h; 456 VReg *Vd = (VReg *)vd; 457 VReg *Vj = (VReg *)vj; 458 VReg *Vk = (VReg *)vk; 459 int oprsz = simd_oprsz(desc); 460 461 for (i = 0; i < oprsz / 8; i++) { 462 muls64(&l, &h, Vj->D(i), Vk->D(i)); 463 Vd->D(i) = h; 464 } 465 } 466 467 DO_VMUH(vmuh_b, 8, H, B, DO_MUH) 468 DO_VMUH(vmuh_h, 16, W, H, DO_MUH) 469 DO_VMUH(vmuh_w, 32, D, W, DO_MUH) 470 471 void HELPER(vmuh_du)(void *vd, void *vj, void *vk, uint32_t desc) 472 { 473 int i; 474 uint64_t l, h; 475 VReg *Vd = (VReg *)vd; 476 VReg *Vj = (VReg *)vj; 477 VReg *Vk = (VReg *)vk; 478 int oprsz = simd_oprsz(desc); 479 480 for (i = 0; i < oprsz / 8; i++) { 481 mulu64(&l, &h, Vj->D(i), Vk->D(i)); 482 Vd->D(i) = h; 483 } 484 } 485 486 DO_VMUH(vmuh_bu, 8, UH, UB, DO_MUH) 487 DO_VMUH(vmuh_hu, 16, UW, UH, DO_MUH) 488 DO_VMUH(vmuh_wu, 32, UD, UW, DO_MUH) 489 490 DO_EVEN(vmulwev_h_b, 16, H, B, DO_MUL) 491 DO_EVEN(vmulwev_w_h, 32, W, H, DO_MUL) 492 DO_EVEN(vmulwev_d_w, 64, D, W, DO_MUL) 493 494 DO_ODD(vmulwod_h_b, 16, H, B, DO_MUL) 495 DO_ODD(vmulwod_w_h, 32, W, H, DO_MUL) 496 DO_ODD(vmulwod_d_w, 64, D, W, DO_MUL) 497 498 DO_EVEN(vmulwev_h_bu, 16, UH, UB, DO_MUL) 499 DO_EVEN(vmulwev_w_hu, 32, UW, UH, DO_MUL) 500 DO_EVEN(vmulwev_d_wu, 64, UD, UW, DO_MUL) 501 502 DO_ODD(vmulwod_h_bu, 16, UH, UB, DO_MUL) 503 DO_ODD(vmulwod_w_hu, 32, UW, UH, DO_MUL) 504 DO_ODD(vmulwod_d_wu, 64, UD, UW, DO_MUL) 505 506 DO_EVEN_U_S(vmulwev_h_bu_b, 16, H, UH, B, UB, DO_MUL) 507 DO_EVEN_U_S(vmulwev_w_hu_h, 32, W, UW, H, UH, DO_MUL) 508 DO_EVEN_U_S(vmulwev_d_wu_w, 64, D, UD, W, UW, DO_MUL) 509 510 DO_ODD_U_S(vmulwod_h_bu_b, 16, H, UH, B, UB, DO_MUL) 511 DO_ODD_U_S(vmulwod_w_hu_h, 32, W, UW, H, UH, DO_MUL) 512 DO_ODD_U_S(vmulwod_d_wu_w, 64, D, UD, W, UW, DO_MUL) 513 514 #define VMADDSUB(NAME, BIT, E, DO_OP) \ 515 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 516 { \ 517 int i; \ 518 VReg *Vd = (VReg *)vd; \ 519 VReg *Vj = (VReg *)vj; \ 520 VReg *Vk = (VReg *)vk; \ 521 int oprsz = simd_oprsz(desc); \ 522 \ 523 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 524 Vd->E(i) = DO_OP(Vd->E(i), Vj->E(i) ,Vk->E(i)); \ 525 } \ 526 } 527 528 VMADDSUB(vmadd_b, 8, B, DO_MADD) 529 VMADDSUB(vmadd_h, 16, H, DO_MADD) 530 VMADDSUB(vmadd_w, 32, W, DO_MADD) 531 VMADDSUB(vmadd_d, 64, D, DO_MADD) 532 VMADDSUB(vmsub_b, 8, B, DO_MSUB) 533 VMADDSUB(vmsub_h, 16, H, DO_MSUB) 534 VMADDSUB(vmsub_w, 32, W, DO_MSUB) 535 VMADDSUB(vmsub_d, 64, D, DO_MSUB) 536 537 #define VMADDWEV(NAME, BIT, E1, E2, DO_OP) \ 538 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 539 { \ 540 int i; \ 541 VReg *Vd = (VReg *)vd; \ 542 VReg *Vj = (VReg *)vj; \ 543 VReg *Vk = (VReg *)vk; \ 544 typedef __typeof(Vd->E1(0)) TD; \ 545 int oprsz = simd_oprsz(desc); \ 546 \ 547 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 548 Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i), (TD)Vk->E2(2 * i)); \ 549 } \ 550 } 551 552 VMADDWEV(vmaddwev_h_b, 16, H, B, DO_MUL) 553 VMADDWEV(vmaddwev_w_h, 32, W, H, DO_MUL) 554 VMADDWEV(vmaddwev_d_w, 64, D, W, DO_MUL) 555 VMADDWEV(vmaddwev_h_bu, 16, UH, UB, DO_MUL) 556 VMADDWEV(vmaddwev_w_hu, 32, UW, UH, DO_MUL) 557 VMADDWEV(vmaddwev_d_wu, 64, UD, UW, DO_MUL) 558 559 #define VMADDWOD(NAME, BIT, E1, E2, DO_OP) \ 560 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 561 { \ 562 int i; \ 563 VReg *Vd = (VReg *)vd; \ 564 VReg *Vj = (VReg *)vj; \ 565 VReg *Vk = (VReg *)vk; \ 566 typedef __typeof(Vd->E1(0)) TD; \ 567 int oprsz = simd_oprsz(desc); \ 568 \ 569 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 570 Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i + 1), \ 571 (TD)Vk->E2(2 * i + 1)); \ 572 } \ 573 } 574 575 VMADDWOD(vmaddwod_h_b, 16, H, B, DO_MUL) 576 VMADDWOD(vmaddwod_w_h, 32, W, H, DO_MUL) 577 VMADDWOD(vmaddwod_d_w, 64, D, W, DO_MUL) 578 VMADDWOD(vmaddwod_h_bu, 16, UH, UB, DO_MUL) 579 VMADDWOD(vmaddwod_w_hu, 32, UW, UH, DO_MUL) 580 VMADDWOD(vmaddwod_d_wu, 64, UD, UW, DO_MUL) 581 582 #define VMADDWEV_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 583 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 584 { \ 585 int i; \ 586 VReg *Vd = (VReg *)vd; \ 587 VReg *Vj = (VReg *)vj; \ 588 VReg *Vk = (VReg *)vk; \ 589 typedef __typeof(Vd->ES1(0)) TS1; \ 590 typedef __typeof(Vd->EU1(0)) TU1; \ 591 int oprsz = simd_oprsz(desc); \ 592 \ 593 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 594 Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i), \ 595 (TS1)Vk->ES2(2 * i)); \ 596 } \ 597 } 598 599 VMADDWEV_U_S(vmaddwev_h_bu_b, 16, H, UH, B, UB, DO_MUL) 600 VMADDWEV_U_S(vmaddwev_w_hu_h, 32, W, UW, H, UH, DO_MUL) 601 VMADDWEV_U_S(vmaddwev_d_wu_w, 64, D, UD, W, UW, DO_MUL) 602 603 #define VMADDWOD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ 604 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 605 { \ 606 int i; \ 607 VReg *Vd = (VReg *)vd; \ 608 VReg *Vj = (VReg *)vj; \ 609 VReg *Vk = (VReg *)vk; \ 610 typedef __typeof(Vd->ES1(0)) TS1; \ 611 typedef __typeof(Vd->EU1(0)) TU1; \ 612 int oprsz = simd_oprsz(desc); \ 613 \ 614 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 615 Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i + 1), \ 616 (TS1)Vk->ES2(2 * i + 1)); \ 617 } \ 618 } 619 620 VMADDWOD_U_S(vmaddwod_h_bu_b, 16, H, UH, B, UB, DO_MUL) 621 VMADDWOD_U_S(vmaddwod_w_hu_h, 32, W, UW, H, UH, DO_MUL) 622 VMADDWOD_U_S(vmaddwod_d_wu_w, 64, D, UD, W, UW, DO_MUL) 623 624 #define VDIV(NAME, BIT, E, DO_OP) \ 625 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 626 { \ 627 int i; \ 628 VReg *Vd = (VReg *)vd; \ 629 VReg *Vj = (VReg *)vj; \ 630 VReg *Vk = (VReg *)vk; \ 631 int oprsz = simd_oprsz(desc); \ 632 \ 633 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 634 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \ 635 } \ 636 } 637 638 VDIV(vdiv_b, 8, B, DO_DIV) 639 VDIV(vdiv_h, 16, H, DO_DIV) 640 VDIV(vdiv_w, 32, W, DO_DIV) 641 VDIV(vdiv_d, 64, D, DO_DIV) 642 VDIV(vdiv_bu, 8, UB, DO_DIVU) 643 VDIV(vdiv_hu, 16, UH, DO_DIVU) 644 VDIV(vdiv_wu, 32, UW, DO_DIVU) 645 VDIV(vdiv_du, 64, UD, DO_DIVU) 646 VDIV(vmod_b, 8, B, DO_REM) 647 VDIV(vmod_h, 16, H, DO_REM) 648 VDIV(vmod_w, 32, W, DO_REM) 649 VDIV(vmod_d, 64, D, DO_REM) 650 VDIV(vmod_bu, 8, UB, DO_REMU) 651 VDIV(vmod_hu, 16, UH, DO_REMU) 652 VDIV(vmod_wu, 32, UW, DO_REMU) 653 VDIV(vmod_du, 64, UD, DO_REMU) 654 655 #define VSAT_S(NAME, BIT, E) \ 656 void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \ 657 { \ 658 int i; \ 659 VReg *Vd = (VReg *)vd; \ 660 VReg *Vj = (VReg *)vj; \ 661 typedef __typeof(Vd->E(0)) TD; \ 662 int oprsz = simd_oprsz(desc); \ 663 \ 664 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 665 Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : \ 666 Vj->E(i) < (TD)~max ? (TD)~max: Vj->E(i); \ 667 } \ 668 } 669 670 VSAT_S(vsat_b, 8, B) 671 VSAT_S(vsat_h, 16, H) 672 VSAT_S(vsat_w, 32, W) 673 VSAT_S(vsat_d, 64, D) 674 675 #define VSAT_U(NAME, BIT, E) \ 676 void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \ 677 { \ 678 int i; \ 679 VReg *Vd = (VReg *)vd; \ 680 VReg *Vj = (VReg *)vj; \ 681 typedef __typeof(Vd->E(0)) TD; \ 682 int oprsz = simd_oprsz(desc); \ 683 \ 684 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 685 Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : Vj->E(i); \ 686 } \ 687 } 688 689 VSAT_U(vsat_bu, 8, UB) 690 VSAT_U(vsat_hu, 16, UH) 691 VSAT_U(vsat_wu, 32, UW) 692 VSAT_U(vsat_du, 64, UD) 693 694 #define VEXTH(NAME, BIT, E1, E2) \ 695 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 696 { \ 697 int i, j, ofs; \ 698 VReg *Vd = (VReg *)vd; \ 699 VReg *Vj = (VReg *)vj; \ 700 int oprsz = simd_oprsz(desc); \ 701 \ 702 ofs = LSX_LEN / BIT; \ 703 for (i = 0; i < oprsz / 16; i++) { \ 704 for (j = 0; j < ofs; j++) { \ 705 Vd->E1(j + i * ofs) = Vj->E2(j + ofs + ofs * 2 * i); \ 706 } \ 707 } \ 708 } 709 710 void HELPER(vexth_q_d)(void *vd, void *vj, uint32_t desc) 711 { 712 int i; 713 VReg *Vd = (VReg *)vd; 714 VReg *Vj = (VReg *)vj; 715 int oprsz = simd_oprsz(desc); 716 717 for (i = 0; i < oprsz / 16; i++) { 718 Vd->Q(i) = int128_makes64(Vj->D(2 * i + 1)); 719 } 720 } 721 722 void HELPER(vexth_qu_du)(void *vd, void *vj, uint32_t desc) 723 { 724 int i; 725 VReg *Vd = (VReg *)vd; 726 VReg *Vj = (VReg *)vj; 727 int oprsz = simd_oprsz(desc); 728 729 for (i = 0; i < oprsz / 16; i++) { 730 Vd->Q(i) = int128_make64(Vj->UD(2 * i + 1)); 731 } 732 } 733 734 VEXTH(vexth_h_b, 16, H, B) 735 VEXTH(vexth_w_h, 32, W, H) 736 VEXTH(vexth_d_w, 64, D, W) 737 VEXTH(vexth_hu_bu, 16, UH, UB) 738 VEXTH(vexth_wu_hu, 32, UW, UH) 739 VEXTH(vexth_du_wu, 64, UD, UW) 740 741 #define VEXT2XV(NAME, BIT, E1, E2) \ 742 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 743 { \ 744 int i; \ 745 VReg temp = {}; \ 746 VReg *Vd = (VReg *)vd; \ 747 VReg *Vj = (VReg *)vj; \ 748 int oprsz = simd_oprsz(desc); \ 749 \ 750 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 751 temp.E1(i) = Vj->E2(i); \ 752 } \ 753 *Vd = temp; \ 754 } 755 756 VEXT2XV(vext2xv_h_b, 16, H, B) 757 VEXT2XV(vext2xv_w_b, 32, W, B) 758 VEXT2XV(vext2xv_d_b, 64, D, B) 759 VEXT2XV(vext2xv_w_h, 32, W, H) 760 VEXT2XV(vext2xv_d_h, 64, D, H) 761 VEXT2XV(vext2xv_d_w, 64, D, W) 762 VEXT2XV(vext2xv_hu_bu, 16, UH, UB) 763 VEXT2XV(vext2xv_wu_bu, 32, UW, UB) 764 VEXT2XV(vext2xv_du_bu, 64, UD, UB) 765 VEXT2XV(vext2xv_wu_hu, 32, UW, UH) 766 VEXT2XV(vext2xv_du_hu, 64, UD, UH) 767 VEXT2XV(vext2xv_du_wu, 64, UD, UW) 768 769 DO_3OP(vsigncov_b, 8, B, DO_SIGNCOV) 770 DO_3OP(vsigncov_h, 16, H, DO_SIGNCOV) 771 DO_3OP(vsigncov_w, 32, W, DO_SIGNCOV) 772 DO_3OP(vsigncov_d, 64, D, DO_SIGNCOV) 773 774 static uint64_t do_vmskltz_b(int64_t val) 775 { 776 uint64_t m = 0x8080808080808080ULL; 777 uint64_t c = val & m; 778 c |= c << 7; 779 c |= c << 14; 780 c |= c << 28; 781 return c >> 56; 782 } 783 784 void HELPER(vmskltz_b)(void *vd, void *vj, uint32_t desc) 785 { 786 int i; 787 uint16_t temp = 0; 788 VReg *Vd = (VReg *)vd; 789 VReg *Vj = (VReg *)vj; 790 int oprsz = simd_oprsz(desc); 791 792 for (i = 0; i < oprsz / 16; i++) { 793 temp = 0; 794 temp = do_vmskltz_b(Vj->D(2 * i)); 795 temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8); 796 Vd->D(2 * i) = temp; 797 Vd->D(2 * i + 1) = 0; 798 } 799 } 800 801 static uint64_t do_vmskltz_h(int64_t val) 802 { 803 uint64_t m = 0x8000800080008000ULL; 804 uint64_t c = val & m; 805 c |= c << 15; 806 c |= c << 30; 807 return c >> 60; 808 } 809 810 void HELPER(vmskltz_h)(void *vd, void *vj, uint32_t desc) 811 { 812 int i; 813 uint16_t temp = 0; 814 VReg *Vd = (VReg *)vd; 815 VReg *Vj = (VReg *)vj; 816 int oprsz = simd_oprsz(desc); 817 818 for (i = 0; i < oprsz / 16; i++) { 819 temp = 0; 820 temp = do_vmskltz_h(Vj->D(2 * i)); 821 temp |= (do_vmskltz_h(Vj->D(2 * i + 1)) << 4); 822 Vd->D(2 * i) = temp; 823 Vd->D(2 * i + 1) = 0; 824 } 825 } 826 827 static uint64_t do_vmskltz_w(int64_t val) 828 { 829 uint64_t m = 0x8000000080000000ULL; 830 uint64_t c = val & m; 831 c |= c << 31; 832 return c >> 62; 833 } 834 835 void HELPER(vmskltz_w)(void *vd, void *vj, uint32_t desc) 836 { 837 int i; 838 uint16_t temp = 0; 839 VReg *Vd = (VReg *)vd; 840 VReg *Vj = (VReg *)vj; 841 int oprsz = simd_oprsz(desc); 842 843 for (i = 0; i < oprsz / 16; i++) { 844 temp = 0; 845 temp = do_vmskltz_w(Vj->D(2 * i)); 846 temp |= (do_vmskltz_w(Vj->D(2 * i + 1)) << 2); 847 Vd->D(2 * i) = temp; 848 Vd->D(2 * i + 1) = 0; 849 } 850 } 851 852 static uint64_t do_vmskltz_d(int64_t val) 853 { 854 return (uint64_t)val >> 63; 855 } 856 void HELPER(vmskltz_d)(void *vd, void *vj, uint32_t desc) 857 { 858 int i; 859 uint16_t temp = 0; 860 VReg *Vd = (VReg *)vd; 861 VReg *Vj = (VReg *)vj; 862 int oprsz = simd_oprsz(desc); 863 864 for (i = 0; i < oprsz / 16; i++) { 865 temp = 0; 866 temp = do_vmskltz_d(Vj->D(2 * i)); 867 temp |= (do_vmskltz_d(Vj->D(2 * i + 1)) << 1); 868 Vd->D(2 * i) = temp; 869 Vd->D(2 * i + 1) = 0; 870 } 871 } 872 873 void HELPER(vmskgez_b)(void *vd, void *vj, uint32_t desc) 874 { 875 int i; 876 uint16_t temp = 0; 877 VReg *Vd = (VReg *)vd; 878 VReg *Vj = (VReg *)vj; 879 int oprsz = simd_oprsz(desc); 880 881 for (i = 0; i < oprsz / 16; i++) { 882 temp = 0; 883 temp = do_vmskltz_b(Vj->D(2 * i)); 884 temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8); 885 Vd->D(2 * i) = (uint16_t)(~temp); 886 Vd->D(2 * i + 1) = 0; 887 } 888 } 889 890 static uint64_t do_vmskez_b(uint64_t a) 891 { 892 uint64_t m = 0x7f7f7f7f7f7f7f7fULL; 893 uint64_t c = ~(((a & m) + m) | a | m); 894 c |= c << 7; 895 c |= c << 14; 896 c |= c << 28; 897 return c >> 56; 898 } 899 900 void HELPER(vmsknz_b)(void *vd, void *vj, uint32_t desc) 901 { 902 int i; 903 uint16_t temp = 0; 904 VReg *Vd = (VReg *)vd; 905 VReg *Vj = (VReg *)vj; 906 int oprsz = simd_oprsz(desc); 907 908 for (i = 0; i < oprsz / 16; i++) { 909 temp = 0; 910 temp = do_vmskez_b(Vj->D(2 * i)); 911 temp |= (do_vmskez_b(Vj->D(2 * i + 1)) << 8); 912 Vd->D(2 * i) = (uint16_t)(~temp); 913 Vd->D(2 * i + 1) = 0; 914 } 915 } 916 917 void HELPER(vnori_b)(void *vd, void *vj, uint64_t imm, uint32_t desc) 918 { 919 int i; 920 VReg *Vd = (VReg *)vd; 921 VReg *Vj = (VReg *)vj; 922 923 for (i = 0; i < simd_oprsz(desc); i++) { 924 Vd->B(i) = ~(Vj->B(i) | (uint8_t)imm); 925 } 926 } 927 928 #define VSLLWIL(NAME, BIT, E1, E2) \ 929 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 930 { \ 931 int i, j, ofs; \ 932 VReg temp = {}; \ 933 VReg *Vd = (VReg *)vd; \ 934 VReg *Vj = (VReg *)vj; \ 935 int oprsz = simd_oprsz(desc); \ 936 typedef __typeof(temp.E1(0)) TD; \ 937 \ 938 ofs = LSX_LEN / BIT; \ 939 for (i = 0; i < oprsz / 16; i++) { \ 940 for (j = 0; j < ofs; j++) { \ 941 temp.E1(j + ofs * i) = (TD)Vj->E2(j + ofs * 2 * i) << (imm % BIT); \ 942 } \ 943 } \ 944 *Vd = temp; \ 945 } 946 947 948 void HELPER(vextl_q_d)(void *vd, void *vj, uint32_t desc) 949 { 950 int i; 951 VReg *Vd = (VReg *)vd; 952 VReg *Vj = (VReg *)vj; 953 int oprsz = simd_oprsz(desc); 954 955 for (i = 0; i < oprsz / 16; i++) { 956 Vd->Q(i) = int128_makes64(Vj->D(2 * i)); 957 } 958 } 959 960 void HELPER(vextl_qu_du)(void *vd, void *vj, uint32_t desc) 961 { 962 int i; 963 VReg *Vd = (VReg *)vd; 964 VReg *Vj = (VReg *)vj; 965 int oprsz = simd_oprsz(desc); 966 967 for (i = 0; i < oprsz / 16; i++) { 968 Vd->Q(i) = int128_make64(Vj->UD(2 * i)); 969 } 970 } 971 972 VSLLWIL(vsllwil_h_b, 16, H, B) 973 VSLLWIL(vsllwil_w_h, 32, W, H) 974 VSLLWIL(vsllwil_d_w, 64, D, W) 975 VSLLWIL(vsllwil_hu_bu, 16, UH, UB) 976 VSLLWIL(vsllwil_wu_hu, 32, UW, UH) 977 VSLLWIL(vsllwil_du_wu, 64, UD, UW) 978 979 #define do_vsrlr(E, T) \ 980 static T do_vsrlr_ ##E(T s1, int sh) \ 981 { \ 982 if (sh == 0) { \ 983 return s1; \ 984 } else { \ 985 return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \ 986 } \ 987 } 988 989 do_vsrlr(B, uint8_t) 990 do_vsrlr(H, uint16_t) 991 do_vsrlr(W, uint32_t) 992 do_vsrlr(D, uint64_t) 993 994 #define VSRLR(NAME, BIT, T, E) \ 995 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 996 { \ 997 int i; \ 998 VReg *Vd = (VReg *)vd; \ 999 VReg *Vj = (VReg *)vj; \ 1000 VReg *Vk = (VReg *)vk; \ 1001 int oprsz = simd_oprsz(desc); \ 1002 \ 1003 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1004 Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \ 1005 } \ 1006 } 1007 1008 VSRLR(vsrlr_b, 8, uint8_t, B) 1009 VSRLR(vsrlr_h, 16, uint16_t, H) 1010 VSRLR(vsrlr_w, 32, uint32_t, W) 1011 VSRLR(vsrlr_d, 64, uint64_t, D) 1012 1013 #define VSRLRI(NAME, BIT, E) \ 1014 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1015 { \ 1016 int i; \ 1017 VReg *Vd = (VReg *)vd; \ 1018 VReg *Vj = (VReg *)vj; \ 1019 int oprsz = simd_oprsz(desc); \ 1020 \ 1021 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1022 Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), imm); \ 1023 } \ 1024 } 1025 1026 VSRLRI(vsrlri_b, 8, B) 1027 VSRLRI(vsrlri_h, 16, H) 1028 VSRLRI(vsrlri_w, 32, W) 1029 VSRLRI(vsrlri_d, 64, D) 1030 1031 #define do_vsrar(E, T) \ 1032 static T do_vsrar_ ##E(T s1, int sh) \ 1033 { \ 1034 if (sh == 0) { \ 1035 return s1; \ 1036 } else { \ 1037 return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \ 1038 } \ 1039 } 1040 1041 do_vsrar(B, int8_t) 1042 do_vsrar(H, int16_t) 1043 do_vsrar(W, int32_t) 1044 do_vsrar(D, int64_t) 1045 1046 #define VSRAR(NAME, BIT, T, E) \ 1047 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1048 { \ 1049 int i; \ 1050 VReg *Vd = (VReg *)vd; \ 1051 VReg *Vj = (VReg *)vj; \ 1052 VReg *Vk = (VReg *)vk; \ 1053 int oprsz = simd_oprsz(desc); \ 1054 \ 1055 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1056 Vd->E(i) = do_vsrar_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \ 1057 } \ 1058 } 1059 1060 VSRAR(vsrar_b, 8, uint8_t, B) 1061 VSRAR(vsrar_h, 16, uint16_t, H) 1062 VSRAR(vsrar_w, 32, uint32_t, W) 1063 VSRAR(vsrar_d, 64, uint64_t, D) 1064 1065 #define VSRARI(NAME, BIT, E) \ 1066 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1067 { \ 1068 int i; \ 1069 VReg *Vd = (VReg *)vd; \ 1070 VReg *Vj = (VReg *)vj; \ 1071 int oprsz = simd_oprsz(desc); \ 1072 \ 1073 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 1074 Vd->E(i) = do_vsrar_ ## E(Vj->E(i), imm); \ 1075 } \ 1076 } 1077 1078 VSRARI(vsrari_b, 8, B) 1079 VSRARI(vsrari_h, 16, H) 1080 VSRARI(vsrari_w, 32, W) 1081 VSRARI(vsrari_d, 64, D) 1082 1083 #define VSRLN(NAME, BIT, E1, E2) \ 1084 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1085 { \ 1086 int i, j, ofs; \ 1087 VReg *Vd = (VReg *)vd; \ 1088 VReg *Vj = (VReg *)vj; \ 1089 VReg *Vk = (VReg *)vk; \ 1090 int oprsz = simd_oprsz(desc); \ 1091 \ 1092 ofs = LSX_LEN / BIT; \ 1093 for (i = 0; i < oprsz / 16; i++) { \ 1094 for (j = 0; j < ofs; j++) { \ 1095 Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \ 1096 Vk->E2(j + ofs * i) % BIT); \ 1097 } \ 1098 Vd->D(2 * i + 1) = 0; \ 1099 } \ 1100 } 1101 1102 VSRLN(vsrln_b_h, 16, B, UH) 1103 VSRLN(vsrln_h_w, 32, H, UW) 1104 VSRLN(vsrln_w_d, 64, W, UD) 1105 1106 #define VSRAN(NAME, BIT, E1, E2, E3) \ 1107 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1108 { \ 1109 int i, j, ofs; \ 1110 VReg *Vd = (VReg *)vd; \ 1111 VReg *Vj = (VReg *)vj; \ 1112 VReg *Vk = (VReg *)vk; \ 1113 int oprsz = simd_oprsz(desc); \ 1114 \ 1115 ofs = LSX_LEN / BIT; \ 1116 for (i = 0; i < oprsz / 16; i++) { \ 1117 for (j = 0; j < ofs; j++) { \ 1118 Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \ 1119 Vk->E3(j + ofs * i) % BIT); \ 1120 } \ 1121 Vd->D(2 * i + 1) = 0; \ 1122 } \ 1123 } 1124 1125 VSRAN(vsran_b_h, 16, B, H, UH) 1126 VSRAN(vsran_h_w, 32, H, W, UW) 1127 VSRAN(vsran_w_d, 64, W, D, UD) 1128 1129 #define VSRLNI(NAME, BIT, E1, E2) \ 1130 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1131 { \ 1132 int i, j, ofs; \ 1133 VReg temp = {}; \ 1134 VReg *Vd = (VReg *)vd; \ 1135 VReg *Vj = (VReg *)vj; \ 1136 int oprsz = simd_oprsz(desc); \ 1137 \ 1138 ofs = LSX_LEN / BIT; \ 1139 for (i = 0; i < oprsz / 16; i++) { \ 1140 for (j = 0; j < ofs; j++) { \ 1141 temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \ 1142 temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \ 1143 imm); \ 1144 } \ 1145 } \ 1146 *Vd = temp; \ 1147 } 1148 1149 void HELPER(vsrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1150 { 1151 int i; 1152 VReg temp = {}; 1153 VReg *Vd = (VReg *)vd; 1154 VReg *Vj = (VReg *)vj; 1155 1156 for (i = 0; i < 2; i++) { 1157 temp.D(2 * i) = int128_getlo(int128_urshift(Vj->Q(i), imm % 128)); 1158 temp.D(2 * i +1) = int128_getlo(int128_urshift(Vd->Q(i), imm % 128)); 1159 } 1160 *Vd = temp; 1161 } 1162 1163 VSRLNI(vsrlni_b_h, 16, B, UH) 1164 VSRLNI(vsrlni_h_w, 32, H, UW) 1165 VSRLNI(vsrlni_w_d, 64, W, UD) 1166 1167 #define VSRANI(NAME, BIT, E1, E2) \ 1168 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1169 { \ 1170 int i, j, ofs; \ 1171 VReg temp = {}; \ 1172 VReg *Vd = (VReg *)vd; \ 1173 VReg *Vj = (VReg *)vj; \ 1174 int oprsz = simd_oprsz(desc); \ 1175 \ 1176 ofs = LSX_LEN / BIT; \ 1177 for (i = 0; i < oprsz / 16; i++) { \ 1178 for (j = 0; j < ofs; j++) { \ 1179 temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \ 1180 temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \ 1181 imm); \ 1182 } \ 1183 } \ 1184 *Vd = temp; \ 1185 } 1186 1187 void HELPER(vsrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1188 { 1189 int i; 1190 VReg temp = {}; 1191 VReg *Vd = (VReg *)vd; 1192 VReg *Vj = (VReg *)vj; 1193 1194 for (i = 0; i < 2; i++) { 1195 temp.D(2 * i) = int128_getlo(int128_rshift(Vj->Q(i), imm % 128)); 1196 temp.D(2 * i + 1) = int128_getlo(int128_rshift(Vd->Q(i), imm % 128)); 1197 } 1198 *Vd = temp; 1199 } 1200 1201 VSRANI(vsrani_b_h, 16, B, H) 1202 VSRANI(vsrani_h_w, 32, H, W) 1203 VSRANI(vsrani_w_d, 64, W, D) 1204 1205 #define VSRLRN(NAME, BIT, E1, E2, E3) \ 1206 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1207 { \ 1208 int i, j, ofs; \ 1209 VReg *Vd = (VReg *)vd; \ 1210 VReg *Vj = (VReg *)vj; \ 1211 VReg *Vk = (VReg *)vk; \ 1212 int oprsz = simd_oprsz(desc); \ 1213 \ 1214 ofs = LSX_LEN / BIT; \ 1215 for (i = 0; i < oprsz / 16; i++) { \ 1216 for (j = 0; j < ofs; j++) { \ 1217 Vd->E1(j + ofs * 2 * i) = do_vsrlr_ ##E2(Vj->E2(j + ofs * i), \ 1218 Vk->E3(j + ofs * i) % BIT); \ 1219 } \ 1220 Vd->D(2 * i + 1) = 0; \ 1221 } \ 1222 } 1223 1224 VSRLRN(vsrlrn_b_h, 16, B, H, UH) 1225 VSRLRN(vsrlrn_h_w, 32, H, W, UW) 1226 VSRLRN(vsrlrn_w_d, 64, W, D, UD) 1227 1228 #define VSRARN(NAME, BIT, E1, E2, E3) \ 1229 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1230 { \ 1231 int i, j, ofs; \ 1232 VReg *Vd = (VReg *)vd; \ 1233 VReg *Vj = (VReg *)vj; \ 1234 VReg *Vk = (VReg *)vk; \ 1235 int oprsz = simd_oprsz(desc); \ 1236 \ 1237 ofs = LSX_LEN / BIT; \ 1238 for (i = 0; i < oprsz / 16; i++) { \ 1239 for (j = 0; j < ofs; j++) { \ 1240 Vd->E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), \ 1241 Vk->E3(j + ofs * i) % BIT); \ 1242 } \ 1243 Vd->D(2 * i + 1) = 0; \ 1244 } \ 1245 } 1246 1247 VSRARN(vsrarn_b_h, 16, B, H, UH) 1248 VSRARN(vsrarn_h_w, 32, H, W, UW) 1249 VSRARN(vsrarn_w_d, 64, W, D, UD) 1250 1251 #define VSRLRNI(NAME, BIT, E1, E2) \ 1252 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1253 { \ 1254 int i, j, ofs; \ 1255 VReg temp = {}; \ 1256 VReg *Vd = (VReg *)vd; \ 1257 VReg *Vj = (VReg *)vj; \ 1258 int oprsz = simd_oprsz(desc); \ 1259 \ 1260 ofs = LSX_LEN / BIT; \ 1261 for (i = 0; i < oprsz / 16; i++) { \ 1262 for (j = 0; j < ofs; j++) { \ 1263 temp.E1(j + ofs * 2 * i) = do_vsrlr_ ## E2(Vj->E2(j + ofs * i), imm); \ 1264 temp.E1(j + ofs * (2 * i + 1)) = do_vsrlr_ ## E2(Vd->E2(j + ofs * i), \ 1265 imm); \ 1266 } \ 1267 } \ 1268 *Vd = temp; \ 1269 } 1270 1271 void HELPER(vsrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1272 { 1273 int i; 1274 VReg temp = {}; 1275 VReg *Vd = (VReg *)vd; 1276 VReg *Vj = (VReg *)vj; 1277 Int128 r[4]; 1278 int oprsz = simd_oprsz(desc); 1279 1280 for (i = 0; i < oprsz / 16; i++) { 1281 if (imm == 0) { 1282 temp.D(2 * i) = int128_getlo(Vj->Q(i)); 1283 temp.D(2 * i + 1) = int128_getlo(Vd->Q(i)); 1284 } else { 1285 r[2 * i] = int128_and(int128_urshift(Vj->Q(i), (imm - 1)), 1286 int128_one()); 1287 r[2 * i + 1] = int128_and(int128_urshift(Vd->Q(i), (imm - 1)), 1288 int128_one()); 1289 temp.D(2 * i) = int128_getlo(int128_add(int128_urshift(Vj->Q(i), 1290 imm), r[2 * i])); 1291 temp.D(2 * i + 1) = int128_getlo(int128_add(int128_urshift(Vd->Q(i), 1292 imm), r[ 2 * i + 1])); 1293 } 1294 } 1295 *Vd = temp; 1296 } 1297 1298 VSRLRNI(vsrlrni_b_h, 16, B, H) 1299 VSRLRNI(vsrlrni_h_w, 32, H, W) 1300 VSRLRNI(vsrlrni_w_d, 64, W, D) 1301 1302 #define VSRARNI(NAME, BIT, E1, E2) \ 1303 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1304 { \ 1305 int i, j, ofs; \ 1306 VReg temp = {}; \ 1307 VReg *Vd = (VReg *)vd; \ 1308 VReg *Vj = (VReg *)vj; \ 1309 int oprsz = simd_oprsz(desc); \ 1310 \ 1311 ofs = LSX_LEN / BIT; \ 1312 for (i = 0; i < oprsz / 16; i++) { \ 1313 for (j = 0; j < ofs; j++) { \ 1314 temp.E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), imm); \ 1315 temp.E1(j + ofs * (2 * i + 1)) = do_vsrar_ ## E2(Vd->E2(j + ofs * i), \ 1316 imm); \ 1317 } \ 1318 } \ 1319 *Vd = temp; \ 1320 } 1321 1322 void HELPER(vsrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1323 { 1324 int i; 1325 VReg temp = {}; 1326 VReg *Vd = (VReg *)vd; 1327 VReg *Vj = (VReg *)vj; 1328 Int128 r[4]; 1329 int oprsz = simd_oprsz(desc); 1330 1331 for (i = 0; i < oprsz / 16; i++) { 1332 if (imm == 0) { 1333 temp.D(2 * i) = int128_getlo(Vj->Q(i)); 1334 temp.D(2 * i + 1) = int128_getlo(Vd->Q(i)); 1335 } else { 1336 r[2 * i] = int128_and(int128_rshift(Vj->Q(i), (imm - 1)), 1337 int128_one()); 1338 r[2 * i + 1] = int128_and(int128_rshift(Vd->Q(i), (imm - 1)), 1339 int128_one()); 1340 temp.D(2 * i) = int128_getlo(int128_add(int128_rshift(Vj->Q(i), 1341 imm), r[2 * i])); 1342 temp.D(2 * i + 1) = int128_getlo(int128_add(int128_rshift(Vd->Q(i), 1343 imm), r[2 * i + 1])); 1344 } 1345 } 1346 *Vd = temp; 1347 } 1348 1349 VSRARNI(vsrarni_b_h, 16, B, H) 1350 VSRARNI(vsrarni_h_w, 32, H, W) 1351 VSRARNI(vsrarni_w_d, 64, W, D) 1352 1353 #define SSRLNS(NAME, T1, T2, T3) \ 1354 static T1 do_ssrlns_ ## NAME(T2 e2, int sa, int sh) \ 1355 { \ 1356 T1 shft_res; \ 1357 if (sa == 0) { \ 1358 shft_res = e2; \ 1359 } else { \ 1360 shft_res = (((T1)e2) >> sa); \ 1361 } \ 1362 T3 mask; \ 1363 mask = (1ull << sh) -1; \ 1364 if (shft_res > mask) { \ 1365 return mask; \ 1366 } else { \ 1367 return shft_res; \ 1368 } \ 1369 } 1370 1371 SSRLNS(B, uint16_t, int16_t, uint8_t) 1372 SSRLNS(H, uint32_t, int32_t, uint16_t) 1373 SSRLNS(W, uint64_t, int64_t, uint32_t) 1374 1375 #define VSSRLN(NAME, BIT, E1, E2, E3) \ 1376 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1377 { \ 1378 int i, j, ofs; \ 1379 VReg *Vd = (VReg *)vd; \ 1380 VReg *Vj = (VReg *)vj; \ 1381 VReg *Vk = (VReg *)vk; \ 1382 int oprsz = simd_oprsz(desc); \ 1383 \ 1384 ofs = LSX_LEN / BIT; \ 1385 for (i = 0; i < oprsz / 16; i++) { \ 1386 for (j = 0; j < ofs; j++) { \ 1387 Vd->E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \ 1388 Vk->E3(j + ofs * i) % BIT, \ 1389 BIT / 2 - 1); \ 1390 } \ 1391 Vd->D(2 * i + 1) = 0; \ 1392 } \ 1393 } 1394 1395 VSSRLN(vssrln_b_h, 16, B, H, UH) 1396 VSSRLN(vssrln_h_w, 32, H, W, UW) 1397 VSSRLN(vssrln_w_d, 64, W, D, UD) 1398 1399 #define SSRANS(E, T1, T2) \ 1400 static T1 do_ssrans_ ## E(T1 e2, int sa, int sh) \ 1401 { \ 1402 T1 shft_res; \ 1403 if (sa == 0) { \ 1404 shft_res = e2; \ 1405 } else { \ 1406 shft_res = e2 >> sa; \ 1407 } \ 1408 T2 mask; \ 1409 mask = (1ll << sh) - 1; \ 1410 if (shft_res > mask) { \ 1411 return mask; \ 1412 } else if (shft_res < -(mask + 1)) { \ 1413 return ~mask; \ 1414 } else { \ 1415 return shft_res; \ 1416 } \ 1417 } 1418 1419 SSRANS(B, int16_t, int8_t) 1420 SSRANS(H, int32_t, int16_t) 1421 SSRANS(W, int64_t, int32_t) 1422 1423 #define VSSRAN(NAME, BIT, E1, E2, E3) \ 1424 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1425 { \ 1426 int i, j, ofs; \ 1427 VReg *Vd = (VReg *)vd; \ 1428 VReg *Vj = (VReg *)vj; \ 1429 VReg *Vk = (VReg *)vk; \ 1430 int oprsz = simd_oprsz(desc); \ 1431 \ 1432 ofs = LSX_LEN / BIT; \ 1433 for (i = 0; i < oprsz / 16; i++) { \ 1434 for (j = 0; j < ofs; j++) { \ 1435 Vd->E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \ 1436 Vk->E3(j + ofs * i) % BIT, \ 1437 BIT / 2 - 1); \ 1438 } \ 1439 Vd->D(2 * i + 1) = 0; \ 1440 } \ 1441 } 1442 1443 VSSRAN(vssran_b_h, 16, B, H, UH) 1444 VSSRAN(vssran_h_w, 32, H, W, UW) 1445 VSSRAN(vssran_w_d, 64, W, D, UD) 1446 1447 #define SSRLNU(E, T1, T2, T3) \ 1448 static T1 do_ssrlnu_ ## E(T3 e2, int sa, int sh) \ 1449 { \ 1450 T1 shft_res; \ 1451 if (sa == 0) { \ 1452 shft_res = e2; \ 1453 } else { \ 1454 shft_res = (((T1)e2) >> sa); \ 1455 } \ 1456 T2 mask; \ 1457 mask = (1ull << sh) - 1; \ 1458 if (shft_res > mask) { \ 1459 return mask; \ 1460 } else { \ 1461 return shft_res; \ 1462 } \ 1463 } 1464 1465 SSRLNU(B, uint16_t, uint8_t, int16_t) 1466 SSRLNU(H, uint32_t, uint16_t, int32_t) 1467 SSRLNU(W, uint64_t, uint32_t, int64_t) 1468 1469 #define VSSRLNU(NAME, BIT, E1, E2, E3) \ 1470 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1471 { \ 1472 int i, j, ofs; \ 1473 VReg *Vd = (VReg *)vd; \ 1474 VReg *Vj = (VReg *)vj; \ 1475 VReg *Vk = (VReg *)vk; \ 1476 int oprsz = simd_oprsz(desc); \ 1477 \ 1478 ofs = LSX_LEN / BIT; \ 1479 for (i = 0; i < oprsz / 16; i++) { \ 1480 for (j = 0; j < ofs; j++) { \ 1481 Vd->E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \ 1482 Vk->E3(j + ofs * i) % BIT, \ 1483 BIT / 2); \ 1484 } \ 1485 Vd->D(2 * i + 1) = 0; \ 1486 } \ 1487 } 1488 1489 VSSRLNU(vssrln_bu_h, 16, B, H, UH) 1490 VSSRLNU(vssrln_hu_w, 32, H, W, UW) 1491 VSSRLNU(vssrln_wu_d, 64, W, D, UD) 1492 1493 #define SSRANU(E, T1, T2, T3) \ 1494 static T1 do_ssranu_ ## E(T3 e2, int sa, int sh) \ 1495 { \ 1496 T1 shft_res; \ 1497 if (sa == 0) { \ 1498 shft_res = e2; \ 1499 } else { \ 1500 shft_res = e2 >> sa; \ 1501 } \ 1502 if (e2 < 0) { \ 1503 shft_res = 0; \ 1504 } \ 1505 T2 mask; \ 1506 mask = (1ull << sh) - 1; \ 1507 if (shft_res > mask) { \ 1508 return mask; \ 1509 } else { \ 1510 return shft_res; \ 1511 } \ 1512 } 1513 1514 SSRANU(B, uint16_t, uint8_t, int16_t) 1515 SSRANU(H, uint32_t, uint16_t, int32_t) 1516 SSRANU(W, uint64_t, uint32_t, int64_t) 1517 1518 #define VSSRANU(NAME, BIT, E1, E2, E3) \ 1519 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1520 { \ 1521 int i, j, ofs; \ 1522 VReg *Vd = (VReg *)vd; \ 1523 VReg *Vj = (VReg *)vj; \ 1524 VReg *Vk = (VReg *)vk; \ 1525 int oprsz = simd_oprsz(desc); \ 1526 \ 1527 ofs = LSX_LEN / BIT; \ 1528 for (i = 0; i < oprsz / 16; i++) { \ 1529 for (j = 0; j < ofs; j++) { \ 1530 Vd->E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i), \ 1531 Vk->E3(j + ofs * i) % BIT, \ 1532 BIT / 2); \ 1533 } \ 1534 Vd->D(2 * i + 1) = 0; \ 1535 } \ 1536 } 1537 1538 VSSRANU(vssran_bu_h, 16, B, H, UH) 1539 VSSRANU(vssran_hu_w, 32, H, W, UW) 1540 VSSRANU(vssran_wu_d, 64, W, D, UD) 1541 1542 #define VSSRLNI(NAME, BIT, E1, E2) \ 1543 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1544 { \ 1545 int i, j, ofs; \ 1546 VReg temp = {}; \ 1547 VReg *Vd = (VReg *)vd; \ 1548 VReg *Vj = (VReg *)vj; \ 1549 int oprsz = simd_oprsz(desc); \ 1550 \ 1551 ofs = LSX_LEN / BIT; \ 1552 for (i = 0; i < oprsz / 16; i++) { \ 1553 for (j = 0; j < ofs; j++) { \ 1554 temp.E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \ 1555 imm, BIT / 2 - 1); \ 1556 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlns_ ## E1(Vd->E2(j + ofs * i), \ 1557 imm, BIT / 2 - 1); \ 1558 } \ 1559 } \ 1560 *Vd = temp; \ 1561 } 1562 1563 static void do_vssrlni_q(VReg *Vd, VReg *Vj, 1564 uint64_t imm, int idx, Int128 mask) 1565 { 1566 Int128 shft_res1, shft_res2; 1567 1568 if (imm == 0) { 1569 shft_res1 = Vj->Q(idx); 1570 shft_res2 = Vd->Q(idx); 1571 } else { 1572 shft_res1 = int128_urshift(Vj->Q(idx), imm); 1573 shft_res2 = int128_urshift(Vd->Q(idx), imm); 1574 } 1575 1576 if (int128_ult(mask, shft_res1)) { 1577 Vd->D(idx * 2) = int128_getlo(mask); 1578 }else { 1579 Vd->D(idx * 2) = int128_getlo(shft_res1); 1580 } 1581 1582 if (int128_ult(mask, shft_res2)) { 1583 Vd->D(idx * 2 + 1) = int128_getlo(mask); 1584 }else { 1585 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 1586 } 1587 } 1588 1589 void HELPER(vssrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1590 { 1591 int i; 1592 Int128 mask; 1593 VReg *Vd = (VReg *)vd; 1594 VReg *Vj = (VReg *)vj; 1595 int oprsz = simd_oprsz(desc); 1596 1597 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 1598 1599 for (i = 0; i < oprsz / 16; i++) { 1600 do_vssrlni_q(Vd, Vj, imm, i, mask); 1601 } 1602 } 1603 1604 VSSRLNI(vssrlni_b_h, 16, B, H) 1605 VSSRLNI(vssrlni_h_w, 32, H, W) 1606 VSSRLNI(vssrlni_w_d, 64, W, D) 1607 1608 #define VSSRANI(NAME, BIT, E1, E2) \ 1609 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1610 { \ 1611 int i, j, ofs; \ 1612 VReg temp = {}; \ 1613 VReg *Vd = (VReg *)vd; \ 1614 VReg *Vj = (VReg *)vj; \ 1615 int oprsz = simd_oprsz(desc); \ 1616 \ 1617 ofs = LSX_LEN / BIT; \ 1618 for (i = 0; i < oprsz / 16; i++) { \ 1619 for (j = 0; j < ofs; j++) { \ 1620 temp.E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \ 1621 imm, BIT / 2 - 1); \ 1622 temp.E1(j + ofs * (2 * i + 1)) = do_ssrans_ ## E1(Vd->E2(j + ofs * i), \ 1623 imm, BIT / 2 - 1); \ 1624 } \ 1625 } \ 1626 *Vd = temp; \ 1627 } 1628 1629 static void do_vssrani_d_q(VReg *Vd, VReg *Vj, 1630 uint64_t imm, int idx, Int128 mask, Int128 min) 1631 { 1632 Int128 shft_res1, shft_res2; 1633 1634 if (imm == 0) { 1635 shft_res1 = Vj->Q(idx); 1636 shft_res2 = Vd->Q(idx); 1637 } else { 1638 shft_res1 = int128_rshift(Vj->Q(idx), imm); 1639 shft_res2 = int128_rshift(Vd->Q(idx), imm); 1640 } 1641 1642 if (int128_gt(shft_res1, mask)) { 1643 Vd->D(idx * 2) = int128_getlo(mask); 1644 } else if (int128_lt(shft_res1, int128_neg(min))) { 1645 Vd->D(idx * 2) = int128_getlo(min); 1646 } else { 1647 Vd->D(idx * 2) = int128_getlo(shft_res1); 1648 } 1649 1650 if (int128_gt(shft_res2, mask)) { 1651 Vd->D(idx * 2 + 1) = int128_getlo(mask); 1652 } else if (int128_lt(shft_res2, int128_neg(min))) { 1653 Vd->D(idx * 2 + 1) = int128_getlo(min); 1654 } else { 1655 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 1656 } 1657 } 1658 1659 void HELPER(vssrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1660 { 1661 int i; 1662 Int128 mask, min; 1663 VReg *Vd = (VReg *)vd; 1664 VReg *Vj = (VReg *)vj; 1665 int oprsz = simd_oprsz(desc); 1666 1667 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 1668 min = int128_lshift(int128_one(), 63); 1669 1670 for (i = 0; i < oprsz / 16; i++) { 1671 do_vssrani_d_q(Vd, Vj, imm, i, mask, min); 1672 } 1673 } 1674 1675 1676 VSSRANI(vssrani_b_h, 16, B, H) 1677 VSSRANI(vssrani_h_w, 32, H, W) 1678 VSSRANI(vssrani_w_d, 64, W, D) 1679 1680 #define VSSRLNUI(NAME, BIT, E1, E2) \ 1681 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1682 { \ 1683 int i, j, ofs; \ 1684 VReg temp = {}; \ 1685 VReg *Vd = (VReg *)vd; \ 1686 VReg *Vj = (VReg *)vj; \ 1687 int oprsz = simd_oprsz(desc); \ 1688 \ 1689 ofs = LSX_LEN / BIT; \ 1690 for (i = 0; i < oprsz / 16; i++) { \ 1691 for (j = 0; j < ofs; j++) { \ 1692 temp.E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \ 1693 imm, BIT / 2); \ 1694 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlnu_ ## E1(Vd->E2(j + ofs * i), \ 1695 imm, BIT / 2); \ 1696 } \ 1697 } \ 1698 *Vd = temp; \ 1699 } 1700 1701 void HELPER(vssrlni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1702 { 1703 int i; 1704 Int128 mask; 1705 VReg *Vd = (VReg *)vd; 1706 VReg *Vj = (VReg *)vj; 1707 int oprsz = simd_oprsz(desc); 1708 1709 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 1710 1711 for (i = 0; i < oprsz / 16; i++) { 1712 do_vssrlni_q(Vd, Vj, imm, i, mask); 1713 } 1714 } 1715 1716 VSSRLNUI(vssrlni_bu_h, 16, B, H) 1717 VSSRLNUI(vssrlni_hu_w, 32, H, W) 1718 VSSRLNUI(vssrlni_wu_d, 64, W, D) 1719 1720 #define VSSRANUI(NAME, BIT, E1, E2) \ 1721 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1722 { \ 1723 int i, j, ofs; \ 1724 VReg temp = {}; \ 1725 VReg *Vd = (VReg *)vd; \ 1726 VReg *Vj = (VReg *)vj; \ 1727 int oprsz = simd_oprsz(desc); \ 1728 \ 1729 ofs = LSX_LEN / BIT; \ 1730 for (i = 0; i < oprsz / 16; i++) { \ 1731 for (j = 0; j < ofs; j++) { \ 1732 temp.E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i), \ 1733 imm, BIT / 2); \ 1734 temp.E1(j + ofs * (2 * i + 1)) = do_ssranu_ ## E1(Vd->E2(j + ofs * i), \ 1735 imm, BIT / 2); \ 1736 } \ 1737 } \ 1738 *Vd = temp; \ 1739 } 1740 1741 static void do_vssrani_du_q(VReg *Vd, VReg *Vj, 1742 uint64_t imm, int idx, Int128 mask) 1743 { 1744 Int128 shft_res1, shft_res2; 1745 1746 if (imm == 0) { 1747 shft_res1 = Vj->Q(idx); 1748 shft_res2 = Vd->Q(idx); 1749 } else { 1750 shft_res1 = int128_rshift(Vj->Q(idx), imm); 1751 shft_res2 = int128_rshift(Vd->Q(idx), imm); 1752 } 1753 1754 if (int128_lt(Vj->Q(idx), int128_zero())) { 1755 shft_res1 = int128_zero(); 1756 } 1757 1758 if (int128_lt(Vd->Q(idx), int128_zero())) { 1759 shft_res2 = int128_zero(); 1760 } 1761 if (int128_ult(mask, shft_res1)) { 1762 Vd->D(idx * 2) = int128_getlo(mask); 1763 }else { 1764 Vd->D(idx * 2) = int128_getlo(shft_res1); 1765 } 1766 1767 if (int128_ult(mask, shft_res2)) { 1768 Vd->D(idx * 2 + 1) = int128_getlo(mask); 1769 }else { 1770 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 1771 } 1772 1773 } 1774 1775 void HELPER(vssrani_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 1776 { 1777 int i; 1778 Int128 mask; 1779 VReg *Vd = (VReg *)vd; 1780 VReg *Vj = (VReg *)vj; 1781 int oprsz = simd_oprsz(desc); 1782 1783 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 1784 1785 for (i = 0; i < oprsz / 16; i++) { 1786 do_vssrani_du_q(Vd, Vj, imm, i, mask); 1787 } 1788 } 1789 1790 VSSRANUI(vssrani_bu_h, 16, B, H) 1791 VSSRANUI(vssrani_hu_w, 32, H, W) 1792 VSSRANUI(vssrani_wu_d, 64, W, D) 1793 1794 #define SSRLRNS(E1, E2, T1, T2, T3) \ 1795 static T1 do_ssrlrns_ ## E1(T2 e2, int sa, int sh) \ 1796 { \ 1797 T1 shft_res; \ 1798 \ 1799 shft_res = do_vsrlr_ ## E2(e2, sa); \ 1800 T1 mask; \ 1801 mask = (1ull << sh) - 1; \ 1802 if (shft_res > mask) { \ 1803 return mask; \ 1804 } else { \ 1805 return shft_res; \ 1806 } \ 1807 } 1808 1809 SSRLRNS(B, H, uint16_t, int16_t, uint8_t) 1810 SSRLRNS(H, W, uint32_t, int32_t, uint16_t) 1811 SSRLRNS(W, D, uint64_t, int64_t, uint32_t) 1812 1813 #define VSSRLRN(NAME, BIT, E1, E2, E3) \ 1814 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1815 { \ 1816 int i, j, ofs; \ 1817 VReg *Vd = (VReg *)vd; \ 1818 VReg *Vj = (VReg *)vj; \ 1819 VReg *Vk = (VReg *)vk; \ 1820 int oprsz = simd_oprsz(desc); \ 1821 \ 1822 ofs = LSX_LEN / BIT; \ 1823 for (i = 0; i < oprsz / 16; i++) { \ 1824 for (j = 0; j < ofs; j++) { \ 1825 Vd->E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i), \ 1826 Vk->E3(j + ofs * i) % BIT, \ 1827 BIT / 2 - 1); \ 1828 } \ 1829 Vd->D(2 * i + 1) = 0; \ 1830 } \ 1831 } 1832 1833 VSSRLRN(vssrlrn_b_h, 16, B, H, UH) 1834 VSSRLRN(vssrlrn_h_w, 32, H, W, UW) 1835 VSSRLRN(vssrlrn_w_d, 64, W, D, UD) 1836 1837 #define SSRARNS(E1, E2, T1, T2) \ 1838 static T1 do_ssrarns_ ## E1(T1 e2, int sa, int sh) \ 1839 { \ 1840 T1 shft_res; \ 1841 \ 1842 shft_res = do_vsrar_ ## E2(e2, sa); \ 1843 T2 mask; \ 1844 mask = (1ll << sh) - 1; \ 1845 if (shft_res > mask) { \ 1846 return mask; \ 1847 } else if (shft_res < -(mask +1)) { \ 1848 return ~mask; \ 1849 } else { \ 1850 return shft_res; \ 1851 } \ 1852 } 1853 1854 SSRARNS(B, H, int16_t, int8_t) 1855 SSRARNS(H, W, int32_t, int16_t) 1856 SSRARNS(W, D, int64_t, int32_t) 1857 1858 #define VSSRARN(NAME, BIT, E1, E2, E3) \ 1859 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1860 { \ 1861 int i, j, ofs; \ 1862 VReg *Vd = (VReg *)vd; \ 1863 VReg *Vj = (VReg *)vj; \ 1864 VReg *Vk = (VReg *)vk; \ 1865 int oprsz = simd_oprsz(desc); \ 1866 \ 1867 ofs = LSX_LEN / BIT; \ 1868 for (i = 0; i < oprsz / 16; i++) { \ 1869 for (j = 0; j < ofs; j++) { \ 1870 Vd->E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i), \ 1871 Vk->E3(j + ofs * i) % BIT, \ 1872 BIT/ 2 - 1); \ 1873 } \ 1874 Vd->D(2 * i + 1) = 0; \ 1875 } \ 1876 } 1877 1878 VSSRARN(vssrarn_b_h, 16, B, H, UH) 1879 VSSRARN(vssrarn_h_w, 32, H, W, UW) 1880 VSSRARN(vssrarn_w_d, 64, W, D, UD) 1881 1882 #define SSRLRNU(E1, E2, T1, T2, T3) \ 1883 static T1 do_ssrlrnu_ ## E1(T3 e2, int sa, int sh) \ 1884 { \ 1885 T1 shft_res; \ 1886 \ 1887 shft_res = do_vsrlr_ ## E2(e2, sa); \ 1888 \ 1889 T2 mask; \ 1890 mask = (1ull << sh) - 1; \ 1891 if (shft_res > mask) { \ 1892 return mask; \ 1893 } else { \ 1894 return shft_res; \ 1895 } \ 1896 } 1897 1898 SSRLRNU(B, H, uint16_t, uint8_t, int16_t) 1899 SSRLRNU(H, W, uint32_t, uint16_t, int32_t) 1900 SSRLRNU(W, D, uint64_t, uint32_t, int64_t) 1901 1902 #define VSSRLRNU(NAME, BIT, E1, E2, E3) \ 1903 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1904 { \ 1905 int i, j, ofs; \ 1906 VReg *Vd = (VReg *)vd; \ 1907 VReg *Vj = (VReg *)vj; \ 1908 VReg *Vk = (VReg *)vk; \ 1909 int oprsz = simd_oprsz(desc); \ 1910 \ 1911 ofs = LSX_LEN / BIT; \ 1912 for (i = 0; i < oprsz / 16; i++) { \ 1913 for (j = 0; j < ofs; j++) { \ 1914 Vd->E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i), \ 1915 Vk->E3(j + ofs * i) % BIT, \ 1916 BIT / 2); \ 1917 } \ 1918 Vd->D(2 * i + 1) = 0; \ 1919 } \ 1920 } 1921 1922 VSSRLRNU(vssrlrn_bu_h, 16, B, H, UH) 1923 VSSRLRNU(vssrlrn_hu_w, 32, H, W, UW) 1924 VSSRLRNU(vssrlrn_wu_d, 64, W, D, UD) 1925 1926 #define SSRARNU(E1, E2, T1, T2, T3) \ 1927 static T1 do_ssrarnu_ ## E1(T3 e2, int sa, int sh) \ 1928 { \ 1929 T1 shft_res; \ 1930 \ 1931 if (e2 < 0) { \ 1932 shft_res = 0; \ 1933 } else { \ 1934 shft_res = do_vsrar_ ## E2(e2, sa); \ 1935 } \ 1936 T2 mask; \ 1937 mask = (1ull << sh) - 1; \ 1938 if (shft_res > mask) { \ 1939 return mask; \ 1940 } else { \ 1941 return shft_res; \ 1942 } \ 1943 } 1944 1945 SSRARNU(B, H, uint16_t, uint8_t, int16_t) 1946 SSRARNU(H, W, uint32_t, uint16_t, int32_t) 1947 SSRARNU(W, D, uint64_t, uint32_t, int64_t) 1948 1949 #define VSSRARNU(NAME, BIT, E1, E2, E3) \ 1950 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 1951 { \ 1952 int i, j, ofs; \ 1953 VReg *Vd = (VReg *)vd; \ 1954 VReg *Vj = (VReg *)vj; \ 1955 VReg *Vk = (VReg *)vk; \ 1956 int oprsz = simd_oprsz(desc); \ 1957 \ 1958 ofs = LSX_LEN / BIT; \ 1959 for (i = 0; i < oprsz / 16; i++) { \ 1960 for (j = 0; j < ofs; j++) { \ 1961 Vd->E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \ 1962 Vk->E3(j + ofs * i) % BIT, \ 1963 BIT / 2); \ 1964 } \ 1965 Vd->D(2 * i + 1) = 0; \ 1966 } \ 1967 } 1968 1969 VSSRARNU(vssrarn_bu_h, 16, B, H, UH) 1970 VSSRARNU(vssrarn_hu_w, 32, H, W, UW) 1971 VSSRARNU(vssrarn_wu_d, 64, W, D, UD) 1972 1973 #define VSSRLRNI(NAME, BIT, E1, E2) \ 1974 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 1975 { \ 1976 int i, j, ofs; \ 1977 VReg temp = {}; \ 1978 VReg *Vd = (VReg *)vd; \ 1979 VReg *Vj = (VReg *)vj; \ 1980 int oprsz = simd_oprsz(desc); \ 1981 \ 1982 ofs = LSX_LEN / BIT; \ 1983 for (i = 0; i < oprsz / 16; i++) { \ 1984 for (j = 0; j < ofs; j++) { \ 1985 temp.E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i), \ 1986 imm, BIT / 2 - 1); \ 1987 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrns_ ## E1(Vd->E2(j + ofs * i), \ 1988 imm, BIT / 2 - 1); \ 1989 } \ 1990 } \ 1991 *Vd = temp; \ 1992 } 1993 1994 static void do_vssrlrni_q(VReg *Vd, VReg * Vj, 1995 uint64_t imm, int idx, Int128 mask) 1996 { 1997 Int128 shft_res1, shft_res2, r1, r2; 1998 if (imm == 0) { 1999 shft_res1 = Vj->Q(idx); 2000 shft_res2 = Vd->Q(idx); 2001 } else { 2002 r1 = int128_and(int128_urshift(Vj->Q(idx), (imm - 1)), int128_one()); 2003 r2 = int128_and(int128_urshift(Vd->Q(idx), (imm - 1)), int128_one()); 2004 shft_res1 = (int128_add(int128_urshift(Vj->Q(idx), imm), r1)); 2005 shft_res2 = (int128_add(int128_urshift(Vd->Q(idx), imm), r2)); 2006 } 2007 2008 if (int128_ult(mask, shft_res1)) { 2009 Vd->D(idx * 2) = int128_getlo(mask); 2010 }else { 2011 Vd->D(idx * 2) = int128_getlo(shft_res1); 2012 } 2013 2014 if (int128_ult(mask, shft_res2)) { 2015 Vd->D(idx * 2 + 1) = int128_getlo(mask); 2016 }else { 2017 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 2018 } 2019 } 2020 2021 void HELPER(vssrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 2022 { 2023 int i; 2024 Int128 mask; 2025 VReg *Vd = (VReg *)vd; 2026 VReg *Vj = (VReg *)vj; 2027 int oprsz = simd_oprsz(desc); 2028 2029 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 2030 2031 for (i = 0; i < oprsz / 16; i++) { 2032 do_vssrlrni_q(Vd, Vj, imm, i, mask); 2033 } 2034 } 2035 2036 VSSRLRNI(vssrlrni_b_h, 16, B, H) 2037 VSSRLRNI(vssrlrni_h_w, 32, H, W) 2038 VSSRLRNI(vssrlrni_w_d, 64, W, D) 2039 2040 #define VSSRARNI(NAME, BIT, E1, E2) \ 2041 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2042 { \ 2043 int i, j, ofs; \ 2044 VReg temp = {}; \ 2045 VReg *Vd = (VReg *)vd; \ 2046 VReg *Vj = (VReg *)vj; \ 2047 int oprsz = simd_oprsz(desc); \ 2048 \ 2049 ofs = LSX_LEN / BIT; \ 2050 for (i = 0; i < oprsz / 16; i++) { \ 2051 for (j = 0; j < ofs; j++) { \ 2052 temp.E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i), \ 2053 imm, BIT / 2 - 1); \ 2054 temp.E1(j + ofs * (2 * i + 1)) = do_ssrarns_ ## E1(Vd->E2(j + ofs * i), \ 2055 imm, BIT / 2 - 1); \ 2056 } \ 2057 } \ 2058 *Vd = temp; \ 2059 } 2060 2061 static void do_vssrarni_d_q(VReg *Vd, VReg *Vj, 2062 uint64_t imm, int idx, Int128 mask1, Int128 mask2) 2063 { 2064 Int128 shft_res1, shft_res2, r1, r2; 2065 2066 if (imm == 0) { 2067 shft_res1 = Vj->Q(idx); 2068 shft_res2 = Vd->Q(idx); 2069 } else { 2070 r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one()); 2071 r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one()); 2072 shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1); 2073 shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2); 2074 } 2075 if (int128_gt(shft_res1, mask1)) { 2076 Vd->D(idx * 2) = int128_getlo(mask1); 2077 } else if (int128_lt(shft_res1, int128_neg(mask2))) { 2078 Vd->D(idx * 2) = int128_getlo(mask2); 2079 } else { 2080 Vd->D(idx * 2) = int128_getlo(shft_res1); 2081 } 2082 2083 if (int128_gt(shft_res2, mask1)) { 2084 Vd->D(idx * 2 + 1) = int128_getlo(mask1); 2085 } else if (int128_lt(shft_res2, int128_neg(mask2))) { 2086 Vd->D(idx * 2 + 1) = int128_getlo(mask2); 2087 } else { 2088 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 2089 } 2090 } 2091 2092 void HELPER(vssrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 2093 { 2094 int i; 2095 Int128 mask1, mask2; 2096 VReg *Vd = (VReg *)vd; 2097 VReg *Vj = (VReg *)vj; 2098 int oprsz = simd_oprsz(desc); 2099 2100 mask1 = int128_sub(int128_lshift(int128_one(), 63), int128_one()); 2101 mask2 = int128_lshift(int128_one(), 63); 2102 2103 for (i = 0; i < oprsz / 16; i++) { 2104 do_vssrarni_d_q(Vd, Vj, imm, i, mask1, mask2); 2105 } 2106 } 2107 2108 VSSRARNI(vssrarni_b_h, 16, B, H) 2109 VSSRARNI(vssrarni_h_w, 32, H, W) 2110 VSSRARNI(vssrarni_w_d, 64, W, D) 2111 2112 #define VSSRLRNUI(NAME, BIT, E1, E2) \ 2113 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2114 { \ 2115 int i, j, ofs; \ 2116 VReg temp = {}; \ 2117 VReg *Vd = (VReg *)vd; \ 2118 VReg *Vj = (VReg *)vj; \ 2119 int oprsz = simd_oprsz(desc); \ 2120 \ 2121 ofs = LSX_LEN / BIT; \ 2122 for (i = 0; i < oprsz / 16; i++) { \ 2123 for (j = 0; j < ofs; j++) { \ 2124 temp.E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i), \ 2125 imm, BIT / 2); \ 2126 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrnu_ ## E1(Vd->E2(j + ofs * i), \ 2127 imm, BIT / 2); \ 2128 } \ 2129 } \ 2130 *Vd = temp; \ 2131 } 2132 2133 void HELPER(vssrlrni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 2134 { 2135 int i; 2136 Int128 mask; 2137 VReg *Vd = (VReg *)vd; 2138 VReg *Vj = (VReg *)vj; 2139 int oprsz = simd_oprsz(desc); 2140 2141 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 2142 2143 for (i = 0; i < oprsz / 16; i++) { 2144 do_vssrlrni_q(Vd, Vj, imm, i, mask); 2145 } 2146 } 2147 2148 VSSRLRNUI(vssrlrni_bu_h, 16, B, H) 2149 VSSRLRNUI(vssrlrni_hu_w, 32, H, W) 2150 VSSRLRNUI(vssrlrni_wu_d, 64, W, D) 2151 2152 #define VSSRARNUI(NAME, BIT, E1, E2) \ 2153 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2154 { \ 2155 int i, j, ofs; \ 2156 VReg temp = {}; \ 2157 VReg *Vd = (VReg *)vd; \ 2158 VReg *Vj = (VReg *)vj; \ 2159 int oprsz = simd_oprsz(desc); \ 2160 \ 2161 ofs = LSX_LEN / BIT; \ 2162 for (i = 0; i < oprsz / 16; i++) { \ 2163 for (j = 0; j < ofs; j++) { \ 2164 temp.E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \ 2165 imm, BIT / 2); \ 2166 temp.E1(j + ofs * (2 * i + 1)) = do_ssrarnu_ ## E1(Vd->E2(j + ofs * i), \ 2167 imm, BIT / 2); \ 2168 } \ 2169 } \ 2170 *Vd = temp; \ 2171 } 2172 2173 static void do_vssrarni_du_q(VReg *Vd, VReg *Vj, 2174 uint64_t imm, int idx, Int128 mask1, Int128 mask2) 2175 { 2176 Int128 shft_res1, shft_res2, r1, r2; 2177 2178 if (imm == 0) { 2179 shft_res1 = Vj->Q(idx); 2180 shft_res2 = Vd->Q(idx); 2181 } else { 2182 r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one()); 2183 r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one()); 2184 shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1); 2185 shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2); 2186 } 2187 2188 if (int128_lt(Vj->Q(idx), int128_zero())) { 2189 shft_res1 = int128_zero(); 2190 } 2191 if (int128_lt(Vd->Q(idx), int128_zero())) { 2192 shft_res2 = int128_zero(); 2193 } 2194 2195 if (int128_gt(shft_res1, mask1)) { 2196 Vd->D(idx * 2) = int128_getlo(mask1); 2197 } else if (int128_lt(shft_res1, int128_neg(mask2))) { 2198 Vd->D(idx * 2) = int128_getlo(mask2); 2199 } else { 2200 Vd->D(idx * 2) = int128_getlo(shft_res1); 2201 } 2202 2203 if (int128_gt(shft_res2, mask1)) { 2204 Vd->D(idx * 2 + 1) = int128_getlo(mask1); 2205 } else if (int128_lt(shft_res2, int128_neg(mask2))) { 2206 Vd->D(idx * 2 + 1) = int128_getlo(mask2); 2207 } else { 2208 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); 2209 } 2210 } 2211 2212 void HELPER(vssrarni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 2213 { 2214 int i; 2215 Int128 mask1, mask2; 2216 VReg *Vd = (VReg *)vd; 2217 VReg *Vj = (VReg *)vj; 2218 int oprsz = simd_oprsz(desc); 2219 2220 mask1 = int128_sub(int128_lshift(int128_one(), 64), int128_one()); 2221 mask2 = int128_lshift(int128_one(), 64); 2222 2223 for (i = 0; i < oprsz / 16; i++) { 2224 do_vssrarni_du_q(Vd, Vj, imm, i, mask1, mask2); 2225 } 2226 } 2227 2228 VSSRARNUI(vssrarni_bu_h, 16, B, H) 2229 VSSRARNUI(vssrarni_hu_w, 32, H, W) 2230 VSSRARNUI(vssrarni_wu_d, 64, W, D) 2231 2232 #define DO_2OP(NAME, BIT, E, DO_OP) \ 2233 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 2234 { \ 2235 int i; \ 2236 VReg *Vd = (VReg *)vd; \ 2237 VReg *Vj = (VReg *)vj; \ 2238 int oprsz = simd_oprsz(desc); \ 2239 \ 2240 for (i = 0; i < oprsz / (BIT / 8); i++) \ 2241 { \ 2242 Vd->E(i) = DO_OP(Vj->E(i)); \ 2243 } \ 2244 } 2245 2246 DO_2OP(vclo_b, 8, UB, DO_CLO_B) 2247 DO_2OP(vclo_h, 16, UH, DO_CLO_H) 2248 DO_2OP(vclo_w, 32, UW, DO_CLO_W) 2249 DO_2OP(vclo_d, 64, UD, DO_CLO_D) 2250 DO_2OP(vclz_b, 8, UB, DO_CLZ_B) 2251 DO_2OP(vclz_h, 16, UH, DO_CLZ_H) 2252 DO_2OP(vclz_w, 32, UW, DO_CLZ_W) 2253 DO_2OP(vclz_d, 64, UD, DO_CLZ_D) 2254 2255 #define VPCNT(NAME, BIT, E, FN) \ 2256 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ 2257 { \ 2258 int i; \ 2259 VReg *Vd = (VReg *)vd; \ 2260 VReg *Vj = (VReg *)vj; \ 2261 int oprsz = simd_oprsz(desc); \ 2262 \ 2263 for (i = 0; i < oprsz / (BIT / 8); i++) \ 2264 { \ 2265 Vd->E(i) = FN(Vj->E(i)); \ 2266 } \ 2267 } 2268 2269 VPCNT(vpcnt_b, 8, UB, ctpop8) 2270 VPCNT(vpcnt_h, 16, UH, ctpop16) 2271 VPCNT(vpcnt_w, 32, UW, ctpop32) 2272 VPCNT(vpcnt_d, 64, UD, ctpop64) 2273 2274 #define DO_BIT(NAME, BIT, E, DO_OP) \ 2275 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 2276 { \ 2277 int i; \ 2278 VReg *Vd = (VReg *)vd; \ 2279 VReg *Vj = (VReg *)vj; \ 2280 VReg *Vk = (VReg *)vk; \ 2281 int oprsz = simd_oprsz(desc); \ 2282 \ 2283 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2284 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)%BIT); \ 2285 } \ 2286 } 2287 2288 DO_BIT(vbitclr_b, 8, UB, DO_BITCLR) 2289 DO_BIT(vbitclr_h, 16, UH, DO_BITCLR) 2290 DO_BIT(vbitclr_w, 32, UW, DO_BITCLR) 2291 DO_BIT(vbitclr_d, 64, UD, DO_BITCLR) 2292 DO_BIT(vbitset_b, 8, UB, DO_BITSET) 2293 DO_BIT(vbitset_h, 16, UH, DO_BITSET) 2294 DO_BIT(vbitset_w, 32, UW, DO_BITSET) 2295 DO_BIT(vbitset_d, 64, UD, DO_BITSET) 2296 DO_BIT(vbitrev_b, 8, UB, DO_BITREV) 2297 DO_BIT(vbitrev_h, 16, UH, DO_BITREV) 2298 DO_BIT(vbitrev_w, 32, UW, DO_BITREV) 2299 DO_BIT(vbitrev_d, 64, UD, DO_BITREV) 2300 2301 #define DO_BITI(NAME, BIT, E, DO_OP) \ 2302 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2303 { \ 2304 int i; \ 2305 VReg *Vd = (VReg *)vd; \ 2306 VReg *Vj = (VReg *)vj; \ 2307 int oprsz = simd_oprsz(desc); \ 2308 \ 2309 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2310 Vd->E(i) = DO_OP(Vj->E(i), imm); \ 2311 } \ 2312 } 2313 2314 DO_BITI(vbitclri_b, 8, UB, DO_BITCLR) 2315 DO_BITI(vbitclri_h, 16, UH, DO_BITCLR) 2316 DO_BITI(vbitclri_w, 32, UW, DO_BITCLR) 2317 DO_BITI(vbitclri_d, 64, UD, DO_BITCLR) 2318 DO_BITI(vbitseti_b, 8, UB, DO_BITSET) 2319 DO_BITI(vbitseti_h, 16, UH, DO_BITSET) 2320 DO_BITI(vbitseti_w, 32, UW, DO_BITSET) 2321 DO_BITI(vbitseti_d, 64, UD, DO_BITSET) 2322 DO_BITI(vbitrevi_b, 8, UB, DO_BITREV) 2323 DO_BITI(vbitrevi_h, 16, UH, DO_BITREV) 2324 DO_BITI(vbitrevi_w, 32, UW, DO_BITREV) 2325 DO_BITI(vbitrevi_d, 64, UD, DO_BITREV) 2326 2327 #define VFRSTP(NAME, BIT, MASK, E) \ 2328 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 2329 { \ 2330 int i, j, m, ofs; \ 2331 VReg *Vd = (VReg *)vd; \ 2332 VReg *Vj = (VReg *)vj; \ 2333 VReg *Vk = (VReg *)vk; \ 2334 int oprsz = simd_oprsz(desc); \ 2335 \ 2336 ofs = LSX_LEN / BIT; \ 2337 for (i = 0; i < oprsz / 16; i++) { \ 2338 m = Vk->E(i * ofs) & MASK; \ 2339 for (j = 0; j < ofs; j++) { \ 2340 if (Vj->E(j + ofs * i) < 0) { \ 2341 break; \ 2342 } \ 2343 } \ 2344 Vd->E(m + i * ofs) = j; \ 2345 } \ 2346 } 2347 2348 VFRSTP(vfrstp_b, 8, 0xf, B) 2349 VFRSTP(vfrstp_h, 16, 0x7, H) 2350 2351 #define VFRSTPI(NAME, BIT, E) \ 2352 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 2353 { \ 2354 int i, j, m, ofs; \ 2355 VReg *Vd = (VReg *)vd; \ 2356 VReg *Vj = (VReg *)vj; \ 2357 int oprsz = simd_oprsz(desc); \ 2358 \ 2359 ofs = LSX_LEN / BIT; \ 2360 m = imm % ofs; \ 2361 for (i = 0; i < oprsz / 16; i++) { \ 2362 for (j = 0; j < ofs; j++) { \ 2363 if (Vj->E(j + ofs * i) < 0) { \ 2364 break; \ 2365 } \ 2366 } \ 2367 Vd->E(m + i * ofs) = j; \ 2368 } \ 2369 } 2370 2371 VFRSTPI(vfrstpi_b, 8, B) 2372 VFRSTPI(vfrstpi_h, 16, H) 2373 2374 static void vec_update_fcsr0_mask(CPULoongArchState *env, 2375 uintptr_t pc, int mask) 2376 { 2377 int flags = get_float_exception_flags(&env->fp_status); 2378 2379 set_float_exception_flags(0, &env->fp_status); 2380 2381 flags &= ~mask; 2382 2383 if (flags) { 2384 flags = ieee_ex_to_loongarch(flags); 2385 UPDATE_FP_CAUSE(env->fcsr0, flags); 2386 } 2387 2388 if (GET_FP_ENABLES(env->fcsr0) & flags) { 2389 do_raise_exception(env, EXCCODE_FPE, pc); 2390 } else { 2391 UPDATE_FP_FLAGS(env->fcsr0, flags); 2392 } 2393 } 2394 2395 static void vec_update_fcsr0(CPULoongArchState *env, uintptr_t pc) 2396 { 2397 vec_update_fcsr0_mask(env, pc, 0); 2398 } 2399 2400 static inline void vec_clear_cause(CPULoongArchState *env) 2401 { 2402 SET_FP_CAUSE(env->fcsr0, 0); 2403 } 2404 2405 #define DO_3OP_F(NAME, BIT, E, FN) \ 2406 void HELPER(NAME)(void *vd, void *vj, void *vk, \ 2407 CPULoongArchState *env, uint32_t desc) \ 2408 { \ 2409 int i; \ 2410 VReg *Vd = (VReg *)vd; \ 2411 VReg *Vj = (VReg *)vj; \ 2412 VReg *Vk = (VReg *)vk; \ 2413 int oprsz = simd_oprsz(desc); \ 2414 \ 2415 vec_clear_cause(env); \ 2416 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2417 Vd->E(i) = FN(Vj->E(i), Vk->E(i), &env->fp_status); \ 2418 vec_update_fcsr0(env, GETPC()); \ 2419 } \ 2420 } 2421 2422 DO_3OP_F(vfadd_s, 32, UW, float32_add) 2423 DO_3OP_F(vfadd_d, 64, UD, float64_add) 2424 DO_3OP_F(vfsub_s, 32, UW, float32_sub) 2425 DO_3OP_F(vfsub_d, 64, UD, float64_sub) 2426 DO_3OP_F(vfmul_s, 32, UW, float32_mul) 2427 DO_3OP_F(vfmul_d, 64, UD, float64_mul) 2428 DO_3OP_F(vfdiv_s, 32, UW, float32_div) 2429 DO_3OP_F(vfdiv_d, 64, UD, float64_div) 2430 DO_3OP_F(vfmax_s, 32, UW, float32_maxnum) 2431 DO_3OP_F(vfmax_d, 64, UD, float64_maxnum) 2432 DO_3OP_F(vfmin_s, 32, UW, float32_minnum) 2433 DO_3OP_F(vfmin_d, 64, UD, float64_minnum) 2434 DO_3OP_F(vfmaxa_s, 32, UW, float32_maxnummag) 2435 DO_3OP_F(vfmaxa_d, 64, UD, float64_maxnummag) 2436 DO_3OP_F(vfmina_s, 32, UW, float32_minnummag) 2437 DO_3OP_F(vfmina_d, 64, UD, float64_minnummag) 2438 2439 #define DO_4OP_F(NAME, BIT, E, FN, flags) \ 2440 void HELPER(NAME)(void *vd, void *vj, void *vk, void *va, \ 2441 CPULoongArchState *env, uint32_t desc) \ 2442 { \ 2443 int i; \ 2444 VReg *Vd = (VReg *)vd; \ 2445 VReg *Vj = (VReg *)vj; \ 2446 VReg *Vk = (VReg *)vk; \ 2447 VReg *Va = (VReg *)va; \ 2448 int oprsz = simd_oprsz(desc); \ 2449 \ 2450 vec_clear_cause(env); \ 2451 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2452 Vd->E(i) = FN(Vj->E(i), Vk->E(i), Va->E(i), flags, &env->fp_status); \ 2453 vec_update_fcsr0(env, GETPC()); \ 2454 } \ 2455 } 2456 2457 DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0) 2458 DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0) 2459 DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c) 2460 DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c) 2461 DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result) 2462 DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result) 2463 DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd, 2464 float_muladd_negate_c | float_muladd_negate_result) 2465 DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd, 2466 float_muladd_negate_c | float_muladd_negate_result) 2467 2468 #define DO_2OP_F(NAME, BIT, E, FN) \ 2469 void HELPER(NAME)(void *vd, void *vj, \ 2470 CPULoongArchState *env, uint32_t desc) \ 2471 { \ 2472 int i; \ 2473 VReg *Vd = (VReg *)vd; \ 2474 VReg *Vj = (VReg *)vj; \ 2475 int oprsz = simd_oprsz(desc); \ 2476 \ 2477 vec_clear_cause(env); \ 2478 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2479 Vd->E(i) = FN(env, Vj->E(i)); \ 2480 } \ 2481 } 2482 2483 #define FLOGB(BIT, T) \ 2484 static T do_flogb_## BIT(CPULoongArchState *env, T fj) \ 2485 { \ 2486 T fp, fd; \ 2487 float_status *status = &env->fp_status; \ 2488 FloatRoundMode old_mode = get_float_rounding_mode(status); \ 2489 \ 2490 set_float_rounding_mode(float_round_down, status); \ 2491 fp = float ## BIT ##_log2(fj, status); \ 2492 fd = float ## BIT ##_round_to_int(fp, status); \ 2493 set_float_rounding_mode(old_mode, status); \ 2494 vec_update_fcsr0_mask(env, GETPC(), float_flag_inexact); \ 2495 return fd; \ 2496 } 2497 2498 FLOGB(32, uint32_t) 2499 FLOGB(64, uint64_t) 2500 2501 #define FCLASS(NAME, BIT, E, FN) \ 2502 void HELPER(NAME)(void *vd, void *vj, \ 2503 CPULoongArchState *env, uint32_t desc) \ 2504 { \ 2505 int i; \ 2506 VReg *Vd = (VReg *)vd; \ 2507 VReg *Vj = (VReg *)vj; \ 2508 int oprsz = simd_oprsz(desc); \ 2509 \ 2510 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2511 Vd->E(i) = FN(env, Vj->E(i)); \ 2512 } \ 2513 } 2514 2515 FCLASS(vfclass_s, 32, UW, helper_fclass_s) 2516 FCLASS(vfclass_d, 64, UD, helper_fclass_d) 2517 2518 #define FSQRT(BIT, T) \ 2519 static T do_fsqrt_## BIT(CPULoongArchState *env, T fj) \ 2520 { \ 2521 T fd; \ 2522 fd = float ## BIT ##_sqrt(fj, &env->fp_status); \ 2523 vec_update_fcsr0(env, GETPC()); \ 2524 return fd; \ 2525 } 2526 2527 FSQRT(32, uint32_t) 2528 FSQRT(64, uint64_t) 2529 2530 #define FRECIP(BIT, T) \ 2531 static T do_frecip_## BIT(CPULoongArchState *env, T fj) \ 2532 { \ 2533 T fd; \ 2534 fd = float ## BIT ##_div(float ## BIT ##_one, fj, &env->fp_status); \ 2535 vec_update_fcsr0(env, GETPC()); \ 2536 return fd; \ 2537 } 2538 2539 FRECIP(32, uint32_t) 2540 FRECIP(64, uint64_t) 2541 2542 #define FRSQRT(BIT, T) \ 2543 static T do_frsqrt_## BIT(CPULoongArchState *env, T fj) \ 2544 { \ 2545 T fd, fp; \ 2546 fp = float ## BIT ##_sqrt(fj, &env->fp_status); \ 2547 fd = float ## BIT ##_div(float ## BIT ##_one, fp, &env->fp_status); \ 2548 vec_update_fcsr0(env, GETPC()); \ 2549 return fd; \ 2550 } 2551 2552 FRSQRT(32, uint32_t) 2553 FRSQRT(64, uint64_t) 2554 2555 DO_2OP_F(vflogb_s, 32, UW, do_flogb_32) 2556 DO_2OP_F(vflogb_d, 64, UD, do_flogb_64) 2557 DO_2OP_F(vfsqrt_s, 32, UW, do_fsqrt_32) 2558 DO_2OP_F(vfsqrt_d, 64, UD, do_fsqrt_64) 2559 DO_2OP_F(vfrecip_s, 32, UW, do_frecip_32) 2560 DO_2OP_F(vfrecip_d, 64, UD, do_frecip_64) 2561 DO_2OP_F(vfrsqrt_s, 32, UW, do_frsqrt_32) 2562 DO_2OP_F(vfrsqrt_d, 64, UD, do_frsqrt_64) 2563 2564 static uint32_t float16_cvt_float32(uint16_t h, float_status *status) 2565 { 2566 return float16_to_float32(h, true, status); 2567 } 2568 static uint64_t float32_cvt_float64(uint32_t s, float_status *status) 2569 { 2570 return float32_to_float64(s, status); 2571 } 2572 2573 static uint16_t float32_cvt_float16(uint32_t s, float_status *status) 2574 { 2575 return float32_to_float16(s, true, status); 2576 } 2577 static uint32_t float64_cvt_float32(uint64_t d, float_status *status) 2578 { 2579 return float64_to_float32(d, status); 2580 } 2581 2582 void HELPER(vfcvtl_s_h)(void *vd, void *vj, 2583 CPULoongArchState *env, uint32_t desc) 2584 { 2585 int i, j, ofs; 2586 VReg temp = {}; 2587 VReg *Vd = (VReg *)vd; 2588 VReg *Vj = (VReg *)vj; 2589 int oprsz = simd_oprsz(desc); 2590 2591 ofs = LSX_LEN / 32; 2592 vec_clear_cause(env); 2593 for (i = 0; i < oprsz / 16; i++) { 2594 for (j = 0; j < ofs; j++) { 2595 temp.UW(j + ofs * i) =float16_cvt_float32(Vj->UH(j + ofs * 2 * i), 2596 &env->fp_status); 2597 } 2598 vec_update_fcsr0(env, GETPC()); 2599 } 2600 *Vd = temp; 2601 } 2602 2603 void HELPER(vfcvtl_d_s)(void *vd, void *vj, 2604 CPULoongArchState *env, uint32_t desc) 2605 { 2606 int i, j, ofs; 2607 VReg temp = {}; 2608 VReg *Vd = (VReg *)vd; 2609 VReg *Vj = (VReg *)vj; 2610 int oprsz = simd_oprsz(desc); 2611 2612 ofs = LSX_LEN / 64; 2613 vec_clear_cause(env); 2614 for (i = 0; i < oprsz / 16; i++) { 2615 for (j = 0; j < ofs; j++) { 2616 temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * 2 * i), 2617 &env->fp_status); 2618 } 2619 vec_update_fcsr0(env, GETPC()); 2620 } 2621 *Vd = temp; 2622 } 2623 2624 void HELPER(vfcvth_s_h)(void *vd, void *vj, 2625 CPULoongArchState *env, uint32_t desc) 2626 { 2627 int i, j, ofs; 2628 VReg temp = {}; 2629 VReg *Vd = (VReg *)vd; 2630 VReg *Vj = (VReg *)vj; 2631 int oprsz = simd_oprsz(desc); 2632 2633 ofs = LSX_LEN / 32; 2634 vec_clear_cause(env); 2635 for (i = 0; i < oprsz / 16; i++) { 2636 for (j = 0; j < ofs; j++) { 2637 temp.UW(j + ofs * i) = float16_cvt_float32(Vj->UH(j + ofs * (2 * i + 1)), 2638 &env->fp_status); 2639 } 2640 vec_update_fcsr0(env, GETPC()); 2641 } 2642 *Vd = temp; 2643 } 2644 2645 void HELPER(vfcvth_d_s)(void *vd, void *vj, 2646 CPULoongArchState *env, uint32_t desc) 2647 { 2648 int i, j, ofs; 2649 VReg temp = {}; 2650 VReg *Vd = (VReg *)vd; 2651 VReg *Vj = (VReg *)vj; 2652 int oprsz = simd_oprsz(desc); 2653 2654 ofs = LSX_LEN / 64; 2655 vec_clear_cause(env); 2656 for (i = 0; i < oprsz / 16; i++) { 2657 for (j = 0; j < ofs; j++) { 2658 temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * (2 * i + 1)), 2659 &env->fp_status); 2660 } 2661 vec_update_fcsr0(env, GETPC()); 2662 } 2663 *Vd = temp; 2664 } 2665 2666 void HELPER(vfcvt_h_s)(void *vd, void *vj, void *vk, 2667 CPULoongArchState *env, uint32_t desc) 2668 { 2669 int i, j, ofs; 2670 VReg temp = {}; 2671 VReg *Vd = (VReg *)vd; 2672 VReg *Vj = (VReg *)vj; 2673 VReg *Vk = (VReg *)vk; 2674 int oprsz = simd_oprsz(desc); 2675 2676 ofs = LSX_LEN / 32; 2677 vec_clear_cause(env); 2678 for(i = 0; i < oprsz / 16; i++) { 2679 for (j = 0; j < ofs; j++) { 2680 temp.UH(j + ofs * (2 * i + 1)) = float32_cvt_float16(Vj->UW(j + ofs * i), 2681 &env->fp_status); 2682 temp.UH(j + ofs * 2 * i) = float32_cvt_float16(Vk->UW(j + ofs * i), 2683 &env->fp_status); 2684 } 2685 vec_update_fcsr0(env, GETPC()); 2686 } 2687 *Vd = temp; 2688 } 2689 2690 void HELPER(vfcvt_s_d)(void *vd, void *vj, void *vk, 2691 CPULoongArchState *env, uint32_t desc) 2692 { 2693 int i, j, ofs; 2694 VReg temp = {}; 2695 VReg *Vd = (VReg *)vd; 2696 VReg *Vj = (VReg *)vj; 2697 VReg *Vk = (VReg *)vk; 2698 int oprsz = simd_oprsz(desc); 2699 2700 ofs = LSX_LEN / 64; 2701 vec_clear_cause(env); 2702 for(i = 0; i < oprsz / 16; i++) { 2703 for (j = 0; j < ofs; j++) { 2704 temp.UW(j + ofs * (2 * i + 1)) = float64_cvt_float32(Vj->UD(j + ofs * i), 2705 &env->fp_status); 2706 temp.UW(j + ofs * 2 * i) = float64_cvt_float32(Vk->UD(j + ofs * i), 2707 &env->fp_status); 2708 } 2709 vec_update_fcsr0(env, GETPC()); 2710 } 2711 *Vd = temp; 2712 } 2713 2714 void HELPER(vfrint_s)(void *vd, void *vj, 2715 CPULoongArchState *env, uint32_t desc) 2716 { 2717 int i; 2718 VReg *Vd = (VReg *)vd; 2719 VReg *Vj = (VReg *)vj; 2720 int oprsz = simd_oprsz(desc); 2721 2722 vec_clear_cause(env); 2723 for (i = 0; i < oprsz / 4; i++) { 2724 Vd->W(i) = float32_round_to_int(Vj->UW(i), &env->fp_status); 2725 vec_update_fcsr0(env, GETPC()); 2726 } 2727 } 2728 2729 void HELPER(vfrint_d)(void *vd, void *vj, 2730 CPULoongArchState *env, uint32_t desc) 2731 { 2732 int i; 2733 VReg *Vd = (VReg *)vd; 2734 VReg *Vj = (VReg *)vj; 2735 int oprsz = simd_oprsz(desc); 2736 2737 vec_clear_cause(env); 2738 for (i = 0; i < oprsz / 8; i++) { 2739 Vd->D(i) = float64_round_to_int(Vj->UD(i), &env->fp_status); 2740 vec_update_fcsr0(env, GETPC()); 2741 } 2742 } 2743 2744 #define FCVT_2OP(NAME, BIT, E, MODE) \ 2745 void HELPER(NAME)(void *vd, void *vj, \ 2746 CPULoongArchState *env, uint32_t desc) \ 2747 { \ 2748 int i; \ 2749 VReg *Vd = (VReg *)vd; \ 2750 VReg *Vj = (VReg *)vj; \ 2751 int oprsz = simd_oprsz(desc); \ 2752 \ 2753 vec_clear_cause(env); \ 2754 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 2755 FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \ 2756 set_float_rounding_mode(MODE, &env->fp_status); \ 2757 Vd->E(i) = float## BIT ## _round_to_int(Vj->E(i), &env->fp_status); \ 2758 set_float_rounding_mode(old_mode, &env->fp_status); \ 2759 vec_update_fcsr0(env, GETPC()); \ 2760 } \ 2761 } 2762 2763 FCVT_2OP(vfrintrne_s, 32, UW, float_round_nearest_even) 2764 FCVT_2OP(vfrintrne_d, 64, UD, float_round_nearest_even) 2765 FCVT_2OP(vfrintrz_s, 32, UW, float_round_to_zero) 2766 FCVT_2OP(vfrintrz_d, 64, UD, float_round_to_zero) 2767 FCVT_2OP(vfrintrp_s, 32, UW, float_round_up) 2768 FCVT_2OP(vfrintrp_d, 64, UD, float_round_up) 2769 FCVT_2OP(vfrintrm_s, 32, UW, float_round_down) 2770 FCVT_2OP(vfrintrm_d, 64, UD, float_round_down) 2771 2772 #define FTINT(NAME, FMT1, FMT2, T1, T2, MODE) \ 2773 static T2 do_ftint ## NAME(CPULoongArchState *env, T1 fj) \ 2774 { \ 2775 T2 fd; \ 2776 FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \ 2777 \ 2778 set_float_rounding_mode(MODE, &env->fp_status); \ 2779 fd = do_## FMT1 ##_to_## FMT2(env, fj); \ 2780 set_float_rounding_mode(old_mode, &env->fp_status); \ 2781 return fd; \ 2782 } 2783 2784 #define DO_FTINT(FMT1, FMT2, T1, T2) \ 2785 static T2 do_## FMT1 ##_to_## FMT2(CPULoongArchState *env, T1 fj) \ 2786 { \ 2787 T2 fd; \ 2788 \ 2789 fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \ 2790 if (get_float_exception_flags(&env->fp_status) & (float_flag_invalid)) { \ 2791 if (FMT1 ##_is_any_nan(fj)) { \ 2792 fd = 0; \ 2793 } \ 2794 } \ 2795 vec_update_fcsr0(env, GETPC()); \ 2796 return fd; \ 2797 } 2798 2799 DO_FTINT(float32, int32, uint32_t, uint32_t) 2800 DO_FTINT(float64, int64, uint64_t, uint64_t) 2801 DO_FTINT(float32, uint32, uint32_t, uint32_t) 2802 DO_FTINT(float64, uint64, uint64_t, uint64_t) 2803 DO_FTINT(float64, int32, uint64_t, uint32_t) 2804 DO_FTINT(float32, int64, uint32_t, uint64_t) 2805 2806 FTINT(rne_w_s, float32, int32, uint32_t, uint32_t, float_round_nearest_even) 2807 FTINT(rne_l_d, float64, int64, uint64_t, uint64_t, float_round_nearest_even) 2808 FTINT(rp_w_s, float32, int32, uint32_t, uint32_t, float_round_up) 2809 FTINT(rp_l_d, float64, int64, uint64_t, uint64_t, float_round_up) 2810 FTINT(rz_w_s, float32, int32, uint32_t, uint32_t, float_round_to_zero) 2811 FTINT(rz_l_d, float64, int64, uint64_t, uint64_t, float_round_to_zero) 2812 FTINT(rm_w_s, float32, int32, uint32_t, uint32_t, float_round_down) 2813 FTINT(rm_l_d, float64, int64, uint64_t, uint64_t, float_round_down) 2814 2815 DO_2OP_F(vftintrne_w_s, 32, UW, do_ftintrne_w_s) 2816 DO_2OP_F(vftintrne_l_d, 64, UD, do_ftintrne_l_d) 2817 DO_2OP_F(vftintrp_w_s, 32, UW, do_ftintrp_w_s) 2818 DO_2OP_F(vftintrp_l_d, 64, UD, do_ftintrp_l_d) 2819 DO_2OP_F(vftintrz_w_s, 32, UW, do_ftintrz_w_s) 2820 DO_2OP_F(vftintrz_l_d, 64, UD, do_ftintrz_l_d) 2821 DO_2OP_F(vftintrm_w_s, 32, UW, do_ftintrm_w_s) 2822 DO_2OP_F(vftintrm_l_d, 64, UD, do_ftintrm_l_d) 2823 DO_2OP_F(vftint_w_s, 32, UW, do_float32_to_int32) 2824 DO_2OP_F(vftint_l_d, 64, UD, do_float64_to_int64) 2825 2826 FTINT(rz_wu_s, float32, uint32, uint32_t, uint32_t, float_round_to_zero) 2827 FTINT(rz_lu_d, float64, uint64, uint64_t, uint64_t, float_round_to_zero) 2828 2829 DO_2OP_F(vftintrz_wu_s, 32, UW, do_ftintrz_wu_s) 2830 DO_2OP_F(vftintrz_lu_d, 64, UD, do_ftintrz_lu_d) 2831 DO_2OP_F(vftint_wu_s, 32, UW, do_float32_to_uint32) 2832 DO_2OP_F(vftint_lu_d, 64, UD, do_float64_to_uint64) 2833 2834 FTINT(rm_w_d, float64, int32, uint64_t, uint32_t, float_round_down) 2835 FTINT(rp_w_d, float64, int32, uint64_t, uint32_t, float_round_up) 2836 FTINT(rz_w_d, float64, int32, uint64_t, uint32_t, float_round_to_zero) 2837 FTINT(rne_w_d, float64, int32, uint64_t, uint32_t, float_round_nearest_even) 2838 2839 #define FTINT_W_D(NAME, FN) \ 2840 void HELPER(NAME)(void *vd, void *vj, void *vk, \ 2841 CPULoongArchState *env, uint32_t desc) \ 2842 { \ 2843 int i, j, ofs; \ 2844 VReg temp = {}; \ 2845 VReg *Vd = (VReg *)vd; \ 2846 VReg *Vj = (VReg *)vj; \ 2847 VReg *Vk = (VReg *)vk; \ 2848 int oprsz = simd_oprsz(desc); \ 2849 \ 2850 ofs = LSX_LEN / 64; \ 2851 vec_clear_cause(env); \ 2852 for (i = 0; i < oprsz / 16; i++) { \ 2853 for (j = 0; j < ofs; j++) { \ 2854 temp.W(j + ofs * (2 * i + 1)) = FN(env, Vj->UD(j + ofs * i)); \ 2855 temp.W(j + ofs * 2 * i) = FN(env, Vk->UD(j + ofs * i)); \ 2856 } \ 2857 } \ 2858 *Vd = temp; \ 2859 } 2860 2861 FTINT_W_D(vftint_w_d, do_float64_to_int32) 2862 FTINT_W_D(vftintrm_w_d, do_ftintrm_w_d) 2863 FTINT_W_D(vftintrp_w_d, do_ftintrp_w_d) 2864 FTINT_W_D(vftintrz_w_d, do_ftintrz_w_d) 2865 FTINT_W_D(vftintrne_w_d, do_ftintrne_w_d) 2866 2867 FTINT(rml_l_s, float32, int64, uint32_t, uint64_t, float_round_down) 2868 FTINT(rpl_l_s, float32, int64, uint32_t, uint64_t, float_round_up) 2869 FTINT(rzl_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero) 2870 FTINT(rnel_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even) 2871 FTINT(rmh_l_s, float32, int64, uint32_t, uint64_t, float_round_down) 2872 FTINT(rph_l_s, float32, int64, uint32_t, uint64_t, float_round_up) 2873 FTINT(rzh_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero) 2874 FTINT(rneh_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even) 2875 2876 #define FTINTL_L_S(NAME, FN) \ 2877 void HELPER(NAME)(void *vd, void *vj, \ 2878 CPULoongArchState *env, uint32_t desc) \ 2879 { \ 2880 int i, j, ofs; \ 2881 VReg temp; \ 2882 VReg *Vd = (VReg *)vd; \ 2883 VReg *Vj = (VReg *)vj; \ 2884 int oprsz = simd_oprsz(desc); \ 2885 \ 2886 ofs = LSX_LEN / 64; \ 2887 vec_clear_cause(env); \ 2888 for (i = 0; i < oprsz / 16; i++) { \ 2889 for (j = 0; j < ofs; j++) { \ 2890 temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * 2 * i)); \ 2891 } \ 2892 } \ 2893 *Vd = temp; \ 2894 } 2895 2896 FTINTL_L_S(vftintl_l_s, do_float32_to_int64) 2897 FTINTL_L_S(vftintrml_l_s, do_ftintrml_l_s) 2898 FTINTL_L_S(vftintrpl_l_s, do_ftintrpl_l_s) 2899 FTINTL_L_S(vftintrzl_l_s, do_ftintrzl_l_s) 2900 FTINTL_L_S(vftintrnel_l_s, do_ftintrnel_l_s) 2901 2902 #define FTINTH_L_S(NAME, FN) \ 2903 void HELPER(NAME)(void *vd, void *vj, \ 2904 CPULoongArchState *env, uint32_t desc) \ 2905 { \ 2906 int i, j, ofs; \ 2907 VReg temp = {}; \ 2908 VReg *Vd = (VReg *)vd; \ 2909 VReg *Vj = (VReg *)vj; \ 2910 int oprsz = simd_oprsz(desc); \ 2911 \ 2912 ofs = LSX_LEN / 64; \ 2913 vec_clear_cause(env); \ 2914 for (i = 0; i < oprsz / 16; i++) { \ 2915 for (j = 0; j < ofs; j++) { \ 2916 temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * (2 * i + 1))); \ 2917 } \ 2918 } \ 2919 *Vd = temp; \ 2920 } 2921 2922 FTINTH_L_S(vftinth_l_s, do_float32_to_int64) 2923 FTINTH_L_S(vftintrmh_l_s, do_ftintrmh_l_s) 2924 FTINTH_L_S(vftintrph_l_s, do_ftintrph_l_s) 2925 FTINTH_L_S(vftintrzh_l_s, do_ftintrzh_l_s) 2926 FTINTH_L_S(vftintrneh_l_s, do_ftintrneh_l_s) 2927 2928 #define FFINT(NAME, FMT1, FMT2, T1, T2) \ 2929 static T2 do_ffint_ ## NAME(CPULoongArchState *env, T1 fj) \ 2930 { \ 2931 T2 fd; \ 2932 \ 2933 fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \ 2934 vec_update_fcsr0(env, GETPC()); \ 2935 return fd; \ 2936 } 2937 2938 FFINT(s_w, int32, float32, int32_t, uint32_t) 2939 FFINT(d_l, int64, float64, int64_t, uint64_t) 2940 FFINT(s_wu, uint32, float32, uint32_t, uint32_t) 2941 FFINT(d_lu, uint64, float64, uint64_t, uint64_t) 2942 2943 DO_2OP_F(vffint_s_w, 32, W, do_ffint_s_w) 2944 DO_2OP_F(vffint_d_l, 64, D, do_ffint_d_l) 2945 DO_2OP_F(vffint_s_wu, 32, UW, do_ffint_s_wu) 2946 DO_2OP_F(vffint_d_lu, 64, UD, do_ffint_d_lu) 2947 2948 void HELPER(vffintl_d_w)(void *vd, void *vj, 2949 CPULoongArchState *env, uint32_t desc) 2950 { 2951 int i, j, ofs; 2952 VReg temp = {}; 2953 VReg *Vd = (VReg *)vd; 2954 VReg *Vj = (VReg *)vj; 2955 int oprsz = simd_oprsz(desc); 2956 2957 ofs = LSX_LEN / 64; 2958 vec_clear_cause(env); 2959 for (i = 0; i < oprsz / 16; i++) { 2960 for (j = 0; j < ofs; j++) { 2961 temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * 2 * i), 2962 &env->fp_status); 2963 } 2964 vec_update_fcsr0(env, GETPC()); 2965 } 2966 *Vd = temp; 2967 } 2968 2969 void HELPER(vffinth_d_w)(void *vd, void *vj, 2970 CPULoongArchState *env, uint32_t desc) 2971 { 2972 int i, j, ofs; 2973 VReg temp = {}; 2974 VReg *Vd = (VReg *)vd; 2975 VReg *Vj = (VReg *)vj; 2976 int oprsz = simd_oprsz(desc); 2977 2978 ofs = LSX_LEN / 64; 2979 vec_clear_cause(env); 2980 for (i = 0; i < oprsz /16; i++) { 2981 for (j = 0; j < ofs; j++) { 2982 temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * (2 * i + 1)), 2983 &env->fp_status); 2984 } 2985 vec_update_fcsr0(env, GETPC()); 2986 } 2987 *Vd = temp; 2988 } 2989 2990 void HELPER(vffint_s_l)(void *vd, void *vj, void *vk, 2991 CPULoongArchState *env, uint32_t desc) 2992 { 2993 int i, j, ofs; 2994 VReg temp = {}; 2995 VReg *Vd = (VReg *)vd; 2996 VReg *Vj = (VReg *)vj; 2997 VReg *Vk = (VReg *)vk; 2998 int oprsz = simd_oprsz(desc); 2999 3000 ofs = LSX_LEN / 64; 3001 vec_clear_cause(env); 3002 for (i = 0; i < oprsz / 16; i++) { 3003 for (j = 0; j < ofs; j++) { 3004 temp.W(j + ofs * (2 * i + 1)) = int64_to_float32(Vj->D(j + ofs * i), 3005 &env->fp_status); 3006 temp.W(j + ofs * 2 * i) = int64_to_float32(Vk->D(j + ofs * i), 3007 &env->fp_status); 3008 } 3009 vec_update_fcsr0(env, GETPC()); 3010 } 3011 *Vd = temp; 3012 } 3013 3014 #define VCMPI(NAME, BIT, E, DO_OP) \ 3015 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3016 { \ 3017 int i; \ 3018 VReg *Vd = (VReg *)vd; \ 3019 VReg *Vj = (VReg *)vj; \ 3020 typedef __typeof(Vd->E(0)) TD; \ 3021 int oprsz = simd_oprsz(desc); \ 3022 \ 3023 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3024 Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \ 3025 } \ 3026 } 3027 3028 VCMPI(vseqi_b, 8, B, VSEQ) 3029 VCMPI(vseqi_h, 16, H, VSEQ) 3030 VCMPI(vseqi_w, 32, W, VSEQ) 3031 VCMPI(vseqi_d, 64, D, VSEQ) 3032 VCMPI(vslei_b, 8, B, VSLE) 3033 VCMPI(vslei_h, 16, H, VSLE) 3034 VCMPI(vslei_w, 32, W, VSLE) 3035 VCMPI(vslei_d, 64, D, VSLE) 3036 VCMPI(vslei_bu, 8, UB, VSLE) 3037 VCMPI(vslei_hu, 16, UH, VSLE) 3038 VCMPI(vslei_wu, 32, UW, VSLE) 3039 VCMPI(vslei_du, 64, UD, VSLE) 3040 VCMPI(vslti_b, 8, B, VSLT) 3041 VCMPI(vslti_h, 16, H, VSLT) 3042 VCMPI(vslti_w, 32, W, VSLT) 3043 VCMPI(vslti_d, 64, D, VSLT) 3044 VCMPI(vslti_bu, 8, UB, VSLT) 3045 VCMPI(vslti_hu, 16, UH, VSLT) 3046 VCMPI(vslti_wu, 32, UW, VSLT) 3047 VCMPI(vslti_du, 64, UD, VSLT) 3048 3049 static uint64_t vfcmp_common(CPULoongArchState *env, 3050 FloatRelation cmp, uint32_t flags) 3051 { 3052 uint64_t ret = 0; 3053 3054 switch (cmp) { 3055 case float_relation_less: 3056 ret = (flags & FCMP_LT); 3057 break; 3058 case float_relation_equal: 3059 ret = (flags & FCMP_EQ); 3060 break; 3061 case float_relation_greater: 3062 ret = (flags & FCMP_GT); 3063 break; 3064 case float_relation_unordered: 3065 ret = (flags & FCMP_UN); 3066 break; 3067 default: 3068 g_assert_not_reached(); 3069 } 3070 3071 if (ret) { 3072 ret = -1; 3073 } 3074 3075 return ret; 3076 } 3077 3078 #define VFCMP(NAME, BIT, E, FN) \ 3079 void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz, \ 3080 uint32_t vd, uint32_t vj, uint32_t vk, uint32_t flags) \ 3081 { \ 3082 int i; \ 3083 VReg t; \ 3084 VReg *Vd = &(env->fpr[vd].vreg); \ 3085 VReg *Vj = &(env->fpr[vj].vreg); \ 3086 VReg *Vk = &(env->fpr[vk].vreg); \ 3087 \ 3088 vec_clear_cause(env); \ 3089 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3090 FloatRelation cmp; \ 3091 cmp = FN(Vj->E(i), Vk->E(i), &env->fp_status); \ 3092 t.E(i) = vfcmp_common(env, cmp, flags); \ 3093 vec_update_fcsr0(env, GETPC()); \ 3094 } \ 3095 *Vd = t; \ 3096 } 3097 3098 VFCMP(vfcmp_c_s, 32, UW, float32_compare_quiet) 3099 VFCMP(vfcmp_s_s, 32, UW, float32_compare) 3100 VFCMP(vfcmp_c_d, 64, UD, float64_compare_quiet) 3101 VFCMP(vfcmp_s_d, 64, UD, float64_compare) 3102 3103 void HELPER(vbitseli_b)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3104 { 3105 int i; 3106 VReg *Vd = (VReg *)vd; 3107 VReg *Vj = (VReg *)vj; 3108 3109 for (i = 0; i < simd_oprsz(desc); i++) { 3110 Vd->B(i) = (~Vd->B(i) & Vj->B(i)) | (Vd->B(i) & imm); 3111 } 3112 } 3113 3114 /* Copy from target/arm/tcg/sve_helper.c */ 3115 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) 3116 { 3117 int bits = 8 << esz; 3118 uint64_t ones = dup_const(esz, 1); 3119 uint64_t signs = ones << (bits - 1); 3120 uint64_t cmp0, cmp1; 3121 3122 cmp1 = dup_const(esz, n); 3123 cmp0 = cmp1 ^ m0; 3124 cmp1 = cmp1 ^ m1; 3125 cmp0 = (cmp0 - ones) & ~cmp0; 3126 cmp1 = (cmp1 - ones) & ~cmp1; 3127 return (cmp0 | cmp1) & signs; 3128 } 3129 3130 #define SETANYEQZ(NAME, MO) \ 3131 void HELPER(NAME)(CPULoongArchState *env, \ 3132 uint32_t oprsz, uint32_t cd, uint32_t vj) \ 3133 { \ 3134 VReg *Vj = &(env->fpr[vj].vreg); \ 3135 \ 3136 env->cf[cd & 0x7] = do_match2(0, Vj->D(0), Vj->D(1), MO); \ 3137 if (oprsz == 32) { \ 3138 env->cf[cd & 0x7] = env->cf[cd & 0x7] || \ 3139 do_match2(0, Vj->D(2), Vj->D(3), MO); \ 3140 } \ 3141 } 3142 3143 SETANYEQZ(vsetanyeqz_b, MO_8) 3144 SETANYEQZ(vsetanyeqz_h, MO_16) 3145 SETANYEQZ(vsetanyeqz_w, MO_32) 3146 SETANYEQZ(vsetanyeqz_d, MO_64) 3147 3148 #define SETALLNEZ(NAME, MO) \ 3149 void HELPER(NAME)(CPULoongArchState *env, \ 3150 uint32_t oprsz, uint32_t cd, uint32_t vj) \ 3151 { \ 3152 VReg *Vj = &(env->fpr[vj].vreg); \ 3153 \ 3154 env->cf[cd & 0x7]= !do_match2(0, Vj->D(0), Vj->D(1), MO); \ 3155 if (oprsz == 32) { \ 3156 env->cf[cd & 0x7] = env->cf[cd & 0x7] && \ 3157 !do_match2(0, Vj->D(2), Vj->D(3), MO); \ 3158 } \ 3159 } 3160 3161 SETALLNEZ(vsetallnez_b, MO_8) 3162 SETALLNEZ(vsetallnez_h, MO_16) 3163 SETALLNEZ(vsetallnez_w, MO_32) 3164 SETALLNEZ(vsetallnez_d, MO_64) 3165 3166 #define XVINSVE0(NAME, E, MASK) \ 3167 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3168 { \ 3169 VReg *Vd = (VReg *)vd; \ 3170 VReg *Vj = (VReg *)vj; \ 3171 Vd->E(imm & MASK) = Vj->E(0); \ 3172 } 3173 3174 XVINSVE0(xvinsve0_w, W, 0x7) 3175 XVINSVE0(xvinsve0_d, D, 0x3) 3176 3177 #define XVPICKVE(NAME, E, BIT, MASK) \ 3178 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3179 { \ 3180 int i; \ 3181 VReg *Vd = (VReg *)vd; \ 3182 VReg *Vj = (VReg *)vj; \ 3183 int oprsz = simd_oprsz(desc); \ 3184 \ 3185 Vd->E(0) = Vj->E(imm & MASK); \ 3186 for (i = 1; i < oprsz / (BIT / 8); i++) { \ 3187 Vd->E(i) = 0; \ 3188 } \ 3189 } 3190 3191 XVPICKVE(xvpickve_w, W, 32, 0x7) 3192 XVPICKVE(xvpickve_d, D, 64, 0x3) 3193 3194 #define VPACKEV(NAME, BIT, E) \ 3195 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3196 { \ 3197 int i; \ 3198 VReg temp = {}; \ 3199 VReg *Vd = (VReg *)vd; \ 3200 VReg *Vj = (VReg *)vj; \ 3201 VReg *Vk = (VReg *)vk; \ 3202 int oprsz = simd_oprsz(desc); \ 3203 \ 3204 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3205 temp.E(2 * i + 1) = Vj->E(2 * i); \ 3206 temp.E(2 *i) = Vk->E(2 * i); \ 3207 } \ 3208 *Vd = temp; \ 3209 } 3210 3211 VPACKEV(vpackev_b, 16, B) 3212 VPACKEV(vpackev_h, 32, H) 3213 VPACKEV(vpackev_w, 64, W) 3214 VPACKEV(vpackev_d, 128, D) 3215 3216 #define VPACKOD(NAME, BIT, E) \ 3217 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3218 { \ 3219 int i; \ 3220 VReg temp = {}; \ 3221 VReg *Vd = (VReg *)vd; \ 3222 VReg *Vj = (VReg *)vj; \ 3223 VReg *Vk = (VReg *)vk; \ 3224 int oprsz = simd_oprsz(desc); \ 3225 \ 3226 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3227 temp.E(2 * i + 1) = Vj->E(2 * i + 1); \ 3228 temp.E(2 * i) = Vk->E(2 * i + 1); \ 3229 } \ 3230 *Vd = temp; \ 3231 } 3232 3233 VPACKOD(vpackod_b, 16, B) 3234 VPACKOD(vpackod_h, 32, H) 3235 VPACKOD(vpackod_w, 64, W) 3236 VPACKOD(vpackod_d, 128, D) 3237 3238 #define VPICKEV(NAME, BIT, E) \ 3239 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3240 { \ 3241 int i, j, ofs; \ 3242 VReg temp = {}; \ 3243 VReg *Vd = (VReg *)vd; \ 3244 VReg *Vj = (VReg *)vj; \ 3245 VReg *Vk = (VReg *)vk; \ 3246 int oprsz = simd_oprsz(desc); \ 3247 \ 3248 ofs = LSX_LEN / BIT; \ 3249 for (i = 0; i < oprsz / 16; i++) { \ 3250 for (j = 0; j < ofs; j++) { \ 3251 temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i)); \ 3252 temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i)); \ 3253 } \ 3254 } \ 3255 *Vd = temp; \ 3256 } 3257 3258 VPICKEV(vpickev_b, 16, B) 3259 VPICKEV(vpickev_h, 32, H) 3260 VPICKEV(vpickev_w, 64, W) 3261 VPICKEV(vpickev_d, 128, D) 3262 3263 #define VPICKOD(NAME, BIT, E) \ 3264 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3265 { \ 3266 int i, j, ofs; \ 3267 VReg temp = {}; \ 3268 VReg *Vd = (VReg *)vd; \ 3269 VReg *Vj = (VReg *)vj; \ 3270 VReg *Vk = (VReg *)vk; \ 3271 int oprsz = simd_oprsz(desc); \ 3272 \ 3273 ofs = LSX_LEN / BIT; \ 3274 for (i = 0; i < oprsz / 16; i++) { \ 3275 for (j = 0; j < ofs; j++) { \ 3276 temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i) + 1); \ 3277 temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i) + 1); \ 3278 } \ 3279 } \ 3280 *Vd = temp; \ 3281 } 3282 3283 VPICKOD(vpickod_b, 16, B) 3284 VPICKOD(vpickod_h, 32, H) 3285 VPICKOD(vpickod_w, 64, W) 3286 VPICKOD(vpickod_d, 128, D) 3287 3288 #define VILVL(NAME, BIT, E) \ 3289 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3290 { \ 3291 int i, j, ofs; \ 3292 VReg temp = {}; \ 3293 VReg *Vd = (VReg *)vd; \ 3294 VReg *Vj = (VReg *)vj; \ 3295 VReg *Vk = (VReg *)vk; \ 3296 int oprsz = simd_oprsz(desc); \ 3297 \ 3298 ofs = LSX_LEN / BIT; \ 3299 for (i = 0; i < oprsz / 16; i++) { \ 3300 for (j = 0; j < ofs; j++) { \ 3301 temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * 2 * i); \ 3302 temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * 2 * i); \ 3303 } \ 3304 } \ 3305 *Vd = temp; \ 3306 } 3307 3308 VILVL(vilvl_b, 16, B) 3309 VILVL(vilvl_h, 32, H) 3310 VILVL(vilvl_w, 64, W) 3311 VILVL(vilvl_d, 128, D) 3312 3313 #define VILVH(NAME, BIT, E) \ 3314 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3315 { \ 3316 int i, j, ofs; \ 3317 VReg temp = {}; \ 3318 VReg *Vd = (VReg *)vd; \ 3319 VReg *Vj = (VReg *)vj; \ 3320 VReg *Vk = (VReg *)vk; \ 3321 int oprsz = simd_oprsz(desc); \ 3322 \ 3323 ofs = LSX_LEN / BIT; \ 3324 for (i = 0; i < oprsz / 16; i++) { \ 3325 for (j = 0; j < ofs; j++) { \ 3326 temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * (2 * i + 1)); \ 3327 temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * (2 * i + 1)); \ 3328 } \ 3329 } \ 3330 *Vd = temp; \ 3331 } 3332 3333 VILVH(vilvh_b, 16, B) 3334 VILVH(vilvh_h, 32, H) 3335 VILVH(vilvh_w, 64, W) 3336 VILVH(vilvh_d, 128, D) 3337 3338 void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc) 3339 { 3340 int i, j, m; 3341 VReg temp = {}; 3342 VReg *Vd = (VReg *)vd; 3343 VReg *Vj = (VReg *)vj; 3344 VReg *Vk = (VReg *)vk; 3345 VReg *Va = (VReg *)va; 3346 int oprsz = simd_oprsz(desc); 3347 3348 m = LSX_LEN / 8; 3349 for (i = 0; i < (oprsz / 16) * m; i++) { 3350 j = i < m ? 0 : 1; 3351 uint64_t k = (uint8_t)Va->B(i) % (2 * m); 3352 temp.B(i) = k < m ? Vk->B(k + j * m): Vj->B(k + (j - 1) * m); 3353 } 3354 *Vd = temp; 3355 } 3356 3357 #define VSHUF(NAME, BIT, E) \ 3358 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ 3359 { \ 3360 int i, j, m; \ 3361 VReg temp = {}; \ 3362 VReg *Vd = (VReg *)vd; \ 3363 VReg *Vj = (VReg *)vj; \ 3364 VReg *Vk = (VReg *)vk; \ 3365 int oprsz = simd_oprsz(desc); \ 3366 \ 3367 m = LSX_LEN / BIT; \ 3368 for (i = 0; i < (oprsz / 16) * m; i++) { \ 3369 j = i < m ? 0 : 1; \ 3370 uint64_t k = ((uint8_t)Vd->E(i)) % (2 * m); \ 3371 temp.E(i) = k < m ? Vk->E(k + j * m) : Vj->E(k + (j - 1) * m); \ 3372 } \ 3373 *Vd = temp; \ 3374 } 3375 3376 VSHUF(vshuf_h, 16, H) 3377 VSHUF(vshuf_w, 32, W) 3378 VSHUF(vshuf_d, 64, D) 3379 3380 #define VSHUF4I(NAME, BIT, E) \ 3381 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3382 { \ 3383 int i, j, max; \ 3384 VReg temp = {}; \ 3385 VReg *Vd = (VReg *)vd; \ 3386 VReg *Vj = (VReg *)vj; \ 3387 int oprsz = simd_oprsz(desc); \ 3388 \ 3389 max = LSX_LEN / BIT; \ 3390 for (i = 0; i < oprsz / (BIT / 8); i++) { \ 3391 j = i < max ? 1 : 2; \ 3392 temp.E(i) = Vj->E(SHF_POS(i - ((j -1)* max), imm) + (j - 1) * max); \ 3393 } \ 3394 *Vd = temp; \ 3395 } 3396 3397 VSHUF4I(vshuf4i_b, 8, B) 3398 VSHUF4I(vshuf4i_h, 16, H) 3399 VSHUF4I(vshuf4i_w, 32, W) 3400 3401 void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3402 { 3403 int i; 3404 VReg temp = {}; 3405 VReg *Vd = (VReg *)vd; 3406 VReg *Vj = (VReg *)vj; 3407 int oprsz = simd_oprsz(desc); 3408 3409 for (i = 0; i < oprsz / 16; i++) { 3410 temp.D(2 * i) = (imm & 2 ? Vj : Vd)->D((imm & 1) + 2 * i); 3411 temp.D(2 * i + 1) = (imm & 8 ? Vj : Vd)->D(((imm >> 2) & 1) + 2 * i); 3412 } 3413 *Vd = temp; 3414 } 3415 3416 void HELPER(vperm_w)(void *vd, void *vj, void *vk, uint32_t desc) 3417 { 3418 int i, m; 3419 VReg temp = {}; 3420 VReg *Vd = (VReg *)vd; 3421 VReg *Vj = (VReg *)vj; 3422 VReg *Vk = (VReg *)vk; 3423 3424 m = LASX_LEN / 32; 3425 for (i = 0; i < m ; i++) { 3426 uint64_t k = (uint8_t)Vk->W(i) % 8; 3427 temp.W(i) = Vj->W(k); 3428 } 3429 *Vd = temp; 3430 } 3431 3432 void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3433 { 3434 int i; 3435 VReg temp = {}; 3436 VReg *Vd = (VReg *)vd; 3437 VReg *Vj = (VReg *)vj; 3438 int oprsz = simd_oprsz(desc); 3439 3440 for (i = 0; i < oprsz / 16; i++) { 3441 temp.W(4 * i) = Vj->W((imm & 0x3) + 4 * i); 3442 temp.W(4 * i + 1) = Vj->W(((imm >> 2) & 0x3) + 4 * i); 3443 temp.W(4 * i + 2) = Vd->W(((imm >> 4) & 0x3) + 4 * i); 3444 temp.W(4 * i + 3) = Vd->W(((imm >> 6) & 0x3) + 4 * i); 3445 } 3446 *Vd = temp; 3447 } 3448 3449 void HELPER(vpermi_d)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3450 { 3451 VReg temp = {}; 3452 VReg *Vd = (VReg *)vd; 3453 VReg *Vj = (VReg *)vj; 3454 3455 temp.D(0) = Vj->D(imm & 0x3); 3456 temp.D(1) = Vj->D((imm >> 2) & 0x3); 3457 temp.D(2) = Vj->D((imm >> 4) & 0x3); 3458 temp.D(3) = Vj->D((imm >> 6) & 0x3); 3459 *Vd = temp; 3460 } 3461 3462 void HELPER(vpermi_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) 3463 { 3464 int i; 3465 VReg temp; 3466 VReg *Vd = (VReg *)vd; 3467 VReg *Vj = (VReg *)vj; 3468 3469 for (i = 0; i < 2; i++, imm >>= 4) { 3470 temp.Q(i) = (imm & 2 ? Vd: Vj)->Q(imm & 1); 3471 } 3472 *Vd = temp; 3473 } 3474 3475 #define VEXTRINS(NAME, BIT, E, MASK) \ 3476 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ 3477 { \ 3478 int i, ins, extr, max; \ 3479 VReg *Vd = (VReg *)vd; \ 3480 VReg *Vj = (VReg *)vj; \ 3481 int oprsz = simd_oprsz(desc); \ 3482 \ 3483 max = LSX_LEN / BIT; \ 3484 ins = (imm >> 4) & MASK; \ 3485 extr = imm & MASK; \ 3486 for (i = 0; i < oprsz / 16; i++) { \ 3487 Vd->E(ins + i * max) = Vj->E(extr + i * max); \ 3488 } \ 3489 } 3490 3491 VEXTRINS(vextrins_b, 8, B, 0xf) 3492 VEXTRINS(vextrins_h, 16, H, 0x7) 3493 VEXTRINS(vextrins_w, 32, W, 0x3) 3494 VEXTRINS(vextrins_d, 64, D, 0x1) 3495