1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "tcg/tcg.h" 22 #include "tcg/tcg-op.h" 23 #include "tcg/tcg-op-gvec.h" 24 #include "tcg/tcg-gvec-desc.h" 25 26 #define MAX_UNROLL 4 27 28 #ifdef CONFIG_DEBUG_TCG 29 static const TCGOpcode vecop_list_empty[1] = { 0 }; 30 #else 31 #define vecop_list_empty NULL 32 #endif 33 34 35 /* Verify vector size and alignment rules. OFS should be the OR of all 36 of the operand offsets so that we can check them all at once. */ 37 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 38 { 39 uint32_t max_align; 40 41 switch (oprsz) { 42 case 8: 43 case 16: 44 case 32: 45 tcg_debug_assert(oprsz <= maxsz); 46 break; 47 default: 48 tcg_debug_assert(oprsz == maxsz); 49 break; 50 } 51 tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS)); 52 53 max_align = maxsz >= 16 ? 15 : 7; 54 tcg_debug_assert((maxsz & max_align) == 0); 55 tcg_debug_assert((ofs & max_align) == 0); 56 } 57 58 /* Verify vector overlap rules for two operands. */ 59 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 60 { 61 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 62 } 63 64 /* Verify vector overlap rules for three operands. */ 65 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 66 { 67 check_overlap_2(d, a, s); 68 check_overlap_2(d, b, s); 69 check_overlap_2(a, b, s); 70 } 71 72 /* Verify vector overlap rules for four operands. */ 73 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 74 uint32_t c, uint32_t s) 75 { 76 check_overlap_2(d, a, s); 77 check_overlap_2(d, b, s); 78 check_overlap_2(d, c, s); 79 check_overlap_2(a, b, s); 80 check_overlap_2(a, c, s); 81 check_overlap_2(b, c, s); 82 } 83 84 /* Create a descriptor from components. */ 85 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 86 { 87 uint32_t desc = 0; 88 89 check_size_align(oprsz, maxsz, 0); 90 tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 91 92 oprsz = (oprsz / 8) - 1; 93 maxsz = (maxsz / 8) - 1; 94 95 /* 96 * We have just asserted in check_size_align that either 97 * oprsz is {8,16,32} or matches maxsz. Encode the final 98 * case with '2', as that would otherwise map to 24. 99 */ 100 if (oprsz == maxsz) { 101 oprsz = 2; 102 } 103 104 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 105 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 106 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 107 108 return desc; 109 } 110 111 /* Generate a call to a gvec-style helper with two vector operands. */ 112 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 113 uint32_t oprsz, uint32_t maxsz, int32_t data, 114 gen_helper_gvec_2 *fn) 115 { 116 TCGv_ptr a0, a1; 117 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 118 119 a0 = tcg_temp_ebb_new_ptr(); 120 a1 = tcg_temp_ebb_new_ptr(); 121 122 tcg_gen_addi_ptr(a0, cpu_env, dofs); 123 tcg_gen_addi_ptr(a1, cpu_env, aofs); 124 125 fn(a0, a1, desc); 126 127 tcg_temp_free_ptr(a0); 128 tcg_temp_free_ptr(a1); 129 } 130 131 /* Generate a call to a gvec-style helper with two vector operands 132 and one scalar operand. */ 133 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 134 uint32_t oprsz, uint32_t maxsz, int32_t data, 135 gen_helper_gvec_2i *fn) 136 { 137 TCGv_ptr a0, a1; 138 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 139 140 a0 = tcg_temp_ebb_new_ptr(); 141 a1 = tcg_temp_ebb_new_ptr(); 142 143 tcg_gen_addi_ptr(a0, cpu_env, dofs); 144 tcg_gen_addi_ptr(a1, cpu_env, aofs); 145 146 fn(a0, a1, c, desc); 147 148 tcg_temp_free_ptr(a0); 149 tcg_temp_free_ptr(a1); 150 } 151 152 /* Generate a call to a gvec-style helper with three vector operands. */ 153 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 154 uint32_t oprsz, uint32_t maxsz, int32_t data, 155 gen_helper_gvec_3 *fn) 156 { 157 TCGv_ptr a0, a1, a2; 158 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 159 160 a0 = tcg_temp_ebb_new_ptr(); 161 a1 = tcg_temp_ebb_new_ptr(); 162 a2 = tcg_temp_ebb_new_ptr(); 163 164 tcg_gen_addi_ptr(a0, cpu_env, dofs); 165 tcg_gen_addi_ptr(a1, cpu_env, aofs); 166 tcg_gen_addi_ptr(a2, cpu_env, bofs); 167 168 fn(a0, a1, a2, desc); 169 170 tcg_temp_free_ptr(a0); 171 tcg_temp_free_ptr(a1); 172 tcg_temp_free_ptr(a2); 173 } 174 175 /* Generate a call to a gvec-style helper with four vector operands. */ 176 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 177 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 178 int32_t data, gen_helper_gvec_4 *fn) 179 { 180 TCGv_ptr a0, a1, a2, a3; 181 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 182 183 a0 = tcg_temp_ebb_new_ptr(); 184 a1 = tcg_temp_ebb_new_ptr(); 185 a2 = tcg_temp_ebb_new_ptr(); 186 a3 = tcg_temp_ebb_new_ptr(); 187 188 tcg_gen_addi_ptr(a0, cpu_env, dofs); 189 tcg_gen_addi_ptr(a1, cpu_env, aofs); 190 tcg_gen_addi_ptr(a2, cpu_env, bofs); 191 tcg_gen_addi_ptr(a3, cpu_env, cofs); 192 193 fn(a0, a1, a2, a3, desc); 194 195 tcg_temp_free_ptr(a0); 196 tcg_temp_free_ptr(a1); 197 tcg_temp_free_ptr(a2); 198 tcg_temp_free_ptr(a3); 199 } 200 201 /* Generate a call to a gvec-style helper with five vector operands. */ 202 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 203 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 204 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 205 { 206 TCGv_ptr a0, a1, a2, a3, a4; 207 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 208 209 a0 = tcg_temp_ebb_new_ptr(); 210 a1 = tcg_temp_ebb_new_ptr(); 211 a2 = tcg_temp_ebb_new_ptr(); 212 a3 = tcg_temp_ebb_new_ptr(); 213 a4 = tcg_temp_ebb_new_ptr(); 214 215 tcg_gen_addi_ptr(a0, cpu_env, dofs); 216 tcg_gen_addi_ptr(a1, cpu_env, aofs); 217 tcg_gen_addi_ptr(a2, cpu_env, bofs); 218 tcg_gen_addi_ptr(a3, cpu_env, cofs); 219 tcg_gen_addi_ptr(a4, cpu_env, xofs); 220 221 fn(a0, a1, a2, a3, a4, desc); 222 223 tcg_temp_free_ptr(a0); 224 tcg_temp_free_ptr(a1); 225 tcg_temp_free_ptr(a2); 226 tcg_temp_free_ptr(a3); 227 tcg_temp_free_ptr(a4); 228 } 229 230 /* Generate a call to a gvec-style helper with three vector operands 231 and an extra pointer operand. */ 232 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 233 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 234 int32_t data, gen_helper_gvec_2_ptr *fn) 235 { 236 TCGv_ptr a0, a1; 237 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 238 239 a0 = tcg_temp_ebb_new_ptr(); 240 a1 = tcg_temp_ebb_new_ptr(); 241 242 tcg_gen_addi_ptr(a0, cpu_env, dofs); 243 tcg_gen_addi_ptr(a1, cpu_env, aofs); 244 245 fn(a0, a1, ptr, desc); 246 247 tcg_temp_free_ptr(a0); 248 tcg_temp_free_ptr(a1); 249 } 250 251 /* Generate a call to a gvec-style helper with three vector operands 252 and an extra pointer operand. */ 253 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 254 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 255 int32_t data, gen_helper_gvec_3_ptr *fn) 256 { 257 TCGv_ptr a0, a1, a2; 258 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 259 260 a0 = tcg_temp_ebb_new_ptr(); 261 a1 = tcg_temp_ebb_new_ptr(); 262 a2 = tcg_temp_ebb_new_ptr(); 263 264 tcg_gen_addi_ptr(a0, cpu_env, dofs); 265 tcg_gen_addi_ptr(a1, cpu_env, aofs); 266 tcg_gen_addi_ptr(a2, cpu_env, bofs); 267 268 fn(a0, a1, a2, ptr, desc); 269 270 tcg_temp_free_ptr(a0); 271 tcg_temp_free_ptr(a1); 272 tcg_temp_free_ptr(a2); 273 } 274 275 /* Generate a call to a gvec-style helper with four vector operands 276 and an extra pointer operand. */ 277 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 278 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 279 uint32_t maxsz, int32_t data, 280 gen_helper_gvec_4_ptr *fn) 281 { 282 TCGv_ptr a0, a1, a2, a3; 283 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 284 285 a0 = tcg_temp_ebb_new_ptr(); 286 a1 = tcg_temp_ebb_new_ptr(); 287 a2 = tcg_temp_ebb_new_ptr(); 288 a3 = tcg_temp_ebb_new_ptr(); 289 290 tcg_gen_addi_ptr(a0, cpu_env, dofs); 291 tcg_gen_addi_ptr(a1, cpu_env, aofs); 292 tcg_gen_addi_ptr(a2, cpu_env, bofs); 293 tcg_gen_addi_ptr(a3, cpu_env, cofs); 294 295 fn(a0, a1, a2, a3, ptr, desc); 296 297 tcg_temp_free_ptr(a0); 298 tcg_temp_free_ptr(a1); 299 tcg_temp_free_ptr(a2); 300 tcg_temp_free_ptr(a3); 301 } 302 303 /* Generate a call to a gvec-style helper with five vector operands 304 and an extra pointer operand. */ 305 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 306 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr, 307 uint32_t oprsz, uint32_t maxsz, int32_t data, 308 gen_helper_gvec_5_ptr *fn) 309 { 310 TCGv_ptr a0, a1, a2, a3, a4; 311 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 312 313 a0 = tcg_temp_ebb_new_ptr(); 314 a1 = tcg_temp_ebb_new_ptr(); 315 a2 = tcg_temp_ebb_new_ptr(); 316 a3 = tcg_temp_ebb_new_ptr(); 317 a4 = tcg_temp_ebb_new_ptr(); 318 319 tcg_gen_addi_ptr(a0, cpu_env, dofs); 320 tcg_gen_addi_ptr(a1, cpu_env, aofs); 321 tcg_gen_addi_ptr(a2, cpu_env, bofs); 322 tcg_gen_addi_ptr(a3, cpu_env, cofs); 323 tcg_gen_addi_ptr(a4, cpu_env, eofs); 324 325 fn(a0, a1, a2, a3, a4, ptr, desc); 326 327 tcg_temp_free_ptr(a0); 328 tcg_temp_free_ptr(a1); 329 tcg_temp_free_ptr(a2); 330 tcg_temp_free_ptr(a3); 331 tcg_temp_free_ptr(a4); 332 } 333 334 /* Return true if we want to implement something of OPRSZ bytes 335 in units of LNSZ. This limits the expansion of inline code. */ 336 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 337 { 338 uint32_t q, r; 339 340 if (oprsz < lnsz) { 341 return false; 342 } 343 344 q = oprsz / lnsz; 345 r = oprsz % lnsz; 346 tcg_debug_assert((r & 7) == 0); 347 348 if (lnsz < 16) { 349 /* For sizes below 16, accept no remainder. */ 350 if (r != 0) { 351 return false; 352 } 353 } else { 354 /* 355 * Recall that ARM SVE allows vector sizes that are not a 356 * power of 2, but always a multiple of 16. The intent is 357 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 358 * In addition, expand_clr needs to handle a multiple of 8. 359 * Thus we can handle the tail with one more operation per 360 * diminishing power of 2. 361 */ 362 q += ctpop32(r); 363 } 364 365 return q <= MAX_UNROLL; 366 } 367 368 static void expand_clr(uint32_t dofs, uint32_t maxsz); 369 370 /* Duplicate C as per VECE. */ 371 uint64_t (dup_const)(unsigned vece, uint64_t c) 372 { 373 switch (vece) { 374 case MO_8: 375 return 0x0101010101010101ull * (uint8_t)c; 376 case MO_16: 377 return 0x0001000100010001ull * (uint16_t)c; 378 case MO_32: 379 return 0x0000000100000001ull * (uint32_t)c; 380 case MO_64: 381 return c; 382 default: 383 g_assert_not_reached(); 384 } 385 } 386 387 /* Duplicate IN into OUT as per VECE. */ 388 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 389 { 390 switch (vece) { 391 case MO_8: 392 tcg_gen_ext8u_i32(out, in); 393 tcg_gen_muli_i32(out, out, 0x01010101); 394 break; 395 case MO_16: 396 tcg_gen_deposit_i32(out, in, in, 16, 16); 397 break; 398 case MO_32: 399 tcg_gen_mov_i32(out, in); 400 break; 401 default: 402 g_assert_not_reached(); 403 } 404 } 405 406 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 407 { 408 switch (vece) { 409 case MO_8: 410 tcg_gen_ext8u_i64(out, in); 411 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 412 break; 413 case MO_16: 414 tcg_gen_ext16u_i64(out, in); 415 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 416 break; 417 case MO_32: 418 tcg_gen_deposit_i64(out, in, in, 32, 32); 419 break; 420 case MO_64: 421 tcg_gen_mov_i64(out, in); 422 break; 423 default: 424 g_assert_not_reached(); 425 } 426 } 427 428 /* Select a supported vector type for implementing an operation on SIZE 429 * bytes. If OP is 0, assume that the real operation to be performed is 430 * required by all backends. Otherwise, make sure than OP can be performed 431 * on elements of size VECE in the selected type. Do not select V64 if 432 * PREFER_I64 is true. Return 0 if no vector type is selected. 433 */ 434 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 435 uint32_t size, bool prefer_i64) 436 { 437 /* 438 * Recall that ARM SVE allows vector sizes that are not a 439 * power of 2, but always a multiple of 16. The intent is 440 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 441 * It is hard to imagine a case in which v256 is supported 442 * but v128 is not, but check anyway. 443 * In addition, expand_clr needs to handle a multiple of 8. 444 */ 445 if (TCG_TARGET_HAS_v256 && 446 check_size_impl(size, 32) && 447 tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && 448 (!(size & 16) || 449 (TCG_TARGET_HAS_v128 && 450 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && 451 (!(size & 8) || 452 (TCG_TARGET_HAS_v64 && 453 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 454 return TCG_TYPE_V256; 455 } 456 if (TCG_TARGET_HAS_v128 && 457 check_size_impl(size, 16) && 458 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && 459 (!(size & 8) || 460 (TCG_TARGET_HAS_v64 && 461 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 462 return TCG_TYPE_V128; 463 } 464 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 465 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 466 return TCG_TYPE_V64; 467 } 468 return 0; 469 } 470 471 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 472 uint32_t maxsz, TCGv_vec t_vec) 473 { 474 uint32_t i = 0; 475 476 tcg_debug_assert(oprsz >= 8); 477 478 /* 479 * This may be expand_clr for the tail of an operation, e.g. 480 * oprsz == 8 && maxsz == 64. The first 8 bytes of this store 481 * are misaligned wrt the maximum vector size, so do that first. 482 */ 483 if (dofs & 8) { 484 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 485 i += 8; 486 } 487 488 switch (type) { 489 case TCG_TYPE_V256: 490 /* 491 * Recall that ARM SVE allows vector sizes that are not a 492 * power of 2, but always a multiple of 16. The intent is 493 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 494 */ 495 for (; i + 32 <= oprsz; i += 32) { 496 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 497 } 498 /* fallthru */ 499 case TCG_TYPE_V128: 500 for (; i + 16 <= oprsz; i += 16) { 501 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 502 } 503 break; 504 case TCG_TYPE_V64: 505 for (; i < oprsz; i += 8) { 506 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 507 } 508 break; 509 default: 510 g_assert_not_reached(); 511 } 512 513 if (oprsz < maxsz) { 514 expand_clr(dofs + oprsz, maxsz - oprsz); 515 } 516 } 517 518 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 519 * Only one of IN_32 or IN_64 may be set; 520 * IN_C is used if IN_32 and IN_64 are unset. 521 */ 522 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 523 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 524 uint64_t in_c) 525 { 526 TCGType type; 527 TCGv_i64 t_64; 528 TCGv_i32 t_32, t_desc; 529 TCGv_ptr t_ptr; 530 uint32_t i; 531 532 assert(vece <= (in_32 ? MO_32 : MO_64)); 533 assert(in_32 == NULL || in_64 == NULL); 534 535 /* If we're storing 0, expand oprsz to maxsz. */ 536 if (in_32 == NULL && in_64 == NULL) { 537 in_c = dup_const(vece, in_c); 538 if (in_c == 0) { 539 oprsz = maxsz; 540 vece = MO_8; 541 } else if (in_c == dup_const(MO_8, in_c)) { 542 vece = MO_8; 543 } 544 } 545 546 /* Implement inline with a vector type, if possible. 547 * Prefer integer when 64-bit host and no variable dup. 548 */ 549 type = choose_vector_type(NULL, vece, oprsz, 550 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 551 && (in_64 == NULL || vece == MO_64))); 552 if (type != 0) { 553 TCGv_vec t_vec = tcg_temp_new_vec(type); 554 555 if (in_32) { 556 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 557 } else if (in_64) { 558 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 559 } else { 560 tcg_gen_dupi_vec(vece, t_vec, in_c); 561 } 562 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 563 tcg_temp_free_vec(t_vec); 564 return; 565 } 566 567 /* Otherwise, inline with an integer type, unless "large". */ 568 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 569 t_64 = NULL; 570 t_32 = NULL; 571 572 if (in_32) { 573 /* We are given a 32-bit variable input. For a 64-bit host, 574 use a 64-bit operation unless the 32-bit operation would 575 be simple enough. */ 576 if (TCG_TARGET_REG_BITS == 64 577 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 578 t_64 = tcg_temp_ebb_new_i64(); 579 tcg_gen_extu_i32_i64(t_64, in_32); 580 tcg_gen_dup_i64(vece, t_64, t_64); 581 } else { 582 t_32 = tcg_temp_ebb_new_i32(); 583 tcg_gen_dup_i32(vece, t_32, in_32); 584 } 585 } else if (in_64) { 586 /* We are given a 64-bit variable input. */ 587 t_64 = tcg_temp_ebb_new_i64(); 588 tcg_gen_dup_i64(vece, t_64, in_64); 589 } else { 590 /* We are given a constant input. */ 591 /* For 64-bit hosts, use 64-bit constants for "simple" constants 592 or when we'd need too many 32-bit stores, or when a 64-bit 593 constant is really required. */ 594 if (vece == MO_64 595 || (TCG_TARGET_REG_BITS == 64 596 && (in_c == 0 || in_c == -1 597 || !check_size_impl(oprsz, 4)))) { 598 t_64 = tcg_constant_i64(in_c); 599 } else { 600 t_32 = tcg_constant_i32(in_c); 601 } 602 } 603 604 /* Implement inline if we picked an implementation size above. */ 605 if (t_32) { 606 for (i = 0; i < oprsz; i += 4) { 607 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 608 } 609 tcg_temp_free_i32(t_32); 610 goto done; 611 } 612 if (t_64) { 613 for (i = 0; i < oprsz; i += 8) { 614 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 615 } 616 tcg_temp_free_i64(t_64); 617 goto done; 618 } 619 } 620 621 /* Otherwise implement out of line. */ 622 t_ptr = tcg_temp_ebb_new_ptr(); 623 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 624 625 /* 626 * This may be expand_clr for the tail of an operation, e.g. 627 * oprsz == 8 && maxsz == 64. The size of the clear is misaligned 628 * wrt simd_desc and will assert. Simply pass all replicated byte 629 * stores through to memset. 630 */ 631 if (oprsz == maxsz && vece == MO_8) { 632 TCGv_ptr t_size = tcg_constant_ptr(oprsz); 633 TCGv_i32 t_val; 634 635 if (in_32) { 636 t_val = in_32; 637 } else if (in_64) { 638 t_val = tcg_temp_ebb_new_i32(); 639 tcg_gen_extrl_i64_i32(t_val, in_64); 640 } else { 641 t_val = tcg_constant_i32(in_c); 642 } 643 gen_helper_memset(t_ptr, t_ptr, t_val, t_size); 644 645 if (in_64) { 646 tcg_temp_free_i32(t_val); 647 } 648 tcg_temp_free_ptr(t_ptr); 649 return; 650 } 651 652 t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0)); 653 654 if (vece == MO_64) { 655 if (in_64) { 656 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 657 } else { 658 t_64 = tcg_constant_i64(in_c); 659 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 660 } 661 } else { 662 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 663 static dup_fn * const fns[3] = { 664 gen_helper_gvec_dup8, 665 gen_helper_gvec_dup16, 666 gen_helper_gvec_dup32 667 }; 668 669 if (in_32) { 670 fns[vece](t_ptr, t_desc, in_32); 671 } else if (in_64) { 672 t_32 = tcg_temp_ebb_new_i32(); 673 tcg_gen_extrl_i64_i32(t_32, in_64); 674 fns[vece](t_ptr, t_desc, t_32); 675 tcg_temp_free_i32(t_32); 676 } else { 677 if (vece == MO_8) { 678 in_c &= 0xff; 679 } else if (vece == MO_16) { 680 in_c &= 0xffff; 681 } 682 t_32 = tcg_constant_i32(in_c); 683 fns[vece](t_ptr, t_desc, t_32); 684 } 685 } 686 687 tcg_temp_free_ptr(t_ptr); 688 return; 689 690 done: 691 if (oprsz < maxsz) { 692 expand_clr(dofs + oprsz, maxsz - oprsz); 693 } 694 } 695 696 /* Likewise, but with zero. */ 697 static void expand_clr(uint32_t dofs, uint32_t maxsz) 698 { 699 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 700 } 701 702 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 703 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 704 bool load_dest, void (*fni)(TCGv_i32, TCGv_i32)) 705 { 706 TCGv_i32 t0 = tcg_temp_new_i32(); 707 TCGv_i32 t1 = tcg_temp_new_i32(); 708 uint32_t i; 709 710 for (i = 0; i < oprsz; i += 4) { 711 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 712 if (load_dest) { 713 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 714 } 715 fni(t1, t0); 716 tcg_gen_st_i32(t1, cpu_env, dofs + i); 717 } 718 tcg_temp_free_i32(t0); 719 tcg_temp_free_i32(t1); 720 } 721 722 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 723 int32_t c, bool load_dest, 724 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 725 { 726 TCGv_i32 t0 = tcg_temp_new_i32(); 727 TCGv_i32 t1 = tcg_temp_new_i32(); 728 uint32_t i; 729 730 for (i = 0; i < oprsz; i += 4) { 731 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 732 if (load_dest) { 733 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 734 } 735 fni(t1, t0, c); 736 tcg_gen_st_i32(t1, cpu_env, dofs + i); 737 } 738 tcg_temp_free_i32(t0); 739 tcg_temp_free_i32(t1); 740 } 741 742 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 743 TCGv_i32 c, bool scalar_first, 744 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 745 { 746 TCGv_i32 t0 = tcg_temp_new_i32(); 747 TCGv_i32 t1 = tcg_temp_new_i32(); 748 uint32_t i; 749 750 for (i = 0; i < oprsz; i += 4) { 751 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 752 if (scalar_first) { 753 fni(t1, c, t0); 754 } else { 755 fni(t1, t0, c); 756 } 757 tcg_gen_st_i32(t1, cpu_env, dofs + i); 758 } 759 tcg_temp_free_i32(t0); 760 tcg_temp_free_i32(t1); 761 } 762 763 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 764 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 765 uint32_t bofs, uint32_t oprsz, bool load_dest, 766 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 767 { 768 TCGv_i32 t0 = tcg_temp_new_i32(); 769 TCGv_i32 t1 = tcg_temp_new_i32(); 770 TCGv_i32 t2 = tcg_temp_new_i32(); 771 uint32_t i; 772 773 for (i = 0; i < oprsz; i += 4) { 774 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 775 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 776 if (load_dest) { 777 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 778 } 779 fni(t2, t0, t1); 780 tcg_gen_st_i32(t2, cpu_env, dofs + i); 781 } 782 tcg_temp_free_i32(t2); 783 tcg_temp_free_i32(t1); 784 tcg_temp_free_i32(t0); 785 } 786 787 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 788 uint32_t oprsz, int32_t c, bool load_dest, 789 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 790 { 791 TCGv_i32 t0 = tcg_temp_new_i32(); 792 TCGv_i32 t1 = tcg_temp_new_i32(); 793 TCGv_i32 t2 = tcg_temp_new_i32(); 794 uint32_t i; 795 796 for (i = 0; i < oprsz; i += 4) { 797 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 798 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 799 if (load_dest) { 800 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 801 } 802 fni(t2, t0, t1, c); 803 tcg_gen_st_i32(t2, cpu_env, dofs + i); 804 } 805 tcg_temp_free_i32(t0); 806 tcg_temp_free_i32(t1); 807 tcg_temp_free_i32(t2); 808 } 809 810 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 811 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 812 uint32_t cofs, uint32_t oprsz, bool write_aofs, 813 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 814 { 815 TCGv_i32 t0 = tcg_temp_new_i32(); 816 TCGv_i32 t1 = tcg_temp_new_i32(); 817 TCGv_i32 t2 = tcg_temp_new_i32(); 818 TCGv_i32 t3 = tcg_temp_new_i32(); 819 uint32_t i; 820 821 for (i = 0; i < oprsz; i += 4) { 822 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 823 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 824 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 825 fni(t0, t1, t2, t3); 826 tcg_gen_st_i32(t0, cpu_env, dofs + i); 827 if (write_aofs) { 828 tcg_gen_st_i32(t1, cpu_env, aofs + i); 829 } 830 } 831 tcg_temp_free_i32(t3); 832 tcg_temp_free_i32(t2); 833 tcg_temp_free_i32(t1); 834 tcg_temp_free_i32(t0); 835 } 836 837 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 838 uint32_t cofs, uint32_t oprsz, int32_t c, 839 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32, 840 int32_t)) 841 { 842 TCGv_i32 t0 = tcg_temp_new_i32(); 843 TCGv_i32 t1 = tcg_temp_new_i32(); 844 TCGv_i32 t2 = tcg_temp_new_i32(); 845 TCGv_i32 t3 = tcg_temp_new_i32(); 846 uint32_t i; 847 848 for (i = 0; i < oprsz; i += 4) { 849 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 850 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 851 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 852 fni(t0, t1, t2, t3, c); 853 tcg_gen_st_i32(t0, cpu_env, dofs + i); 854 } 855 tcg_temp_free_i32(t3); 856 tcg_temp_free_i32(t2); 857 tcg_temp_free_i32(t1); 858 tcg_temp_free_i32(t0); 859 } 860 861 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 862 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 863 bool load_dest, void (*fni)(TCGv_i64, TCGv_i64)) 864 { 865 TCGv_i64 t0 = tcg_temp_new_i64(); 866 TCGv_i64 t1 = tcg_temp_new_i64(); 867 uint32_t i; 868 869 for (i = 0; i < oprsz; i += 8) { 870 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 871 if (load_dest) { 872 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 873 } 874 fni(t1, t0); 875 tcg_gen_st_i64(t1, cpu_env, dofs + i); 876 } 877 tcg_temp_free_i64(t0); 878 tcg_temp_free_i64(t1); 879 } 880 881 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 882 int64_t c, bool load_dest, 883 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 884 { 885 TCGv_i64 t0 = tcg_temp_new_i64(); 886 TCGv_i64 t1 = tcg_temp_new_i64(); 887 uint32_t i; 888 889 for (i = 0; i < oprsz; i += 8) { 890 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 891 if (load_dest) { 892 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 893 } 894 fni(t1, t0, c); 895 tcg_gen_st_i64(t1, cpu_env, dofs + i); 896 } 897 tcg_temp_free_i64(t0); 898 tcg_temp_free_i64(t1); 899 } 900 901 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 902 TCGv_i64 c, bool scalar_first, 903 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 904 { 905 TCGv_i64 t0 = tcg_temp_new_i64(); 906 TCGv_i64 t1 = tcg_temp_new_i64(); 907 uint32_t i; 908 909 for (i = 0; i < oprsz; i += 8) { 910 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 911 if (scalar_first) { 912 fni(t1, c, t0); 913 } else { 914 fni(t1, t0, c); 915 } 916 tcg_gen_st_i64(t1, cpu_env, dofs + i); 917 } 918 tcg_temp_free_i64(t0); 919 tcg_temp_free_i64(t1); 920 } 921 922 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 923 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 924 uint32_t bofs, uint32_t oprsz, bool load_dest, 925 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 926 { 927 TCGv_i64 t0 = tcg_temp_new_i64(); 928 TCGv_i64 t1 = tcg_temp_new_i64(); 929 TCGv_i64 t2 = tcg_temp_new_i64(); 930 uint32_t i; 931 932 for (i = 0; i < oprsz; i += 8) { 933 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 934 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 935 if (load_dest) { 936 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 937 } 938 fni(t2, t0, t1); 939 tcg_gen_st_i64(t2, cpu_env, dofs + i); 940 } 941 tcg_temp_free_i64(t2); 942 tcg_temp_free_i64(t1); 943 tcg_temp_free_i64(t0); 944 } 945 946 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 947 uint32_t oprsz, int64_t c, bool load_dest, 948 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 949 { 950 TCGv_i64 t0 = tcg_temp_new_i64(); 951 TCGv_i64 t1 = tcg_temp_new_i64(); 952 TCGv_i64 t2 = tcg_temp_new_i64(); 953 uint32_t i; 954 955 for (i = 0; i < oprsz; i += 8) { 956 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 957 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 958 if (load_dest) { 959 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 960 } 961 fni(t2, t0, t1, c); 962 tcg_gen_st_i64(t2, cpu_env, dofs + i); 963 } 964 tcg_temp_free_i64(t0); 965 tcg_temp_free_i64(t1); 966 tcg_temp_free_i64(t2); 967 } 968 969 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 970 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 971 uint32_t cofs, uint32_t oprsz, bool write_aofs, 972 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 973 { 974 TCGv_i64 t0 = tcg_temp_new_i64(); 975 TCGv_i64 t1 = tcg_temp_new_i64(); 976 TCGv_i64 t2 = tcg_temp_new_i64(); 977 TCGv_i64 t3 = tcg_temp_new_i64(); 978 uint32_t i; 979 980 for (i = 0; i < oprsz; i += 8) { 981 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 982 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 983 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 984 fni(t0, t1, t2, t3); 985 tcg_gen_st_i64(t0, cpu_env, dofs + i); 986 if (write_aofs) { 987 tcg_gen_st_i64(t1, cpu_env, aofs + i); 988 } 989 } 990 tcg_temp_free_i64(t3); 991 tcg_temp_free_i64(t2); 992 tcg_temp_free_i64(t1); 993 tcg_temp_free_i64(t0); 994 } 995 996 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 997 uint32_t cofs, uint32_t oprsz, int64_t c, 998 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64, 999 int64_t)) 1000 { 1001 TCGv_i64 t0 = tcg_temp_new_i64(); 1002 TCGv_i64 t1 = tcg_temp_new_i64(); 1003 TCGv_i64 t2 = tcg_temp_new_i64(); 1004 TCGv_i64 t3 = tcg_temp_new_i64(); 1005 uint32_t i; 1006 1007 for (i = 0; i < oprsz; i += 8) { 1008 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 1009 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 1010 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 1011 fni(t0, t1, t2, t3, c); 1012 tcg_gen_st_i64(t0, cpu_env, dofs + i); 1013 } 1014 tcg_temp_free_i64(t3); 1015 tcg_temp_free_i64(t2); 1016 tcg_temp_free_i64(t1); 1017 tcg_temp_free_i64(t0); 1018 } 1019 1020 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 1021 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1022 uint32_t oprsz, uint32_t tysz, TCGType type, 1023 bool load_dest, 1024 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 1025 { 1026 TCGv_vec t0 = tcg_temp_new_vec(type); 1027 TCGv_vec t1 = tcg_temp_new_vec(type); 1028 uint32_t i; 1029 1030 for (i = 0; i < oprsz; i += tysz) { 1031 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1032 if (load_dest) { 1033 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 1034 } 1035 fni(vece, t1, t0); 1036 tcg_gen_st_vec(t1, cpu_env, dofs + i); 1037 } 1038 tcg_temp_free_vec(t0); 1039 tcg_temp_free_vec(t1); 1040 } 1041 1042 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 1043 using host vectors. */ 1044 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1045 uint32_t oprsz, uint32_t tysz, TCGType type, 1046 int64_t c, bool load_dest, 1047 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 1048 { 1049 TCGv_vec t0 = tcg_temp_new_vec(type); 1050 TCGv_vec t1 = tcg_temp_new_vec(type); 1051 uint32_t i; 1052 1053 for (i = 0; i < oprsz; i += tysz) { 1054 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1055 if (load_dest) { 1056 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 1057 } 1058 fni(vece, t1, t0, c); 1059 tcg_gen_st_vec(t1, cpu_env, dofs + i); 1060 } 1061 tcg_temp_free_vec(t0); 1062 tcg_temp_free_vec(t1); 1063 } 1064 1065 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1066 uint32_t oprsz, uint32_t tysz, TCGType type, 1067 TCGv_vec c, bool scalar_first, 1068 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1069 { 1070 TCGv_vec t0 = tcg_temp_new_vec(type); 1071 TCGv_vec t1 = tcg_temp_new_vec(type); 1072 uint32_t i; 1073 1074 for (i = 0; i < oprsz; i += tysz) { 1075 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1076 if (scalar_first) { 1077 fni(vece, t1, c, t0); 1078 } else { 1079 fni(vece, t1, t0, c); 1080 } 1081 tcg_gen_st_vec(t1, cpu_env, dofs + i); 1082 } 1083 tcg_temp_free_vec(t0); 1084 tcg_temp_free_vec(t1); 1085 } 1086 1087 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 1088 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1089 uint32_t bofs, uint32_t oprsz, 1090 uint32_t tysz, TCGType type, bool load_dest, 1091 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1092 { 1093 TCGv_vec t0 = tcg_temp_new_vec(type); 1094 TCGv_vec t1 = tcg_temp_new_vec(type); 1095 TCGv_vec t2 = tcg_temp_new_vec(type); 1096 uint32_t i; 1097 1098 for (i = 0; i < oprsz; i += tysz) { 1099 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1100 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 1101 if (load_dest) { 1102 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 1103 } 1104 fni(vece, t2, t0, t1); 1105 tcg_gen_st_vec(t2, cpu_env, dofs + i); 1106 } 1107 tcg_temp_free_vec(t2); 1108 tcg_temp_free_vec(t1); 1109 tcg_temp_free_vec(t0); 1110 } 1111 1112 /* 1113 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 1114 * using host vectors. 1115 */ 1116 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1117 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 1118 TCGType type, int64_t c, bool load_dest, 1119 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 1120 int64_t)) 1121 { 1122 TCGv_vec t0 = tcg_temp_new_vec(type); 1123 TCGv_vec t1 = tcg_temp_new_vec(type); 1124 TCGv_vec t2 = tcg_temp_new_vec(type); 1125 uint32_t i; 1126 1127 for (i = 0; i < oprsz; i += tysz) { 1128 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1129 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 1130 if (load_dest) { 1131 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 1132 } 1133 fni(vece, t2, t0, t1, c); 1134 tcg_gen_st_vec(t2, cpu_env, dofs + i); 1135 } 1136 tcg_temp_free_vec(t0); 1137 tcg_temp_free_vec(t1); 1138 tcg_temp_free_vec(t2); 1139 } 1140 1141 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 1142 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1143 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1144 uint32_t tysz, TCGType type, bool write_aofs, 1145 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1146 TCGv_vec, TCGv_vec)) 1147 { 1148 TCGv_vec t0 = tcg_temp_new_vec(type); 1149 TCGv_vec t1 = tcg_temp_new_vec(type); 1150 TCGv_vec t2 = tcg_temp_new_vec(type); 1151 TCGv_vec t3 = tcg_temp_new_vec(type); 1152 uint32_t i; 1153 1154 for (i = 0; i < oprsz; i += tysz) { 1155 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 1156 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 1157 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 1158 fni(vece, t0, t1, t2, t3); 1159 tcg_gen_st_vec(t0, cpu_env, dofs + i); 1160 if (write_aofs) { 1161 tcg_gen_st_vec(t1, cpu_env, aofs + i); 1162 } 1163 } 1164 tcg_temp_free_vec(t3); 1165 tcg_temp_free_vec(t2); 1166 tcg_temp_free_vec(t1); 1167 tcg_temp_free_vec(t0); 1168 } 1169 1170 /* 1171 * Expand OPSZ bytes worth of four-vector operands and an immediate operand 1172 * using host vectors. 1173 */ 1174 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1175 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1176 uint32_t tysz, TCGType type, int64_t c, 1177 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1178 TCGv_vec, TCGv_vec, int64_t)) 1179 { 1180 TCGv_vec t0 = tcg_temp_new_vec(type); 1181 TCGv_vec t1 = tcg_temp_new_vec(type); 1182 TCGv_vec t2 = tcg_temp_new_vec(type); 1183 TCGv_vec t3 = tcg_temp_new_vec(type); 1184 uint32_t i; 1185 1186 for (i = 0; i < oprsz; i += tysz) { 1187 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 1188 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 1189 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 1190 fni(vece, t0, t1, t2, t3, c); 1191 tcg_gen_st_vec(t0, cpu_env, dofs + i); 1192 } 1193 tcg_temp_free_vec(t3); 1194 tcg_temp_free_vec(t2); 1195 tcg_temp_free_vec(t1); 1196 tcg_temp_free_vec(t0); 1197 } 1198 1199 /* Expand a vector two-operand operation. */ 1200 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 1201 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 1202 { 1203 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1204 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1205 TCGType type; 1206 uint32_t some; 1207 1208 check_size_align(oprsz, maxsz, dofs | aofs); 1209 check_overlap_2(dofs, aofs, maxsz); 1210 1211 type = 0; 1212 if (g->fniv) { 1213 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1214 } 1215 switch (type) { 1216 case TCG_TYPE_V256: 1217 /* Recall that ARM SVE allows vector sizes that are not a 1218 * power of 2, but always a multiple of 16. The intent is 1219 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1220 */ 1221 some = QEMU_ALIGN_DOWN(oprsz, 32); 1222 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1223 g->load_dest, g->fniv); 1224 if (some == oprsz) { 1225 break; 1226 } 1227 dofs += some; 1228 aofs += some; 1229 oprsz -= some; 1230 maxsz -= some; 1231 /* fallthru */ 1232 case TCG_TYPE_V128: 1233 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1234 g->load_dest, g->fniv); 1235 break; 1236 case TCG_TYPE_V64: 1237 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1238 g->load_dest, g->fniv); 1239 break; 1240 1241 case 0: 1242 if (g->fni8 && check_size_impl(oprsz, 8)) { 1243 expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8); 1244 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1245 expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4); 1246 } else { 1247 assert(g->fno != NULL); 1248 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1249 oprsz = maxsz; 1250 } 1251 break; 1252 1253 default: 1254 g_assert_not_reached(); 1255 } 1256 tcg_swap_vecop_list(hold_list); 1257 1258 if (oprsz < maxsz) { 1259 expand_clr(dofs + oprsz, maxsz - oprsz); 1260 } 1261 } 1262 1263 /* Expand a vector operation with two vectors and an immediate. */ 1264 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1265 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1266 { 1267 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1268 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1269 TCGType type; 1270 uint32_t some; 1271 1272 check_size_align(oprsz, maxsz, dofs | aofs); 1273 check_overlap_2(dofs, aofs, maxsz); 1274 1275 type = 0; 1276 if (g->fniv) { 1277 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1278 } 1279 switch (type) { 1280 case TCG_TYPE_V256: 1281 /* Recall that ARM SVE allows vector sizes that are not a 1282 * power of 2, but always a multiple of 16. The intent is 1283 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1284 */ 1285 some = QEMU_ALIGN_DOWN(oprsz, 32); 1286 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1287 c, g->load_dest, g->fniv); 1288 if (some == oprsz) { 1289 break; 1290 } 1291 dofs += some; 1292 aofs += some; 1293 oprsz -= some; 1294 maxsz -= some; 1295 /* fallthru */ 1296 case TCG_TYPE_V128: 1297 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1298 c, g->load_dest, g->fniv); 1299 break; 1300 case TCG_TYPE_V64: 1301 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1302 c, g->load_dest, g->fniv); 1303 break; 1304 1305 case 0: 1306 if (g->fni8 && check_size_impl(oprsz, 8)) { 1307 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1308 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1309 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1310 } else { 1311 if (g->fno) { 1312 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1313 } else { 1314 TCGv_i64 tcg_c = tcg_constant_i64(c); 1315 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1316 maxsz, c, g->fnoi); 1317 } 1318 oprsz = maxsz; 1319 } 1320 break; 1321 1322 default: 1323 g_assert_not_reached(); 1324 } 1325 tcg_swap_vecop_list(hold_list); 1326 1327 if (oprsz < maxsz) { 1328 expand_clr(dofs + oprsz, maxsz - oprsz); 1329 } 1330 } 1331 1332 /* Expand a vector operation with two vectors and a scalar. */ 1333 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1334 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1335 { 1336 TCGType type; 1337 1338 check_size_align(oprsz, maxsz, dofs | aofs); 1339 check_overlap_2(dofs, aofs, maxsz); 1340 1341 type = 0; 1342 if (g->fniv) { 1343 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1344 } 1345 if (type != 0) { 1346 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1347 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1348 TCGv_vec t_vec = tcg_temp_new_vec(type); 1349 uint32_t some; 1350 1351 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1352 1353 switch (type) { 1354 case TCG_TYPE_V256: 1355 /* Recall that ARM SVE allows vector sizes that are not a 1356 * power of 2, but always a multiple of 16. The intent is 1357 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1358 */ 1359 some = QEMU_ALIGN_DOWN(oprsz, 32); 1360 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1361 t_vec, g->scalar_first, g->fniv); 1362 if (some == oprsz) { 1363 break; 1364 } 1365 dofs += some; 1366 aofs += some; 1367 oprsz -= some; 1368 maxsz -= some; 1369 /* fallthru */ 1370 1371 case TCG_TYPE_V128: 1372 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1373 t_vec, g->scalar_first, g->fniv); 1374 break; 1375 1376 case TCG_TYPE_V64: 1377 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1378 t_vec, g->scalar_first, g->fniv); 1379 break; 1380 1381 default: 1382 g_assert_not_reached(); 1383 } 1384 tcg_temp_free_vec(t_vec); 1385 tcg_swap_vecop_list(hold_list); 1386 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1387 TCGv_i64 t64 = tcg_temp_new_i64(); 1388 1389 tcg_gen_dup_i64(g->vece, t64, c); 1390 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1391 tcg_temp_free_i64(t64); 1392 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1393 TCGv_i32 t32 = tcg_temp_new_i32(); 1394 1395 tcg_gen_extrl_i64_i32(t32, c); 1396 tcg_gen_dup_i32(g->vece, t32, t32); 1397 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1398 tcg_temp_free_i32(t32); 1399 } else { 1400 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1401 return; 1402 } 1403 1404 if (oprsz < maxsz) { 1405 expand_clr(dofs + oprsz, maxsz - oprsz); 1406 } 1407 } 1408 1409 /* Expand a vector three-operand operation. */ 1410 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1411 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1412 { 1413 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1414 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1415 TCGType type; 1416 uint32_t some; 1417 1418 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1419 check_overlap_3(dofs, aofs, bofs, maxsz); 1420 1421 type = 0; 1422 if (g->fniv) { 1423 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1424 } 1425 switch (type) { 1426 case TCG_TYPE_V256: 1427 /* Recall that ARM SVE allows vector sizes that are not a 1428 * power of 2, but always a multiple of 16. The intent is 1429 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1430 */ 1431 some = QEMU_ALIGN_DOWN(oprsz, 32); 1432 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1433 g->load_dest, g->fniv); 1434 if (some == oprsz) { 1435 break; 1436 } 1437 dofs += some; 1438 aofs += some; 1439 bofs += some; 1440 oprsz -= some; 1441 maxsz -= some; 1442 /* fallthru */ 1443 case TCG_TYPE_V128: 1444 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1445 g->load_dest, g->fniv); 1446 break; 1447 case TCG_TYPE_V64: 1448 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1449 g->load_dest, g->fniv); 1450 break; 1451 1452 case 0: 1453 if (g->fni8 && check_size_impl(oprsz, 8)) { 1454 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1455 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1456 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1457 } else { 1458 assert(g->fno != NULL); 1459 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1460 maxsz, g->data, g->fno); 1461 oprsz = maxsz; 1462 } 1463 break; 1464 1465 default: 1466 g_assert_not_reached(); 1467 } 1468 tcg_swap_vecop_list(hold_list); 1469 1470 if (oprsz < maxsz) { 1471 expand_clr(dofs + oprsz, maxsz - oprsz); 1472 } 1473 } 1474 1475 /* Expand a vector operation with three vectors and an immediate. */ 1476 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1477 uint32_t oprsz, uint32_t maxsz, int64_t c, 1478 const GVecGen3i *g) 1479 { 1480 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1481 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1482 TCGType type; 1483 uint32_t some; 1484 1485 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1486 check_overlap_3(dofs, aofs, bofs, maxsz); 1487 1488 type = 0; 1489 if (g->fniv) { 1490 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1491 } 1492 switch (type) { 1493 case TCG_TYPE_V256: 1494 /* 1495 * Recall that ARM SVE allows vector sizes that are not a 1496 * power of 2, but always a multiple of 16. The intent is 1497 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1498 */ 1499 some = QEMU_ALIGN_DOWN(oprsz, 32); 1500 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1501 c, g->load_dest, g->fniv); 1502 if (some == oprsz) { 1503 break; 1504 } 1505 dofs += some; 1506 aofs += some; 1507 bofs += some; 1508 oprsz -= some; 1509 maxsz -= some; 1510 /* fallthru */ 1511 case TCG_TYPE_V128: 1512 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1513 c, g->load_dest, g->fniv); 1514 break; 1515 case TCG_TYPE_V64: 1516 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1517 c, g->load_dest, g->fniv); 1518 break; 1519 1520 case 0: 1521 if (g->fni8 && check_size_impl(oprsz, 8)) { 1522 expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8); 1523 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1524 expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4); 1525 } else { 1526 assert(g->fno != NULL); 1527 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1528 oprsz = maxsz; 1529 } 1530 break; 1531 1532 default: 1533 g_assert_not_reached(); 1534 } 1535 tcg_swap_vecop_list(hold_list); 1536 1537 if (oprsz < maxsz) { 1538 expand_clr(dofs + oprsz, maxsz - oprsz); 1539 } 1540 } 1541 1542 /* Expand a vector four-operand operation. */ 1543 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1544 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1545 { 1546 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1547 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1548 TCGType type; 1549 uint32_t some; 1550 1551 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1552 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1553 1554 type = 0; 1555 if (g->fniv) { 1556 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1557 } 1558 switch (type) { 1559 case TCG_TYPE_V256: 1560 /* Recall that ARM SVE allows vector sizes that are not a 1561 * power of 2, but always a multiple of 16. The intent is 1562 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1563 */ 1564 some = QEMU_ALIGN_DOWN(oprsz, 32); 1565 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1566 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1567 if (some == oprsz) { 1568 break; 1569 } 1570 dofs += some; 1571 aofs += some; 1572 bofs += some; 1573 cofs += some; 1574 oprsz -= some; 1575 maxsz -= some; 1576 /* fallthru */ 1577 case TCG_TYPE_V128: 1578 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1579 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1580 break; 1581 case TCG_TYPE_V64: 1582 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1583 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1584 break; 1585 1586 case 0: 1587 if (g->fni8 && check_size_impl(oprsz, 8)) { 1588 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1589 g->write_aofs, g->fni8); 1590 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1591 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1592 g->write_aofs, g->fni4); 1593 } else { 1594 assert(g->fno != NULL); 1595 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1596 oprsz, maxsz, g->data, g->fno); 1597 oprsz = maxsz; 1598 } 1599 break; 1600 1601 default: 1602 g_assert_not_reached(); 1603 } 1604 tcg_swap_vecop_list(hold_list); 1605 1606 if (oprsz < maxsz) { 1607 expand_clr(dofs + oprsz, maxsz - oprsz); 1608 } 1609 } 1610 1611 /* Expand a vector four-operand operation. */ 1612 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1613 uint32_t oprsz, uint32_t maxsz, int64_t c, 1614 const GVecGen4i *g) 1615 { 1616 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1617 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1618 TCGType type; 1619 uint32_t some; 1620 1621 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1622 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1623 1624 type = 0; 1625 if (g->fniv) { 1626 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1627 } 1628 switch (type) { 1629 case TCG_TYPE_V256: 1630 /* 1631 * Recall that ARM SVE allows vector sizes that are not a 1632 * power of 2, but always a multiple of 16. The intent is 1633 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1634 */ 1635 some = QEMU_ALIGN_DOWN(oprsz, 32); 1636 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some, 1637 32, TCG_TYPE_V256, c, g->fniv); 1638 if (some == oprsz) { 1639 break; 1640 } 1641 dofs += some; 1642 aofs += some; 1643 bofs += some; 1644 cofs += some; 1645 oprsz -= some; 1646 maxsz -= some; 1647 /* fallthru */ 1648 case TCG_TYPE_V128: 1649 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1650 16, TCG_TYPE_V128, c, g->fniv); 1651 break; 1652 case TCG_TYPE_V64: 1653 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1654 8, TCG_TYPE_V64, c, g->fniv); 1655 break; 1656 1657 case 0: 1658 if (g->fni8 && check_size_impl(oprsz, 8)) { 1659 expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8); 1660 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1661 expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4); 1662 } else { 1663 assert(g->fno != NULL); 1664 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1665 oprsz, maxsz, c, g->fno); 1666 oprsz = maxsz; 1667 } 1668 break; 1669 1670 default: 1671 g_assert_not_reached(); 1672 } 1673 tcg_swap_vecop_list(hold_list); 1674 1675 if (oprsz < maxsz) { 1676 expand_clr(dofs + oprsz, maxsz - oprsz); 1677 } 1678 } 1679 1680 /* 1681 * Expand specific vector operations. 1682 */ 1683 1684 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1685 { 1686 tcg_gen_mov_vec(a, b); 1687 } 1688 1689 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1690 uint32_t oprsz, uint32_t maxsz) 1691 { 1692 static const GVecGen2 g = { 1693 .fni8 = tcg_gen_mov_i64, 1694 .fniv = vec_mov2, 1695 .fno = gen_helper_gvec_mov, 1696 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1697 }; 1698 if (dofs != aofs) { 1699 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1700 } else { 1701 check_size_align(oprsz, maxsz, dofs); 1702 if (oprsz < maxsz) { 1703 expand_clr(dofs + oprsz, maxsz - oprsz); 1704 } 1705 } 1706 } 1707 1708 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1709 uint32_t maxsz, TCGv_i32 in) 1710 { 1711 check_size_align(oprsz, maxsz, dofs); 1712 tcg_debug_assert(vece <= MO_32); 1713 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1714 } 1715 1716 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1717 uint32_t maxsz, TCGv_i64 in) 1718 { 1719 check_size_align(oprsz, maxsz, dofs); 1720 tcg_debug_assert(vece <= MO_64); 1721 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1722 } 1723 1724 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1725 uint32_t oprsz, uint32_t maxsz) 1726 { 1727 check_size_align(oprsz, maxsz, dofs); 1728 if (vece <= MO_64) { 1729 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1730 if (type != 0) { 1731 TCGv_vec t_vec = tcg_temp_new_vec(type); 1732 tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs); 1733 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1734 tcg_temp_free_vec(t_vec); 1735 } else if (vece <= MO_32) { 1736 TCGv_i32 in = tcg_temp_ebb_new_i32(); 1737 switch (vece) { 1738 case MO_8: 1739 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1740 break; 1741 case MO_16: 1742 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1743 break; 1744 default: 1745 tcg_gen_ld_i32(in, cpu_env, aofs); 1746 break; 1747 } 1748 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1749 tcg_temp_free_i32(in); 1750 } else { 1751 TCGv_i64 in = tcg_temp_ebb_new_i64(); 1752 tcg_gen_ld_i64(in, cpu_env, aofs); 1753 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1754 tcg_temp_free_i64(in); 1755 } 1756 } else if (vece == 4) { 1757 /* 128-bit duplicate. */ 1758 int i; 1759 1760 tcg_debug_assert(oprsz >= 16); 1761 if (TCG_TARGET_HAS_v128) { 1762 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1763 1764 tcg_gen_ld_vec(in, cpu_env, aofs); 1765 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1766 tcg_gen_st_vec(in, cpu_env, dofs + i); 1767 } 1768 tcg_temp_free_vec(in); 1769 } else { 1770 TCGv_i64 in0 = tcg_temp_ebb_new_i64(); 1771 TCGv_i64 in1 = tcg_temp_ebb_new_i64(); 1772 1773 tcg_gen_ld_i64(in0, cpu_env, aofs); 1774 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1775 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1776 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1777 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1778 } 1779 tcg_temp_free_i64(in0); 1780 tcg_temp_free_i64(in1); 1781 } 1782 if (oprsz < maxsz) { 1783 expand_clr(dofs + oprsz, maxsz - oprsz); 1784 } 1785 } else if (vece == 5) { 1786 /* 256-bit duplicate. */ 1787 int i; 1788 1789 tcg_debug_assert(oprsz >= 32); 1790 tcg_debug_assert(oprsz % 32 == 0); 1791 if (TCG_TARGET_HAS_v256) { 1792 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256); 1793 1794 tcg_gen_ld_vec(in, cpu_env, aofs); 1795 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1796 tcg_gen_st_vec(in, cpu_env, dofs + i); 1797 } 1798 tcg_temp_free_vec(in); 1799 } else if (TCG_TARGET_HAS_v128) { 1800 TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128); 1801 TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128); 1802 1803 tcg_gen_ld_vec(in0, cpu_env, aofs); 1804 tcg_gen_ld_vec(in1, cpu_env, aofs + 16); 1805 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1806 tcg_gen_st_vec(in0, cpu_env, dofs + i); 1807 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16); 1808 } 1809 tcg_temp_free_vec(in0); 1810 tcg_temp_free_vec(in1); 1811 } else { 1812 TCGv_i64 in[4]; 1813 int j; 1814 1815 for (j = 0; j < 4; ++j) { 1816 in[j] = tcg_temp_ebb_new_i64(); 1817 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8); 1818 } 1819 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1820 for (j = 0; j < 4; ++j) { 1821 tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8); 1822 } 1823 } 1824 for (j = 0; j < 4; ++j) { 1825 tcg_temp_free_i64(in[j]); 1826 } 1827 } 1828 if (oprsz < maxsz) { 1829 expand_clr(dofs + oprsz, maxsz - oprsz); 1830 } 1831 } else { 1832 g_assert_not_reached(); 1833 } 1834 } 1835 1836 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz, 1837 uint32_t maxsz, uint64_t x) 1838 { 1839 check_size_align(oprsz, maxsz, dofs); 1840 do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x); 1841 } 1842 1843 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1844 uint32_t oprsz, uint32_t maxsz) 1845 { 1846 static const GVecGen2 g = { 1847 .fni8 = tcg_gen_not_i64, 1848 .fniv = tcg_gen_not_vec, 1849 .fno = gen_helper_gvec_not, 1850 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1851 }; 1852 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1853 } 1854 1855 /* Perform a vector addition using normal addition and a mask. The mask 1856 should be the sign bit of each lane. This 6-operation form is more 1857 efficient than separate additions when there are 4 or more lanes in 1858 the 64-bit operation. */ 1859 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1860 { 1861 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 1862 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 1863 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 1864 1865 tcg_gen_andc_i64(t1, a, m); 1866 tcg_gen_andc_i64(t2, b, m); 1867 tcg_gen_xor_i64(t3, a, b); 1868 tcg_gen_add_i64(d, t1, t2); 1869 tcg_gen_and_i64(t3, t3, m); 1870 tcg_gen_xor_i64(d, d, t3); 1871 1872 tcg_temp_free_i64(t1); 1873 tcg_temp_free_i64(t2); 1874 tcg_temp_free_i64(t3); 1875 } 1876 1877 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1878 { 1879 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 1880 gen_addv_mask(d, a, b, m); 1881 } 1882 1883 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1884 { 1885 TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80)); 1886 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 1887 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 1888 TCGv_i32 t3 = tcg_temp_ebb_new_i32(); 1889 1890 tcg_gen_andc_i32(t1, a, m); 1891 tcg_gen_andc_i32(t2, b, m); 1892 tcg_gen_xor_i32(t3, a, b); 1893 tcg_gen_add_i32(d, t1, t2); 1894 tcg_gen_and_i32(t3, t3, m); 1895 tcg_gen_xor_i32(d, d, t3); 1896 1897 tcg_temp_free_i32(t1); 1898 tcg_temp_free_i32(t2); 1899 tcg_temp_free_i32(t3); 1900 } 1901 1902 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1903 { 1904 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 1905 gen_addv_mask(d, a, b, m); 1906 } 1907 1908 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1909 { 1910 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 1911 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 1912 1913 tcg_gen_andi_i32(t1, a, ~0xffff); 1914 tcg_gen_add_i32(t2, a, b); 1915 tcg_gen_add_i32(t1, t1, b); 1916 tcg_gen_deposit_i32(d, t1, t2, 0, 16); 1917 1918 tcg_temp_free_i32(t1); 1919 tcg_temp_free_i32(t2); 1920 } 1921 1922 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1923 { 1924 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 1925 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 1926 1927 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1928 tcg_gen_add_i64(t2, a, b); 1929 tcg_gen_add_i64(t1, t1, b); 1930 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1931 1932 tcg_temp_free_i64(t1); 1933 tcg_temp_free_i64(t2); 1934 } 1935 1936 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1937 1938 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1939 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1940 { 1941 static const GVecGen3 g[4] = { 1942 { .fni8 = tcg_gen_vec_add8_i64, 1943 .fniv = tcg_gen_add_vec, 1944 .fno = gen_helper_gvec_add8, 1945 .opt_opc = vecop_list_add, 1946 .vece = MO_8 }, 1947 { .fni8 = tcg_gen_vec_add16_i64, 1948 .fniv = tcg_gen_add_vec, 1949 .fno = gen_helper_gvec_add16, 1950 .opt_opc = vecop_list_add, 1951 .vece = MO_16 }, 1952 { .fni4 = tcg_gen_add_i32, 1953 .fniv = tcg_gen_add_vec, 1954 .fno = gen_helper_gvec_add32, 1955 .opt_opc = vecop_list_add, 1956 .vece = MO_32 }, 1957 { .fni8 = tcg_gen_add_i64, 1958 .fniv = tcg_gen_add_vec, 1959 .fno = gen_helper_gvec_add64, 1960 .opt_opc = vecop_list_add, 1961 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1962 .vece = MO_64 }, 1963 }; 1964 1965 tcg_debug_assert(vece <= MO_64); 1966 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1967 } 1968 1969 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1970 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1971 { 1972 static const GVecGen2s g[4] = { 1973 { .fni8 = tcg_gen_vec_add8_i64, 1974 .fniv = tcg_gen_add_vec, 1975 .fno = gen_helper_gvec_adds8, 1976 .opt_opc = vecop_list_add, 1977 .vece = MO_8 }, 1978 { .fni8 = tcg_gen_vec_add16_i64, 1979 .fniv = tcg_gen_add_vec, 1980 .fno = gen_helper_gvec_adds16, 1981 .opt_opc = vecop_list_add, 1982 .vece = MO_16 }, 1983 { .fni4 = tcg_gen_add_i32, 1984 .fniv = tcg_gen_add_vec, 1985 .fno = gen_helper_gvec_adds32, 1986 .opt_opc = vecop_list_add, 1987 .vece = MO_32 }, 1988 { .fni8 = tcg_gen_add_i64, 1989 .fniv = tcg_gen_add_vec, 1990 .fno = gen_helper_gvec_adds64, 1991 .opt_opc = vecop_list_add, 1992 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1993 .vece = MO_64 }, 1994 }; 1995 1996 tcg_debug_assert(vece <= MO_64); 1997 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1998 } 1999 2000 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 2001 int64_t c, uint32_t oprsz, uint32_t maxsz) 2002 { 2003 TCGv_i64 tmp = tcg_constant_i64(c); 2004 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 2005 } 2006 2007 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 2008 2009 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 2010 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2011 { 2012 static const GVecGen2s g[4] = { 2013 { .fni8 = tcg_gen_vec_sub8_i64, 2014 .fniv = tcg_gen_sub_vec, 2015 .fno = gen_helper_gvec_subs8, 2016 .opt_opc = vecop_list_sub, 2017 .vece = MO_8 }, 2018 { .fni8 = tcg_gen_vec_sub16_i64, 2019 .fniv = tcg_gen_sub_vec, 2020 .fno = gen_helper_gvec_subs16, 2021 .opt_opc = vecop_list_sub, 2022 .vece = MO_16 }, 2023 { .fni4 = tcg_gen_sub_i32, 2024 .fniv = tcg_gen_sub_vec, 2025 .fno = gen_helper_gvec_subs32, 2026 .opt_opc = vecop_list_sub, 2027 .vece = MO_32 }, 2028 { .fni8 = tcg_gen_sub_i64, 2029 .fniv = tcg_gen_sub_vec, 2030 .fno = gen_helper_gvec_subs64, 2031 .opt_opc = vecop_list_sub, 2032 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2033 .vece = MO_64 }, 2034 }; 2035 2036 tcg_debug_assert(vece <= MO_64); 2037 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 2038 } 2039 2040 /* Perform a vector subtraction using normal subtraction and a mask. 2041 Compare gen_addv_mask above. */ 2042 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 2043 { 2044 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2045 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2046 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 2047 2048 tcg_gen_or_i64(t1, a, m); 2049 tcg_gen_andc_i64(t2, b, m); 2050 tcg_gen_eqv_i64(t3, a, b); 2051 tcg_gen_sub_i64(d, t1, t2); 2052 tcg_gen_and_i64(t3, t3, m); 2053 tcg_gen_xor_i64(d, d, t3); 2054 2055 tcg_temp_free_i64(t1); 2056 tcg_temp_free_i64(t2); 2057 tcg_temp_free_i64(t3); 2058 } 2059 2060 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2061 { 2062 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 2063 gen_subv_mask(d, a, b, m); 2064 } 2065 2066 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2067 { 2068 TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80)); 2069 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 2070 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 2071 TCGv_i32 t3 = tcg_temp_ebb_new_i32(); 2072 2073 tcg_gen_or_i32(t1, a, m); 2074 tcg_gen_andc_i32(t2, b, m); 2075 tcg_gen_eqv_i32(t3, a, b); 2076 tcg_gen_sub_i32(d, t1, t2); 2077 tcg_gen_and_i32(t3, t3, m); 2078 tcg_gen_xor_i32(d, d, t3); 2079 2080 tcg_temp_free_i32(t1); 2081 tcg_temp_free_i32(t2); 2082 tcg_temp_free_i32(t3); 2083 } 2084 2085 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2086 { 2087 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 2088 gen_subv_mask(d, a, b, m); 2089 } 2090 2091 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2092 { 2093 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 2094 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 2095 2096 tcg_gen_andi_i32(t1, b, ~0xffff); 2097 tcg_gen_sub_i32(t2, a, b); 2098 tcg_gen_sub_i32(t1, a, t1); 2099 tcg_gen_deposit_i32(d, t1, t2, 0, 16); 2100 2101 tcg_temp_free_i32(t1); 2102 tcg_temp_free_i32(t2); 2103 } 2104 2105 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2106 { 2107 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2108 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2109 2110 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2111 tcg_gen_sub_i64(t2, a, b); 2112 tcg_gen_sub_i64(t1, a, t1); 2113 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2114 2115 tcg_temp_free_i64(t1); 2116 tcg_temp_free_i64(t2); 2117 } 2118 2119 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 2120 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2121 { 2122 static const GVecGen3 g[4] = { 2123 { .fni8 = tcg_gen_vec_sub8_i64, 2124 .fniv = tcg_gen_sub_vec, 2125 .fno = gen_helper_gvec_sub8, 2126 .opt_opc = vecop_list_sub, 2127 .vece = MO_8 }, 2128 { .fni8 = tcg_gen_vec_sub16_i64, 2129 .fniv = tcg_gen_sub_vec, 2130 .fno = gen_helper_gvec_sub16, 2131 .opt_opc = vecop_list_sub, 2132 .vece = MO_16 }, 2133 { .fni4 = tcg_gen_sub_i32, 2134 .fniv = tcg_gen_sub_vec, 2135 .fno = gen_helper_gvec_sub32, 2136 .opt_opc = vecop_list_sub, 2137 .vece = MO_32 }, 2138 { .fni8 = tcg_gen_sub_i64, 2139 .fniv = tcg_gen_sub_vec, 2140 .fno = gen_helper_gvec_sub64, 2141 .opt_opc = vecop_list_sub, 2142 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2143 .vece = MO_64 }, 2144 }; 2145 2146 tcg_debug_assert(vece <= MO_64); 2147 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2148 } 2149 2150 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 2151 2152 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 2153 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2154 { 2155 static const GVecGen3 g[4] = { 2156 { .fniv = tcg_gen_mul_vec, 2157 .fno = gen_helper_gvec_mul8, 2158 .opt_opc = vecop_list_mul, 2159 .vece = MO_8 }, 2160 { .fniv = tcg_gen_mul_vec, 2161 .fno = gen_helper_gvec_mul16, 2162 .opt_opc = vecop_list_mul, 2163 .vece = MO_16 }, 2164 { .fni4 = tcg_gen_mul_i32, 2165 .fniv = tcg_gen_mul_vec, 2166 .fno = gen_helper_gvec_mul32, 2167 .opt_opc = vecop_list_mul, 2168 .vece = MO_32 }, 2169 { .fni8 = tcg_gen_mul_i64, 2170 .fniv = tcg_gen_mul_vec, 2171 .fno = gen_helper_gvec_mul64, 2172 .opt_opc = vecop_list_mul, 2173 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2174 .vece = MO_64 }, 2175 }; 2176 2177 tcg_debug_assert(vece <= MO_64); 2178 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2179 } 2180 2181 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 2182 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2183 { 2184 static const GVecGen2s g[4] = { 2185 { .fniv = tcg_gen_mul_vec, 2186 .fno = gen_helper_gvec_muls8, 2187 .opt_opc = vecop_list_mul, 2188 .vece = MO_8 }, 2189 { .fniv = tcg_gen_mul_vec, 2190 .fno = gen_helper_gvec_muls16, 2191 .opt_opc = vecop_list_mul, 2192 .vece = MO_16 }, 2193 { .fni4 = tcg_gen_mul_i32, 2194 .fniv = tcg_gen_mul_vec, 2195 .fno = gen_helper_gvec_muls32, 2196 .opt_opc = vecop_list_mul, 2197 .vece = MO_32 }, 2198 { .fni8 = tcg_gen_mul_i64, 2199 .fniv = tcg_gen_mul_vec, 2200 .fno = gen_helper_gvec_muls64, 2201 .opt_opc = vecop_list_mul, 2202 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2203 .vece = MO_64 }, 2204 }; 2205 2206 tcg_debug_assert(vece <= MO_64); 2207 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 2208 } 2209 2210 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 2211 int64_t c, uint32_t oprsz, uint32_t maxsz) 2212 { 2213 TCGv_i64 tmp = tcg_constant_i64(c); 2214 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 2215 } 2216 2217 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2218 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2219 { 2220 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 2221 static const GVecGen3 g[4] = { 2222 { .fniv = tcg_gen_ssadd_vec, 2223 .fno = gen_helper_gvec_ssadd8, 2224 .opt_opc = vecop_list, 2225 .vece = MO_8 }, 2226 { .fniv = tcg_gen_ssadd_vec, 2227 .fno = gen_helper_gvec_ssadd16, 2228 .opt_opc = vecop_list, 2229 .vece = MO_16 }, 2230 { .fniv = tcg_gen_ssadd_vec, 2231 .fno = gen_helper_gvec_ssadd32, 2232 .opt_opc = vecop_list, 2233 .vece = MO_32 }, 2234 { .fniv = tcg_gen_ssadd_vec, 2235 .fno = gen_helper_gvec_ssadd64, 2236 .opt_opc = vecop_list, 2237 .vece = MO_64 }, 2238 }; 2239 tcg_debug_assert(vece <= MO_64); 2240 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2241 } 2242 2243 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 2244 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2245 { 2246 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 2247 static const GVecGen3 g[4] = { 2248 { .fniv = tcg_gen_sssub_vec, 2249 .fno = gen_helper_gvec_sssub8, 2250 .opt_opc = vecop_list, 2251 .vece = MO_8 }, 2252 { .fniv = tcg_gen_sssub_vec, 2253 .fno = gen_helper_gvec_sssub16, 2254 .opt_opc = vecop_list, 2255 .vece = MO_16 }, 2256 { .fniv = tcg_gen_sssub_vec, 2257 .fno = gen_helper_gvec_sssub32, 2258 .opt_opc = vecop_list, 2259 .vece = MO_32 }, 2260 { .fniv = tcg_gen_sssub_vec, 2261 .fno = gen_helper_gvec_sssub64, 2262 .opt_opc = vecop_list, 2263 .vece = MO_64 }, 2264 }; 2265 tcg_debug_assert(vece <= MO_64); 2266 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2267 } 2268 2269 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2270 { 2271 TCGv_i32 max = tcg_constant_i32(-1); 2272 tcg_gen_add_i32(d, a, b); 2273 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 2274 } 2275 2276 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2277 { 2278 TCGv_i64 max = tcg_constant_i64(-1); 2279 tcg_gen_add_i64(d, a, b); 2280 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 2281 } 2282 2283 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2284 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2285 { 2286 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 2287 static const GVecGen3 g[4] = { 2288 { .fniv = tcg_gen_usadd_vec, 2289 .fno = gen_helper_gvec_usadd8, 2290 .opt_opc = vecop_list, 2291 .vece = MO_8 }, 2292 { .fniv = tcg_gen_usadd_vec, 2293 .fno = gen_helper_gvec_usadd16, 2294 .opt_opc = vecop_list, 2295 .vece = MO_16 }, 2296 { .fni4 = tcg_gen_usadd_i32, 2297 .fniv = tcg_gen_usadd_vec, 2298 .fno = gen_helper_gvec_usadd32, 2299 .opt_opc = vecop_list, 2300 .vece = MO_32 }, 2301 { .fni8 = tcg_gen_usadd_i64, 2302 .fniv = tcg_gen_usadd_vec, 2303 .fno = gen_helper_gvec_usadd64, 2304 .opt_opc = vecop_list, 2305 .vece = MO_64 } 2306 }; 2307 tcg_debug_assert(vece <= MO_64); 2308 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2309 } 2310 2311 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2312 { 2313 TCGv_i32 min = tcg_constant_i32(0); 2314 tcg_gen_sub_i32(d, a, b); 2315 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 2316 } 2317 2318 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2319 { 2320 TCGv_i64 min = tcg_constant_i64(0); 2321 tcg_gen_sub_i64(d, a, b); 2322 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 2323 } 2324 2325 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 2326 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2327 { 2328 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 2329 static const GVecGen3 g[4] = { 2330 { .fniv = tcg_gen_ussub_vec, 2331 .fno = gen_helper_gvec_ussub8, 2332 .opt_opc = vecop_list, 2333 .vece = MO_8 }, 2334 { .fniv = tcg_gen_ussub_vec, 2335 .fno = gen_helper_gvec_ussub16, 2336 .opt_opc = vecop_list, 2337 .vece = MO_16 }, 2338 { .fni4 = tcg_gen_ussub_i32, 2339 .fniv = tcg_gen_ussub_vec, 2340 .fno = gen_helper_gvec_ussub32, 2341 .opt_opc = vecop_list, 2342 .vece = MO_32 }, 2343 { .fni8 = tcg_gen_ussub_i64, 2344 .fniv = tcg_gen_ussub_vec, 2345 .fno = gen_helper_gvec_ussub64, 2346 .opt_opc = vecop_list, 2347 .vece = MO_64 } 2348 }; 2349 tcg_debug_assert(vece <= MO_64); 2350 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2351 } 2352 2353 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 2354 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2355 { 2356 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 2357 static const GVecGen3 g[4] = { 2358 { .fniv = tcg_gen_smin_vec, 2359 .fno = gen_helper_gvec_smin8, 2360 .opt_opc = vecop_list, 2361 .vece = MO_8 }, 2362 { .fniv = tcg_gen_smin_vec, 2363 .fno = gen_helper_gvec_smin16, 2364 .opt_opc = vecop_list, 2365 .vece = MO_16 }, 2366 { .fni4 = tcg_gen_smin_i32, 2367 .fniv = tcg_gen_smin_vec, 2368 .fno = gen_helper_gvec_smin32, 2369 .opt_opc = vecop_list, 2370 .vece = MO_32 }, 2371 { .fni8 = tcg_gen_smin_i64, 2372 .fniv = tcg_gen_smin_vec, 2373 .fno = gen_helper_gvec_smin64, 2374 .opt_opc = vecop_list, 2375 .vece = MO_64 } 2376 }; 2377 tcg_debug_assert(vece <= MO_64); 2378 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2379 } 2380 2381 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2382 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2383 { 2384 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2385 static const GVecGen3 g[4] = { 2386 { .fniv = tcg_gen_umin_vec, 2387 .fno = gen_helper_gvec_umin8, 2388 .opt_opc = vecop_list, 2389 .vece = MO_8 }, 2390 { .fniv = tcg_gen_umin_vec, 2391 .fno = gen_helper_gvec_umin16, 2392 .opt_opc = vecop_list, 2393 .vece = MO_16 }, 2394 { .fni4 = tcg_gen_umin_i32, 2395 .fniv = tcg_gen_umin_vec, 2396 .fno = gen_helper_gvec_umin32, 2397 .opt_opc = vecop_list, 2398 .vece = MO_32 }, 2399 { .fni8 = tcg_gen_umin_i64, 2400 .fniv = tcg_gen_umin_vec, 2401 .fno = gen_helper_gvec_umin64, 2402 .opt_opc = vecop_list, 2403 .vece = MO_64 } 2404 }; 2405 tcg_debug_assert(vece <= MO_64); 2406 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2407 } 2408 2409 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2410 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2411 { 2412 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2413 static const GVecGen3 g[4] = { 2414 { .fniv = tcg_gen_smax_vec, 2415 .fno = gen_helper_gvec_smax8, 2416 .opt_opc = vecop_list, 2417 .vece = MO_8 }, 2418 { .fniv = tcg_gen_smax_vec, 2419 .fno = gen_helper_gvec_smax16, 2420 .opt_opc = vecop_list, 2421 .vece = MO_16 }, 2422 { .fni4 = tcg_gen_smax_i32, 2423 .fniv = tcg_gen_smax_vec, 2424 .fno = gen_helper_gvec_smax32, 2425 .opt_opc = vecop_list, 2426 .vece = MO_32 }, 2427 { .fni8 = tcg_gen_smax_i64, 2428 .fniv = tcg_gen_smax_vec, 2429 .fno = gen_helper_gvec_smax64, 2430 .opt_opc = vecop_list, 2431 .vece = MO_64 } 2432 }; 2433 tcg_debug_assert(vece <= MO_64); 2434 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2435 } 2436 2437 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2438 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2439 { 2440 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2441 static const GVecGen3 g[4] = { 2442 { .fniv = tcg_gen_umax_vec, 2443 .fno = gen_helper_gvec_umax8, 2444 .opt_opc = vecop_list, 2445 .vece = MO_8 }, 2446 { .fniv = tcg_gen_umax_vec, 2447 .fno = gen_helper_gvec_umax16, 2448 .opt_opc = vecop_list, 2449 .vece = MO_16 }, 2450 { .fni4 = tcg_gen_umax_i32, 2451 .fniv = tcg_gen_umax_vec, 2452 .fno = gen_helper_gvec_umax32, 2453 .opt_opc = vecop_list, 2454 .vece = MO_32 }, 2455 { .fni8 = tcg_gen_umax_i64, 2456 .fniv = tcg_gen_umax_vec, 2457 .fno = gen_helper_gvec_umax64, 2458 .opt_opc = vecop_list, 2459 .vece = MO_64 } 2460 }; 2461 tcg_debug_assert(vece <= MO_64); 2462 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2463 } 2464 2465 /* Perform a vector negation using normal negation and a mask. 2466 Compare gen_subv_mask above. */ 2467 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2468 { 2469 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2470 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 2471 2472 tcg_gen_andc_i64(t3, m, b); 2473 tcg_gen_andc_i64(t2, b, m); 2474 tcg_gen_sub_i64(d, m, t2); 2475 tcg_gen_xor_i64(d, d, t3); 2476 2477 tcg_temp_free_i64(t2); 2478 tcg_temp_free_i64(t3); 2479 } 2480 2481 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2482 { 2483 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 2484 gen_negv_mask(d, b, m); 2485 } 2486 2487 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2488 { 2489 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 2490 gen_negv_mask(d, b, m); 2491 } 2492 2493 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2494 { 2495 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2496 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2497 2498 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2499 tcg_gen_neg_i64(t2, b); 2500 tcg_gen_neg_i64(t1, t1); 2501 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2502 2503 tcg_temp_free_i64(t1); 2504 tcg_temp_free_i64(t2); 2505 } 2506 2507 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2508 uint32_t oprsz, uint32_t maxsz) 2509 { 2510 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2511 static const GVecGen2 g[4] = { 2512 { .fni8 = tcg_gen_vec_neg8_i64, 2513 .fniv = tcg_gen_neg_vec, 2514 .fno = gen_helper_gvec_neg8, 2515 .opt_opc = vecop_list, 2516 .vece = MO_8 }, 2517 { .fni8 = tcg_gen_vec_neg16_i64, 2518 .fniv = tcg_gen_neg_vec, 2519 .fno = gen_helper_gvec_neg16, 2520 .opt_opc = vecop_list, 2521 .vece = MO_16 }, 2522 { .fni4 = tcg_gen_neg_i32, 2523 .fniv = tcg_gen_neg_vec, 2524 .fno = gen_helper_gvec_neg32, 2525 .opt_opc = vecop_list, 2526 .vece = MO_32 }, 2527 { .fni8 = tcg_gen_neg_i64, 2528 .fniv = tcg_gen_neg_vec, 2529 .fno = gen_helper_gvec_neg64, 2530 .opt_opc = vecop_list, 2531 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2532 .vece = MO_64 }, 2533 }; 2534 2535 tcg_debug_assert(vece <= MO_64); 2536 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2537 } 2538 2539 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2540 { 2541 TCGv_i64 t = tcg_temp_ebb_new_i64(); 2542 int nbit = 8 << vece; 2543 2544 /* Create -1 for each negative element. */ 2545 tcg_gen_shri_i64(t, b, nbit - 1); 2546 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2547 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2548 2549 /* 2550 * Invert (via xor -1) and add one. 2551 * Because of the ordering the msb is cleared, 2552 * so we never have carry into the next element. 2553 */ 2554 tcg_gen_xor_i64(d, b, t); 2555 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2556 tcg_gen_add_i64(d, d, t); 2557 2558 tcg_temp_free_i64(t); 2559 } 2560 2561 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2562 { 2563 gen_absv_mask(d, b, MO_8); 2564 } 2565 2566 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2567 { 2568 gen_absv_mask(d, b, MO_16); 2569 } 2570 2571 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2572 uint32_t oprsz, uint32_t maxsz) 2573 { 2574 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2575 static const GVecGen2 g[4] = { 2576 { .fni8 = tcg_gen_vec_abs8_i64, 2577 .fniv = tcg_gen_abs_vec, 2578 .fno = gen_helper_gvec_abs8, 2579 .opt_opc = vecop_list, 2580 .vece = MO_8 }, 2581 { .fni8 = tcg_gen_vec_abs16_i64, 2582 .fniv = tcg_gen_abs_vec, 2583 .fno = gen_helper_gvec_abs16, 2584 .opt_opc = vecop_list, 2585 .vece = MO_16 }, 2586 { .fni4 = tcg_gen_abs_i32, 2587 .fniv = tcg_gen_abs_vec, 2588 .fno = gen_helper_gvec_abs32, 2589 .opt_opc = vecop_list, 2590 .vece = MO_32 }, 2591 { .fni8 = tcg_gen_abs_i64, 2592 .fniv = tcg_gen_abs_vec, 2593 .fno = gen_helper_gvec_abs64, 2594 .opt_opc = vecop_list, 2595 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2596 .vece = MO_64 }, 2597 }; 2598 2599 tcg_debug_assert(vece <= MO_64); 2600 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2601 } 2602 2603 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2604 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2605 { 2606 static const GVecGen3 g = { 2607 .fni8 = tcg_gen_and_i64, 2608 .fniv = tcg_gen_and_vec, 2609 .fno = gen_helper_gvec_and, 2610 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2611 }; 2612 2613 if (aofs == bofs) { 2614 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2615 } else { 2616 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2617 } 2618 } 2619 2620 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2621 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2622 { 2623 static const GVecGen3 g = { 2624 .fni8 = tcg_gen_or_i64, 2625 .fniv = tcg_gen_or_vec, 2626 .fno = gen_helper_gvec_or, 2627 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2628 }; 2629 2630 if (aofs == bofs) { 2631 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2632 } else { 2633 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2634 } 2635 } 2636 2637 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2638 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2639 { 2640 static const GVecGen3 g = { 2641 .fni8 = tcg_gen_xor_i64, 2642 .fniv = tcg_gen_xor_vec, 2643 .fno = gen_helper_gvec_xor, 2644 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2645 }; 2646 2647 if (aofs == bofs) { 2648 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2649 } else { 2650 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2651 } 2652 } 2653 2654 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2655 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2656 { 2657 static const GVecGen3 g = { 2658 .fni8 = tcg_gen_andc_i64, 2659 .fniv = tcg_gen_andc_vec, 2660 .fno = gen_helper_gvec_andc, 2661 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2662 }; 2663 2664 if (aofs == bofs) { 2665 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2666 } else { 2667 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2668 } 2669 } 2670 2671 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2672 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2673 { 2674 static const GVecGen3 g = { 2675 .fni8 = tcg_gen_orc_i64, 2676 .fniv = tcg_gen_orc_vec, 2677 .fno = gen_helper_gvec_orc, 2678 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2679 }; 2680 2681 if (aofs == bofs) { 2682 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2683 } else { 2684 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2685 } 2686 } 2687 2688 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2689 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2690 { 2691 static const GVecGen3 g = { 2692 .fni8 = tcg_gen_nand_i64, 2693 .fniv = tcg_gen_nand_vec, 2694 .fno = gen_helper_gvec_nand, 2695 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2696 }; 2697 2698 if (aofs == bofs) { 2699 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2700 } else { 2701 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2702 } 2703 } 2704 2705 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2706 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2707 { 2708 static const GVecGen3 g = { 2709 .fni8 = tcg_gen_nor_i64, 2710 .fniv = tcg_gen_nor_vec, 2711 .fno = gen_helper_gvec_nor, 2712 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2713 }; 2714 2715 if (aofs == bofs) { 2716 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2717 } else { 2718 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2719 } 2720 } 2721 2722 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2723 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2724 { 2725 static const GVecGen3 g = { 2726 .fni8 = tcg_gen_eqv_i64, 2727 .fniv = tcg_gen_eqv_vec, 2728 .fno = gen_helper_gvec_eqv, 2729 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2730 }; 2731 2732 if (aofs == bofs) { 2733 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2734 } else { 2735 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2736 } 2737 } 2738 2739 static const GVecGen2s gop_ands = { 2740 .fni8 = tcg_gen_and_i64, 2741 .fniv = tcg_gen_and_vec, 2742 .fno = gen_helper_gvec_ands, 2743 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2744 .vece = MO_64 2745 }; 2746 2747 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2748 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2749 { 2750 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2751 tcg_gen_dup_i64(vece, tmp, c); 2752 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2753 tcg_temp_free_i64(tmp); 2754 } 2755 2756 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2757 int64_t c, uint32_t oprsz, uint32_t maxsz) 2758 { 2759 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2760 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2761 } 2762 2763 static const GVecGen2s gop_xors = { 2764 .fni8 = tcg_gen_xor_i64, 2765 .fniv = tcg_gen_xor_vec, 2766 .fno = gen_helper_gvec_xors, 2767 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2768 .vece = MO_64 2769 }; 2770 2771 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2772 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2773 { 2774 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2775 tcg_gen_dup_i64(vece, tmp, c); 2776 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2777 tcg_temp_free_i64(tmp); 2778 } 2779 2780 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2781 int64_t c, uint32_t oprsz, uint32_t maxsz) 2782 { 2783 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2784 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2785 } 2786 2787 static const GVecGen2s gop_ors = { 2788 .fni8 = tcg_gen_or_i64, 2789 .fniv = tcg_gen_or_vec, 2790 .fno = gen_helper_gvec_ors, 2791 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2792 .vece = MO_64 2793 }; 2794 2795 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2796 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2797 { 2798 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2799 tcg_gen_dup_i64(vece, tmp, c); 2800 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2801 tcg_temp_free_i64(tmp); 2802 } 2803 2804 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2805 int64_t c, uint32_t oprsz, uint32_t maxsz) 2806 { 2807 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2808 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2809 } 2810 2811 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2812 { 2813 uint64_t mask = dup_const(MO_8, 0xff << c); 2814 tcg_gen_shli_i64(d, a, c); 2815 tcg_gen_andi_i64(d, d, mask); 2816 } 2817 2818 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2819 { 2820 uint64_t mask = dup_const(MO_16, 0xffff << c); 2821 tcg_gen_shli_i64(d, a, c); 2822 tcg_gen_andi_i64(d, d, mask); 2823 } 2824 2825 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2826 { 2827 uint32_t mask = dup_const(MO_8, 0xff << c); 2828 tcg_gen_shli_i32(d, a, c); 2829 tcg_gen_andi_i32(d, d, mask); 2830 } 2831 2832 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2833 { 2834 uint32_t mask = dup_const(MO_16, 0xffff << c); 2835 tcg_gen_shli_i32(d, a, c); 2836 tcg_gen_andi_i32(d, d, mask); 2837 } 2838 2839 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2840 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2841 { 2842 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2843 static const GVecGen2i g[4] = { 2844 { .fni8 = tcg_gen_vec_shl8i_i64, 2845 .fniv = tcg_gen_shli_vec, 2846 .fno = gen_helper_gvec_shl8i, 2847 .opt_opc = vecop_list, 2848 .vece = MO_8 }, 2849 { .fni8 = tcg_gen_vec_shl16i_i64, 2850 .fniv = tcg_gen_shli_vec, 2851 .fno = gen_helper_gvec_shl16i, 2852 .opt_opc = vecop_list, 2853 .vece = MO_16 }, 2854 { .fni4 = tcg_gen_shli_i32, 2855 .fniv = tcg_gen_shli_vec, 2856 .fno = gen_helper_gvec_shl32i, 2857 .opt_opc = vecop_list, 2858 .vece = MO_32 }, 2859 { .fni8 = tcg_gen_shli_i64, 2860 .fniv = tcg_gen_shli_vec, 2861 .fno = gen_helper_gvec_shl64i, 2862 .opt_opc = vecop_list, 2863 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2864 .vece = MO_64 }, 2865 }; 2866 2867 tcg_debug_assert(vece <= MO_64); 2868 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2869 if (shift == 0) { 2870 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2871 } else { 2872 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2873 } 2874 } 2875 2876 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2877 { 2878 uint64_t mask = dup_const(MO_8, 0xff >> c); 2879 tcg_gen_shri_i64(d, a, c); 2880 tcg_gen_andi_i64(d, d, mask); 2881 } 2882 2883 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2884 { 2885 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2886 tcg_gen_shri_i64(d, a, c); 2887 tcg_gen_andi_i64(d, d, mask); 2888 } 2889 2890 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2891 { 2892 uint32_t mask = dup_const(MO_8, 0xff >> c); 2893 tcg_gen_shri_i32(d, a, c); 2894 tcg_gen_andi_i32(d, d, mask); 2895 } 2896 2897 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2898 { 2899 uint32_t mask = dup_const(MO_16, 0xffff >> c); 2900 tcg_gen_shri_i32(d, a, c); 2901 tcg_gen_andi_i32(d, d, mask); 2902 } 2903 2904 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2905 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2906 { 2907 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2908 static const GVecGen2i g[4] = { 2909 { .fni8 = tcg_gen_vec_shr8i_i64, 2910 .fniv = tcg_gen_shri_vec, 2911 .fno = gen_helper_gvec_shr8i, 2912 .opt_opc = vecop_list, 2913 .vece = MO_8 }, 2914 { .fni8 = tcg_gen_vec_shr16i_i64, 2915 .fniv = tcg_gen_shri_vec, 2916 .fno = gen_helper_gvec_shr16i, 2917 .opt_opc = vecop_list, 2918 .vece = MO_16 }, 2919 { .fni4 = tcg_gen_shri_i32, 2920 .fniv = tcg_gen_shri_vec, 2921 .fno = gen_helper_gvec_shr32i, 2922 .opt_opc = vecop_list, 2923 .vece = MO_32 }, 2924 { .fni8 = tcg_gen_shri_i64, 2925 .fniv = tcg_gen_shri_vec, 2926 .fno = gen_helper_gvec_shr64i, 2927 .opt_opc = vecop_list, 2928 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2929 .vece = MO_64 }, 2930 }; 2931 2932 tcg_debug_assert(vece <= MO_64); 2933 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2934 if (shift == 0) { 2935 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2936 } else { 2937 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2938 } 2939 } 2940 2941 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2942 { 2943 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2944 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2945 TCGv_i64 s = tcg_temp_ebb_new_i64(); 2946 2947 tcg_gen_shri_i64(d, a, c); 2948 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2949 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2950 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2951 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2952 tcg_temp_free_i64(s); 2953 } 2954 2955 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2956 { 2957 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2958 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2959 TCGv_i64 s = tcg_temp_ebb_new_i64(); 2960 2961 tcg_gen_shri_i64(d, a, c); 2962 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2963 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2964 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2965 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2966 tcg_temp_free_i64(s); 2967 } 2968 2969 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2970 { 2971 uint32_t s_mask = dup_const(MO_8, 0x80 >> c); 2972 uint32_t c_mask = dup_const(MO_8, 0xff >> c); 2973 TCGv_i32 s = tcg_temp_ebb_new_i32(); 2974 2975 tcg_gen_shri_i32(d, a, c); 2976 tcg_gen_andi_i32(s, d, s_mask); /* isolate (shifted) sign bit */ 2977 tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */ 2978 tcg_gen_andi_i32(d, d, c_mask); /* clear out bits above sign */ 2979 tcg_gen_or_i32(d, d, s); /* include sign extension */ 2980 tcg_temp_free_i32(s); 2981 } 2982 2983 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2984 { 2985 uint32_t s_mask = dup_const(MO_16, 0x8000 >> c); 2986 uint32_t c_mask = dup_const(MO_16, 0xffff >> c); 2987 TCGv_i32 s = tcg_temp_ebb_new_i32(); 2988 2989 tcg_gen_shri_i32(d, a, c); 2990 tcg_gen_andi_i32(s, d, s_mask); /* isolate (shifted) sign bit */ 2991 tcg_gen_andi_i32(d, d, c_mask); /* clear out bits above sign */ 2992 tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */ 2993 tcg_gen_or_i32(d, d, s); /* include sign extension */ 2994 tcg_temp_free_i32(s); 2995 } 2996 2997 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2998 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2999 { 3000 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 3001 static const GVecGen2i g[4] = { 3002 { .fni8 = tcg_gen_vec_sar8i_i64, 3003 .fniv = tcg_gen_sari_vec, 3004 .fno = gen_helper_gvec_sar8i, 3005 .opt_opc = vecop_list, 3006 .vece = MO_8 }, 3007 { .fni8 = tcg_gen_vec_sar16i_i64, 3008 .fniv = tcg_gen_sari_vec, 3009 .fno = gen_helper_gvec_sar16i, 3010 .opt_opc = vecop_list, 3011 .vece = MO_16 }, 3012 { .fni4 = tcg_gen_sari_i32, 3013 .fniv = tcg_gen_sari_vec, 3014 .fno = gen_helper_gvec_sar32i, 3015 .opt_opc = vecop_list, 3016 .vece = MO_32 }, 3017 { .fni8 = tcg_gen_sari_i64, 3018 .fniv = tcg_gen_sari_vec, 3019 .fno = gen_helper_gvec_sar64i, 3020 .opt_opc = vecop_list, 3021 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3022 .vece = MO_64 }, 3023 }; 3024 3025 tcg_debug_assert(vece <= MO_64); 3026 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3027 if (shift == 0) { 3028 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 3029 } else { 3030 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 3031 } 3032 } 3033 3034 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 3035 { 3036 uint64_t mask = dup_const(MO_8, 0xff << c); 3037 3038 tcg_gen_shli_i64(d, a, c); 3039 tcg_gen_shri_i64(a, a, 8 - c); 3040 tcg_gen_andi_i64(d, d, mask); 3041 tcg_gen_andi_i64(a, a, ~mask); 3042 tcg_gen_or_i64(d, d, a); 3043 } 3044 3045 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 3046 { 3047 uint64_t mask = dup_const(MO_16, 0xffff << c); 3048 3049 tcg_gen_shli_i64(d, a, c); 3050 tcg_gen_shri_i64(a, a, 16 - c); 3051 tcg_gen_andi_i64(d, d, mask); 3052 tcg_gen_andi_i64(a, a, ~mask); 3053 tcg_gen_or_i64(d, d, a); 3054 } 3055 3056 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs, 3057 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3058 { 3059 static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 }; 3060 static const GVecGen2i g[4] = { 3061 { .fni8 = tcg_gen_vec_rotl8i_i64, 3062 .fniv = tcg_gen_rotli_vec, 3063 .fno = gen_helper_gvec_rotl8i, 3064 .opt_opc = vecop_list, 3065 .vece = MO_8 }, 3066 { .fni8 = tcg_gen_vec_rotl16i_i64, 3067 .fniv = tcg_gen_rotli_vec, 3068 .fno = gen_helper_gvec_rotl16i, 3069 .opt_opc = vecop_list, 3070 .vece = MO_16 }, 3071 { .fni4 = tcg_gen_rotli_i32, 3072 .fniv = tcg_gen_rotli_vec, 3073 .fno = gen_helper_gvec_rotl32i, 3074 .opt_opc = vecop_list, 3075 .vece = MO_32 }, 3076 { .fni8 = tcg_gen_rotli_i64, 3077 .fniv = tcg_gen_rotli_vec, 3078 .fno = gen_helper_gvec_rotl64i, 3079 .opt_opc = vecop_list, 3080 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3081 .vece = MO_64 }, 3082 }; 3083 3084 tcg_debug_assert(vece <= MO_64); 3085 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3086 if (shift == 0) { 3087 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 3088 } else { 3089 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 3090 } 3091 } 3092 3093 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs, 3094 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3095 { 3096 tcg_debug_assert(vece <= MO_64); 3097 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3098 tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1), 3099 oprsz, maxsz); 3100 } 3101 3102 /* 3103 * Specialized generation vector shifts by a non-constant scalar. 3104 */ 3105 3106 typedef struct { 3107 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 3108 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 3109 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 3110 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 3111 gen_helper_gvec_2 *fno[4]; 3112 TCGOpcode s_list[2]; 3113 TCGOpcode v_list[2]; 3114 } GVecGen2sh; 3115 3116 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3117 uint32_t oprsz, uint32_t tysz, TCGType type, 3118 TCGv_i32 shift, 3119 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 3120 { 3121 TCGv_vec t0 = tcg_temp_new_vec(type); 3122 uint32_t i; 3123 3124 for (i = 0; i < oprsz; i += tysz) { 3125 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 3126 fni(vece, t0, t0, shift); 3127 tcg_gen_st_vec(t0, cpu_env, dofs + i); 3128 } 3129 tcg_temp_free_vec(t0); 3130 } 3131 3132 static void 3133 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 3134 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 3135 { 3136 TCGType type; 3137 uint32_t some; 3138 3139 check_size_align(oprsz, maxsz, dofs | aofs); 3140 check_overlap_2(dofs, aofs, maxsz); 3141 3142 /* If the backend has a scalar expansion, great. */ 3143 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 3144 if (type) { 3145 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 3146 switch (type) { 3147 case TCG_TYPE_V256: 3148 some = QEMU_ALIGN_DOWN(oprsz, 32); 3149 expand_2sh_vec(vece, dofs, aofs, some, 32, 3150 TCG_TYPE_V256, shift, g->fniv_s); 3151 if (some == oprsz) { 3152 break; 3153 } 3154 dofs += some; 3155 aofs += some; 3156 oprsz -= some; 3157 maxsz -= some; 3158 /* fallthru */ 3159 case TCG_TYPE_V128: 3160 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 3161 TCG_TYPE_V128, shift, g->fniv_s); 3162 break; 3163 case TCG_TYPE_V64: 3164 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 3165 TCG_TYPE_V64, shift, g->fniv_s); 3166 break; 3167 default: 3168 g_assert_not_reached(); 3169 } 3170 tcg_swap_vecop_list(hold_list); 3171 goto clear_tail; 3172 } 3173 3174 /* If the backend supports variable vector shifts, also cool. */ 3175 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 3176 if (type) { 3177 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 3178 TCGv_vec v_shift = tcg_temp_new_vec(type); 3179 3180 if (vece == MO_64) { 3181 TCGv_i64 sh64 = tcg_temp_ebb_new_i64(); 3182 tcg_gen_extu_i32_i64(sh64, shift); 3183 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 3184 tcg_temp_free_i64(sh64); 3185 } else { 3186 tcg_gen_dup_i32_vec(vece, v_shift, shift); 3187 } 3188 3189 switch (type) { 3190 case TCG_TYPE_V256: 3191 some = QEMU_ALIGN_DOWN(oprsz, 32); 3192 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 3193 v_shift, false, g->fniv_v); 3194 if (some == oprsz) { 3195 break; 3196 } 3197 dofs += some; 3198 aofs += some; 3199 oprsz -= some; 3200 maxsz -= some; 3201 /* fallthru */ 3202 case TCG_TYPE_V128: 3203 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 3204 v_shift, false, g->fniv_v); 3205 break; 3206 case TCG_TYPE_V64: 3207 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 3208 v_shift, false, g->fniv_v); 3209 break; 3210 default: 3211 g_assert_not_reached(); 3212 } 3213 tcg_temp_free_vec(v_shift); 3214 tcg_swap_vecop_list(hold_list); 3215 goto clear_tail; 3216 } 3217 3218 /* Otherwise fall back to integral... */ 3219 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3220 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 3221 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3222 TCGv_i64 sh64 = tcg_temp_ebb_new_i64(); 3223 tcg_gen_extu_i32_i64(sh64, shift); 3224 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 3225 tcg_temp_free_i64(sh64); 3226 } else { 3227 TCGv_ptr a0 = tcg_temp_ebb_new_ptr(); 3228 TCGv_ptr a1 = tcg_temp_ebb_new_ptr(); 3229 TCGv_i32 desc = tcg_temp_ebb_new_i32(); 3230 3231 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 3232 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 3233 tcg_gen_addi_ptr(a0, cpu_env, dofs); 3234 tcg_gen_addi_ptr(a1, cpu_env, aofs); 3235 3236 g->fno[vece](a0, a1, desc); 3237 3238 tcg_temp_free_ptr(a0); 3239 tcg_temp_free_ptr(a1); 3240 tcg_temp_free_i32(desc); 3241 return; 3242 } 3243 3244 clear_tail: 3245 if (oprsz < maxsz) { 3246 expand_clr(dofs + oprsz, maxsz - oprsz); 3247 } 3248 } 3249 3250 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 3251 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3252 { 3253 static const GVecGen2sh g = { 3254 .fni4 = tcg_gen_shl_i32, 3255 .fni8 = tcg_gen_shl_i64, 3256 .fniv_s = tcg_gen_shls_vec, 3257 .fniv_v = tcg_gen_shlv_vec, 3258 .fno = { 3259 gen_helper_gvec_shl8i, 3260 gen_helper_gvec_shl16i, 3261 gen_helper_gvec_shl32i, 3262 gen_helper_gvec_shl64i, 3263 }, 3264 .s_list = { INDEX_op_shls_vec, 0 }, 3265 .v_list = { INDEX_op_shlv_vec, 0 }, 3266 }; 3267 3268 tcg_debug_assert(vece <= MO_64); 3269 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3270 } 3271 3272 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 3273 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3274 { 3275 static const GVecGen2sh g = { 3276 .fni4 = tcg_gen_shr_i32, 3277 .fni8 = tcg_gen_shr_i64, 3278 .fniv_s = tcg_gen_shrs_vec, 3279 .fniv_v = tcg_gen_shrv_vec, 3280 .fno = { 3281 gen_helper_gvec_shr8i, 3282 gen_helper_gvec_shr16i, 3283 gen_helper_gvec_shr32i, 3284 gen_helper_gvec_shr64i, 3285 }, 3286 .s_list = { INDEX_op_shrs_vec, 0 }, 3287 .v_list = { INDEX_op_shrv_vec, 0 }, 3288 }; 3289 3290 tcg_debug_assert(vece <= MO_64); 3291 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3292 } 3293 3294 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 3295 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3296 { 3297 static const GVecGen2sh g = { 3298 .fni4 = tcg_gen_sar_i32, 3299 .fni8 = tcg_gen_sar_i64, 3300 .fniv_s = tcg_gen_sars_vec, 3301 .fniv_v = tcg_gen_sarv_vec, 3302 .fno = { 3303 gen_helper_gvec_sar8i, 3304 gen_helper_gvec_sar16i, 3305 gen_helper_gvec_sar32i, 3306 gen_helper_gvec_sar64i, 3307 }, 3308 .s_list = { INDEX_op_sars_vec, 0 }, 3309 .v_list = { INDEX_op_sarv_vec, 0 }, 3310 }; 3311 3312 tcg_debug_assert(vece <= MO_64); 3313 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3314 } 3315 3316 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs, 3317 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3318 { 3319 static const GVecGen2sh g = { 3320 .fni4 = tcg_gen_rotl_i32, 3321 .fni8 = tcg_gen_rotl_i64, 3322 .fniv_s = tcg_gen_rotls_vec, 3323 .fniv_v = tcg_gen_rotlv_vec, 3324 .fno = { 3325 gen_helper_gvec_rotl8i, 3326 gen_helper_gvec_rotl16i, 3327 gen_helper_gvec_rotl32i, 3328 gen_helper_gvec_rotl64i, 3329 }, 3330 .s_list = { INDEX_op_rotls_vec, 0 }, 3331 .v_list = { INDEX_op_rotlv_vec, 0 }, 3332 }; 3333 3334 tcg_debug_assert(vece <= MO_64); 3335 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3336 } 3337 3338 /* 3339 * Expand D = A << (B % element bits) 3340 * 3341 * Unlike scalar shifts, where it is easy for the target front end 3342 * to include the modulo as part of the expansion. If the target 3343 * naturally includes the modulo as part of the operation, great! 3344 * If the target has some other behaviour from out-of-range shifts, 3345 * then it could not use this function anyway, and would need to 3346 * do it's own expansion with custom functions. 3347 */ 3348 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 3349 TCGv_vec a, TCGv_vec b) 3350 { 3351 TCGv_vec t = tcg_temp_new_vec_matching(d); 3352 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3353 3354 tcg_gen_and_vec(vece, t, b, m); 3355 tcg_gen_shlv_vec(vece, d, a, t); 3356 tcg_temp_free_vec(t); 3357 } 3358 3359 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3360 { 3361 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3362 3363 tcg_gen_andi_i32(t, b, 31); 3364 tcg_gen_shl_i32(d, a, t); 3365 tcg_temp_free_i32(t); 3366 } 3367 3368 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3369 { 3370 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3371 3372 tcg_gen_andi_i64(t, b, 63); 3373 tcg_gen_shl_i64(d, a, t); 3374 tcg_temp_free_i64(t); 3375 } 3376 3377 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3378 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3379 { 3380 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 3381 static const GVecGen3 g[4] = { 3382 { .fniv = tcg_gen_shlv_mod_vec, 3383 .fno = gen_helper_gvec_shl8v, 3384 .opt_opc = vecop_list, 3385 .vece = MO_8 }, 3386 { .fniv = tcg_gen_shlv_mod_vec, 3387 .fno = gen_helper_gvec_shl16v, 3388 .opt_opc = vecop_list, 3389 .vece = MO_16 }, 3390 { .fni4 = tcg_gen_shl_mod_i32, 3391 .fniv = tcg_gen_shlv_mod_vec, 3392 .fno = gen_helper_gvec_shl32v, 3393 .opt_opc = vecop_list, 3394 .vece = MO_32 }, 3395 { .fni8 = tcg_gen_shl_mod_i64, 3396 .fniv = tcg_gen_shlv_mod_vec, 3397 .fno = gen_helper_gvec_shl64v, 3398 .opt_opc = vecop_list, 3399 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3400 .vece = MO_64 }, 3401 }; 3402 3403 tcg_debug_assert(vece <= MO_64); 3404 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3405 } 3406 3407 /* 3408 * Similarly for logical right shifts. 3409 */ 3410 3411 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 3412 TCGv_vec a, TCGv_vec b) 3413 { 3414 TCGv_vec t = tcg_temp_new_vec_matching(d); 3415 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3416 3417 tcg_gen_and_vec(vece, t, b, m); 3418 tcg_gen_shrv_vec(vece, d, a, t); 3419 tcg_temp_free_vec(t); 3420 } 3421 3422 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3423 { 3424 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3425 3426 tcg_gen_andi_i32(t, b, 31); 3427 tcg_gen_shr_i32(d, a, t); 3428 tcg_temp_free_i32(t); 3429 } 3430 3431 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3432 { 3433 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3434 3435 tcg_gen_andi_i64(t, b, 63); 3436 tcg_gen_shr_i64(d, a, t); 3437 tcg_temp_free_i64(t); 3438 } 3439 3440 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3441 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3442 { 3443 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 3444 static const GVecGen3 g[4] = { 3445 { .fniv = tcg_gen_shrv_mod_vec, 3446 .fno = gen_helper_gvec_shr8v, 3447 .opt_opc = vecop_list, 3448 .vece = MO_8 }, 3449 { .fniv = tcg_gen_shrv_mod_vec, 3450 .fno = gen_helper_gvec_shr16v, 3451 .opt_opc = vecop_list, 3452 .vece = MO_16 }, 3453 { .fni4 = tcg_gen_shr_mod_i32, 3454 .fniv = tcg_gen_shrv_mod_vec, 3455 .fno = gen_helper_gvec_shr32v, 3456 .opt_opc = vecop_list, 3457 .vece = MO_32 }, 3458 { .fni8 = tcg_gen_shr_mod_i64, 3459 .fniv = tcg_gen_shrv_mod_vec, 3460 .fno = gen_helper_gvec_shr64v, 3461 .opt_opc = vecop_list, 3462 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3463 .vece = MO_64 }, 3464 }; 3465 3466 tcg_debug_assert(vece <= MO_64); 3467 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3468 } 3469 3470 /* 3471 * Similarly for arithmetic right shifts. 3472 */ 3473 3474 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 3475 TCGv_vec a, TCGv_vec b) 3476 { 3477 TCGv_vec t = tcg_temp_new_vec_matching(d); 3478 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3479 3480 tcg_gen_and_vec(vece, t, b, m); 3481 tcg_gen_sarv_vec(vece, d, a, t); 3482 tcg_temp_free_vec(t); 3483 } 3484 3485 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3486 { 3487 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3488 3489 tcg_gen_andi_i32(t, b, 31); 3490 tcg_gen_sar_i32(d, a, t); 3491 tcg_temp_free_i32(t); 3492 } 3493 3494 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3495 { 3496 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3497 3498 tcg_gen_andi_i64(t, b, 63); 3499 tcg_gen_sar_i64(d, a, t); 3500 tcg_temp_free_i64(t); 3501 } 3502 3503 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3504 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3505 { 3506 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3507 static const GVecGen3 g[4] = { 3508 { .fniv = tcg_gen_sarv_mod_vec, 3509 .fno = gen_helper_gvec_sar8v, 3510 .opt_opc = vecop_list, 3511 .vece = MO_8 }, 3512 { .fniv = tcg_gen_sarv_mod_vec, 3513 .fno = gen_helper_gvec_sar16v, 3514 .opt_opc = vecop_list, 3515 .vece = MO_16 }, 3516 { .fni4 = tcg_gen_sar_mod_i32, 3517 .fniv = tcg_gen_sarv_mod_vec, 3518 .fno = gen_helper_gvec_sar32v, 3519 .opt_opc = vecop_list, 3520 .vece = MO_32 }, 3521 { .fni8 = tcg_gen_sar_mod_i64, 3522 .fniv = tcg_gen_sarv_mod_vec, 3523 .fno = gen_helper_gvec_sar64v, 3524 .opt_opc = vecop_list, 3525 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3526 .vece = MO_64 }, 3527 }; 3528 3529 tcg_debug_assert(vece <= MO_64); 3530 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3531 } 3532 3533 /* 3534 * Similarly for rotates. 3535 */ 3536 3537 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d, 3538 TCGv_vec a, TCGv_vec b) 3539 { 3540 TCGv_vec t = tcg_temp_new_vec_matching(d); 3541 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3542 3543 tcg_gen_and_vec(vece, t, b, m); 3544 tcg_gen_rotlv_vec(vece, d, a, t); 3545 tcg_temp_free_vec(t); 3546 } 3547 3548 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3549 { 3550 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3551 3552 tcg_gen_andi_i32(t, b, 31); 3553 tcg_gen_rotl_i32(d, a, t); 3554 tcg_temp_free_i32(t); 3555 } 3556 3557 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3558 { 3559 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3560 3561 tcg_gen_andi_i64(t, b, 63); 3562 tcg_gen_rotl_i64(d, a, t); 3563 tcg_temp_free_i64(t); 3564 } 3565 3566 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3567 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3568 { 3569 static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 }; 3570 static const GVecGen3 g[4] = { 3571 { .fniv = tcg_gen_rotlv_mod_vec, 3572 .fno = gen_helper_gvec_rotl8v, 3573 .opt_opc = vecop_list, 3574 .vece = MO_8 }, 3575 { .fniv = tcg_gen_rotlv_mod_vec, 3576 .fno = gen_helper_gvec_rotl16v, 3577 .opt_opc = vecop_list, 3578 .vece = MO_16 }, 3579 { .fni4 = tcg_gen_rotl_mod_i32, 3580 .fniv = tcg_gen_rotlv_mod_vec, 3581 .fno = gen_helper_gvec_rotl32v, 3582 .opt_opc = vecop_list, 3583 .vece = MO_32 }, 3584 { .fni8 = tcg_gen_rotl_mod_i64, 3585 .fniv = tcg_gen_rotlv_mod_vec, 3586 .fno = gen_helper_gvec_rotl64v, 3587 .opt_opc = vecop_list, 3588 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3589 .vece = MO_64 }, 3590 }; 3591 3592 tcg_debug_assert(vece <= MO_64); 3593 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3594 } 3595 3596 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d, 3597 TCGv_vec a, TCGv_vec b) 3598 { 3599 TCGv_vec t = tcg_temp_new_vec_matching(d); 3600 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3601 3602 tcg_gen_and_vec(vece, t, b, m); 3603 tcg_gen_rotrv_vec(vece, d, a, t); 3604 tcg_temp_free_vec(t); 3605 } 3606 3607 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3608 { 3609 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3610 3611 tcg_gen_andi_i32(t, b, 31); 3612 tcg_gen_rotr_i32(d, a, t); 3613 tcg_temp_free_i32(t); 3614 } 3615 3616 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3617 { 3618 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3619 3620 tcg_gen_andi_i64(t, b, 63); 3621 tcg_gen_rotr_i64(d, a, t); 3622 tcg_temp_free_i64(t); 3623 } 3624 3625 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3626 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3627 { 3628 static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 }; 3629 static const GVecGen3 g[4] = { 3630 { .fniv = tcg_gen_rotrv_mod_vec, 3631 .fno = gen_helper_gvec_rotr8v, 3632 .opt_opc = vecop_list, 3633 .vece = MO_8 }, 3634 { .fniv = tcg_gen_rotrv_mod_vec, 3635 .fno = gen_helper_gvec_rotr16v, 3636 .opt_opc = vecop_list, 3637 .vece = MO_16 }, 3638 { .fni4 = tcg_gen_rotr_mod_i32, 3639 .fniv = tcg_gen_rotrv_mod_vec, 3640 .fno = gen_helper_gvec_rotr32v, 3641 .opt_opc = vecop_list, 3642 .vece = MO_32 }, 3643 { .fni8 = tcg_gen_rotr_mod_i64, 3644 .fniv = tcg_gen_rotrv_mod_vec, 3645 .fno = gen_helper_gvec_rotr64v, 3646 .opt_opc = vecop_list, 3647 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3648 .vece = MO_64 }, 3649 }; 3650 3651 tcg_debug_assert(vece <= MO_64); 3652 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3653 } 3654 3655 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3656 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3657 uint32_t oprsz, TCGCond cond) 3658 { 3659 TCGv_i32 t0 = tcg_temp_ebb_new_i32(); 3660 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 3661 uint32_t i; 3662 3663 for (i = 0; i < oprsz; i += 4) { 3664 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 3665 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 3666 tcg_gen_setcond_i32(cond, t0, t0, t1); 3667 tcg_gen_neg_i32(t0, t0); 3668 tcg_gen_st_i32(t0, cpu_env, dofs + i); 3669 } 3670 tcg_temp_free_i32(t1); 3671 tcg_temp_free_i32(t0); 3672 } 3673 3674 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3675 uint32_t oprsz, TCGCond cond) 3676 { 3677 TCGv_i64 t0 = tcg_temp_ebb_new_i64(); 3678 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 3679 uint32_t i; 3680 3681 for (i = 0; i < oprsz; i += 8) { 3682 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 3683 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 3684 tcg_gen_setcond_i64(cond, t0, t0, t1); 3685 tcg_gen_neg_i64(t0, t0); 3686 tcg_gen_st_i64(t0, cpu_env, dofs + i); 3687 } 3688 tcg_temp_free_i64(t1); 3689 tcg_temp_free_i64(t0); 3690 } 3691 3692 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3693 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3694 TCGType type, TCGCond cond) 3695 { 3696 TCGv_vec t0 = tcg_temp_new_vec(type); 3697 TCGv_vec t1 = tcg_temp_new_vec(type); 3698 uint32_t i; 3699 3700 for (i = 0; i < oprsz; i += tysz) { 3701 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 3702 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 3703 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 3704 tcg_gen_st_vec(t0, cpu_env, dofs + i); 3705 } 3706 tcg_temp_free_vec(t1); 3707 tcg_temp_free_vec(t0); 3708 } 3709 3710 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3711 uint32_t aofs, uint32_t bofs, 3712 uint32_t oprsz, uint32_t maxsz) 3713 { 3714 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3715 static gen_helper_gvec_3 * const eq_fn[4] = { 3716 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3717 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3718 }; 3719 static gen_helper_gvec_3 * const ne_fn[4] = { 3720 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3721 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3722 }; 3723 static gen_helper_gvec_3 * const lt_fn[4] = { 3724 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3725 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3726 }; 3727 static gen_helper_gvec_3 * const le_fn[4] = { 3728 gen_helper_gvec_le8, gen_helper_gvec_le16, 3729 gen_helper_gvec_le32, gen_helper_gvec_le64 3730 }; 3731 static gen_helper_gvec_3 * const ltu_fn[4] = { 3732 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3733 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3734 }; 3735 static gen_helper_gvec_3 * const leu_fn[4] = { 3736 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3737 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3738 }; 3739 static gen_helper_gvec_3 * const * const fns[16] = { 3740 [TCG_COND_EQ] = eq_fn, 3741 [TCG_COND_NE] = ne_fn, 3742 [TCG_COND_LT] = lt_fn, 3743 [TCG_COND_LE] = le_fn, 3744 [TCG_COND_LTU] = ltu_fn, 3745 [TCG_COND_LEU] = leu_fn, 3746 }; 3747 3748 const TCGOpcode *hold_list; 3749 TCGType type; 3750 uint32_t some; 3751 3752 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3753 check_overlap_3(dofs, aofs, bofs, maxsz); 3754 3755 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3756 do_dup(MO_8, dofs, oprsz, maxsz, 3757 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3758 return; 3759 } 3760 3761 /* 3762 * Implement inline with a vector type, if possible. 3763 * Prefer integer when 64-bit host and 64-bit comparison. 3764 */ 3765 hold_list = tcg_swap_vecop_list(cmp_list); 3766 type = choose_vector_type(cmp_list, vece, oprsz, 3767 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3768 switch (type) { 3769 case TCG_TYPE_V256: 3770 /* Recall that ARM SVE allows vector sizes that are not a 3771 * power of 2, but always a multiple of 16. The intent is 3772 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3773 */ 3774 some = QEMU_ALIGN_DOWN(oprsz, 32); 3775 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3776 if (some == oprsz) { 3777 break; 3778 } 3779 dofs += some; 3780 aofs += some; 3781 bofs += some; 3782 oprsz -= some; 3783 maxsz -= some; 3784 /* fallthru */ 3785 case TCG_TYPE_V128: 3786 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3787 break; 3788 case TCG_TYPE_V64: 3789 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3790 break; 3791 3792 case 0: 3793 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3794 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3795 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3796 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3797 } else { 3798 gen_helper_gvec_3 * const *fn = fns[cond]; 3799 3800 if (fn == NULL) { 3801 uint32_t tmp; 3802 tmp = aofs, aofs = bofs, bofs = tmp; 3803 cond = tcg_swap_cond(cond); 3804 fn = fns[cond]; 3805 assert(fn != NULL); 3806 } 3807 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3808 oprsz = maxsz; 3809 } 3810 break; 3811 3812 default: 3813 g_assert_not_reached(); 3814 } 3815 tcg_swap_vecop_list(hold_list); 3816 3817 if (oprsz < maxsz) { 3818 expand_clr(dofs + oprsz, maxsz - oprsz); 3819 } 3820 } 3821 3822 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 3823 { 3824 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3825 3826 tcg_gen_and_i64(t, b, a); 3827 tcg_gen_andc_i64(d, c, a); 3828 tcg_gen_or_i64(d, d, t); 3829 tcg_temp_free_i64(t); 3830 } 3831 3832 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 3833 uint32_t bofs, uint32_t cofs, 3834 uint32_t oprsz, uint32_t maxsz) 3835 { 3836 static const GVecGen4 g = { 3837 .fni8 = tcg_gen_bitsel_i64, 3838 .fniv = tcg_gen_bitsel_vec, 3839 .fno = gen_helper_gvec_bitsel, 3840 }; 3841 3842 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 3843 } 3844