1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "tcg/tcg.h" 22 #include "tcg/tcg-op.h" 23 #include "tcg/tcg-op-gvec.h" 24 #include "qemu/main-loop.h" 25 #include "tcg/tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 #ifdef CONFIG_DEBUG_TCG 30 static const TCGOpcode vecop_list_empty[1] = { 0 }; 31 #else 32 #define vecop_list_empty NULL 33 #endif 34 35 36 /* Verify vector size and alignment rules. OFS should be the OR of all 37 of the operand offsets so that we can check them all at once. */ 38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 39 { 40 uint32_t max_align; 41 42 switch (oprsz) { 43 case 8: 44 case 16: 45 case 32: 46 tcg_debug_assert(oprsz <= maxsz); 47 break; 48 default: 49 tcg_debug_assert(oprsz == maxsz); 50 break; 51 } 52 tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS)); 53 54 max_align = maxsz >= 16 ? 15 : 7; 55 tcg_debug_assert((maxsz & max_align) == 0); 56 tcg_debug_assert((ofs & max_align) == 0); 57 } 58 59 /* Verify vector overlap rules for two operands. */ 60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 61 { 62 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 63 } 64 65 /* Verify vector overlap rules for three operands. */ 66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 67 { 68 check_overlap_2(d, a, s); 69 check_overlap_2(d, b, s); 70 check_overlap_2(a, b, s); 71 } 72 73 /* Verify vector overlap rules for four operands. */ 74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 75 uint32_t c, uint32_t s) 76 { 77 check_overlap_2(d, a, s); 78 check_overlap_2(d, b, s); 79 check_overlap_2(d, c, s); 80 check_overlap_2(a, b, s); 81 check_overlap_2(a, c, s); 82 check_overlap_2(b, c, s); 83 } 84 85 /* Create a descriptor from components. */ 86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 87 { 88 uint32_t desc = 0; 89 90 check_size_align(oprsz, maxsz, 0); 91 tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 92 93 oprsz = (oprsz / 8) - 1; 94 maxsz = (maxsz / 8) - 1; 95 96 /* 97 * We have just asserted in check_size_align that either 98 * oprsz is {8,16,32} or matches maxsz. Encode the final 99 * case with '2', as that would otherwise map to 24. 100 */ 101 if (oprsz == maxsz) { 102 oprsz = 2; 103 } 104 105 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 106 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 107 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 108 109 return desc; 110 } 111 112 /* Generate a call to a gvec-style helper with two vector operands. */ 113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 114 uint32_t oprsz, uint32_t maxsz, int32_t data, 115 gen_helper_gvec_2 *fn) 116 { 117 TCGv_ptr a0, a1; 118 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 119 120 a0 = tcg_temp_new_ptr(); 121 a1 = tcg_temp_new_ptr(); 122 123 tcg_gen_addi_ptr(a0, cpu_env, dofs); 124 tcg_gen_addi_ptr(a1, cpu_env, aofs); 125 126 fn(a0, a1, desc); 127 128 tcg_temp_free_ptr(a0); 129 tcg_temp_free_ptr(a1); 130 tcg_temp_free_i32(desc); 131 } 132 133 /* Generate a call to a gvec-style helper with two vector operands 134 and one scalar operand. */ 135 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 136 uint32_t oprsz, uint32_t maxsz, int32_t data, 137 gen_helper_gvec_2i *fn) 138 { 139 TCGv_ptr a0, a1; 140 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 141 142 a0 = tcg_temp_new_ptr(); 143 a1 = tcg_temp_new_ptr(); 144 145 tcg_gen_addi_ptr(a0, cpu_env, dofs); 146 tcg_gen_addi_ptr(a1, cpu_env, aofs); 147 148 fn(a0, a1, c, desc); 149 150 tcg_temp_free_ptr(a0); 151 tcg_temp_free_ptr(a1); 152 tcg_temp_free_i32(desc); 153 } 154 155 /* Generate a call to a gvec-style helper with three vector operands. */ 156 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 157 uint32_t oprsz, uint32_t maxsz, int32_t data, 158 gen_helper_gvec_3 *fn) 159 { 160 TCGv_ptr a0, a1, a2; 161 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 162 163 a0 = tcg_temp_new_ptr(); 164 a1 = tcg_temp_new_ptr(); 165 a2 = tcg_temp_new_ptr(); 166 167 tcg_gen_addi_ptr(a0, cpu_env, dofs); 168 tcg_gen_addi_ptr(a1, cpu_env, aofs); 169 tcg_gen_addi_ptr(a2, cpu_env, bofs); 170 171 fn(a0, a1, a2, desc); 172 173 tcg_temp_free_ptr(a0); 174 tcg_temp_free_ptr(a1); 175 tcg_temp_free_ptr(a2); 176 tcg_temp_free_i32(desc); 177 } 178 179 /* Generate a call to a gvec-style helper with four vector operands. */ 180 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 181 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 182 int32_t data, gen_helper_gvec_4 *fn) 183 { 184 TCGv_ptr a0, a1, a2, a3; 185 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 186 187 a0 = tcg_temp_new_ptr(); 188 a1 = tcg_temp_new_ptr(); 189 a2 = tcg_temp_new_ptr(); 190 a3 = tcg_temp_new_ptr(); 191 192 tcg_gen_addi_ptr(a0, cpu_env, dofs); 193 tcg_gen_addi_ptr(a1, cpu_env, aofs); 194 tcg_gen_addi_ptr(a2, cpu_env, bofs); 195 tcg_gen_addi_ptr(a3, cpu_env, cofs); 196 197 fn(a0, a1, a2, a3, desc); 198 199 tcg_temp_free_ptr(a0); 200 tcg_temp_free_ptr(a1); 201 tcg_temp_free_ptr(a2); 202 tcg_temp_free_ptr(a3); 203 tcg_temp_free_i32(desc); 204 } 205 206 /* Generate a call to a gvec-style helper with five vector operands. */ 207 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 208 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 209 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 210 { 211 TCGv_ptr a0, a1, a2, a3, a4; 212 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 213 214 a0 = tcg_temp_new_ptr(); 215 a1 = tcg_temp_new_ptr(); 216 a2 = tcg_temp_new_ptr(); 217 a3 = tcg_temp_new_ptr(); 218 a4 = tcg_temp_new_ptr(); 219 220 tcg_gen_addi_ptr(a0, cpu_env, dofs); 221 tcg_gen_addi_ptr(a1, cpu_env, aofs); 222 tcg_gen_addi_ptr(a2, cpu_env, bofs); 223 tcg_gen_addi_ptr(a3, cpu_env, cofs); 224 tcg_gen_addi_ptr(a4, cpu_env, xofs); 225 226 fn(a0, a1, a2, a3, a4, desc); 227 228 tcg_temp_free_ptr(a0); 229 tcg_temp_free_ptr(a1); 230 tcg_temp_free_ptr(a2); 231 tcg_temp_free_ptr(a3); 232 tcg_temp_free_ptr(a4); 233 tcg_temp_free_i32(desc); 234 } 235 236 /* Generate a call to a gvec-style helper with three vector operands 237 and an extra pointer operand. */ 238 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 239 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 240 int32_t data, gen_helper_gvec_2_ptr *fn) 241 { 242 TCGv_ptr a0, a1; 243 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 244 245 a0 = tcg_temp_new_ptr(); 246 a1 = tcg_temp_new_ptr(); 247 248 tcg_gen_addi_ptr(a0, cpu_env, dofs); 249 tcg_gen_addi_ptr(a1, cpu_env, aofs); 250 251 fn(a0, a1, ptr, desc); 252 253 tcg_temp_free_ptr(a0); 254 tcg_temp_free_ptr(a1); 255 tcg_temp_free_i32(desc); 256 } 257 258 /* Generate a call to a gvec-style helper with three vector operands 259 and an extra pointer operand. */ 260 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 261 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 262 int32_t data, gen_helper_gvec_3_ptr *fn) 263 { 264 TCGv_ptr a0, a1, a2; 265 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 266 267 a0 = tcg_temp_new_ptr(); 268 a1 = tcg_temp_new_ptr(); 269 a2 = tcg_temp_new_ptr(); 270 271 tcg_gen_addi_ptr(a0, cpu_env, dofs); 272 tcg_gen_addi_ptr(a1, cpu_env, aofs); 273 tcg_gen_addi_ptr(a2, cpu_env, bofs); 274 275 fn(a0, a1, a2, ptr, desc); 276 277 tcg_temp_free_ptr(a0); 278 tcg_temp_free_ptr(a1); 279 tcg_temp_free_ptr(a2); 280 tcg_temp_free_i32(desc); 281 } 282 283 /* Generate a call to a gvec-style helper with four vector operands 284 and an extra pointer operand. */ 285 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 286 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 287 uint32_t maxsz, int32_t data, 288 gen_helper_gvec_4_ptr *fn) 289 { 290 TCGv_ptr a0, a1, a2, a3; 291 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 292 293 a0 = tcg_temp_new_ptr(); 294 a1 = tcg_temp_new_ptr(); 295 a2 = tcg_temp_new_ptr(); 296 a3 = tcg_temp_new_ptr(); 297 298 tcg_gen_addi_ptr(a0, cpu_env, dofs); 299 tcg_gen_addi_ptr(a1, cpu_env, aofs); 300 tcg_gen_addi_ptr(a2, cpu_env, bofs); 301 tcg_gen_addi_ptr(a3, cpu_env, cofs); 302 303 fn(a0, a1, a2, a3, ptr, desc); 304 305 tcg_temp_free_ptr(a0); 306 tcg_temp_free_ptr(a1); 307 tcg_temp_free_ptr(a2); 308 tcg_temp_free_ptr(a3); 309 tcg_temp_free_i32(desc); 310 } 311 312 /* Generate a call to a gvec-style helper with five vector operands 313 and an extra pointer operand. */ 314 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 315 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr, 316 uint32_t oprsz, uint32_t maxsz, int32_t data, 317 gen_helper_gvec_5_ptr *fn) 318 { 319 TCGv_ptr a0, a1, a2, a3, a4; 320 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 321 322 a0 = tcg_temp_new_ptr(); 323 a1 = tcg_temp_new_ptr(); 324 a2 = tcg_temp_new_ptr(); 325 a3 = tcg_temp_new_ptr(); 326 a4 = tcg_temp_new_ptr(); 327 328 tcg_gen_addi_ptr(a0, cpu_env, dofs); 329 tcg_gen_addi_ptr(a1, cpu_env, aofs); 330 tcg_gen_addi_ptr(a2, cpu_env, bofs); 331 tcg_gen_addi_ptr(a3, cpu_env, cofs); 332 tcg_gen_addi_ptr(a4, cpu_env, eofs); 333 334 fn(a0, a1, a2, a3, a4, ptr, desc); 335 336 tcg_temp_free_ptr(a0); 337 tcg_temp_free_ptr(a1); 338 tcg_temp_free_ptr(a2); 339 tcg_temp_free_ptr(a3); 340 tcg_temp_free_ptr(a4); 341 tcg_temp_free_i32(desc); 342 } 343 344 /* Return true if we want to implement something of OPRSZ bytes 345 in units of LNSZ. This limits the expansion of inline code. */ 346 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 347 { 348 uint32_t q, r; 349 350 if (oprsz < lnsz) { 351 return false; 352 } 353 354 q = oprsz / lnsz; 355 r = oprsz % lnsz; 356 tcg_debug_assert((r & 7) == 0); 357 358 if (lnsz < 16) { 359 /* For sizes below 16, accept no remainder. */ 360 if (r != 0) { 361 return false; 362 } 363 } else { 364 /* 365 * Recall that ARM SVE allows vector sizes that are not a 366 * power of 2, but always a multiple of 16. The intent is 367 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 368 * In addition, expand_clr needs to handle a multiple of 8. 369 * Thus we can handle the tail with one more operation per 370 * diminishing power of 2. 371 */ 372 q += ctpop32(r); 373 } 374 375 return q <= MAX_UNROLL; 376 } 377 378 static void expand_clr(uint32_t dofs, uint32_t maxsz); 379 380 /* Duplicate C as per VECE. */ 381 uint64_t (dup_const)(unsigned vece, uint64_t c) 382 { 383 switch (vece) { 384 case MO_8: 385 return 0x0101010101010101ull * (uint8_t)c; 386 case MO_16: 387 return 0x0001000100010001ull * (uint16_t)c; 388 case MO_32: 389 return 0x0000000100000001ull * (uint32_t)c; 390 case MO_64: 391 return c; 392 default: 393 g_assert_not_reached(); 394 } 395 } 396 397 /* Duplicate IN into OUT as per VECE. */ 398 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 399 { 400 switch (vece) { 401 case MO_8: 402 tcg_gen_ext8u_i32(out, in); 403 tcg_gen_muli_i32(out, out, 0x01010101); 404 break; 405 case MO_16: 406 tcg_gen_deposit_i32(out, in, in, 16, 16); 407 break; 408 case MO_32: 409 tcg_gen_mov_i32(out, in); 410 break; 411 default: 412 g_assert_not_reached(); 413 } 414 } 415 416 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 417 { 418 switch (vece) { 419 case MO_8: 420 tcg_gen_ext8u_i64(out, in); 421 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 422 break; 423 case MO_16: 424 tcg_gen_ext16u_i64(out, in); 425 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 426 break; 427 case MO_32: 428 tcg_gen_deposit_i64(out, in, in, 32, 32); 429 break; 430 case MO_64: 431 tcg_gen_mov_i64(out, in); 432 break; 433 default: 434 g_assert_not_reached(); 435 } 436 } 437 438 /* Select a supported vector type for implementing an operation on SIZE 439 * bytes. If OP is 0, assume that the real operation to be performed is 440 * required by all backends. Otherwise, make sure than OP can be performed 441 * on elements of size VECE in the selected type. Do not select V64 if 442 * PREFER_I64 is true. Return 0 if no vector type is selected. 443 */ 444 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 445 uint32_t size, bool prefer_i64) 446 { 447 /* 448 * Recall that ARM SVE allows vector sizes that are not a 449 * power of 2, but always a multiple of 16. The intent is 450 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 451 * It is hard to imagine a case in which v256 is supported 452 * but v128 is not, but check anyway. 453 * In addition, expand_clr needs to handle a multiple of 8. 454 */ 455 if (TCG_TARGET_HAS_v256 && 456 check_size_impl(size, 32) && 457 tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && 458 (!(size & 16) || 459 (TCG_TARGET_HAS_v128 && 460 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && 461 (!(size & 8) || 462 (TCG_TARGET_HAS_v64 && 463 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 464 return TCG_TYPE_V256; 465 } 466 if (TCG_TARGET_HAS_v128 && 467 check_size_impl(size, 16) && 468 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && 469 (!(size & 8) || 470 (TCG_TARGET_HAS_v64 && 471 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 472 return TCG_TYPE_V128; 473 } 474 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 475 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 476 return TCG_TYPE_V64; 477 } 478 return 0; 479 } 480 481 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 482 uint32_t maxsz, TCGv_vec t_vec) 483 { 484 uint32_t i = 0; 485 486 tcg_debug_assert(oprsz >= 8); 487 488 /* 489 * This may be expand_clr for the tail of an operation, e.g. 490 * oprsz == 8 && maxsz == 64. The first 8 bytes of this store 491 * are misaligned wrt the maximum vector size, so do that first. 492 */ 493 if (dofs & 8) { 494 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 495 i += 8; 496 } 497 498 switch (type) { 499 case TCG_TYPE_V256: 500 /* 501 * Recall that ARM SVE allows vector sizes that are not a 502 * power of 2, but always a multiple of 16. The intent is 503 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 504 */ 505 for (; i + 32 <= oprsz; i += 32) { 506 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 507 } 508 /* fallthru */ 509 case TCG_TYPE_V128: 510 for (; i + 16 <= oprsz; i += 16) { 511 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 512 } 513 break; 514 case TCG_TYPE_V64: 515 for (; i < oprsz; i += 8) { 516 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 517 } 518 break; 519 default: 520 g_assert_not_reached(); 521 } 522 523 if (oprsz < maxsz) { 524 expand_clr(dofs + oprsz, maxsz - oprsz); 525 } 526 } 527 528 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 529 * Only one of IN_32 or IN_64 may be set; 530 * IN_C is used if IN_32 and IN_64 are unset. 531 */ 532 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 533 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 534 uint64_t in_c) 535 { 536 TCGType type; 537 TCGv_i64 t_64; 538 TCGv_i32 t_32, t_desc; 539 TCGv_ptr t_ptr; 540 uint32_t i; 541 542 assert(vece <= (in_32 ? MO_32 : MO_64)); 543 assert(in_32 == NULL || in_64 == NULL); 544 545 /* If we're storing 0, expand oprsz to maxsz. */ 546 if (in_32 == NULL && in_64 == NULL) { 547 in_c = dup_const(vece, in_c); 548 if (in_c == 0) { 549 oprsz = maxsz; 550 vece = MO_8; 551 } else if (in_c == dup_const(MO_8, in_c)) { 552 vece = MO_8; 553 } 554 } 555 556 /* Implement inline with a vector type, if possible. 557 * Prefer integer when 64-bit host and no variable dup. 558 */ 559 type = choose_vector_type(NULL, vece, oprsz, 560 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 561 && (in_64 == NULL || vece == MO_64))); 562 if (type != 0) { 563 TCGv_vec t_vec = tcg_temp_new_vec(type); 564 565 if (in_32) { 566 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 567 } else if (in_64) { 568 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 569 } else { 570 tcg_gen_dupi_vec(vece, t_vec, in_c); 571 } 572 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 573 tcg_temp_free_vec(t_vec); 574 return; 575 } 576 577 /* Otherwise, inline with an integer type, unless "large". */ 578 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 579 t_64 = NULL; 580 t_32 = NULL; 581 582 if (in_32) { 583 /* We are given a 32-bit variable input. For a 64-bit host, 584 use a 64-bit operation unless the 32-bit operation would 585 be simple enough. */ 586 if (TCG_TARGET_REG_BITS == 64 587 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 588 t_64 = tcg_temp_new_i64(); 589 tcg_gen_extu_i32_i64(t_64, in_32); 590 gen_dup_i64(vece, t_64, t_64); 591 } else { 592 t_32 = tcg_temp_new_i32(); 593 gen_dup_i32(vece, t_32, in_32); 594 } 595 } else if (in_64) { 596 /* We are given a 64-bit variable input. */ 597 t_64 = tcg_temp_new_i64(); 598 gen_dup_i64(vece, t_64, in_64); 599 } else { 600 /* We are given a constant input. */ 601 /* For 64-bit hosts, use 64-bit constants for "simple" constants 602 or when we'd need too many 32-bit stores, or when a 64-bit 603 constant is really required. */ 604 if (vece == MO_64 605 || (TCG_TARGET_REG_BITS == 64 606 && (in_c == 0 || in_c == -1 607 || !check_size_impl(oprsz, 4)))) { 608 t_64 = tcg_const_i64(in_c); 609 } else { 610 t_32 = tcg_const_i32(in_c); 611 } 612 } 613 614 /* Implement inline if we picked an implementation size above. */ 615 if (t_32) { 616 for (i = 0; i < oprsz; i += 4) { 617 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 618 } 619 tcg_temp_free_i32(t_32); 620 goto done; 621 } 622 if (t_64) { 623 for (i = 0; i < oprsz; i += 8) { 624 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 625 } 626 tcg_temp_free_i64(t_64); 627 goto done; 628 } 629 } 630 631 /* Otherwise implement out of line. */ 632 t_ptr = tcg_temp_new_ptr(); 633 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 634 635 /* 636 * This may be expand_clr for the tail of an operation, e.g. 637 * oprsz == 8 && maxsz == 64. The size of the clear is misaligned 638 * wrt simd_desc and will assert. Simply pass all replicated byte 639 * stores through to memset. 640 */ 641 if (oprsz == maxsz && vece == MO_8) { 642 TCGv_ptr t_size = tcg_const_ptr(oprsz); 643 TCGv_i32 t_val; 644 645 if (in_32) { 646 t_val = in_32; 647 } else if (in_64) { 648 t_val = tcg_temp_new_i32(); 649 tcg_gen_extrl_i64_i32(t_val, in_64); 650 } else { 651 t_val = tcg_const_i32(in_c); 652 } 653 gen_helper_memset(t_ptr, t_ptr, t_val, t_size); 654 655 if (!in_32) { 656 tcg_temp_free_i32(t_val); 657 } 658 tcg_temp_free_ptr(t_size); 659 tcg_temp_free_ptr(t_ptr); 660 return; 661 } 662 663 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 664 665 if (vece == MO_64) { 666 if (in_64) { 667 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 668 } else { 669 t_64 = tcg_const_i64(in_c); 670 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 671 tcg_temp_free_i64(t_64); 672 } 673 } else { 674 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 675 static dup_fn * const fns[3] = { 676 gen_helper_gvec_dup8, 677 gen_helper_gvec_dup16, 678 gen_helper_gvec_dup32 679 }; 680 681 if (in_32) { 682 fns[vece](t_ptr, t_desc, in_32); 683 } else { 684 t_32 = tcg_temp_new_i32(); 685 if (in_64) { 686 tcg_gen_extrl_i64_i32(t_32, in_64); 687 } else if (vece == MO_8) { 688 tcg_gen_movi_i32(t_32, in_c & 0xff); 689 } else if (vece == MO_16) { 690 tcg_gen_movi_i32(t_32, in_c & 0xffff); 691 } else { 692 tcg_gen_movi_i32(t_32, in_c); 693 } 694 fns[vece](t_ptr, t_desc, t_32); 695 tcg_temp_free_i32(t_32); 696 } 697 } 698 699 tcg_temp_free_ptr(t_ptr); 700 tcg_temp_free_i32(t_desc); 701 return; 702 703 done: 704 if (oprsz < maxsz) { 705 expand_clr(dofs + oprsz, maxsz - oprsz); 706 } 707 } 708 709 /* Likewise, but with zero. */ 710 static void expand_clr(uint32_t dofs, uint32_t maxsz) 711 { 712 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 713 } 714 715 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 716 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 717 bool load_dest, void (*fni)(TCGv_i32, TCGv_i32)) 718 { 719 TCGv_i32 t0 = tcg_temp_new_i32(); 720 TCGv_i32 t1 = tcg_temp_new_i32(); 721 uint32_t i; 722 723 for (i = 0; i < oprsz; i += 4) { 724 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 725 if (load_dest) { 726 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 727 } 728 fni(t1, t0); 729 tcg_gen_st_i32(t1, cpu_env, dofs + i); 730 } 731 tcg_temp_free_i32(t0); 732 tcg_temp_free_i32(t1); 733 } 734 735 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 736 int32_t c, bool load_dest, 737 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 738 { 739 TCGv_i32 t0 = tcg_temp_new_i32(); 740 TCGv_i32 t1 = tcg_temp_new_i32(); 741 uint32_t i; 742 743 for (i = 0; i < oprsz; i += 4) { 744 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 745 if (load_dest) { 746 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 747 } 748 fni(t1, t0, c); 749 tcg_gen_st_i32(t1, cpu_env, dofs + i); 750 } 751 tcg_temp_free_i32(t0); 752 tcg_temp_free_i32(t1); 753 } 754 755 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 756 TCGv_i32 c, bool scalar_first, 757 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 758 { 759 TCGv_i32 t0 = tcg_temp_new_i32(); 760 TCGv_i32 t1 = tcg_temp_new_i32(); 761 uint32_t i; 762 763 for (i = 0; i < oprsz; i += 4) { 764 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 765 if (scalar_first) { 766 fni(t1, c, t0); 767 } else { 768 fni(t1, t0, c); 769 } 770 tcg_gen_st_i32(t1, cpu_env, dofs + i); 771 } 772 tcg_temp_free_i32(t0); 773 tcg_temp_free_i32(t1); 774 } 775 776 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 777 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 778 uint32_t bofs, uint32_t oprsz, bool load_dest, 779 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 780 { 781 TCGv_i32 t0 = tcg_temp_new_i32(); 782 TCGv_i32 t1 = tcg_temp_new_i32(); 783 TCGv_i32 t2 = tcg_temp_new_i32(); 784 uint32_t i; 785 786 for (i = 0; i < oprsz; i += 4) { 787 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 788 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 789 if (load_dest) { 790 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 791 } 792 fni(t2, t0, t1); 793 tcg_gen_st_i32(t2, cpu_env, dofs + i); 794 } 795 tcg_temp_free_i32(t2); 796 tcg_temp_free_i32(t1); 797 tcg_temp_free_i32(t0); 798 } 799 800 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 801 uint32_t oprsz, int32_t c, bool load_dest, 802 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 803 { 804 TCGv_i32 t0 = tcg_temp_new_i32(); 805 TCGv_i32 t1 = tcg_temp_new_i32(); 806 TCGv_i32 t2 = tcg_temp_new_i32(); 807 uint32_t i; 808 809 for (i = 0; i < oprsz; i += 4) { 810 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 811 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 812 if (load_dest) { 813 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 814 } 815 fni(t2, t0, t1, c); 816 tcg_gen_st_i32(t2, cpu_env, dofs + i); 817 } 818 tcg_temp_free_i32(t0); 819 tcg_temp_free_i32(t1); 820 tcg_temp_free_i32(t2); 821 } 822 823 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 824 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 825 uint32_t cofs, uint32_t oprsz, bool write_aofs, 826 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 827 { 828 TCGv_i32 t0 = tcg_temp_new_i32(); 829 TCGv_i32 t1 = tcg_temp_new_i32(); 830 TCGv_i32 t2 = tcg_temp_new_i32(); 831 TCGv_i32 t3 = tcg_temp_new_i32(); 832 uint32_t i; 833 834 for (i = 0; i < oprsz; i += 4) { 835 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 836 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 837 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 838 fni(t0, t1, t2, t3); 839 tcg_gen_st_i32(t0, cpu_env, dofs + i); 840 if (write_aofs) { 841 tcg_gen_st_i32(t1, cpu_env, aofs + i); 842 } 843 } 844 tcg_temp_free_i32(t3); 845 tcg_temp_free_i32(t2); 846 tcg_temp_free_i32(t1); 847 tcg_temp_free_i32(t0); 848 } 849 850 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 851 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 852 bool load_dest, void (*fni)(TCGv_i64, TCGv_i64)) 853 { 854 TCGv_i64 t0 = tcg_temp_new_i64(); 855 TCGv_i64 t1 = tcg_temp_new_i64(); 856 uint32_t i; 857 858 for (i = 0; i < oprsz; i += 8) { 859 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 860 if (load_dest) { 861 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 862 } 863 fni(t1, t0); 864 tcg_gen_st_i64(t1, cpu_env, dofs + i); 865 } 866 tcg_temp_free_i64(t0); 867 tcg_temp_free_i64(t1); 868 } 869 870 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 871 int64_t c, bool load_dest, 872 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 873 { 874 TCGv_i64 t0 = tcg_temp_new_i64(); 875 TCGv_i64 t1 = tcg_temp_new_i64(); 876 uint32_t i; 877 878 for (i = 0; i < oprsz; i += 8) { 879 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 880 if (load_dest) { 881 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 882 } 883 fni(t1, t0, c); 884 tcg_gen_st_i64(t1, cpu_env, dofs + i); 885 } 886 tcg_temp_free_i64(t0); 887 tcg_temp_free_i64(t1); 888 } 889 890 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 891 TCGv_i64 c, bool scalar_first, 892 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 893 { 894 TCGv_i64 t0 = tcg_temp_new_i64(); 895 TCGv_i64 t1 = tcg_temp_new_i64(); 896 uint32_t i; 897 898 for (i = 0; i < oprsz; i += 8) { 899 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 900 if (scalar_first) { 901 fni(t1, c, t0); 902 } else { 903 fni(t1, t0, c); 904 } 905 tcg_gen_st_i64(t1, cpu_env, dofs + i); 906 } 907 tcg_temp_free_i64(t0); 908 tcg_temp_free_i64(t1); 909 } 910 911 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 912 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 913 uint32_t bofs, uint32_t oprsz, bool load_dest, 914 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 915 { 916 TCGv_i64 t0 = tcg_temp_new_i64(); 917 TCGv_i64 t1 = tcg_temp_new_i64(); 918 TCGv_i64 t2 = tcg_temp_new_i64(); 919 uint32_t i; 920 921 for (i = 0; i < oprsz; i += 8) { 922 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 923 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 924 if (load_dest) { 925 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 926 } 927 fni(t2, t0, t1); 928 tcg_gen_st_i64(t2, cpu_env, dofs + i); 929 } 930 tcg_temp_free_i64(t2); 931 tcg_temp_free_i64(t1); 932 tcg_temp_free_i64(t0); 933 } 934 935 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 936 uint32_t oprsz, int64_t c, bool load_dest, 937 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 938 { 939 TCGv_i64 t0 = tcg_temp_new_i64(); 940 TCGv_i64 t1 = tcg_temp_new_i64(); 941 TCGv_i64 t2 = tcg_temp_new_i64(); 942 uint32_t i; 943 944 for (i = 0; i < oprsz; i += 8) { 945 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 946 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 947 if (load_dest) { 948 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 949 } 950 fni(t2, t0, t1, c); 951 tcg_gen_st_i64(t2, cpu_env, dofs + i); 952 } 953 tcg_temp_free_i64(t0); 954 tcg_temp_free_i64(t1); 955 tcg_temp_free_i64(t2); 956 } 957 958 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 959 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 960 uint32_t cofs, uint32_t oprsz, bool write_aofs, 961 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 962 { 963 TCGv_i64 t0 = tcg_temp_new_i64(); 964 TCGv_i64 t1 = tcg_temp_new_i64(); 965 TCGv_i64 t2 = tcg_temp_new_i64(); 966 TCGv_i64 t3 = tcg_temp_new_i64(); 967 uint32_t i; 968 969 for (i = 0; i < oprsz; i += 8) { 970 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 971 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 972 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 973 fni(t0, t1, t2, t3); 974 tcg_gen_st_i64(t0, cpu_env, dofs + i); 975 if (write_aofs) { 976 tcg_gen_st_i64(t1, cpu_env, aofs + i); 977 } 978 } 979 tcg_temp_free_i64(t3); 980 tcg_temp_free_i64(t2); 981 tcg_temp_free_i64(t1); 982 tcg_temp_free_i64(t0); 983 } 984 985 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 986 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 987 uint32_t oprsz, uint32_t tysz, TCGType type, 988 bool load_dest, 989 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 990 { 991 TCGv_vec t0 = tcg_temp_new_vec(type); 992 TCGv_vec t1 = tcg_temp_new_vec(type); 993 uint32_t i; 994 995 for (i = 0; i < oprsz; i += tysz) { 996 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 997 if (load_dest) { 998 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 999 } 1000 fni(vece, t1, t0); 1001 tcg_gen_st_vec(t1, cpu_env, dofs + i); 1002 } 1003 tcg_temp_free_vec(t0); 1004 tcg_temp_free_vec(t1); 1005 } 1006 1007 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 1008 using host vectors. */ 1009 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1010 uint32_t oprsz, uint32_t tysz, TCGType type, 1011 int64_t c, bool load_dest, 1012 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 1013 { 1014 TCGv_vec t0 = tcg_temp_new_vec(type); 1015 TCGv_vec t1 = tcg_temp_new_vec(type); 1016 uint32_t i; 1017 1018 for (i = 0; i < oprsz; i += tysz) { 1019 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1020 if (load_dest) { 1021 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 1022 } 1023 fni(vece, t1, t0, c); 1024 tcg_gen_st_vec(t1, cpu_env, dofs + i); 1025 } 1026 tcg_temp_free_vec(t0); 1027 tcg_temp_free_vec(t1); 1028 } 1029 1030 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1031 uint32_t oprsz, uint32_t tysz, TCGType type, 1032 TCGv_vec c, bool scalar_first, 1033 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1034 { 1035 TCGv_vec t0 = tcg_temp_new_vec(type); 1036 TCGv_vec t1 = tcg_temp_new_vec(type); 1037 uint32_t i; 1038 1039 for (i = 0; i < oprsz; i += tysz) { 1040 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1041 if (scalar_first) { 1042 fni(vece, t1, c, t0); 1043 } else { 1044 fni(vece, t1, t0, c); 1045 } 1046 tcg_gen_st_vec(t1, cpu_env, dofs + i); 1047 } 1048 tcg_temp_free_vec(t0); 1049 tcg_temp_free_vec(t1); 1050 } 1051 1052 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 1053 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1054 uint32_t bofs, uint32_t oprsz, 1055 uint32_t tysz, TCGType type, bool load_dest, 1056 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1057 { 1058 TCGv_vec t0 = tcg_temp_new_vec(type); 1059 TCGv_vec t1 = tcg_temp_new_vec(type); 1060 TCGv_vec t2 = tcg_temp_new_vec(type); 1061 uint32_t i; 1062 1063 for (i = 0; i < oprsz; i += tysz) { 1064 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1065 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 1066 if (load_dest) { 1067 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 1068 } 1069 fni(vece, t2, t0, t1); 1070 tcg_gen_st_vec(t2, cpu_env, dofs + i); 1071 } 1072 tcg_temp_free_vec(t2); 1073 tcg_temp_free_vec(t1); 1074 tcg_temp_free_vec(t0); 1075 } 1076 1077 /* 1078 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 1079 * using host vectors. 1080 */ 1081 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1082 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 1083 TCGType type, int64_t c, bool load_dest, 1084 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 1085 int64_t)) 1086 { 1087 TCGv_vec t0 = tcg_temp_new_vec(type); 1088 TCGv_vec t1 = tcg_temp_new_vec(type); 1089 TCGv_vec t2 = tcg_temp_new_vec(type); 1090 uint32_t i; 1091 1092 for (i = 0; i < oprsz; i += tysz) { 1093 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1094 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 1095 if (load_dest) { 1096 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 1097 } 1098 fni(vece, t2, t0, t1, c); 1099 tcg_gen_st_vec(t2, cpu_env, dofs + i); 1100 } 1101 tcg_temp_free_vec(t0); 1102 tcg_temp_free_vec(t1); 1103 tcg_temp_free_vec(t2); 1104 } 1105 1106 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 1107 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1108 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1109 uint32_t tysz, TCGType type, bool write_aofs, 1110 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1111 TCGv_vec, TCGv_vec)) 1112 { 1113 TCGv_vec t0 = tcg_temp_new_vec(type); 1114 TCGv_vec t1 = tcg_temp_new_vec(type); 1115 TCGv_vec t2 = tcg_temp_new_vec(type); 1116 TCGv_vec t3 = tcg_temp_new_vec(type); 1117 uint32_t i; 1118 1119 for (i = 0; i < oprsz; i += tysz) { 1120 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 1121 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 1122 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 1123 fni(vece, t0, t1, t2, t3); 1124 tcg_gen_st_vec(t0, cpu_env, dofs + i); 1125 if (write_aofs) { 1126 tcg_gen_st_vec(t1, cpu_env, aofs + i); 1127 } 1128 } 1129 tcg_temp_free_vec(t3); 1130 tcg_temp_free_vec(t2); 1131 tcg_temp_free_vec(t1); 1132 tcg_temp_free_vec(t0); 1133 } 1134 1135 /* Expand a vector two-operand operation. */ 1136 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 1137 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 1138 { 1139 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1140 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1141 TCGType type; 1142 uint32_t some; 1143 1144 check_size_align(oprsz, maxsz, dofs | aofs); 1145 check_overlap_2(dofs, aofs, maxsz); 1146 1147 type = 0; 1148 if (g->fniv) { 1149 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1150 } 1151 switch (type) { 1152 case TCG_TYPE_V256: 1153 /* Recall that ARM SVE allows vector sizes that are not a 1154 * power of 2, but always a multiple of 16. The intent is 1155 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1156 */ 1157 some = QEMU_ALIGN_DOWN(oprsz, 32); 1158 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1159 g->load_dest, g->fniv); 1160 if (some == oprsz) { 1161 break; 1162 } 1163 dofs += some; 1164 aofs += some; 1165 oprsz -= some; 1166 maxsz -= some; 1167 /* fallthru */ 1168 case TCG_TYPE_V128: 1169 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1170 g->load_dest, g->fniv); 1171 break; 1172 case TCG_TYPE_V64: 1173 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1174 g->load_dest, g->fniv); 1175 break; 1176 1177 case 0: 1178 if (g->fni8 && check_size_impl(oprsz, 8)) { 1179 expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8); 1180 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1181 expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4); 1182 } else { 1183 assert(g->fno != NULL); 1184 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1185 oprsz = maxsz; 1186 } 1187 break; 1188 1189 default: 1190 g_assert_not_reached(); 1191 } 1192 tcg_swap_vecop_list(hold_list); 1193 1194 if (oprsz < maxsz) { 1195 expand_clr(dofs + oprsz, maxsz - oprsz); 1196 } 1197 } 1198 1199 /* Expand a vector operation with two vectors and an immediate. */ 1200 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1201 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1202 { 1203 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1204 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1205 TCGType type; 1206 uint32_t some; 1207 1208 check_size_align(oprsz, maxsz, dofs | aofs); 1209 check_overlap_2(dofs, aofs, maxsz); 1210 1211 type = 0; 1212 if (g->fniv) { 1213 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1214 } 1215 switch (type) { 1216 case TCG_TYPE_V256: 1217 /* Recall that ARM SVE allows vector sizes that are not a 1218 * power of 2, but always a multiple of 16. The intent is 1219 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1220 */ 1221 some = QEMU_ALIGN_DOWN(oprsz, 32); 1222 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1223 c, g->load_dest, g->fniv); 1224 if (some == oprsz) { 1225 break; 1226 } 1227 dofs += some; 1228 aofs += some; 1229 oprsz -= some; 1230 maxsz -= some; 1231 /* fallthru */ 1232 case TCG_TYPE_V128: 1233 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1234 c, g->load_dest, g->fniv); 1235 break; 1236 case TCG_TYPE_V64: 1237 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1238 c, g->load_dest, g->fniv); 1239 break; 1240 1241 case 0: 1242 if (g->fni8 && check_size_impl(oprsz, 8)) { 1243 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1244 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1245 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1246 } else { 1247 if (g->fno) { 1248 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1249 } else { 1250 TCGv_i64 tcg_c = tcg_const_i64(c); 1251 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1252 maxsz, c, g->fnoi); 1253 tcg_temp_free_i64(tcg_c); 1254 } 1255 oprsz = maxsz; 1256 } 1257 break; 1258 1259 default: 1260 g_assert_not_reached(); 1261 } 1262 tcg_swap_vecop_list(hold_list); 1263 1264 if (oprsz < maxsz) { 1265 expand_clr(dofs + oprsz, maxsz - oprsz); 1266 } 1267 } 1268 1269 /* Expand a vector operation with two vectors and a scalar. */ 1270 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1271 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1272 { 1273 TCGType type; 1274 1275 check_size_align(oprsz, maxsz, dofs | aofs); 1276 check_overlap_2(dofs, aofs, maxsz); 1277 1278 type = 0; 1279 if (g->fniv) { 1280 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1281 } 1282 if (type != 0) { 1283 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1284 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1285 TCGv_vec t_vec = tcg_temp_new_vec(type); 1286 uint32_t some; 1287 1288 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1289 1290 switch (type) { 1291 case TCG_TYPE_V256: 1292 /* Recall that ARM SVE allows vector sizes that are not a 1293 * power of 2, but always a multiple of 16. The intent is 1294 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1295 */ 1296 some = QEMU_ALIGN_DOWN(oprsz, 32); 1297 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1298 t_vec, g->scalar_first, g->fniv); 1299 if (some == oprsz) { 1300 break; 1301 } 1302 dofs += some; 1303 aofs += some; 1304 oprsz -= some; 1305 maxsz -= some; 1306 /* fallthru */ 1307 1308 case TCG_TYPE_V128: 1309 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1310 t_vec, g->scalar_first, g->fniv); 1311 break; 1312 1313 case TCG_TYPE_V64: 1314 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1315 t_vec, g->scalar_first, g->fniv); 1316 break; 1317 1318 default: 1319 g_assert_not_reached(); 1320 } 1321 tcg_temp_free_vec(t_vec); 1322 tcg_swap_vecop_list(hold_list); 1323 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1324 TCGv_i64 t64 = tcg_temp_new_i64(); 1325 1326 gen_dup_i64(g->vece, t64, c); 1327 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1328 tcg_temp_free_i64(t64); 1329 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1330 TCGv_i32 t32 = tcg_temp_new_i32(); 1331 1332 tcg_gen_extrl_i64_i32(t32, c); 1333 gen_dup_i32(g->vece, t32, t32); 1334 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1335 tcg_temp_free_i32(t32); 1336 } else { 1337 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1338 return; 1339 } 1340 1341 if (oprsz < maxsz) { 1342 expand_clr(dofs + oprsz, maxsz - oprsz); 1343 } 1344 } 1345 1346 /* Expand a vector three-operand operation. */ 1347 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1348 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1349 { 1350 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1351 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1352 TCGType type; 1353 uint32_t some; 1354 1355 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1356 check_overlap_3(dofs, aofs, bofs, maxsz); 1357 1358 type = 0; 1359 if (g->fniv) { 1360 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1361 } 1362 switch (type) { 1363 case TCG_TYPE_V256: 1364 /* Recall that ARM SVE allows vector sizes that are not a 1365 * power of 2, but always a multiple of 16. The intent is 1366 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1367 */ 1368 some = QEMU_ALIGN_DOWN(oprsz, 32); 1369 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1370 g->load_dest, g->fniv); 1371 if (some == oprsz) { 1372 break; 1373 } 1374 dofs += some; 1375 aofs += some; 1376 bofs += some; 1377 oprsz -= some; 1378 maxsz -= some; 1379 /* fallthru */ 1380 case TCG_TYPE_V128: 1381 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1382 g->load_dest, g->fniv); 1383 break; 1384 case TCG_TYPE_V64: 1385 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1386 g->load_dest, g->fniv); 1387 break; 1388 1389 case 0: 1390 if (g->fni8 && check_size_impl(oprsz, 8)) { 1391 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1392 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1393 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1394 } else { 1395 assert(g->fno != NULL); 1396 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1397 maxsz, g->data, g->fno); 1398 oprsz = maxsz; 1399 } 1400 break; 1401 1402 default: 1403 g_assert_not_reached(); 1404 } 1405 tcg_swap_vecop_list(hold_list); 1406 1407 if (oprsz < maxsz) { 1408 expand_clr(dofs + oprsz, maxsz - oprsz); 1409 } 1410 } 1411 1412 /* Expand a vector operation with three vectors and an immediate. */ 1413 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1414 uint32_t oprsz, uint32_t maxsz, int64_t c, 1415 const GVecGen3i *g) 1416 { 1417 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1418 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1419 TCGType type; 1420 uint32_t some; 1421 1422 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1423 check_overlap_3(dofs, aofs, bofs, maxsz); 1424 1425 type = 0; 1426 if (g->fniv) { 1427 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1428 } 1429 switch (type) { 1430 case TCG_TYPE_V256: 1431 /* 1432 * Recall that ARM SVE allows vector sizes that are not a 1433 * power of 2, but always a multiple of 16. The intent is 1434 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1435 */ 1436 some = QEMU_ALIGN_DOWN(oprsz, 32); 1437 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1438 c, g->load_dest, g->fniv); 1439 if (some == oprsz) { 1440 break; 1441 } 1442 dofs += some; 1443 aofs += some; 1444 bofs += some; 1445 oprsz -= some; 1446 maxsz -= some; 1447 /* fallthru */ 1448 case TCG_TYPE_V128: 1449 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1450 c, g->load_dest, g->fniv); 1451 break; 1452 case TCG_TYPE_V64: 1453 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1454 c, g->load_dest, g->fniv); 1455 break; 1456 1457 case 0: 1458 if (g->fni8 && check_size_impl(oprsz, 8)) { 1459 expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8); 1460 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1461 expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4); 1462 } else { 1463 assert(g->fno != NULL); 1464 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1465 oprsz = maxsz; 1466 } 1467 break; 1468 1469 default: 1470 g_assert_not_reached(); 1471 } 1472 tcg_swap_vecop_list(hold_list); 1473 1474 if (oprsz < maxsz) { 1475 expand_clr(dofs + oprsz, maxsz - oprsz); 1476 } 1477 } 1478 1479 /* Expand a vector four-operand operation. */ 1480 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1481 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1482 { 1483 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1484 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1485 TCGType type; 1486 uint32_t some; 1487 1488 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1489 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1490 1491 type = 0; 1492 if (g->fniv) { 1493 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1494 } 1495 switch (type) { 1496 case TCG_TYPE_V256: 1497 /* Recall that ARM SVE allows vector sizes that are not a 1498 * power of 2, but always a multiple of 16. The intent is 1499 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1500 */ 1501 some = QEMU_ALIGN_DOWN(oprsz, 32); 1502 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1503 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1504 if (some == oprsz) { 1505 break; 1506 } 1507 dofs += some; 1508 aofs += some; 1509 bofs += some; 1510 cofs += some; 1511 oprsz -= some; 1512 maxsz -= some; 1513 /* fallthru */ 1514 case TCG_TYPE_V128: 1515 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1516 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1517 break; 1518 case TCG_TYPE_V64: 1519 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1520 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1521 break; 1522 1523 case 0: 1524 if (g->fni8 && check_size_impl(oprsz, 8)) { 1525 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1526 g->write_aofs, g->fni8); 1527 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1528 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1529 g->write_aofs, g->fni4); 1530 } else { 1531 assert(g->fno != NULL); 1532 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1533 oprsz, maxsz, g->data, g->fno); 1534 oprsz = maxsz; 1535 } 1536 break; 1537 1538 default: 1539 g_assert_not_reached(); 1540 } 1541 tcg_swap_vecop_list(hold_list); 1542 1543 if (oprsz < maxsz) { 1544 expand_clr(dofs + oprsz, maxsz - oprsz); 1545 } 1546 } 1547 1548 /* 1549 * Expand specific vector operations. 1550 */ 1551 1552 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1553 { 1554 tcg_gen_mov_vec(a, b); 1555 } 1556 1557 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1558 uint32_t oprsz, uint32_t maxsz) 1559 { 1560 static const GVecGen2 g = { 1561 .fni8 = tcg_gen_mov_i64, 1562 .fniv = vec_mov2, 1563 .fno = gen_helper_gvec_mov, 1564 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1565 }; 1566 if (dofs != aofs) { 1567 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1568 } else { 1569 check_size_align(oprsz, maxsz, dofs); 1570 if (oprsz < maxsz) { 1571 expand_clr(dofs + oprsz, maxsz - oprsz); 1572 } 1573 } 1574 } 1575 1576 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1577 uint32_t maxsz, TCGv_i32 in) 1578 { 1579 check_size_align(oprsz, maxsz, dofs); 1580 tcg_debug_assert(vece <= MO_32); 1581 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1582 } 1583 1584 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1585 uint32_t maxsz, TCGv_i64 in) 1586 { 1587 check_size_align(oprsz, maxsz, dofs); 1588 tcg_debug_assert(vece <= MO_64); 1589 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1590 } 1591 1592 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1593 uint32_t oprsz, uint32_t maxsz) 1594 { 1595 check_size_align(oprsz, maxsz, dofs); 1596 if (vece <= MO_64) { 1597 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1598 if (type != 0) { 1599 TCGv_vec t_vec = tcg_temp_new_vec(type); 1600 tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs); 1601 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1602 tcg_temp_free_vec(t_vec); 1603 } else if (vece <= MO_32) { 1604 TCGv_i32 in = tcg_temp_new_i32(); 1605 switch (vece) { 1606 case MO_8: 1607 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1608 break; 1609 case MO_16: 1610 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1611 break; 1612 default: 1613 tcg_gen_ld_i32(in, cpu_env, aofs); 1614 break; 1615 } 1616 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1617 tcg_temp_free_i32(in); 1618 } else { 1619 TCGv_i64 in = tcg_temp_new_i64(); 1620 tcg_gen_ld_i64(in, cpu_env, aofs); 1621 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1622 tcg_temp_free_i64(in); 1623 } 1624 } else if (vece == 4) { 1625 /* 128-bit duplicate. */ 1626 int i; 1627 1628 tcg_debug_assert(oprsz >= 16); 1629 if (TCG_TARGET_HAS_v128) { 1630 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1631 1632 tcg_gen_ld_vec(in, cpu_env, aofs); 1633 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1634 tcg_gen_st_vec(in, cpu_env, dofs + i); 1635 } 1636 tcg_temp_free_vec(in); 1637 } else { 1638 TCGv_i64 in0 = tcg_temp_new_i64(); 1639 TCGv_i64 in1 = tcg_temp_new_i64(); 1640 1641 tcg_gen_ld_i64(in0, cpu_env, aofs); 1642 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1643 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1644 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1645 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1646 } 1647 tcg_temp_free_i64(in0); 1648 tcg_temp_free_i64(in1); 1649 } 1650 if (oprsz < maxsz) { 1651 expand_clr(dofs + oprsz, maxsz - oprsz); 1652 } 1653 } else if (vece == 5) { 1654 /* 256-bit duplicate. */ 1655 int i; 1656 1657 tcg_debug_assert(oprsz >= 32); 1658 tcg_debug_assert(oprsz % 32 == 0); 1659 if (TCG_TARGET_HAS_v256) { 1660 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256); 1661 1662 tcg_gen_ld_vec(in, cpu_env, aofs); 1663 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1664 tcg_gen_st_vec(in, cpu_env, dofs + i); 1665 } 1666 tcg_temp_free_vec(in); 1667 } else if (TCG_TARGET_HAS_v128) { 1668 TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128); 1669 TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128); 1670 1671 tcg_gen_ld_vec(in0, cpu_env, aofs); 1672 tcg_gen_ld_vec(in1, cpu_env, aofs + 16); 1673 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1674 tcg_gen_st_vec(in0, cpu_env, dofs + i); 1675 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16); 1676 } 1677 tcg_temp_free_vec(in0); 1678 tcg_temp_free_vec(in1); 1679 } else { 1680 TCGv_i64 in[4]; 1681 int j; 1682 1683 for (j = 0; j < 4; ++j) { 1684 in[j] = tcg_temp_new_i64(); 1685 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8); 1686 } 1687 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1688 for (j = 0; j < 4; ++j) { 1689 tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8); 1690 } 1691 } 1692 for (j = 0; j < 4; ++j) { 1693 tcg_temp_free_i64(in[j]); 1694 } 1695 } 1696 if (oprsz < maxsz) { 1697 expand_clr(dofs + oprsz, maxsz - oprsz); 1698 } 1699 } else { 1700 g_assert_not_reached(); 1701 } 1702 } 1703 1704 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz, 1705 uint32_t maxsz, uint64_t x) 1706 { 1707 check_size_align(oprsz, maxsz, dofs); 1708 do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x); 1709 } 1710 1711 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1712 uint32_t oprsz, uint32_t maxsz) 1713 { 1714 static const GVecGen2 g = { 1715 .fni8 = tcg_gen_not_i64, 1716 .fniv = tcg_gen_not_vec, 1717 .fno = gen_helper_gvec_not, 1718 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1719 }; 1720 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1721 } 1722 1723 /* Perform a vector addition using normal addition and a mask. The mask 1724 should be the sign bit of each lane. This 6-operation form is more 1725 efficient than separate additions when there are 4 or more lanes in 1726 the 64-bit operation. */ 1727 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1728 { 1729 TCGv_i64 t1 = tcg_temp_new_i64(); 1730 TCGv_i64 t2 = tcg_temp_new_i64(); 1731 TCGv_i64 t3 = tcg_temp_new_i64(); 1732 1733 tcg_gen_andc_i64(t1, a, m); 1734 tcg_gen_andc_i64(t2, b, m); 1735 tcg_gen_xor_i64(t3, a, b); 1736 tcg_gen_add_i64(d, t1, t2); 1737 tcg_gen_and_i64(t3, t3, m); 1738 tcg_gen_xor_i64(d, d, t3); 1739 1740 tcg_temp_free_i64(t1); 1741 tcg_temp_free_i64(t2); 1742 tcg_temp_free_i64(t3); 1743 } 1744 1745 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1746 { 1747 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1748 gen_addv_mask(d, a, b, m); 1749 tcg_temp_free_i64(m); 1750 } 1751 1752 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1753 { 1754 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1755 gen_addv_mask(d, a, b, m); 1756 tcg_temp_free_i64(m); 1757 } 1758 1759 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1760 { 1761 TCGv_i64 t1 = tcg_temp_new_i64(); 1762 TCGv_i64 t2 = tcg_temp_new_i64(); 1763 1764 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1765 tcg_gen_add_i64(t2, a, b); 1766 tcg_gen_add_i64(t1, t1, b); 1767 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1768 1769 tcg_temp_free_i64(t1); 1770 tcg_temp_free_i64(t2); 1771 } 1772 1773 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1774 1775 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1776 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1777 { 1778 static const GVecGen3 g[4] = { 1779 { .fni8 = tcg_gen_vec_add8_i64, 1780 .fniv = tcg_gen_add_vec, 1781 .fno = gen_helper_gvec_add8, 1782 .opt_opc = vecop_list_add, 1783 .vece = MO_8 }, 1784 { .fni8 = tcg_gen_vec_add16_i64, 1785 .fniv = tcg_gen_add_vec, 1786 .fno = gen_helper_gvec_add16, 1787 .opt_opc = vecop_list_add, 1788 .vece = MO_16 }, 1789 { .fni4 = tcg_gen_add_i32, 1790 .fniv = tcg_gen_add_vec, 1791 .fno = gen_helper_gvec_add32, 1792 .opt_opc = vecop_list_add, 1793 .vece = MO_32 }, 1794 { .fni8 = tcg_gen_add_i64, 1795 .fniv = tcg_gen_add_vec, 1796 .fno = gen_helper_gvec_add64, 1797 .opt_opc = vecop_list_add, 1798 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1799 .vece = MO_64 }, 1800 }; 1801 1802 tcg_debug_assert(vece <= MO_64); 1803 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1804 } 1805 1806 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1807 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1808 { 1809 static const GVecGen2s g[4] = { 1810 { .fni8 = tcg_gen_vec_add8_i64, 1811 .fniv = tcg_gen_add_vec, 1812 .fno = gen_helper_gvec_adds8, 1813 .opt_opc = vecop_list_add, 1814 .vece = MO_8 }, 1815 { .fni8 = tcg_gen_vec_add16_i64, 1816 .fniv = tcg_gen_add_vec, 1817 .fno = gen_helper_gvec_adds16, 1818 .opt_opc = vecop_list_add, 1819 .vece = MO_16 }, 1820 { .fni4 = tcg_gen_add_i32, 1821 .fniv = tcg_gen_add_vec, 1822 .fno = gen_helper_gvec_adds32, 1823 .opt_opc = vecop_list_add, 1824 .vece = MO_32 }, 1825 { .fni8 = tcg_gen_add_i64, 1826 .fniv = tcg_gen_add_vec, 1827 .fno = gen_helper_gvec_adds64, 1828 .opt_opc = vecop_list_add, 1829 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1830 .vece = MO_64 }, 1831 }; 1832 1833 tcg_debug_assert(vece <= MO_64); 1834 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1835 } 1836 1837 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1838 int64_t c, uint32_t oprsz, uint32_t maxsz) 1839 { 1840 TCGv_i64 tmp = tcg_const_i64(c); 1841 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1842 tcg_temp_free_i64(tmp); 1843 } 1844 1845 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 1846 1847 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1848 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1849 { 1850 static const GVecGen2s g[4] = { 1851 { .fni8 = tcg_gen_vec_sub8_i64, 1852 .fniv = tcg_gen_sub_vec, 1853 .fno = gen_helper_gvec_subs8, 1854 .opt_opc = vecop_list_sub, 1855 .vece = MO_8 }, 1856 { .fni8 = tcg_gen_vec_sub16_i64, 1857 .fniv = tcg_gen_sub_vec, 1858 .fno = gen_helper_gvec_subs16, 1859 .opt_opc = vecop_list_sub, 1860 .vece = MO_16 }, 1861 { .fni4 = tcg_gen_sub_i32, 1862 .fniv = tcg_gen_sub_vec, 1863 .fno = gen_helper_gvec_subs32, 1864 .opt_opc = vecop_list_sub, 1865 .vece = MO_32 }, 1866 { .fni8 = tcg_gen_sub_i64, 1867 .fniv = tcg_gen_sub_vec, 1868 .fno = gen_helper_gvec_subs64, 1869 .opt_opc = vecop_list_sub, 1870 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1871 .vece = MO_64 }, 1872 }; 1873 1874 tcg_debug_assert(vece <= MO_64); 1875 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1876 } 1877 1878 /* Perform a vector subtraction using normal subtraction and a mask. 1879 Compare gen_addv_mask above. */ 1880 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1881 { 1882 TCGv_i64 t1 = tcg_temp_new_i64(); 1883 TCGv_i64 t2 = tcg_temp_new_i64(); 1884 TCGv_i64 t3 = tcg_temp_new_i64(); 1885 1886 tcg_gen_or_i64(t1, a, m); 1887 tcg_gen_andc_i64(t2, b, m); 1888 tcg_gen_eqv_i64(t3, a, b); 1889 tcg_gen_sub_i64(d, t1, t2); 1890 tcg_gen_and_i64(t3, t3, m); 1891 tcg_gen_xor_i64(d, d, t3); 1892 1893 tcg_temp_free_i64(t1); 1894 tcg_temp_free_i64(t2); 1895 tcg_temp_free_i64(t3); 1896 } 1897 1898 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1899 { 1900 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1901 gen_subv_mask(d, a, b, m); 1902 tcg_temp_free_i64(m); 1903 } 1904 1905 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1906 { 1907 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1908 gen_subv_mask(d, a, b, m); 1909 tcg_temp_free_i64(m); 1910 } 1911 1912 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1913 { 1914 TCGv_i64 t1 = tcg_temp_new_i64(); 1915 TCGv_i64 t2 = tcg_temp_new_i64(); 1916 1917 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1918 tcg_gen_sub_i64(t2, a, b); 1919 tcg_gen_sub_i64(t1, a, t1); 1920 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1921 1922 tcg_temp_free_i64(t1); 1923 tcg_temp_free_i64(t2); 1924 } 1925 1926 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1927 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1928 { 1929 static const GVecGen3 g[4] = { 1930 { .fni8 = tcg_gen_vec_sub8_i64, 1931 .fniv = tcg_gen_sub_vec, 1932 .fno = gen_helper_gvec_sub8, 1933 .opt_opc = vecop_list_sub, 1934 .vece = MO_8 }, 1935 { .fni8 = tcg_gen_vec_sub16_i64, 1936 .fniv = tcg_gen_sub_vec, 1937 .fno = gen_helper_gvec_sub16, 1938 .opt_opc = vecop_list_sub, 1939 .vece = MO_16 }, 1940 { .fni4 = tcg_gen_sub_i32, 1941 .fniv = tcg_gen_sub_vec, 1942 .fno = gen_helper_gvec_sub32, 1943 .opt_opc = vecop_list_sub, 1944 .vece = MO_32 }, 1945 { .fni8 = tcg_gen_sub_i64, 1946 .fniv = tcg_gen_sub_vec, 1947 .fno = gen_helper_gvec_sub64, 1948 .opt_opc = vecop_list_sub, 1949 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1950 .vece = MO_64 }, 1951 }; 1952 1953 tcg_debug_assert(vece <= MO_64); 1954 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1955 } 1956 1957 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 1958 1959 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1960 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1961 { 1962 static const GVecGen3 g[4] = { 1963 { .fniv = tcg_gen_mul_vec, 1964 .fno = gen_helper_gvec_mul8, 1965 .opt_opc = vecop_list_mul, 1966 .vece = MO_8 }, 1967 { .fniv = tcg_gen_mul_vec, 1968 .fno = gen_helper_gvec_mul16, 1969 .opt_opc = vecop_list_mul, 1970 .vece = MO_16 }, 1971 { .fni4 = tcg_gen_mul_i32, 1972 .fniv = tcg_gen_mul_vec, 1973 .fno = gen_helper_gvec_mul32, 1974 .opt_opc = vecop_list_mul, 1975 .vece = MO_32 }, 1976 { .fni8 = tcg_gen_mul_i64, 1977 .fniv = tcg_gen_mul_vec, 1978 .fno = gen_helper_gvec_mul64, 1979 .opt_opc = vecop_list_mul, 1980 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1981 .vece = MO_64 }, 1982 }; 1983 1984 tcg_debug_assert(vece <= MO_64); 1985 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1986 } 1987 1988 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1989 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1990 { 1991 static const GVecGen2s g[4] = { 1992 { .fniv = tcg_gen_mul_vec, 1993 .fno = gen_helper_gvec_muls8, 1994 .opt_opc = vecop_list_mul, 1995 .vece = MO_8 }, 1996 { .fniv = tcg_gen_mul_vec, 1997 .fno = gen_helper_gvec_muls16, 1998 .opt_opc = vecop_list_mul, 1999 .vece = MO_16 }, 2000 { .fni4 = tcg_gen_mul_i32, 2001 .fniv = tcg_gen_mul_vec, 2002 .fno = gen_helper_gvec_muls32, 2003 .opt_opc = vecop_list_mul, 2004 .vece = MO_32 }, 2005 { .fni8 = tcg_gen_mul_i64, 2006 .fniv = tcg_gen_mul_vec, 2007 .fno = gen_helper_gvec_muls64, 2008 .opt_opc = vecop_list_mul, 2009 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2010 .vece = MO_64 }, 2011 }; 2012 2013 tcg_debug_assert(vece <= MO_64); 2014 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 2015 } 2016 2017 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 2018 int64_t c, uint32_t oprsz, uint32_t maxsz) 2019 { 2020 TCGv_i64 tmp = tcg_const_i64(c); 2021 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 2022 tcg_temp_free_i64(tmp); 2023 } 2024 2025 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2026 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2027 { 2028 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 2029 static const GVecGen3 g[4] = { 2030 { .fniv = tcg_gen_ssadd_vec, 2031 .fno = gen_helper_gvec_ssadd8, 2032 .opt_opc = vecop_list, 2033 .vece = MO_8 }, 2034 { .fniv = tcg_gen_ssadd_vec, 2035 .fno = gen_helper_gvec_ssadd16, 2036 .opt_opc = vecop_list, 2037 .vece = MO_16 }, 2038 { .fniv = tcg_gen_ssadd_vec, 2039 .fno = gen_helper_gvec_ssadd32, 2040 .opt_opc = vecop_list, 2041 .vece = MO_32 }, 2042 { .fniv = tcg_gen_ssadd_vec, 2043 .fno = gen_helper_gvec_ssadd64, 2044 .opt_opc = vecop_list, 2045 .vece = MO_64 }, 2046 }; 2047 tcg_debug_assert(vece <= MO_64); 2048 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2049 } 2050 2051 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 2052 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2053 { 2054 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 2055 static const GVecGen3 g[4] = { 2056 { .fniv = tcg_gen_sssub_vec, 2057 .fno = gen_helper_gvec_sssub8, 2058 .opt_opc = vecop_list, 2059 .vece = MO_8 }, 2060 { .fniv = tcg_gen_sssub_vec, 2061 .fno = gen_helper_gvec_sssub16, 2062 .opt_opc = vecop_list, 2063 .vece = MO_16 }, 2064 { .fniv = tcg_gen_sssub_vec, 2065 .fno = gen_helper_gvec_sssub32, 2066 .opt_opc = vecop_list, 2067 .vece = MO_32 }, 2068 { .fniv = tcg_gen_sssub_vec, 2069 .fno = gen_helper_gvec_sssub64, 2070 .opt_opc = vecop_list, 2071 .vece = MO_64 }, 2072 }; 2073 tcg_debug_assert(vece <= MO_64); 2074 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2075 } 2076 2077 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2078 { 2079 TCGv_i32 max = tcg_const_i32(-1); 2080 tcg_gen_add_i32(d, a, b); 2081 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 2082 tcg_temp_free_i32(max); 2083 } 2084 2085 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2086 { 2087 TCGv_i64 max = tcg_const_i64(-1); 2088 tcg_gen_add_i64(d, a, b); 2089 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 2090 tcg_temp_free_i64(max); 2091 } 2092 2093 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2094 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2095 { 2096 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 2097 static const GVecGen3 g[4] = { 2098 { .fniv = tcg_gen_usadd_vec, 2099 .fno = gen_helper_gvec_usadd8, 2100 .opt_opc = vecop_list, 2101 .vece = MO_8 }, 2102 { .fniv = tcg_gen_usadd_vec, 2103 .fno = gen_helper_gvec_usadd16, 2104 .opt_opc = vecop_list, 2105 .vece = MO_16 }, 2106 { .fni4 = tcg_gen_usadd_i32, 2107 .fniv = tcg_gen_usadd_vec, 2108 .fno = gen_helper_gvec_usadd32, 2109 .opt_opc = vecop_list, 2110 .vece = MO_32 }, 2111 { .fni8 = tcg_gen_usadd_i64, 2112 .fniv = tcg_gen_usadd_vec, 2113 .fno = gen_helper_gvec_usadd64, 2114 .opt_opc = vecop_list, 2115 .vece = MO_64 } 2116 }; 2117 tcg_debug_assert(vece <= MO_64); 2118 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2119 } 2120 2121 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2122 { 2123 TCGv_i32 min = tcg_const_i32(0); 2124 tcg_gen_sub_i32(d, a, b); 2125 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 2126 tcg_temp_free_i32(min); 2127 } 2128 2129 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2130 { 2131 TCGv_i64 min = tcg_const_i64(0); 2132 tcg_gen_sub_i64(d, a, b); 2133 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 2134 tcg_temp_free_i64(min); 2135 } 2136 2137 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 2138 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2139 { 2140 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 2141 static const GVecGen3 g[4] = { 2142 { .fniv = tcg_gen_ussub_vec, 2143 .fno = gen_helper_gvec_ussub8, 2144 .opt_opc = vecop_list, 2145 .vece = MO_8 }, 2146 { .fniv = tcg_gen_ussub_vec, 2147 .fno = gen_helper_gvec_ussub16, 2148 .opt_opc = vecop_list, 2149 .vece = MO_16 }, 2150 { .fni4 = tcg_gen_ussub_i32, 2151 .fniv = tcg_gen_ussub_vec, 2152 .fno = gen_helper_gvec_ussub32, 2153 .opt_opc = vecop_list, 2154 .vece = MO_32 }, 2155 { .fni8 = tcg_gen_ussub_i64, 2156 .fniv = tcg_gen_ussub_vec, 2157 .fno = gen_helper_gvec_ussub64, 2158 .opt_opc = vecop_list, 2159 .vece = MO_64 } 2160 }; 2161 tcg_debug_assert(vece <= MO_64); 2162 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2163 } 2164 2165 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 2166 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2167 { 2168 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 2169 static const GVecGen3 g[4] = { 2170 { .fniv = tcg_gen_smin_vec, 2171 .fno = gen_helper_gvec_smin8, 2172 .opt_opc = vecop_list, 2173 .vece = MO_8 }, 2174 { .fniv = tcg_gen_smin_vec, 2175 .fno = gen_helper_gvec_smin16, 2176 .opt_opc = vecop_list, 2177 .vece = MO_16 }, 2178 { .fni4 = tcg_gen_smin_i32, 2179 .fniv = tcg_gen_smin_vec, 2180 .fno = gen_helper_gvec_smin32, 2181 .opt_opc = vecop_list, 2182 .vece = MO_32 }, 2183 { .fni8 = tcg_gen_smin_i64, 2184 .fniv = tcg_gen_smin_vec, 2185 .fno = gen_helper_gvec_smin64, 2186 .opt_opc = vecop_list, 2187 .vece = MO_64 } 2188 }; 2189 tcg_debug_assert(vece <= MO_64); 2190 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2191 } 2192 2193 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2194 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2195 { 2196 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2197 static const GVecGen3 g[4] = { 2198 { .fniv = tcg_gen_umin_vec, 2199 .fno = gen_helper_gvec_umin8, 2200 .opt_opc = vecop_list, 2201 .vece = MO_8 }, 2202 { .fniv = tcg_gen_umin_vec, 2203 .fno = gen_helper_gvec_umin16, 2204 .opt_opc = vecop_list, 2205 .vece = MO_16 }, 2206 { .fni4 = tcg_gen_umin_i32, 2207 .fniv = tcg_gen_umin_vec, 2208 .fno = gen_helper_gvec_umin32, 2209 .opt_opc = vecop_list, 2210 .vece = MO_32 }, 2211 { .fni8 = tcg_gen_umin_i64, 2212 .fniv = tcg_gen_umin_vec, 2213 .fno = gen_helper_gvec_umin64, 2214 .opt_opc = vecop_list, 2215 .vece = MO_64 } 2216 }; 2217 tcg_debug_assert(vece <= MO_64); 2218 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2219 } 2220 2221 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2222 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2223 { 2224 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2225 static const GVecGen3 g[4] = { 2226 { .fniv = tcg_gen_smax_vec, 2227 .fno = gen_helper_gvec_smax8, 2228 .opt_opc = vecop_list, 2229 .vece = MO_8 }, 2230 { .fniv = tcg_gen_smax_vec, 2231 .fno = gen_helper_gvec_smax16, 2232 .opt_opc = vecop_list, 2233 .vece = MO_16 }, 2234 { .fni4 = tcg_gen_smax_i32, 2235 .fniv = tcg_gen_smax_vec, 2236 .fno = gen_helper_gvec_smax32, 2237 .opt_opc = vecop_list, 2238 .vece = MO_32 }, 2239 { .fni8 = tcg_gen_smax_i64, 2240 .fniv = tcg_gen_smax_vec, 2241 .fno = gen_helper_gvec_smax64, 2242 .opt_opc = vecop_list, 2243 .vece = MO_64 } 2244 }; 2245 tcg_debug_assert(vece <= MO_64); 2246 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2247 } 2248 2249 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2250 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2251 { 2252 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2253 static const GVecGen3 g[4] = { 2254 { .fniv = tcg_gen_umax_vec, 2255 .fno = gen_helper_gvec_umax8, 2256 .opt_opc = vecop_list, 2257 .vece = MO_8 }, 2258 { .fniv = tcg_gen_umax_vec, 2259 .fno = gen_helper_gvec_umax16, 2260 .opt_opc = vecop_list, 2261 .vece = MO_16 }, 2262 { .fni4 = tcg_gen_umax_i32, 2263 .fniv = tcg_gen_umax_vec, 2264 .fno = gen_helper_gvec_umax32, 2265 .opt_opc = vecop_list, 2266 .vece = MO_32 }, 2267 { .fni8 = tcg_gen_umax_i64, 2268 .fniv = tcg_gen_umax_vec, 2269 .fno = gen_helper_gvec_umax64, 2270 .opt_opc = vecop_list, 2271 .vece = MO_64 } 2272 }; 2273 tcg_debug_assert(vece <= MO_64); 2274 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2275 } 2276 2277 /* Perform a vector negation using normal negation and a mask. 2278 Compare gen_subv_mask above. */ 2279 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2280 { 2281 TCGv_i64 t2 = tcg_temp_new_i64(); 2282 TCGv_i64 t3 = tcg_temp_new_i64(); 2283 2284 tcg_gen_andc_i64(t3, m, b); 2285 tcg_gen_andc_i64(t2, b, m); 2286 tcg_gen_sub_i64(d, m, t2); 2287 tcg_gen_xor_i64(d, d, t3); 2288 2289 tcg_temp_free_i64(t2); 2290 tcg_temp_free_i64(t3); 2291 } 2292 2293 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2294 { 2295 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 2296 gen_negv_mask(d, b, m); 2297 tcg_temp_free_i64(m); 2298 } 2299 2300 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2301 { 2302 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 2303 gen_negv_mask(d, b, m); 2304 tcg_temp_free_i64(m); 2305 } 2306 2307 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2308 { 2309 TCGv_i64 t1 = tcg_temp_new_i64(); 2310 TCGv_i64 t2 = tcg_temp_new_i64(); 2311 2312 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2313 tcg_gen_neg_i64(t2, b); 2314 tcg_gen_neg_i64(t1, t1); 2315 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2316 2317 tcg_temp_free_i64(t1); 2318 tcg_temp_free_i64(t2); 2319 } 2320 2321 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2322 uint32_t oprsz, uint32_t maxsz) 2323 { 2324 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2325 static const GVecGen2 g[4] = { 2326 { .fni8 = tcg_gen_vec_neg8_i64, 2327 .fniv = tcg_gen_neg_vec, 2328 .fno = gen_helper_gvec_neg8, 2329 .opt_opc = vecop_list, 2330 .vece = MO_8 }, 2331 { .fni8 = tcg_gen_vec_neg16_i64, 2332 .fniv = tcg_gen_neg_vec, 2333 .fno = gen_helper_gvec_neg16, 2334 .opt_opc = vecop_list, 2335 .vece = MO_16 }, 2336 { .fni4 = tcg_gen_neg_i32, 2337 .fniv = tcg_gen_neg_vec, 2338 .fno = gen_helper_gvec_neg32, 2339 .opt_opc = vecop_list, 2340 .vece = MO_32 }, 2341 { .fni8 = tcg_gen_neg_i64, 2342 .fniv = tcg_gen_neg_vec, 2343 .fno = gen_helper_gvec_neg64, 2344 .opt_opc = vecop_list, 2345 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2346 .vece = MO_64 }, 2347 }; 2348 2349 tcg_debug_assert(vece <= MO_64); 2350 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2351 } 2352 2353 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2354 { 2355 TCGv_i64 t = tcg_temp_new_i64(); 2356 int nbit = 8 << vece; 2357 2358 /* Create -1 for each negative element. */ 2359 tcg_gen_shri_i64(t, b, nbit - 1); 2360 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2361 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2362 2363 /* 2364 * Invert (via xor -1) and add one. 2365 * Because of the ordering the msb is cleared, 2366 * so we never have carry into the next element. 2367 */ 2368 tcg_gen_xor_i64(d, b, t); 2369 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2370 tcg_gen_add_i64(d, d, t); 2371 2372 tcg_temp_free_i64(t); 2373 } 2374 2375 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2376 { 2377 gen_absv_mask(d, b, MO_8); 2378 } 2379 2380 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2381 { 2382 gen_absv_mask(d, b, MO_16); 2383 } 2384 2385 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2386 uint32_t oprsz, uint32_t maxsz) 2387 { 2388 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2389 static const GVecGen2 g[4] = { 2390 { .fni8 = tcg_gen_vec_abs8_i64, 2391 .fniv = tcg_gen_abs_vec, 2392 .fno = gen_helper_gvec_abs8, 2393 .opt_opc = vecop_list, 2394 .vece = MO_8 }, 2395 { .fni8 = tcg_gen_vec_abs16_i64, 2396 .fniv = tcg_gen_abs_vec, 2397 .fno = gen_helper_gvec_abs16, 2398 .opt_opc = vecop_list, 2399 .vece = MO_16 }, 2400 { .fni4 = tcg_gen_abs_i32, 2401 .fniv = tcg_gen_abs_vec, 2402 .fno = gen_helper_gvec_abs32, 2403 .opt_opc = vecop_list, 2404 .vece = MO_32 }, 2405 { .fni8 = tcg_gen_abs_i64, 2406 .fniv = tcg_gen_abs_vec, 2407 .fno = gen_helper_gvec_abs64, 2408 .opt_opc = vecop_list, 2409 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2410 .vece = MO_64 }, 2411 }; 2412 2413 tcg_debug_assert(vece <= MO_64); 2414 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2415 } 2416 2417 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2418 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2419 { 2420 static const GVecGen3 g = { 2421 .fni8 = tcg_gen_and_i64, 2422 .fniv = tcg_gen_and_vec, 2423 .fno = gen_helper_gvec_and, 2424 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2425 }; 2426 2427 if (aofs == bofs) { 2428 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2429 } else { 2430 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2431 } 2432 } 2433 2434 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2435 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2436 { 2437 static const GVecGen3 g = { 2438 .fni8 = tcg_gen_or_i64, 2439 .fniv = tcg_gen_or_vec, 2440 .fno = gen_helper_gvec_or, 2441 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2442 }; 2443 2444 if (aofs == bofs) { 2445 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2446 } else { 2447 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2448 } 2449 } 2450 2451 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2452 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2453 { 2454 static const GVecGen3 g = { 2455 .fni8 = tcg_gen_xor_i64, 2456 .fniv = tcg_gen_xor_vec, 2457 .fno = gen_helper_gvec_xor, 2458 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2459 }; 2460 2461 if (aofs == bofs) { 2462 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2463 } else { 2464 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2465 } 2466 } 2467 2468 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2469 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2470 { 2471 static const GVecGen3 g = { 2472 .fni8 = tcg_gen_andc_i64, 2473 .fniv = tcg_gen_andc_vec, 2474 .fno = gen_helper_gvec_andc, 2475 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2476 }; 2477 2478 if (aofs == bofs) { 2479 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2480 } else { 2481 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2482 } 2483 } 2484 2485 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2486 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2487 { 2488 static const GVecGen3 g = { 2489 .fni8 = tcg_gen_orc_i64, 2490 .fniv = tcg_gen_orc_vec, 2491 .fno = gen_helper_gvec_orc, 2492 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2493 }; 2494 2495 if (aofs == bofs) { 2496 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2497 } else { 2498 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2499 } 2500 } 2501 2502 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2503 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2504 { 2505 static const GVecGen3 g = { 2506 .fni8 = tcg_gen_nand_i64, 2507 .fniv = tcg_gen_nand_vec, 2508 .fno = gen_helper_gvec_nand, 2509 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2510 }; 2511 2512 if (aofs == bofs) { 2513 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2514 } else { 2515 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2516 } 2517 } 2518 2519 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2520 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2521 { 2522 static const GVecGen3 g = { 2523 .fni8 = tcg_gen_nor_i64, 2524 .fniv = tcg_gen_nor_vec, 2525 .fno = gen_helper_gvec_nor, 2526 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2527 }; 2528 2529 if (aofs == bofs) { 2530 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2531 } else { 2532 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2533 } 2534 } 2535 2536 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2537 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2538 { 2539 static const GVecGen3 g = { 2540 .fni8 = tcg_gen_eqv_i64, 2541 .fniv = tcg_gen_eqv_vec, 2542 .fno = gen_helper_gvec_eqv, 2543 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2544 }; 2545 2546 if (aofs == bofs) { 2547 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2548 } else { 2549 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2550 } 2551 } 2552 2553 static const GVecGen2s gop_ands = { 2554 .fni8 = tcg_gen_and_i64, 2555 .fniv = tcg_gen_and_vec, 2556 .fno = gen_helper_gvec_ands, 2557 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2558 .vece = MO_64 2559 }; 2560 2561 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2562 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2563 { 2564 TCGv_i64 tmp = tcg_temp_new_i64(); 2565 gen_dup_i64(vece, tmp, c); 2566 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2567 tcg_temp_free_i64(tmp); 2568 } 2569 2570 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2571 int64_t c, uint32_t oprsz, uint32_t maxsz) 2572 { 2573 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2574 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2575 tcg_temp_free_i64(tmp); 2576 } 2577 2578 static const GVecGen2s gop_xors = { 2579 .fni8 = tcg_gen_xor_i64, 2580 .fniv = tcg_gen_xor_vec, 2581 .fno = gen_helper_gvec_xors, 2582 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2583 .vece = MO_64 2584 }; 2585 2586 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2587 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2588 { 2589 TCGv_i64 tmp = tcg_temp_new_i64(); 2590 gen_dup_i64(vece, tmp, c); 2591 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2592 tcg_temp_free_i64(tmp); 2593 } 2594 2595 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2596 int64_t c, uint32_t oprsz, uint32_t maxsz) 2597 { 2598 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2599 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2600 tcg_temp_free_i64(tmp); 2601 } 2602 2603 static const GVecGen2s gop_ors = { 2604 .fni8 = tcg_gen_or_i64, 2605 .fniv = tcg_gen_or_vec, 2606 .fno = gen_helper_gvec_ors, 2607 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2608 .vece = MO_64 2609 }; 2610 2611 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2612 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2613 { 2614 TCGv_i64 tmp = tcg_temp_new_i64(); 2615 gen_dup_i64(vece, tmp, c); 2616 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2617 tcg_temp_free_i64(tmp); 2618 } 2619 2620 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2621 int64_t c, uint32_t oprsz, uint32_t maxsz) 2622 { 2623 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2624 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2625 tcg_temp_free_i64(tmp); 2626 } 2627 2628 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2629 { 2630 uint64_t mask = dup_const(MO_8, 0xff << c); 2631 tcg_gen_shli_i64(d, a, c); 2632 tcg_gen_andi_i64(d, d, mask); 2633 } 2634 2635 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2636 { 2637 uint64_t mask = dup_const(MO_16, 0xffff << c); 2638 tcg_gen_shli_i64(d, a, c); 2639 tcg_gen_andi_i64(d, d, mask); 2640 } 2641 2642 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2643 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2644 { 2645 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2646 static const GVecGen2i g[4] = { 2647 { .fni8 = tcg_gen_vec_shl8i_i64, 2648 .fniv = tcg_gen_shli_vec, 2649 .fno = gen_helper_gvec_shl8i, 2650 .opt_opc = vecop_list, 2651 .vece = MO_8 }, 2652 { .fni8 = tcg_gen_vec_shl16i_i64, 2653 .fniv = tcg_gen_shli_vec, 2654 .fno = gen_helper_gvec_shl16i, 2655 .opt_opc = vecop_list, 2656 .vece = MO_16 }, 2657 { .fni4 = tcg_gen_shli_i32, 2658 .fniv = tcg_gen_shli_vec, 2659 .fno = gen_helper_gvec_shl32i, 2660 .opt_opc = vecop_list, 2661 .vece = MO_32 }, 2662 { .fni8 = tcg_gen_shli_i64, 2663 .fniv = tcg_gen_shli_vec, 2664 .fno = gen_helper_gvec_shl64i, 2665 .opt_opc = vecop_list, 2666 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2667 .vece = MO_64 }, 2668 }; 2669 2670 tcg_debug_assert(vece <= MO_64); 2671 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2672 if (shift == 0) { 2673 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2674 } else { 2675 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2676 } 2677 } 2678 2679 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2680 { 2681 uint64_t mask = dup_const(MO_8, 0xff >> c); 2682 tcg_gen_shri_i64(d, a, c); 2683 tcg_gen_andi_i64(d, d, mask); 2684 } 2685 2686 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2687 { 2688 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2689 tcg_gen_shri_i64(d, a, c); 2690 tcg_gen_andi_i64(d, d, mask); 2691 } 2692 2693 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2694 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2695 { 2696 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2697 static const GVecGen2i g[4] = { 2698 { .fni8 = tcg_gen_vec_shr8i_i64, 2699 .fniv = tcg_gen_shri_vec, 2700 .fno = gen_helper_gvec_shr8i, 2701 .opt_opc = vecop_list, 2702 .vece = MO_8 }, 2703 { .fni8 = tcg_gen_vec_shr16i_i64, 2704 .fniv = tcg_gen_shri_vec, 2705 .fno = gen_helper_gvec_shr16i, 2706 .opt_opc = vecop_list, 2707 .vece = MO_16 }, 2708 { .fni4 = tcg_gen_shri_i32, 2709 .fniv = tcg_gen_shri_vec, 2710 .fno = gen_helper_gvec_shr32i, 2711 .opt_opc = vecop_list, 2712 .vece = MO_32 }, 2713 { .fni8 = tcg_gen_shri_i64, 2714 .fniv = tcg_gen_shri_vec, 2715 .fno = gen_helper_gvec_shr64i, 2716 .opt_opc = vecop_list, 2717 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2718 .vece = MO_64 }, 2719 }; 2720 2721 tcg_debug_assert(vece <= MO_64); 2722 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2723 if (shift == 0) { 2724 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2725 } else { 2726 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2727 } 2728 } 2729 2730 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2731 { 2732 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2733 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2734 TCGv_i64 s = tcg_temp_new_i64(); 2735 2736 tcg_gen_shri_i64(d, a, c); 2737 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2738 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2739 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2740 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2741 tcg_temp_free_i64(s); 2742 } 2743 2744 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2745 { 2746 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2747 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2748 TCGv_i64 s = tcg_temp_new_i64(); 2749 2750 tcg_gen_shri_i64(d, a, c); 2751 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2752 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2753 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2754 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2755 tcg_temp_free_i64(s); 2756 } 2757 2758 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2759 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2760 { 2761 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 2762 static const GVecGen2i g[4] = { 2763 { .fni8 = tcg_gen_vec_sar8i_i64, 2764 .fniv = tcg_gen_sari_vec, 2765 .fno = gen_helper_gvec_sar8i, 2766 .opt_opc = vecop_list, 2767 .vece = MO_8 }, 2768 { .fni8 = tcg_gen_vec_sar16i_i64, 2769 .fniv = tcg_gen_sari_vec, 2770 .fno = gen_helper_gvec_sar16i, 2771 .opt_opc = vecop_list, 2772 .vece = MO_16 }, 2773 { .fni4 = tcg_gen_sari_i32, 2774 .fniv = tcg_gen_sari_vec, 2775 .fno = gen_helper_gvec_sar32i, 2776 .opt_opc = vecop_list, 2777 .vece = MO_32 }, 2778 { .fni8 = tcg_gen_sari_i64, 2779 .fniv = tcg_gen_sari_vec, 2780 .fno = gen_helper_gvec_sar64i, 2781 .opt_opc = vecop_list, 2782 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2783 .vece = MO_64 }, 2784 }; 2785 2786 tcg_debug_assert(vece <= MO_64); 2787 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2788 if (shift == 0) { 2789 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2790 } else { 2791 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2792 } 2793 } 2794 2795 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2796 { 2797 uint64_t mask = dup_const(MO_8, 0xff << c); 2798 2799 tcg_gen_shli_i64(d, a, c); 2800 tcg_gen_shri_i64(a, a, 8 - c); 2801 tcg_gen_andi_i64(d, d, mask); 2802 tcg_gen_andi_i64(a, a, ~mask); 2803 tcg_gen_or_i64(d, d, a); 2804 } 2805 2806 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2807 { 2808 uint64_t mask = dup_const(MO_16, 0xffff << c); 2809 2810 tcg_gen_shli_i64(d, a, c); 2811 tcg_gen_shri_i64(a, a, 16 - c); 2812 tcg_gen_andi_i64(d, d, mask); 2813 tcg_gen_andi_i64(a, a, ~mask); 2814 tcg_gen_or_i64(d, d, a); 2815 } 2816 2817 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs, 2818 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2819 { 2820 static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 }; 2821 static const GVecGen2i g[4] = { 2822 { .fni8 = tcg_gen_vec_rotl8i_i64, 2823 .fniv = tcg_gen_rotli_vec, 2824 .fno = gen_helper_gvec_rotl8i, 2825 .opt_opc = vecop_list, 2826 .vece = MO_8 }, 2827 { .fni8 = tcg_gen_vec_rotl16i_i64, 2828 .fniv = tcg_gen_rotli_vec, 2829 .fno = gen_helper_gvec_rotl16i, 2830 .opt_opc = vecop_list, 2831 .vece = MO_16 }, 2832 { .fni4 = tcg_gen_rotli_i32, 2833 .fniv = tcg_gen_rotli_vec, 2834 .fno = gen_helper_gvec_rotl32i, 2835 .opt_opc = vecop_list, 2836 .vece = MO_32 }, 2837 { .fni8 = tcg_gen_rotli_i64, 2838 .fniv = tcg_gen_rotli_vec, 2839 .fno = gen_helper_gvec_rotl64i, 2840 .opt_opc = vecop_list, 2841 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2842 .vece = MO_64 }, 2843 }; 2844 2845 tcg_debug_assert(vece <= MO_64); 2846 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2847 if (shift == 0) { 2848 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2849 } else { 2850 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2851 } 2852 } 2853 2854 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs, 2855 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2856 { 2857 tcg_debug_assert(vece <= MO_64); 2858 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2859 tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1), 2860 oprsz, maxsz); 2861 } 2862 2863 /* 2864 * Specialized generation vector shifts by a non-constant scalar. 2865 */ 2866 2867 typedef struct { 2868 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 2869 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 2870 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 2871 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 2872 gen_helper_gvec_2 *fno[4]; 2873 TCGOpcode s_list[2]; 2874 TCGOpcode v_list[2]; 2875 } GVecGen2sh; 2876 2877 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2878 uint32_t oprsz, uint32_t tysz, TCGType type, 2879 TCGv_i32 shift, 2880 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 2881 { 2882 TCGv_vec t0 = tcg_temp_new_vec(type); 2883 uint32_t i; 2884 2885 for (i = 0; i < oprsz; i += tysz) { 2886 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2887 fni(vece, t0, t0, shift); 2888 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2889 } 2890 tcg_temp_free_vec(t0); 2891 } 2892 2893 static void 2894 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 2895 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 2896 { 2897 TCGType type; 2898 uint32_t some; 2899 2900 check_size_align(oprsz, maxsz, dofs | aofs); 2901 check_overlap_2(dofs, aofs, maxsz); 2902 2903 /* If the backend has a scalar expansion, great. */ 2904 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 2905 if (type) { 2906 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2907 switch (type) { 2908 case TCG_TYPE_V256: 2909 some = QEMU_ALIGN_DOWN(oprsz, 32); 2910 expand_2sh_vec(vece, dofs, aofs, some, 32, 2911 TCG_TYPE_V256, shift, g->fniv_s); 2912 if (some == oprsz) { 2913 break; 2914 } 2915 dofs += some; 2916 aofs += some; 2917 oprsz -= some; 2918 maxsz -= some; 2919 /* fallthru */ 2920 case TCG_TYPE_V128: 2921 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 2922 TCG_TYPE_V128, shift, g->fniv_s); 2923 break; 2924 case TCG_TYPE_V64: 2925 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 2926 TCG_TYPE_V64, shift, g->fniv_s); 2927 break; 2928 default: 2929 g_assert_not_reached(); 2930 } 2931 tcg_swap_vecop_list(hold_list); 2932 goto clear_tail; 2933 } 2934 2935 /* If the backend supports variable vector shifts, also cool. */ 2936 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 2937 if (type) { 2938 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2939 TCGv_vec v_shift = tcg_temp_new_vec(type); 2940 2941 if (vece == MO_64) { 2942 TCGv_i64 sh64 = tcg_temp_new_i64(); 2943 tcg_gen_extu_i32_i64(sh64, shift); 2944 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 2945 tcg_temp_free_i64(sh64); 2946 } else { 2947 tcg_gen_dup_i32_vec(vece, v_shift, shift); 2948 } 2949 2950 switch (type) { 2951 case TCG_TYPE_V256: 2952 some = QEMU_ALIGN_DOWN(oprsz, 32); 2953 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 2954 v_shift, false, g->fniv_v); 2955 if (some == oprsz) { 2956 break; 2957 } 2958 dofs += some; 2959 aofs += some; 2960 oprsz -= some; 2961 maxsz -= some; 2962 /* fallthru */ 2963 case TCG_TYPE_V128: 2964 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 2965 v_shift, false, g->fniv_v); 2966 break; 2967 case TCG_TYPE_V64: 2968 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 2969 v_shift, false, g->fniv_v); 2970 break; 2971 default: 2972 g_assert_not_reached(); 2973 } 2974 tcg_temp_free_vec(v_shift); 2975 tcg_swap_vecop_list(hold_list); 2976 goto clear_tail; 2977 } 2978 2979 /* Otherwise fall back to integral... */ 2980 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2981 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 2982 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2983 TCGv_i64 sh64 = tcg_temp_new_i64(); 2984 tcg_gen_extu_i32_i64(sh64, shift); 2985 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 2986 tcg_temp_free_i64(sh64); 2987 } else { 2988 TCGv_ptr a0 = tcg_temp_new_ptr(); 2989 TCGv_ptr a1 = tcg_temp_new_ptr(); 2990 TCGv_i32 desc = tcg_temp_new_i32(); 2991 2992 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 2993 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 2994 tcg_gen_addi_ptr(a0, cpu_env, dofs); 2995 tcg_gen_addi_ptr(a1, cpu_env, aofs); 2996 2997 g->fno[vece](a0, a1, desc); 2998 2999 tcg_temp_free_ptr(a0); 3000 tcg_temp_free_ptr(a1); 3001 tcg_temp_free_i32(desc); 3002 return; 3003 } 3004 3005 clear_tail: 3006 if (oprsz < maxsz) { 3007 expand_clr(dofs + oprsz, maxsz - oprsz); 3008 } 3009 } 3010 3011 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 3012 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3013 { 3014 static const GVecGen2sh g = { 3015 .fni4 = tcg_gen_shl_i32, 3016 .fni8 = tcg_gen_shl_i64, 3017 .fniv_s = tcg_gen_shls_vec, 3018 .fniv_v = tcg_gen_shlv_vec, 3019 .fno = { 3020 gen_helper_gvec_shl8i, 3021 gen_helper_gvec_shl16i, 3022 gen_helper_gvec_shl32i, 3023 gen_helper_gvec_shl64i, 3024 }, 3025 .s_list = { INDEX_op_shls_vec, 0 }, 3026 .v_list = { INDEX_op_shlv_vec, 0 }, 3027 }; 3028 3029 tcg_debug_assert(vece <= MO_64); 3030 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3031 } 3032 3033 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 3034 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3035 { 3036 static const GVecGen2sh g = { 3037 .fni4 = tcg_gen_shr_i32, 3038 .fni8 = tcg_gen_shr_i64, 3039 .fniv_s = tcg_gen_shrs_vec, 3040 .fniv_v = tcg_gen_shrv_vec, 3041 .fno = { 3042 gen_helper_gvec_shr8i, 3043 gen_helper_gvec_shr16i, 3044 gen_helper_gvec_shr32i, 3045 gen_helper_gvec_shr64i, 3046 }, 3047 .s_list = { INDEX_op_shrs_vec, 0 }, 3048 .v_list = { INDEX_op_shrv_vec, 0 }, 3049 }; 3050 3051 tcg_debug_assert(vece <= MO_64); 3052 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3053 } 3054 3055 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 3056 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3057 { 3058 static const GVecGen2sh g = { 3059 .fni4 = tcg_gen_sar_i32, 3060 .fni8 = tcg_gen_sar_i64, 3061 .fniv_s = tcg_gen_sars_vec, 3062 .fniv_v = tcg_gen_sarv_vec, 3063 .fno = { 3064 gen_helper_gvec_sar8i, 3065 gen_helper_gvec_sar16i, 3066 gen_helper_gvec_sar32i, 3067 gen_helper_gvec_sar64i, 3068 }, 3069 .s_list = { INDEX_op_sars_vec, 0 }, 3070 .v_list = { INDEX_op_sarv_vec, 0 }, 3071 }; 3072 3073 tcg_debug_assert(vece <= MO_64); 3074 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3075 } 3076 3077 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs, 3078 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3079 { 3080 static const GVecGen2sh g = { 3081 .fni4 = tcg_gen_rotl_i32, 3082 .fni8 = tcg_gen_rotl_i64, 3083 .fniv_s = tcg_gen_rotls_vec, 3084 .fniv_v = tcg_gen_rotlv_vec, 3085 .fno = { 3086 gen_helper_gvec_rotl8i, 3087 gen_helper_gvec_rotl16i, 3088 gen_helper_gvec_rotl32i, 3089 gen_helper_gvec_rotl64i, 3090 }, 3091 .s_list = { INDEX_op_rotls_vec, 0 }, 3092 .v_list = { INDEX_op_rotlv_vec, 0 }, 3093 }; 3094 3095 tcg_debug_assert(vece <= MO_64); 3096 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3097 } 3098 3099 /* 3100 * Expand D = A << (B % element bits) 3101 * 3102 * Unlike scalar shifts, where it is easy for the target front end 3103 * to include the modulo as part of the expansion. If the target 3104 * naturally includes the modulo as part of the operation, great! 3105 * If the target has some other behaviour from out-of-range shifts, 3106 * then it could not use this function anyway, and would need to 3107 * do it's own expansion with custom functions. 3108 */ 3109 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 3110 TCGv_vec a, TCGv_vec b) 3111 { 3112 TCGv_vec t = tcg_temp_new_vec_matching(d); 3113 3114 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3115 tcg_gen_and_vec(vece, t, t, b); 3116 tcg_gen_shlv_vec(vece, d, a, t); 3117 tcg_temp_free_vec(t); 3118 } 3119 3120 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3121 { 3122 TCGv_i32 t = tcg_temp_new_i32(); 3123 3124 tcg_gen_andi_i32(t, b, 31); 3125 tcg_gen_shl_i32(d, a, t); 3126 tcg_temp_free_i32(t); 3127 } 3128 3129 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3130 { 3131 TCGv_i64 t = tcg_temp_new_i64(); 3132 3133 tcg_gen_andi_i64(t, b, 63); 3134 tcg_gen_shl_i64(d, a, t); 3135 tcg_temp_free_i64(t); 3136 } 3137 3138 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3139 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3140 { 3141 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 3142 static const GVecGen3 g[4] = { 3143 { .fniv = tcg_gen_shlv_mod_vec, 3144 .fno = gen_helper_gvec_shl8v, 3145 .opt_opc = vecop_list, 3146 .vece = MO_8 }, 3147 { .fniv = tcg_gen_shlv_mod_vec, 3148 .fno = gen_helper_gvec_shl16v, 3149 .opt_opc = vecop_list, 3150 .vece = MO_16 }, 3151 { .fni4 = tcg_gen_shl_mod_i32, 3152 .fniv = tcg_gen_shlv_mod_vec, 3153 .fno = gen_helper_gvec_shl32v, 3154 .opt_opc = vecop_list, 3155 .vece = MO_32 }, 3156 { .fni8 = tcg_gen_shl_mod_i64, 3157 .fniv = tcg_gen_shlv_mod_vec, 3158 .fno = gen_helper_gvec_shl64v, 3159 .opt_opc = vecop_list, 3160 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3161 .vece = MO_64 }, 3162 }; 3163 3164 tcg_debug_assert(vece <= MO_64); 3165 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3166 } 3167 3168 /* 3169 * Similarly for logical right shifts. 3170 */ 3171 3172 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 3173 TCGv_vec a, TCGv_vec b) 3174 { 3175 TCGv_vec t = tcg_temp_new_vec_matching(d); 3176 3177 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3178 tcg_gen_and_vec(vece, t, t, b); 3179 tcg_gen_shrv_vec(vece, d, a, t); 3180 tcg_temp_free_vec(t); 3181 } 3182 3183 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3184 { 3185 TCGv_i32 t = tcg_temp_new_i32(); 3186 3187 tcg_gen_andi_i32(t, b, 31); 3188 tcg_gen_shr_i32(d, a, t); 3189 tcg_temp_free_i32(t); 3190 } 3191 3192 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3193 { 3194 TCGv_i64 t = tcg_temp_new_i64(); 3195 3196 tcg_gen_andi_i64(t, b, 63); 3197 tcg_gen_shr_i64(d, a, t); 3198 tcg_temp_free_i64(t); 3199 } 3200 3201 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3202 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3203 { 3204 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 3205 static const GVecGen3 g[4] = { 3206 { .fniv = tcg_gen_shrv_mod_vec, 3207 .fno = gen_helper_gvec_shr8v, 3208 .opt_opc = vecop_list, 3209 .vece = MO_8 }, 3210 { .fniv = tcg_gen_shrv_mod_vec, 3211 .fno = gen_helper_gvec_shr16v, 3212 .opt_opc = vecop_list, 3213 .vece = MO_16 }, 3214 { .fni4 = tcg_gen_shr_mod_i32, 3215 .fniv = tcg_gen_shrv_mod_vec, 3216 .fno = gen_helper_gvec_shr32v, 3217 .opt_opc = vecop_list, 3218 .vece = MO_32 }, 3219 { .fni8 = tcg_gen_shr_mod_i64, 3220 .fniv = tcg_gen_shrv_mod_vec, 3221 .fno = gen_helper_gvec_shr64v, 3222 .opt_opc = vecop_list, 3223 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3224 .vece = MO_64 }, 3225 }; 3226 3227 tcg_debug_assert(vece <= MO_64); 3228 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3229 } 3230 3231 /* 3232 * Similarly for arithmetic right shifts. 3233 */ 3234 3235 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 3236 TCGv_vec a, TCGv_vec b) 3237 { 3238 TCGv_vec t = tcg_temp_new_vec_matching(d); 3239 3240 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3241 tcg_gen_and_vec(vece, t, t, b); 3242 tcg_gen_sarv_vec(vece, d, a, t); 3243 tcg_temp_free_vec(t); 3244 } 3245 3246 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3247 { 3248 TCGv_i32 t = tcg_temp_new_i32(); 3249 3250 tcg_gen_andi_i32(t, b, 31); 3251 tcg_gen_sar_i32(d, a, t); 3252 tcg_temp_free_i32(t); 3253 } 3254 3255 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3256 { 3257 TCGv_i64 t = tcg_temp_new_i64(); 3258 3259 tcg_gen_andi_i64(t, b, 63); 3260 tcg_gen_sar_i64(d, a, t); 3261 tcg_temp_free_i64(t); 3262 } 3263 3264 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3265 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3266 { 3267 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3268 static const GVecGen3 g[4] = { 3269 { .fniv = tcg_gen_sarv_mod_vec, 3270 .fno = gen_helper_gvec_sar8v, 3271 .opt_opc = vecop_list, 3272 .vece = MO_8 }, 3273 { .fniv = tcg_gen_sarv_mod_vec, 3274 .fno = gen_helper_gvec_sar16v, 3275 .opt_opc = vecop_list, 3276 .vece = MO_16 }, 3277 { .fni4 = tcg_gen_sar_mod_i32, 3278 .fniv = tcg_gen_sarv_mod_vec, 3279 .fno = gen_helper_gvec_sar32v, 3280 .opt_opc = vecop_list, 3281 .vece = MO_32 }, 3282 { .fni8 = tcg_gen_sar_mod_i64, 3283 .fniv = tcg_gen_sarv_mod_vec, 3284 .fno = gen_helper_gvec_sar64v, 3285 .opt_opc = vecop_list, 3286 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3287 .vece = MO_64 }, 3288 }; 3289 3290 tcg_debug_assert(vece <= MO_64); 3291 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3292 } 3293 3294 /* 3295 * Similarly for rotates. 3296 */ 3297 3298 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d, 3299 TCGv_vec a, TCGv_vec b) 3300 { 3301 TCGv_vec t = tcg_temp_new_vec_matching(d); 3302 3303 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3304 tcg_gen_and_vec(vece, t, t, b); 3305 tcg_gen_rotlv_vec(vece, d, a, t); 3306 tcg_temp_free_vec(t); 3307 } 3308 3309 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3310 { 3311 TCGv_i32 t = tcg_temp_new_i32(); 3312 3313 tcg_gen_andi_i32(t, b, 31); 3314 tcg_gen_rotl_i32(d, a, t); 3315 tcg_temp_free_i32(t); 3316 } 3317 3318 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3319 { 3320 TCGv_i64 t = tcg_temp_new_i64(); 3321 3322 tcg_gen_andi_i64(t, b, 63); 3323 tcg_gen_rotl_i64(d, a, t); 3324 tcg_temp_free_i64(t); 3325 } 3326 3327 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3328 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3329 { 3330 static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 }; 3331 static const GVecGen3 g[4] = { 3332 { .fniv = tcg_gen_rotlv_mod_vec, 3333 .fno = gen_helper_gvec_rotl8v, 3334 .opt_opc = vecop_list, 3335 .vece = MO_8 }, 3336 { .fniv = tcg_gen_rotlv_mod_vec, 3337 .fno = gen_helper_gvec_rotl16v, 3338 .opt_opc = vecop_list, 3339 .vece = MO_16 }, 3340 { .fni4 = tcg_gen_rotl_mod_i32, 3341 .fniv = tcg_gen_rotlv_mod_vec, 3342 .fno = gen_helper_gvec_rotl32v, 3343 .opt_opc = vecop_list, 3344 .vece = MO_32 }, 3345 { .fni8 = tcg_gen_rotl_mod_i64, 3346 .fniv = tcg_gen_rotlv_mod_vec, 3347 .fno = gen_helper_gvec_rotl64v, 3348 .opt_opc = vecop_list, 3349 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3350 .vece = MO_64 }, 3351 }; 3352 3353 tcg_debug_assert(vece <= MO_64); 3354 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3355 } 3356 3357 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d, 3358 TCGv_vec a, TCGv_vec b) 3359 { 3360 TCGv_vec t = tcg_temp_new_vec_matching(d); 3361 3362 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3363 tcg_gen_and_vec(vece, t, t, b); 3364 tcg_gen_rotrv_vec(vece, d, a, t); 3365 tcg_temp_free_vec(t); 3366 } 3367 3368 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3369 { 3370 TCGv_i32 t = tcg_temp_new_i32(); 3371 3372 tcg_gen_andi_i32(t, b, 31); 3373 tcg_gen_rotr_i32(d, a, t); 3374 tcg_temp_free_i32(t); 3375 } 3376 3377 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3378 { 3379 TCGv_i64 t = tcg_temp_new_i64(); 3380 3381 tcg_gen_andi_i64(t, b, 63); 3382 tcg_gen_rotr_i64(d, a, t); 3383 tcg_temp_free_i64(t); 3384 } 3385 3386 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3387 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3388 { 3389 static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 }; 3390 static const GVecGen3 g[4] = { 3391 { .fniv = tcg_gen_rotrv_mod_vec, 3392 .fno = gen_helper_gvec_rotr8v, 3393 .opt_opc = vecop_list, 3394 .vece = MO_8 }, 3395 { .fniv = tcg_gen_rotrv_mod_vec, 3396 .fno = gen_helper_gvec_rotr16v, 3397 .opt_opc = vecop_list, 3398 .vece = MO_16 }, 3399 { .fni4 = tcg_gen_rotr_mod_i32, 3400 .fniv = tcg_gen_rotrv_mod_vec, 3401 .fno = gen_helper_gvec_rotr32v, 3402 .opt_opc = vecop_list, 3403 .vece = MO_32 }, 3404 { .fni8 = tcg_gen_rotr_mod_i64, 3405 .fniv = tcg_gen_rotrv_mod_vec, 3406 .fno = gen_helper_gvec_rotr64v, 3407 .opt_opc = vecop_list, 3408 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3409 .vece = MO_64 }, 3410 }; 3411 3412 tcg_debug_assert(vece <= MO_64); 3413 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3414 } 3415 3416 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3417 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3418 uint32_t oprsz, TCGCond cond) 3419 { 3420 TCGv_i32 t0 = tcg_temp_new_i32(); 3421 TCGv_i32 t1 = tcg_temp_new_i32(); 3422 uint32_t i; 3423 3424 for (i = 0; i < oprsz; i += 4) { 3425 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 3426 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 3427 tcg_gen_setcond_i32(cond, t0, t0, t1); 3428 tcg_gen_neg_i32(t0, t0); 3429 tcg_gen_st_i32(t0, cpu_env, dofs + i); 3430 } 3431 tcg_temp_free_i32(t1); 3432 tcg_temp_free_i32(t0); 3433 } 3434 3435 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3436 uint32_t oprsz, TCGCond cond) 3437 { 3438 TCGv_i64 t0 = tcg_temp_new_i64(); 3439 TCGv_i64 t1 = tcg_temp_new_i64(); 3440 uint32_t i; 3441 3442 for (i = 0; i < oprsz; i += 8) { 3443 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 3444 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 3445 tcg_gen_setcond_i64(cond, t0, t0, t1); 3446 tcg_gen_neg_i64(t0, t0); 3447 tcg_gen_st_i64(t0, cpu_env, dofs + i); 3448 } 3449 tcg_temp_free_i64(t1); 3450 tcg_temp_free_i64(t0); 3451 } 3452 3453 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3454 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3455 TCGType type, TCGCond cond) 3456 { 3457 TCGv_vec t0 = tcg_temp_new_vec(type); 3458 TCGv_vec t1 = tcg_temp_new_vec(type); 3459 uint32_t i; 3460 3461 for (i = 0; i < oprsz; i += tysz) { 3462 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 3463 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 3464 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 3465 tcg_gen_st_vec(t0, cpu_env, dofs + i); 3466 } 3467 tcg_temp_free_vec(t1); 3468 tcg_temp_free_vec(t0); 3469 } 3470 3471 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3472 uint32_t aofs, uint32_t bofs, 3473 uint32_t oprsz, uint32_t maxsz) 3474 { 3475 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3476 static gen_helper_gvec_3 * const eq_fn[4] = { 3477 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3478 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3479 }; 3480 static gen_helper_gvec_3 * const ne_fn[4] = { 3481 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3482 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3483 }; 3484 static gen_helper_gvec_3 * const lt_fn[4] = { 3485 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3486 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3487 }; 3488 static gen_helper_gvec_3 * const le_fn[4] = { 3489 gen_helper_gvec_le8, gen_helper_gvec_le16, 3490 gen_helper_gvec_le32, gen_helper_gvec_le64 3491 }; 3492 static gen_helper_gvec_3 * const ltu_fn[4] = { 3493 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3494 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3495 }; 3496 static gen_helper_gvec_3 * const leu_fn[4] = { 3497 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3498 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3499 }; 3500 static gen_helper_gvec_3 * const * const fns[16] = { 3501 [TCG_COND_EQ] = eq_fn, 3502 [TCG_COND_NE] = ne_fn, 3503 [TCG_COND_LT] = lt_fn, 3504 [TCG_COND_LE] = le_fn, 3505 [TCG_COND_LTU] = ltu_fn, 3506 [TCG_COND_LEU] = leu_fn, 3507 }; 3508 3509 const TCGOpcode *hold_list; 3510 TCGType type; 3511 uint32_t some; 3512 3513 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3514 check_overlap_3(dofs, aofs, bofs, maxsz); 3515 3516 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3517 do_dup(MO_8, dofs, oprsz, maxsz, 3518 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3519 return; 3520 } 3521 3522 /* 3523 * Implement inline with a vector type, if possible. 3524 * Prefer integer when 64-bit host and 64-bit comparison. 3525 */ 3526 hold_list = tcg_swap_vecop_list(cmp_list); 3527 type = choose_vector_type(cmp_list, vece, oprsz, 3528 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3529 switch (type) { 3530 case TCG_TYPE_V256: 3531 /* Recall that ARM SVE allows vector sizes that are not a 3532 * power of 2, but always a multiple of 16. The intent is 3533 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3534 */ 3535 some = QEMU_ALIGN_DOWN(oprsz, 32); 3536 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3537 if (some == oprsz) { 3538 break; 3539 } 3540 dofs += some; 3541 aofs += some; 3542 bofs += some; 3543 oprsz -= some; 3544 maxsz -= some; 3545 /* fallthru */ 3546 case TCG_TYPE_V128: 3547 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3548 break; 3549 case TCG_TYPE_V64: 3550 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3551 break; 3552 3553 case 0: 3554 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3555 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3556 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3557 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3558 } else { 3559 gen_helper_gvec_3 * const *fn = fns[cond]; 3560 3561 if (fn == NULL) { 3562 uint32_t tmp; 3563 tmp = aofs, aofs = bofs, bofs = tmp; 3564 cond = tcg_swap_cond(cond); 3565 fn = fns[cond]; 3566 assert(fn != NULL); 3567 } 3568 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3569 oprsz = maxsz; 3570 } 3571 break; 3572 3573 default: 3574 g_assert_not_reached(); 3575 } 3576 tcg_swap_vecop_list(hold_list); 3577 3578 if (oprsz < maxsz) { 3579 expand_clr(dofs + oprsz, maxsz - oprsz); 3580 } 3581 } 3582 3583 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 3584 { 3585 TCGv_i64 t = tcg_temp_new_i64(); 3586 3587 tcg_gen_and_i64(t, b, a); 3588 tcg_gen_andc_i64(d, c, a); 3589 tcg_gen_or_i64(d, d, t); 3590 tcg_temp_free_i64(t); 3591 } 3592 3593 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 3594 uint32_t bofs, uint32_t cofs, 3595 uint32_t oprsz, uint32_t maxsz) 3596 { 3597 static const GVecGen4 g = { 3598 .fni8 = tcg_gen_bitsel_i64, 3599 .fniv = tcg_gen_bitsel_vec, 3600 .fno = gen_helper_gvec_bitsel, 3601 }; 3602 3603 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 3604 } 3605