1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "tcg/tcg.h" 22 #include "tcg/tcg-op.h" 23 #include "tcg/tcg-op-gvec.h" 24 #include "qemu/main-loop.h" 25 #include "tcg/tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 #ifdef CONFIG_DEBUG_TCG 30 static const TCGOpcode vecop_list_empty[1] = { 0 }; 31 #else 32 #define vecop_list_empty NULL 33 #endif 34 35 36 /* Verify vector size and alignment rules. OFS should be the OR of all 37 of the operand offsets so that we can check them all at once. */ 38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 39 { 40 uint32_t max_align; 41 42 switch (oprsz) { 43 case 8: 44 case 16: 45 case 32: 46 tcg_debug_assert(oprsz <= maxsz); 47 break; 48 default: 49 tcg_debug_assert(oprsz == maxsz); 50 break; 51 } 52 tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS)); 53 54 max_align = maxsz >= 16 ? 15 : 7; 55 tcg_debug_assert((maxsz & max_align) == 0); 56 tcg_debug_assert((ofs & max_align) == 0); 57 } 58 59 /* Verify vector overlap rules for two operands. */ 60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 61 { 62 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 63 } 64 65 /* Verify vector overlap rules for three operands. */ 66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 67 { 68 check_overlap_2(d, a, s); 69 check_overlap_2(d, b, s); 70 check_overlap_2(a, b, s); 71 } 72 73 /* Verify vector overlap rules for four operands. */ 74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 75 uint32_t c, uint32_t s) 76 { 77 check_overlap_2(d, a, s); 78 check_overlap_2(d, b, s); 79 check_overlap_2(d, c, s); 80 check_overlap_2(a, b, s); 81 check_overlap_2(a, c, s); 82 check_overlap_2(b, c, s); 83 } 84 85 /* Create a descriptor from components. */ 86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 87 { 88 uint32_t desc = 0; 89 90 check_size_align(oprsz, maxsz, 0); 91 tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 92 93 oprsz = (oprsz / 8) - 1; 94 maxsz = (maxsz / 8) - 1; 95 96 /* 97 * We have just asserted in check_size_align that either 98 * oprsz is {8,16,32} or matches maxsz. Encode the final 99 * case with '2', as that would otherwise map to 24. 100 */ 101 if (oprsz == maxsz) { 102 oprsz = 2; 103 } 104 105 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 106 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 107 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 108 109 return desc; 110 } 111 112 /* Generate a call to a gvec-style helper with two vector operands. */ 113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 114 uint32_t oprsz, uint32_t maxsz, int32_t data, 115 gen_helper_gvec_2 *fn) 116 { 117 TCGv_ptr a0, a1; 118 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 119 120 a0 = tcg_temp_new_ptr(); 121 a1 = tcg_temp_new_ptr(); 122 123 tcg_gen_addi_ptr(a0, cpu_env, dofs); 124 tcg_gen_addi_ptr(a1, cpu_env, aofs); 125 126 fn(a0, a1, desc); 127 128 tcg_temp_free_ptr(a0); 129 tcg_temp_free_ptr(a1); 130 tcg_temp_free_i32(desc); 131 } 132 133 /* Generate a call to a gvec-style helper with two vector operands 134 and one scalar operand. */ 135 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 136 uint32_t oprsz, uint32_t maxsz, int32_t data, 137 gen_helper_gvec_2i *fn) 138 { 139 TCGv_ptr a0, a1; 140 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 141 142 a0 = tcg_temp_new_ptr(); 143 a1 = tcg_temp_new_ptr(); 144 145 tcg_gen_addi_ptr(a0, cpu_env, dofs); 146 tcg_gen_addi_ptr(a1, cpu_env, aofs); 147 148 fn(a0, a1, c, desc); 149 150 tcg_temp_free_ptr(a0); 151 tcg_temp_free_ptr(a1); 152 tcg_temp_free_i32(desc); 153 } 154 155 /* Generate a call to a gvec-style helper with three vector operands. */ 156 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 157 uint32_t oprsz, uint32_t maxsz, int32_t data, 158 gen_helper_gvec_3 *fn) 159 { 160 TCGv_ptr a0, a1, a2; 161 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 162 163 a0 = tcg_temp_new_ptr(); 164 a1 = tcg_temp_new_ptr(); 165 a2 = tcg_temp_new_ptr(); 166 167 tcg_gen_addi_ptr(a0, cpu_env, dofs); 168 tcg_gen_addi_ptr(a1, cpu_env, aofs); 169 tcg_gen_addi_ptr(a2, cpu_env, bofs); 170 171 fn(a0, a1, a2, desc); 172 173 tcg_temp_free_ptr(a0); 174 tcg_temp_free_ptr(a1); 175 tcg_temp_free_ptr(a2); 176 tcg_temp_free_i32(desc); 177 } 178 179 /* Generate a call to a gvec-style helper with four vector operands. */ 180 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 181 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 182 int32_t data, gen_helper_gvec_4 *fn) 183 { 184 TCGv_ptr a0, a1, a2, a3; 185 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 186 187 a0 = tcg_temp_new_ptr(); 188 a1 = tcg_temp_new_ptr(); 189 a2 = tcg_temp_new_ptr(); 190 a3 = tcg_temp_new_ptr(); 191 192 tcg_gen_addi_ptr(a0, cpu_env, dofs); 193 tcg_gen_addi_ptr(a1, cpu_env, aofs); 194 tcg_gen_addi_ptr(a2, cpu_env, bofs); 195 tcg_gen_addi_ptr(a3, cpu_env, cofs); 196 197 fn(a0, a1, a2, a3, desc); 198 199 tcg_temp_free_ptr(a0); 200 tcg_temp_free_ptr(a1); 201 tcg_temp_free_ptr(a2); 202 tcg_temp_free_ptr(a3); 203 tcg_temp_free_i32(desc); 204 } 205 206 /* Generate a call to a gvec-style helper with five vector operands. */ 207 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 208 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 209 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 210 { 211 TCGv_ptr a0, a1, a2, a3, a4; 212 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 213 214 a0 = tcg_temp_new_ptr(); 215 a1 = tcg_temp_new_ptr(); 216 a2 = tcg_temp_new_ptr(); 217 a3 = tcg_temp_new_ptr(); 218 a4 = tcg_temp_new_ptr(); 219 220 tcg_gen_addi_ptr(a0, cpu_env, dofs); 221 tcg_gen_addi_ptr(a1, cpu_env, aofs); 222 tcg_gen_addi_ptr(a2, cpu_env, bofs); 223 tcg_gen_addi_ptr(a3, cpu_env, cofs); 224 tcg_gen_addi_ptr(a4, cpu_env, xofs); 225 226 fn(a0, a1, a2, a3, a4, desc); 227 228 tcg_temp_free_ptr(a0); 229 tcg_temp_free_ptr(a1); 230 tcg_temp_free_ptr(a2); 231 tcg_temp_free_ptr(a3); 232 tcg_temp_free_ptr(a4); 233 tcg_temp_free_i32(desc); 234 } 235 236 /* Generate a call to a gvec-style helper with three vector operands 237 and an extra pointer operand. */ 238 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 239 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 240 int32_t data, gen_helper_gvec_2_ptr *fn) 241 { 242 TCGv_ptr a0, a1; 243 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 244 245 a0 = tcg_temp_new_ptr(); 246 a1 = tcg_temp_new_ptr(); 247 248 tcg_gen_addi_ptr(a0, cpu_env, dofs); 249 tcg_gen_addi_ptr(a1, cpu_env, aofs); 250 251 fn(a0, a1, ptr, desc); 252 253 tcg_temp_free_ptr(a0); 254 tcg_temp_free_ptr(a1); 255 tcg_temp_free_i32(desc); 256 } 257 258 /* Generate a call to a gvec-style helper with three vector operands 259 and an extra pointer operand. */ 260 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 261 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 262 int32_t data, gen_helper_gvec_3_ptr *fn) 263 { 264 TCGv_ptr a0, a1, a2; 265 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 266 267 a0 = tcg_temp_new_ptr(); 268 a1 = tcg_temp_new_ptr(); 269 a2 = tcg_temp_new_ptr(); 270 271 tcg_gen_addi_ptr(a0, cpu_env, dofs); 272 tcg_gen_addi_ptr(a1, cpu_env, aofs); 273 tcg_gen_addi_ptr(a2, cpu_env, bofs); 274 275 fn(a0, a1, a2, ptr, desc); 276 277 tcg_temp_free_ptr(a0); 278 tcg_temp_free_ptr(a1); 279 tcg_temp_free_ptr(a2); 280 tcg_temp_free_i32(desc); 281 } 282 283 /* Generate a call to a gvec-style helper with four vector operands 284 and an extra pointer operand. */ 285 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 286 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 287 uint32_t maxsz, int32_t data, 288 gen_helper_gvec_4_ptr *fn) 289 { 290 TCGv_ptr a0, a1, a2, a3; 291 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 292 293 a0 = tcg_temp_new_ptr(); 294 a1 = tcg_temp_new_ptr(); 295 a2 = tcg_temp_new_ptr(); 296 a3 = tcg_temp_new_ptr(); 297 298 tcg_gen_addi_ptr(a0, cpu_env, dofs); 299 tcg_gen_addi_ptr(a1, cpu_env, aofs); 300 tcg_gen_addi_ptr(a2, cpu_env, bofs); 301 tcg_gen_addi_ptr(a3, cpu_env, cofs); 302 303 fn(a0, a1, a2, a3, ptr, desc); 304 305 tcg_temp_free_ptr(a0); 306 tcg_temp_free_ptr(a1); 307 tcg_temp_free_ptr(a2); 308 tcg_temp_free_ptr(a3); 309 tcg_temp_free_i32(desc); 310 } 311 312 /* Generate a call to a gvec-style helper with five vector operands 313 and an extra pointer operand. */ 314 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 315 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr, 316 uint32_t oprsz, uint32_t maxsz, int32_t data, 317 gen_helper_gvec_5_ptr *fn) 318 { 319 TCGv_ptr a0, a1, a2, a3, a4; 320 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 321 322 a0 = tcg_temp_new_ptr(); 323 a1 = tcg_temp_new_ptr(); 324 a2 = tcg_temp_new_ptr(); 325 a3 = tcg_temp_new_ptr(); 326 a4 = tcg_temp_new_ptr(); 327 328 tcg_gen_addi_ptr(a0, cpu_env, dofs); 329 tcg_gen_addi_ptr(a1, cpu_env, aofs); 330 tcg_gen_addi_ptr(a2, cpu_env, bofs); 331 tcg_gen_addi_ptr(a3, cpu_env, cofs); 332 tcg_gen_addi_ptr(a4, cpu_env, eofs); 333 334 fn(a0, a1, a2, a3, a4, ptr, desc); 335 336 tcg_temp_free_ptr(a0); 337 tcg_temp_free_ptr(a1); 338 tcg_temp_free_ptr(a2); 339 tcg_temp_free_ptr(a3); 340 tcg_temp_free_ptr(a4); 341 tcg_temp_free_i32(desc); 342 } 343 344 /* Return true if we want to implement something of OPRSZ bytes 345 in units of LNSZ. This limits the expansion of inline code. */ 346 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 347 { 348 uint32_t q, r; 349 350 if (oprsz < lnsz) { 351 return false; 352 } 353 354 q = oprsz / lnsz; 355 r = oprsz % lnsz; 356 tcg_debug_assert((r & 7) == 0); 357 358 if (lnsz < 16) { 359 /* For sizes below 16, accept no remainder. */ 360 if (r != 0) { 361 return false; 362 } 363 } else { 364 /* 365 * Recall that ARM SVE allows vector sizes that are not a 366 * power of 2, but always a multiple of 16. The intent is 367 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 368 * In addition, expand_clr needs to handle a multiple of 8. 369 * Thus we can handle the tail with one more operation per 370 * diminishing power of 2. 371 */ 372 q += ctpop32(r); 373 } 374 375 return q <= MAX_UNROLL; 376 } 377 378 static void expand_clr(uint32_t dofs, uint32_t maxsz); 379 380 /* Duplicate C as per VECE. */ 381 uint64_t (dup_const)(unsigned vece, uint64_t c) 382 { 383 switch (vece) { 384 case MO_8: 385 return 0x0101010101010101ull * (uint8_t)c; 386 case MO_16: 387 return 0x0001000100010001ull * (uint16_t)c; 388 case MO_32: 389 return 0x0000000100000001ull * (uint32_t)c; 390 case MO_64: 391 return c; 392 default: 393 g_assert_not_reached(); 394 } 395 } 396 397 /* Duplicate IN into OUT as per VECE. */ 398 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 399 { 400 switch (vece) { 401 case MO_8: 402 tcg_gen_ext8u_i32(out, in); 403 tcg_gen_muli_i32(out, out, 0x01010101); 404 break; 405 case MO_16: 406 tcg_gen_deposit_i32(out, in, in, 16, 16); 407 break; 408 case MO_32: 409 tcg_gen_mov_i32(out, in); 410 break; 411 default: 412 g_assert_not_reached(); 413 } 414 } 415 416 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 417 { 418 switch (vece) { 419 case MO_8: 420 tcg_gen_ext8u_i64(out, in); 421 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 422 break; 423 case MO_16: 424 tcg_gen_ext16u_i64(out, in); 425 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 426 break; 427 case MO_32: 428 tcg_gen_deposit_i64(out, in, in, 32, 32); 429 break; 430 case MO_64: 431 tcg_gen_mov_i64(out, in); 432 break; 433 default: 434 g_assert_not_reached(); 435 } 436 } 437 438 /* Select a supported vector type for implementing an operation on SIZE 439 * bytes. If OP is 0, assume that the real operation to be performed is 440 * required by all backends. Otherwise, make sure than OP can be performed 441 * on elements of size VECE in the selected type. Do not select V64 if 442 * PREFER_I64 is true. Return 0 if no vector type is selected. 443 */ 444 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 445 uint32_t size, bool prefer_i64) 446 { 447 /* 448 * Recall that ARM SVE allows vector sizes that are not a 449 * power of 2, but always a multiple of 16. The intent is 450 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 451 * It is hard to imagine a case in which v256 is supported 452 * but v128 is not, but check anyway. 453 * In addition, expand_clr needs to handle a multiple of 8. 454 */ 455 if (TCG_TARGET_HAS_v256 && 456 check_size_impl(size, 32) && 457 tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && 458 (!(size & 16) || 459 (TCG_TARGET_HAS_v128 && 460 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && 461 (!(size & 8) || 462 (TCG_TARGET_HAS_v64 && 463 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 464 return TCG_TYPE_V256; 465 } 466 if (TCG_TARGET_HAS_v128 && 467 check_size_impl(size, 16) && 468 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && 469 (!(size & 8) || 470 (TCG_TARGET_HAS_v64 && 471 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 472 return TCG_TYPE_V128; 473 } 474 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 475 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 476 return TCG_TYPE_V64; 477 } 478 return 0; 479 } 480 481 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 482 uint32_t maxsz, TCGv_vec t_vec) 483 { 484 uint32_t i = 0; 485 486 tcg_debug_assert(oprsz >= 8); 487 488 /* 489 * This may be expand_clr for the tail of an operation, e.g. 490 * oprsz == 8 && maxsz == 64. The first 8 bytes of this store 491 * are misaligned wrt the maximum vector size, so do that first. 492 */ 493 if (dofs & 8) { 494 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 495 i += 8; 496 } 497 498 switch (type) { 499 case TCG_TYPE_V256: 500 /* 501 * Recall that ARM SVE allows vector sizes that are not a 502 * power of 2, but always a multiple of 16. The intent is 503 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 504 */ 505 for (; i + 32 <= oprsz; i += 32) { 506 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 507 } 508 /* fallthru */ 509 case TCG_TYPE_V128: 510 for (; i + 16 <= oprsz; i += 16) { 511 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 512 } 513 break; 514 case TCG_TYPE_V64: 515 for (; i < oprsz; i += 8) { 516 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 517 } 518 break; 519 default: 520 g_assert_not_reached(); 521 } 522 523 if (oprsz < maxsz) { 524 expand_clr(dofs + oprsz, maxsz - oprsz); 525 } 526 } 527 528 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 529 * Only one of IN_32 or IN_64 may be set; 530 * IN_C is used if IN_32 and IN_64 are unset. 531 */ 532 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 533 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 534 uint64_t in_c) 535 { 536 TCGType type; 537 TCGv_i64 t_64; 538 TCGv_i32 t_32, t_desc; 539 TCGv_ptr t_ptr; 540 uint32_t i; 541 542 assert(vece <= (in_32 ? MO_32 : MO_64)); 543 assert(in_32 == NULL || in_64 == NULL); 544 545 /* If we're storing 0, expand oprsz to maxsz. */ 546 if (in_32 == NULL && in_64 == NULL) { 547 in_c = dup_const(vece, in_c); 548 if (in_c == 0) { 549 oprsz = maxsz; 550 } 551 } 552 553 /* Implement inline with a vector type, if possible. 554 * Prefer integer when 64-bit host and no variable dup. 555 */ 556 type = choose_vector_type(NULL, vece, oprsz, 557 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 558 && (in_64 == NULL || vece == MO_64))); 559 if (type != 0) { 560 TCGv_vec t_vec = tcg_temp_new_vec(type); 561 562 if (in_32) { 563 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 564 } else if (in_64) { 565 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 566 } else { 567 tcg_gen_dupi_vec(vece, t_vec, in_c); 568 } 569 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 570 tcg_temp_free_vec(t_vec); 571 return; 572 } 573 574 /* Otherwise, inline with an integer type, unless "large". */ 575 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 576 t_64 = NULL; 577 t_32 = NULL; 578 579 if (in_32) { 580 /* We are given a 32-bit variable input. For a 64-bit host, 581 use a 64-bit operation unless the 32-bit operation would 582 be simple enough. */ 583 if (TCG_TARGET_REG_BITS == 64 584 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 585 t_64 = tcg_temp_new_i64(); 586 tcg_gen_extu_i32_i64(t_64, in_32); 587 gen_dup_i64(vece, t_64, t_64); 588 } else { 589 t_32 = tcg_temp_new_i32(); 590 gen_dup_i32(vece, t_32, in_32); 591 } 592 } else if (in_64) { 593 /* We are given a 64-bit variable input. */ 594 t_64 = tcg_temp_new_i64(); 595 gen_dup_i64(vece, t_64, in_64); 596 } else { 597 /* We are given a constant input. */ 598 /* For 64-bit hosts, use 64-bit constants for "simple" constants 599 or when we'd need too many 32-bit stores, or when a 64-bit 600 constant is really required. */ 601 if (vece == MO_64 602 || (TCG_TARGET_REG_BITS == 64 603 && (in_c == 0 || in_c == -1 604 || !check_size_impl(oprsz, 4)))) { 605 t_64 = tcg_const_i64(in_c); 606 } else { 607 t_32 = tcg_const_i32(in_c); 608 } 609 } 610 611 /* Implement inline if we picked an implementation size above. */ 612 if (t_32) { 613 for (i = 0; i < oprsz; i += 4) { 614 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 615 } 616 tcg_temp_free_i32(t_32); 617 goto done; 618 } 619 if (t_64) { 620 for (i = 0; i < oprsz; i += 8) { 621 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 622 } 623 tcg_temp_free_i64(t_64); 624 goto done; 625 } 626 } 627 628 /* Otherwise implement out of line. */ 629 t_ptr = tcg_temp_new_ptr(); 630 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 631 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 632 633 if (vece == MO_64) { 634 if (in_64) { 635 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 636 } else { 637 t_64 = tcg_const_i64(in_c); 638 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 639 tcg_temp_free_i64(t_64); 640 } 641 } else { 642 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 643 static dup_fn * const fns[3] = { 644 gen_helper_gvec_dup8, 645 gen_helper_gvec_dup16, 646 gen_helper_gvec_dup32 647 }; 648 649 if (in_32) { 650 fns[vece](t_ptr, t_desc, in_32); 651 } else { 652 t_32 = tcg_temp_new_i32(); 653 if (in_64) { 654 tcg_gen_extrl_i64_i32(t_32, in_64); 655 } else if (vece == MO_8) { 656 tcg_gen_movi_i32(t_32, in_c & 0xff); 657 } else if (vece == MO_16) { 658 tcg_gen_movi_i32(t_32, in_c & 0xffff); 659 } else { 660 tcg_gen_movi_i32(t_32, in_c); 661 } 662 fns[vece](t_ptr, t_desc, t_32); 663 tcg_temp_free_i32(t_32); 664 } 665 } 666 667 tcg_temp_free_ptr(t_ptr); 668 tcg_temp_free_i32(t_desc); 669 return; 670 671 done: 672 if (oprsz < maxsz) { 673 expand_clr(dofs + oprsz, maxsz - oprsz); 674 } 675 } 676 677 /* Likewise, but with zero. */ 678 static void expand_clr(uint32_t dofs, uint32_t maxsz) 679 { 680 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 681 } 682 683 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 684 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 685 bool load_dest, void (*fni)(TCGv_i32, TCGv_i32)) 686 { 687 TCGv_i32 t0 = tcg_temp_new_i32(); 688 TCGv_i32 t1 = tcg_temp_new_i32(); 689 uint32_t i; 690 691 for (i = 0; i < oprsz; i += 4) { 692 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 693 if (load_dest) { 694 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 695 } 696 fni(t1, t0); 697 tcg_gen_st_i32(t1, cpu_env, dofs + i); 698 } 699 tcg_temp_free_i32(t0); 700 tcg_temp_free_i32(t1); 701 } 702 703 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 704 int32_t c, bool load_dest, 705 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 706 { 707 TCGv_i32 t0 = tcg_temp_new_i32(); 708 TCGv_i32 t1 = tcg_temp_new_i32(); 709 uint32_t i; 710 711 for (i = 0; i < oprsz; i += 4) { 712 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 713 if (load_dest) { 714 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 715 } 716 fni(t1, t0, c); 717 tcg_gen_st_i32(t1, cpu_env, dofs + i); 718 } 719 tcg_temp_free_i32(t0); 720 tcg_temp_free_i32(t1); 721 } 722 723 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 724 TCGv_i32 c, bool scalar_first, 725 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 726 { 727 TCGv_i32 t0 = tcg_temp_new_i32(); 728 TCGv_i32 t1 = tcg_temp_new_i32(); 729 uint32_t i; 730 731 for (i = 0; i < oprsz; i += 4) { 732 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 733 if (scalar_first) { 734 fni(t1, c, t0); 735 } else { 736 fni(t1, t0, c); 737 } 738 tcg_gen_st_i32(t1, cpu_env, dofs + i); 739 } 740 tcg_temp_free_i32(t0); 741 tcg_temp_free_i32(t1); 742 } 743 744 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 745 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 746 uint32_t bofs, uint32_t oprsz, bool load_dest, 747 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 748 { 749 TCGv_i32 t0 = tcg_temp_new_i32(); 750 TCGv_i32 t1 = tcg_temp_new_i32(); 751 TCGv_i32 t2 = tcg_temp_new_i32(); 752 uint32_t i; 753 754 for (i = 0; i < oprsz; i += 4) { 755 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 756 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 757 if (load_dest) { 758 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 759 } 760 fni(t2, t0, t1); 761 tcg_gen_st_i32(t2, cpu_env, dofs + i); 762 } 763 tcg_temp_free_i32(t2); 764 tcg_temp_free_i32(t1); 765 tcg_temp_free_i32(t0); 766 } 767 768 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 769 uint32_t oprsz, int32_t c, bool load_dest, 770 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 771 { 772 TCGv_i32 t0 = tcg_temp_new_i32(); 773 TCGv_i32 t1 = tcg_temp_new_i32(); 774 TCGv_i32 t2 = tcg_temp_new_i32(); 775 uint32_t i; 776 777 for (i = 0; i < oprsz; i += 4) { 778 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 779 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 780 if (load_dest) { 781 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 782 } 783 fni(t2, t0, t1, c); 784 tcg_gen_st_i32(t2, cpu_env, dofs + i); 785 } 786 tcg_temp_free_i32(t0); 787 tcg_temp_free_i32(t1); 788 tcg_temp_free_i32(t2); 789 } 790 791 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 792 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 793 uint32_t cofs, uint32_t oprsz, bool write_aofs, 794 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 795 { 796 TCGv_i32 t0 = tcg_temp_new_i32(); 797 TCGv_i32 t1 = tcg_temp_new_i32(); 798 TCGv_i32 t2 = tcg_temp_new_i32(); 799 TCGv_i32 t3 = tcg_temp_new_i32(); 800 uint32_t i; 801 802 for (i = 0; i < oprsz; i += 4) { 803 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 804 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 805 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 806 fni(t0, t1, t2, t3); 807 tcg_gen_st_i32(t0, cpu_env, dofs + i); 808 if (write_aofs) { 809 tcg_gen_st_i32(t1, cpu_env, aofs + i); 810 } 811 } 812 tcg_temp_free_i32(t3); 813 tcg_temp_free_i32(t2); 814 tcg_temp_free_i32(t1); 815 tcg_temp_free_i32(t0); 816 } 817 818 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 819 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 820 bool load_dest, void (*fni)(TCGv_i64, TCGv_i64)) 821 { 822 TCGv_i64 t0 = tcg_temp_new_i64(); 823 TCGv_i64 t1 = tcg_temp_new_i64(); 824 uint32_t i; 825 826 for (i = 0; i < oprsz; i += 8) { 827 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 828 if (load_dest) { 829 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 830 } 831 fni(t1, t0); 832 tcg_gen_st_i64(t1, cpu_env, dofs + i); 833 } 834 tcg_temp_free_i64(t0); 835 tcg_temp_free_i64(t1); 836 } 837 838 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 839 int64_t c, bool load_dest, 840 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 841 { 842 TCGv_i64 t0 = tcg_temp_new_i64(); 843 TCGv_i64 t1 = tcg_temp_new_i64(); 844 uint32_t i; 845 846 for (i = 0; i < oprsz; i += 8) { 847 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 848 if (load_dest) { 849 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 850 } 851 fni(t1, t0, c); 852 tcg_gen_st_i64(t1, cpu_env, dofs + i); 853 } 854 tcg_temp_free_i64(t0); 855 tcg_temp_free_i64(t1); 856 } 857 858 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 859 TCGv_i64 c, bool scalar_first, 860 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 861 { 862 TCGv_i64 t0 = tcg_temp_new_i64(); 863 TCGv_i64 t1 = tcg_temp_new_i64(); 864 uint32_t i; 865 866 for (i = 0; i < oprsz; i += 8) { 867 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 868 if (scalar_first) { 869 fni(t1, c, t0); 870 } else { 871 fni(t1, t0, c); 872 } 873 tcg_gen_st_i64(t1, cpu_env, dofs + i); 874 } 875 tcg_temp_free_i64(t0); 876 tcg_temp_free_i64(t1); 877 } 878 879 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 880 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 881 uint32_t bofs, uint32_t oprsz, bool load_dest, 882 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 883 { 884 TCGv_i64 t0 = tcg_temp_new_i64(); 885 TCGv_i64 t1 = tcg_temp_new_i64(); 886 TCGv_i64 t2 = tcg_temp_new_i64(); 887 uint32_t i; 888 889 for (i = 0; i < oprsz; i += 8) { 890 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 891 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 892 if (load_dest) { 893 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 894 } 895 fni(t2, t0, t1); 896 tcg_gen_st_i64(t2, cpu_env, dofs + i); 897 } 898 tcg_temp_free_i64(t2); 899 tcg_temp_free_i64(t1); 900 tcg_temp_free_i64(t0); 901 } 902 903 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 904 uint32_t oprsz, int64_t c, bool load_dest, 905 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 906 { 907 TCGv_i64 t0 = tcg_temp_new_i64(); 908 TCGv_i64 t1 = tcg_temp_new_i64(); 909 TCGv_i64 t2 = tcg_temp_new_i64(); 910 uint32_t i; 911 912 for (i = 0; i < oprsz; i += 8) { 913 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 914 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 915 if (load_dest) { 916 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 917 } 918 fni(t2, t0, t1, c); 919 tcg_gen_st_i64(t2, cpu_env, dofs + i); 920 } 921 tcg_temp_free_i64(t0); 922 tcg_temp_free_i64(t1); 923 tcg_temp_free_i64(t2); 924 } 925 926 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 927 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 928 uint32_t cofs, uint32_t oprsz, bool write_aofs, 929 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 930 { 931 TCGv_i64 t0 = tcg_temp_new_i64(); 932 TCGv_i64 t1 = tcg_temp_new_i64(); 933 TCGv_i64 t2 = tcg_temp_new_i64(); 934 TCGv_i64 t3 = tcg_temp_new_i64(); 935 uint32_t i; 936 937 for (i = 0; i < oprsz; i += 8) { 938 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 939 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 940 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 941 fni(t0, t1, t2, t3); 942 tcg_gen_st_i64(t0, cpu_env, dofs + i); 943 if (write_aofs) { 944 tcg_gen_st_i64(t1, cpu_env, aofs + i); 945 } 946 } 947 tcg_temp_free_i64(t3); 948 tcg_temp_free_i64(t2); 949 tcg_temp_free_i64(t1); 950 tcg_temp_free_i64(t0); 951 } 952 953 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 954 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 955 uint32_t oprsz, uint32_t tysz, TCGType type, 956 bool load_dest, 957 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 958 { 959 TCGv_vec t0 = tcg_temp_new_vec(type); 960 TCGv_vec t1 = tcg_temp_new_vec(type); 961 uint32_t i; 962 963 for (i = 0; i < oprsz; i += tysz) { 964 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 965 if (load_dest) { 966 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 967 } 968 fni(vece, t1, t0); 969 tcg_gen_st_vec(t1, cpu_env, dofs + i); 970 } 971 tcg_temp_free_vec(t0); 972 tcg_temp_free_vec(t1); 973 } 974 975 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 976 using host vectors. */ 977 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 978 uint32_t oprsz, uint32_t tysz, TCGType type, 979 int64_t c, bool load_dest, 980 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 981 { 982 TCGv_vec t0 = tcg_temp_new_vec(type); 983 TCGv_vec t1 = tcg_temp_new_vec(type); 984 uint32_t i; 985 986 for (i = 0; i < oprsz; i += tysz) { 987 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 988 if (load_dest) { 989 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 990 } 991 fni(vece, t1, t0, c); 992 tcg_gen_st_vec(t1, cpu_env, dofs + i); 993 } 994 tcg_temp_free_vec(t0); 995 tcg_temp_free_vec(t1); 996 } 997 998 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 999 uint32_t oprsz, uint32_t tysz, TCGType type, 1000 TCGv_vec c, bool scalar_first, 1001 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1002 { 1003 TCGv_vec t0 = tcg_temp_new_vec(type); 1004 TCGv_vec t1 = tcg_temp_new_vec(type); 1005 uint32_t i; 1006 1007 for (i = 0; i < oprsz; i += tysz) { 1008 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1009 if (scalar_first) { 1010 fni(vece, t1, c, t0); 1011 } else { 1012 fni(vece, t1, t0, c); 1013 } 1014 tcg_gen_st_vec(t1, cpu_env, dofs + i); 1015 } 1016 tcg_temp_free_vec(t0); 1017 tcg_temp_free_vec(t1); 1018 } 1019 1020 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 1021 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1022 uint32_t bofs, uint32_t oprsz, 1023 uint32_t tysz, TCGType type, bool load_dest, 1024 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1025 { 1026 TCGv_vec t0 = tcg_temp_new_vec(type); 1027 TCGv_vec t1 = tcg_temp_new_vec(type); 1028 TCGv_vec t2 = tcg_temp_new_vec(type); 1029 uint32_t i; 1030 1031 for (i = 0; i < oprsz; i += tysz) { 1032 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1033 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 1034 if (load_dest) { 1035 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 1036 } 1037 fni(vece, t2, t0, t1); 1038 tcg_gen_st_vec(t2, cpu_env, dofs + i); 1039 } 1040 tcg_temp_free_vec(t2); 1041 tcg_temp_free_vec(t1); 1042 tcg_temp_free_vec(t0); 1043 } 1044 1045 /* 1046 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 1047 * using host vectors. 1048 */ 1049 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1050 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 1051 TCGType type, int64_t c, bool load_dest, 1052 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 1053 int64_t)) 1054 { 1055 TCGv_vec t0 = tcg_temp_new_vec(type); 1056 TCGv_vec t1 = tcg_temp_new_vec(type); 1057 TCGv_vec t2 = tcg_temp_new_vec(type); 1058 uint32_t i; 1059 1060 for (i = 0; i < oprsz; i += tysz) { 1061 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1062 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 1063 if (load_dest) { 1064 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 1065 } 1066 fni(vece, t2, t0, t1, c); 1067 tcg_gen_st_vec(t2, cpu_env, dofs + i); 1068 } 1069 tcg_temp_free_vec(t0); 1070 tcg_temp_free_vec(t1); 1071 tcg_temp_free_vec(t2); 1072 } 1073 1074 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 1075 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1076 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1077 uint32_t tysz, TCGType type, bool write_aofs, 1078 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1079 TCGv_vec, TCGv_vec)) 1080 { 1081 TCGv_vec t0 = tcg_temp_new_vec(type); 1082 TCGv_vec t1 = tcg_temp_new_vec(type); 1083 TCGv_vec t2 = tcg_temp_new_vec(type); 1084 TCGv_vec t3 = tcg_temp_new_vec(type); 1085 uint32_t i; 1086 1087 for (i = 0; i < oprsz; i += tysz) { 1088 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 1089 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 1090 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 1091 fni(vece, t0, t1, t2, t3); 1092 tcg_gen_st_vec(t0, cpu_env, dofs + i); 1093 if (write_aofs) { 1094 tcg_gen_st_vec(t1, cpu_env, aofs + i); 1095 } 1096 } 1097 tcg_temp_free_vec(t3); 1098 tcg_temp_free_vec(t2); 1099 tcg_temp_free_vec(t1); 1100 tcg_temp_free_vec(t0); 1101 } 1102 1103 /* Expand a vector two-operand operation. */ 1104 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 1105 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 1106 { 1107 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1108 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1109 TCGType type; 1110 uint32_t some; 1111 1112 check_size_align(oprsz, maxsz, dofs | aofs); 1113 check_overlap_2(dofs, aofs, maxsz); 1114 1115 type = 0; 1116 if (g->fniv) { 1117 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1118 } 1119 switch (type) { 1120 case TCG_TYPE_V256: 1121 /* Recall that ARM SVE allows vector sizes that are not a 1122 * power of 2, but always a multiple of 16. The intent is 1123 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1124 */ 1125 some = QEMU_ALIGN_DOWN(oprsz, 32); 1126 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1127 g->load_dest, g->fniv); 1128 if (some == oprsz) { 1129 break; 1130 } 1131 dofs += some; 1132 aofs += some; 1133 oprsz -= some; 1134 maxsz -= some; 1135 /* fallthru */ 1136 case TCG_TYPE_V128: 1137 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1138 g->load_dest, g->fniv); 1139 break; 1140 case TCG_TYPE_V64: 1141 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1142 g->load_dest, g->fniv); 1143 break; 1144 1145 case 0: 1146 if (g->fni8 && check_size_impl(oprsz, 8)) { 1147 expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8); 1148 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1149 expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4); 1150 } else { 1151 assert(g->fno != NULL); 1152 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1153 oprsz = maxsz; 1154 } 1155 break; 1156 1157 default: 1158 g_assert_not_reached(); 1159 } 1160 tcg_swap_vecop_list(hold_list); 1161 1162 if (oprsz < maxsz) { 1163 expand_clr(dofs + oprsz, maxsz - oprsz); 1164 } 1165 } 1166 1167 /* Expand a vector operation with two vectors and an immediate. */ 1168 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1169 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1170 { 1171 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1172 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1173 TCGType type; 1174 uint32_t some; 1175 1176 check_size_align(oprsz, maxsz, dofs | aofs); 1177 check_overlap_2(dofs, aofs, maxsz); 1178 1179 type = 0; 1180 if (g->fniv) { 1181 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1182 } 1183 switch (type) { 1184 case TCG_TYPE_V256: 1185 /* Recall that ARM SVE allows vector sizes that are not a 1186 * power of 2, but always a multiple of 16. The intent is 1187 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1188 */ 1189 some = QEMU_ALIGN_DOWN(oprsz, 32); 1190 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1191 c, g->load_dest, g->fniv); 1192 if (some == oprsz) { 1193 break; 1194 } 1195 dofs += some; 1196 aofs += some; 1197 oprsz -= some; 1198 maxsz -= some; 1199 /* fallthru */ 1200 case TCG_TYPE_V128: 1201 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1202 c, g->load_dest, g->fniv); 1203 break; 1204 case TCG_TYPE_V64: 1205 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1206 c, g->load_dest, g->fniv); 1207 break; 1208 1209 case 0: 1210 if (g->fni8 && check_size_impl(oprsz, 8)) { 1211 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1212 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1213 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1214 } else { 1215 if (g->fno) { 1216 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1217 } else { 1218 TCGv_i64 tcg_c = tcg_const_i64(c); 1219 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1220 maxsz, c, g->fnoi); 1221 tcg_temp_free_i64(tcg_c); 1222 } 1223 oprsz = maxsz; 1224 } 1225 break; 1226 1227 default: 1228 g_assert_not_reached(); 1229 } 1230 tcg_swap_vecop_list(hold_list); 1231 1232 if (oprsz < maxsz) { 1233 expand_clr(dofs + oprsz, maxsz - oprsz); 1234 } 1235 } 1236 1237 /* Expand a vector operation with two vectors and a scalar. */ 1238 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1239 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1240 { 1241 TCGType type; 1242 1243 check_size_align(oprsz, maxsz, dofs | aofs); 1244 check_overlap_2(dofs, aofs, maxsz); 1245 1246 type = 0; 1247 if (g->fniv) { 1248 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1249 } 1250 if (type != 0) { 1251 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1252 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1253 TCGv_vec t_vec = tcg_temp_new_vec(type); 1254 uint32_t some; 1255 1256 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1257 1258 switch (type) { 1259 case TCG_TYPE_V256: 1260 /* Recall that ARM SVE allows vector sizes that are not a 1261 * power of 2, but always a multiple of 16. The intent is 1262 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1263 */ 1264 some = QEMU_ALIGN_DOWN(oprsz, 32); 1265 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1266 t_vec, g->scalar_first, g->fniv); 1267 if (some == oprsz) { 1268 break; 1269 } 1270 dofs += some; 1271 aofs += some; 1272 oprsz -= some; 1273 maxsz -= some; 1274 /* fallthru */ 1275 1276 case TCG_TYPE_V128: 1277 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1278 t_vec, g->scalar_first, g->fniv); 1279 break; 1280 1281 case TCG_TYPE_V64: 1282 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1283 t_vec, g->scalar_first, g->fniv); 1284 break; 1285 1286 default: 1287 g_assert_not_reached(); 1288 } 1289 tcg_temp_free_vec(t_vec); 1290 tcg_swap_vecop_list(hold_list); 1291 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1292 TCGv_i64 t64 = tcg_temp_new_i64(); 1293 1294 gen_dup_i64(g->vece, t64, c); 1295 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1296 tcg_temp_free_i64(t64); 1297 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1298 TCGv_i32 t32 = tcg_temp_new_i32(); 1299 1300 tcg_gen_extrl_i64_i32(t32, c); 1301 gen_dup_i32(g->vece, t32, t32); 1302 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1303 tcg_temp_free_i32(t32); 1304 } else { 1305 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1306 return; 1307 } 1308 1309 if (oprsz < maxsz) { 1310 expand_clr(dofs + oprsz, maxsz - oprsz); 1311 } 1312 } 1313 1314 /* Expand a vector three-operand operation. */ 1315 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1316 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1317 { 1318 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1319 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1320 TCGType type; 1321 uint32_t some; 1322 1323 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1324 check_overlap_3(dofs, aofs, bofs, maxsz); 1325 1326 type = 0; 1327 if (g->fniv) { 1328 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1329 } 1330 switch (type) { 1331 case TCG_TYPE_V256: 1332 /* Recall that ARM SVE allows vector sizes that are not a 1333 * power of 2, but always a multiple of 16. The intent is 1334 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1335 */ 1336 some = QEMU_ALIGN_DOWN(oprsz, 32); 1337 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1338 g->load_dest, g->fniv); 1339 if (some == oprsz) { 1340 break; 1341 } 1342 dofs += some; 1343 aofs += some; 1344 bofs += some; 1345 oprsz -= some; 1346 maxsz -= some; 1347 /* fallthru */ 1348 case TCG_TYPE_V128: 1349 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1350 g->load_dest, g->fniv); 1351 break; 1352 case TCG_TYPE_V64: 1353 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1354 g->load_dest, g->fniv); 1355 break; 1356 1357 case 0: 1358 if (g->fni8 && check_size_impl(oprsz, 8)) { 1359 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1360 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1361 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1362 } else { 1363 assert(g->fno != NULL); 1364 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1365 maxsz, g->data, g->fno); 1366 oprsz = maxsz; 1367 } 1368 break; 1369 1370 default: 1371 g_assert_not_reached(); 1372 } 1373 tcg_swap_vecop_list(hold_list); 1374 1375 if (oprsz < maxsz) { 1376 expand_clr(dofs + oprsz, maxsz - oprsz); 1377 } 1378 } 1379 1380 /* Expand a vector operation with three vectors and an immediate. */ 1381 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1382 uint32_t oprsz, uint32_t maxsz, int64_t c, 1383 const GVecGen3i *g) 1384 { 1385 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1386 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1387 TCGType type; 1388 uint32_t some; 1389 1390 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1391 check_overlap_3(dofs, aofs, bofs, maxsz); 1392 1393 type = 0; 1394 if (g->fniv) { 1395 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1396 } 1397 switch (type) { 1398 case TCG_TYPE_V256: 1399 /* 1400 * Recall that ARM SVE allows vector sizes that are not a 1401 * power of 2, but always a multiple of 16. The intent is 1402 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1403 */ 1404 some = QEMU_ALIGN_DOWN(oprsz, 32); 1405 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1406 c, g->load_dest, g->fniv); 1407 if (some == oprsz) { 1408 break; 1409 } 1410 dofs += some; 1411 aofs += some; 1412 bofs += some; 1413 oprsz -= some; 1414 maxsz -= some; 1415 /* fallthru */ 1416 case TCG_TYPE_V128: 1417 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1418 c, g->load_dest, g->fniv); 1419 break; 1420 case TCG_TYPE_V64: 1421 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1422 c, g->load_dest, g->fniv); 1423 break; 1424 1425 case 0: 1426 if (g->fni8 && check_size_impl(oprsz, 8)) { 1427 expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8); 1428 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1429 expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4); 1430 } else { 1431 assert(g->fno != NULL); 1432 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1433 oprsz = maxsz; 1434 } 1435 break; 1436 1437 default: 1438 g_assert_not_reached(); 1439 } 1440 tcg_swap_vecop_list(hold_list); 1441 1442 if (oprsz < maxsz) { 1443 expand_clr(dofs + oprsz, maxsz - oprsz); 1444 } 1445 } 1446 1447 /* Expand a vector four-operand operation. */ 1448 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1449 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1450 { 1451 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1452 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1453 TCGType type; 1454 uint32_t some; 1455 1456 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1457 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1458 1459 type = 0; 1460 if (g->fniv) { 1461 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1462 } 1463 switch (type) { 1464 case TCG_TYPE_V256: 1465 /* Recall that ARM SVE allows vector sizes that are not a 1466 * power of 2, but always a multiple of 16. The intent is 1467 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1468 */ 1469 some = QEMU_ALIGN_DOWN(oprsz, 32); 1470 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1471 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1472 if (some == oprsz) { 1473 break; 1474 } 1475 dofs += some; 1476 aofs += some; 1477 bofs += some; 1478 cofs += some; 1479 oprsz -= some; 1480 maxsz -= some; 1481 /* fallthru */ 1482 case TCG_TYPE_V128: 1483 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1484 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1485 break; 1486 case TCG_TYPE_V64: 1487 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1488 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1489 break; 1490 1491 case 0: 1492 if (g->fni8 && check_size_impl(oprsz, 8)) { 1493 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1494 g->write_aofs, g->fni8); 1495 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1496 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1497 g->write_aofs, g->fni4); 1498 } else { 1499 assert(g->fno != NULL); 1500 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1501 oprsz, maxsz, g->data, g->fno); 1502 oprsz = maxsz; 1503 } 1504 break; 1505 1506 default: 1507 g_assert_not_reached(); 1508 } 1509 tcg_swap_vecop_list(hold_list); 1510 1511 if (oprsz < maxsz) { 1512 expand_clr(dofs + oprsz, maxsz - oprsz); 1513 } 1514 } 1515 1516 /* 1517 * Expand specific vector operations. 1518 */ 1519 1520 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1521 { 1522 tcg_gen_mov_vec(a, b); 1523 } 1524 1525 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1526 uint32_t oprsz, uint32_t maxsz) 1527 { 1528 static const GVecGen2 g = { 1529 .fni8 = tcg_gen_mov_i64, 1530 .fniv = vec_mov2, 1531 .fno = gen_helper_gvec_mov, 1532 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1533 }; 1534 if (dofs != aofs) { 1535 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1536 } else { 1537 check_size_align(oprsz, maxsz, dofs); 1538 if (oprsz < maxsz) { 1539 expand_clr(dofs + oprsz, maxsz - oprsz); 1540 } 1541 } 1542 } 1543 1544 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1545 uint32_t maxsz, TCGv_i32 in) 1546 { 1547 check_size_align(oprsz, maxsz, dofs); 1548 tcg_debug_assert(vece <= MO_32); 1549 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1550 } 1551 1552 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1553 uint32_t maxsz, TCGv_i64 in) 1554 { 1555 check_size_align(oprsz, maxsz, dofs); 1556 tcg_debug_assert(vece <= MO_64); 1557 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1558 } 1559 1560 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1561 uint32_t oprsz, uint32_t maxsz) 1562 { 1563 check_size_align(oprsz, maxsz, dofs); 1564 if (vece <= MO_64) { 1565 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1566 if (type != 0) { 1567 TCGv_vec t_vec = tcg_temp_new_vec(type); 1568 tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs); 1569 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1570 tcg_temp_free_vec(t_vec); 1571 } else if (vece <= MO_32) { 1572 TCGv_i32 in = tcg_temp_new_i32(); 1573 switch (vece) { 1574 case MO_8: 1575 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1576 break; 1577 case MO_16: 1578 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1579 break; 1580 default: 1581 tcg_gen_ld_i32(in, cpu_env, aofs); 1582 break; 1583 } 1584 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1585 tcg_temp_free_i32(in); 1586 } else { 1587 TCGv_i64 in = tcg_temp_new_i64(); 1588 tcg_gen_ld_i64(in, cpu_env, aofs); 1589 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1590 tcg_temp_free_i64(in); 1591 } 1592 } else if (vece == 4) { 1593 /* 128-bit duplicate. */ 1594 int i; 1595 1596 tcg_debug_assert(oprsz >= 16); 1597 if (TCG_TARGET_HAS_v128) { 1598 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1599 1600 tcg_gen_ld_vec(in, cpu_env, aofs); 1601 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1602 tcg_gen_st_vec(in, cpu_env, dofs + i); 1603 } 1604 tcg_temp_free_vec(in); 1605 } else { 1606 TCGv_i64 in0 = tcg_temp_new_i64(); 1607 TCGv_i64 in1 = tcg_temp_new_i64(); 1608 1609 tcg_gen_ld_i64(in0, cpu_env, aofs); 1610 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1611 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1612 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1613 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1614 } 1615 tcg_temp_free_i64(in0); 1616 tcg_temp_free_i64(in1); 1617 } 1618 if (oprsz < maxsz) { 1619 expand_clr(dofs + oprsz, maxsz - oprsz); 1620 } 1621 } else if (vece == 5) { 1622 /* 256-bit duplicate. */ 1623 int i; 1624 1625 tcg_debug_assert(oprsz >= 32); 1626 tcg_debug_assert(oprsz % 32 == 0); 1627 if (TCG_TARGET_HAS_v256) { 1628 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256); 1629 1630 tcg_gen_ld_vec(in, cpu_env, aofs); 1631 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1632 tcg_gen_st_vec(in, cpu_env, dofs + i); 1633 } 1634 tcg_temp_free_vec(in); 1635 } else if (TCG_TARGET_HAS_v128) { 1636 TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128); 1637 TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128); 1638 1639 tcg_gen_ld_vec(in0, cpu_env, aofs); 1640 tcg_gen_ld_vec(in1, cpu_env, aofs + 16); 1641 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1642 tcg_gen_st_vec(in0, cpu_env, dofs + i); 1643 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16); 1644 } 1645 tcg_temp_free_vec(in0); 1646 tcg_temp_free_vec(in1); 1647 } else { 1648 TCGv_i64 in[4]; 1649 int j; 1650 1651 for (j = 0; j < 4; ++j) { 1652 in[j] = tcg_temp_new_i64(); 1653 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8); 1654 } 1655 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1656 for (j = 0; j < 4; ++j) { 1657 tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8); 1658 } 1659 } 1660 for (j = 0; j < 4; ++j) { 1661 tcg_temp_free_i64(in[j]); 1662 } 1663 } 1664 if (oprsz < maxsz) { 1665 expand_clr(dofs + oprsz, maxsz - oprsz); 1666 } 1667 } else { 1668 g_assert_not_reached(); 1669 } 1670 } 1671 1672 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz, 1673 uint32_t maxsz, uint64_t x) 1674 { 1675 check_size_align(oprsz, maxsz, dofs); 1676 do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x); 1677 } 1678 1679 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1680 uint32_t oprsz, uint32_t maxsz) 1681 { 1682 static const GVecGen2 g = { 1683 .fni8 = tcg_gen_not_i64, 1684 .fniv = tcg_gen_not_vec, 1685 .fno = gen_helper_gvec_not, 1686 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1687 }; 1688 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1689 } 1690 1691 /* Perform a vector addition using normal addition and a mask. The mask 1692 should be the sign bit of each lane. This 6-operation form is more 1693 efficient than separate additions when there are 4 or more lanes in 1694 the 64-bit operation. */ 1695 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1696 { 1697 TCGv_i64 t1 = tcg_temp_new_i64(); 1698 TCGv_i64 t2 = tcg_temp_new_i64(); 1699 TCGv_i64 t3 = tcg_temp_new_i64(); 1700 1701 tcg_gen_andc_i64(t1, a, m); 1702 tcg_gen_andc_i64(t2, b, m); 1703 tcg_gen_xor_i64(t3, a, b); 1704 tcg_gen_add_i64(d, t1, t2); 1705 tcg_gen_and_i64(t3, t3, m); 1706 tcg_gen_xor_i64(d, d, t3); 1707 1708 tcg_temp_free_i64(t1); 1709 tcg_temp_free_i64(t2); 1710 tcg_temp_free_i64(t3); 1711 } 1712 1713 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1714 { 1715 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1716 gen_addv_mask(d, a, b, m); 1717 tcg_temp_free_i64(m); 1718 } 1719 1720 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1721 { 1722 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1723 gen_addv_mask(d, a, b, m); 1724 tcg_temp_free_i64(m); 1725 } 1726 1727 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1728 { 1729 TCGv_i64 t1 = tcg_temp_new_i64(); 1730 TCGv_i64 t2 = tcg_temp_new_i64(); 1731 1732 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1733 tcg_gen_add_i64(t2, a, b); 1734 tcg_gen_add_i64(t1, t1, b); 1735 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1736 1737 tcg_temp_free_i64(t1); 1738 tcg_temp_free_i64(t2); 1739 } 1740 1741 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1742 1743 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1744 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1745 { 1746 static const GVecGen3 g[4] = { 1747 { .fni8 = tcg_gen_vec_add8_i64, 1748 .fniv = tcg_gen_add_vec, 1749 .fno = gen_helper_gvec_add8, 1750 .opt_opc = vecop_list_add, 1751 .vece = MO_8 }, 1752 { .fni8 = tcg_gen_vec_add16_i64, 1753 .fniv = tcg_gen_add_vec, 1754 .fno = gen_helper_gvec_add16, 1755 .opt_opc = vecop_list_add, 1756 .vece = MO_16 }, 1757 { .fni4 = tcg_gen_add_i32, 1758 .fniv = tcg_gen_add_vec, 1759 .fno = gen_helper_gvec_add32, 1760 .opt_opc = vecop_list_add, 1761 .vece = MO_32 }, 1762 { .fni8 = tcg_gen_add_i64, 1763 .fniv = tcg_gen_add_vec, 1764 .fno = gen_helper_gvec_add64, 1765 .opt_opc = vecop_list_add, 1766 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1767 .vece = MO_64 }, 1768 }; 1769 1770 tcg_debug_assert(vece <= MO_64); 1771 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1772 } 1773 1774 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1775 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1776 { 1777 static const GVecGen2s g[4] = { 1778 { .fni8 = tcg_gen_vec_add8_i64, 1779 .fniv = tcg_gen_add_vec, 1780 .fno = gen_helper_gvec_adds8, 1781 .opt_opc = vecop_list_add, 1782 .vece = MO_8 }, 1783 { .fni8 = tcg_gen_vec_add16_i64, 1784 .fniv = tcg_gen_add_vec, 1785 .fno = gen_helper_gvec_adds16, 1786 .opt_opc = vecop_list_add, 1787 .vece = MO_16 }, 1788 { .fni4 = tcg_gen_add_i32, 1789 .fniv = tcg_gen_add_vec, 1790 .fno = gen_helper_gvec_adds32, 1791 .opt_opc = vecop_list_add, 1792 .vece = MO_32 }, 1793 { .fni8 = tcg_gen_add_i64, 1794 .fniv = tcg_gen_add_vec, 1795 .fno = gen_helper_gvec_adds64, 1796 .opt_opc = vecop_list_add, 1797 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1798 .vece = MO_64 }, 1799 }; 1800 1801 tcg_debug_assert(vece <= MO_64); 1802 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1803 } 1804 1805 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1806 int64_t c, uint32_t oprsz, uint32_t maxsz) 1807 { 1808 TCGv_i64 tmp = tcg_const_i64(c); 1809 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1810 tcg_temp_free_i64(tmp); 1811 } 1812 1813 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 1814 1815 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1816 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1817 { 1818 static const GVecGen2s g[4] = { 1819 { .fni8 = tcg_gen_vec_sub8_i64, 1820 .fniv = tcg_gen_sub_vec, 1821 .fno = gen_helper_gvec_subs8, 1822 .opt_opc = vecop_list_sub, 1823 .vece = MO_8 }, 1824 { .fni8 = tcg_gen_vec_sub16_i64, 1825 .fniv = tcg_gen_sub_vec, 1826 .fno = gen_helper_gvec_subs16, 1827 .opt_opc = vecop_list_sub, 1828 .vece = MO_16 }, 1829 { .fni4 = tcg_gen_sub_i32, 1830 .fniv = tcg_gen_sub_vec, 1831 .fno = gen_helper_gvec_subs32, 1832 .opt_opc = vecop_list_sub, 1833 .vece = MO_32 }, 1834 { .fni8 = tcg_gen_sub_i64, 1835 .fniv = tcg_gen_sub_vec, 1836 .fno = gen_helper_gvec_subs64, 1837 .opt_opc = vecop_list_sub, 1838 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1839 .vece = MO_64 }, 1840 }; 1841 1842 tcg_debug_assert(vece <= MO_64); 1843 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1844 } 1845 1846 /* Perform a vector subtraction using normal subtraction and a mask. 1847 Compare gen_addv_mask above. */ 1848 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1849 { 1850 TCGv_i64 t1 = tcg_temp_new_i64(); 1851 TCGv_i64 t2 = tcg_temp_new_i64(); 1852 TCGv_i64 t3 = tcg_temp_new_i64(); 1853 1854 tcg_gen_or_i64(t1, a, m); 1855 tcg_gen_andc_i64(t2, b, m); 1856 tcg_gen_eqv_i64(t3, a, b); 1857 tcg_gen_sub_i64(d, t1, t2); 1858 tcg_gen_and_i64(t3, t3, m); 1859 tcg_gen_xor_i64(d, d, t3); 1860 1861 tcg_temp_free_i64(t1); 1862 tcg_temp_free_i64(t2); 1863 tcg_temp_free_i64(t3); 1864 } 1865 1866 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1867 { 1868 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1869 gen_subv_mask(d, a, b, m); 1870 tcg_temp_free_i64(m); 1871 } 1872 1873 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1874 { 1875 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1876 gen_subv_mask(d, a, b, m); 1877 tcg_temp_free_i64(m); 1878 } 1879 1880 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1881 { 1882 TCGv_i64 t1 = tcg_temp_new_i64(); 1883 TCGv_i64 t2 = tcg_temp_new_i64(); 1884 1885 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1886 tcg_gen_sub_i64(t2, a, b); 1887 tcg_gen_sub_i64(t1, a, t1); 1888 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1889 1890 tcg_temp_free_i64(t1); 1891 tcg_temp_free_i64(t2); 1892 } 1893 1894 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1895 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1896 { 1897 static const GVecGen3 g[4] = { 1898 { .fni8 = tcg_gen_vec_sub8_i64, 1899 .fniv = tcg_gen_sub_vec, 1900 .fno = gen_helper_gvec_sub8, 1901 .opt_opc = vecop_list_sub, 1902 .vece = MO_8 }, 1903 { .fni8 = tcg_gen_vec_sub16_i64, 1904 .fniv = tcg_gen_sub_vec, 1905 .fno = gen_helper_gvec_sub16, 1906 .opt_opc = vecop_list_sub, 1907 .vece = MO_16 }, 1908 { .fni4 = tcg_gen_sub_i32, 1909 .fniv = tcg_gen_sub_vec, 1910 .fno = gen_helper_gvec_sub32, 1911 .opt_opc = vecop_list_sub, 1912 .vece = MO_32 }, 1913 { .fni8 = tcg_gen_sub_i64, 1914 .fniv = tcg_gen_sub_vec, 1915 .fno = gen_helper_gvec_sub64, 1916 .opt_opc = vecop_list_sub, 1917 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1918 .vece = MO_64 }, 1919 }; 1920 1921 tcg_debug_assert(vece <= MO_64); 1922 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1923 } 1924 1925 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 1926 1927 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1928 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1929 { 1930 static const GVecGen3 g[4] = { 1931 { .fniv = tcg_gen_mul_vec, 1932 .fno = gen_helper_gvec_mul8, 1933 .opt_opc = vecop_list_mul, 1934 .vece = MO_8 }, 1935 { .fniv = tcg_gen_mul_vec, 1936 .fno = gen_helper_gvec_mul16, 1937 .opt_opc = vecop_list_mul, 1938 .vece = MO_16 }, 1939 { .fni4 = tcg_gen_mul_i32, 1940 .fniv = tcg_gen_mul_vec, 1941 .fno = gen_helper_gvec_mul32, 1942 .opt_opc = vecop_list_mul, 1943 .vece = MO_32 }, 1944 { .fni8 = tcg_gen_mul_i64, 1945 .fniv = tcg_gen_mul_vec, 1946 .fno = gen_helper_gvec_mul64, 1947 .opt_opc = vecop_list_mul, 1948 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1949 .vece = MO_64 }, 1950 }; 1951 1952 tcg_debug_assert(vece <= MO_64); 1953 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1954 } 1955 1956 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1957 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1958 { 1959 static const GVecGen2s g[4] = { 1960 { .fniv = tcg_gen_mul_vec, 1961 .fno = gen_helper_gvec_muls8, 1962 .opt_opc = vecop_list_mul, 1963 .vece = MO_8 }, 1964 { .fniv = tcg_gen_mul_vec, 1965 .fno = gen_helper_gvec_muls16, 1966 .opt_opc = vecop_list_mul, 1967 .vece = MO_16 }, 1968 { .fni4 = tcg_gen_mul_i32, 1969 .fniv = tcg_gen_mul_vec, 1970 .fno = gen_helper_gvec_muls32, 1971 .opt_opc = vecop_list_mul, 1972 .vece = MO_32 }, 1973 { .fni8 = tcg_gen_mul_i64, 1974 .fniv = tcg_gen_mul_vec, 1975 .fno = gen_helper_gvec_muls64, 1976 .opt_opc = vecop_list_mul, 1977 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1978 .vece = MO_64 }, 1979 }; 1980 1981 tcg_debug_assert(vece <= MO_64); 1982 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1983 } 1984 1985 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1986 int64_t c, uint32_t oprsz, uint32_t maxsz) 1987 { 1988 TCGv_i64 tmp = tcg_const_i64(c); 1989 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1990 tcg_temp_free_i64(tmp); 1991 } 1992 1993 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1994 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1995 { 1996 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 1997 static const GVecGen3 g[4] = { 1998 { .fniv = tcg_gen_ssadd_vec, 1999 .fno = gen_helper_gvec_ssadd8, 2000 .opt_opc = vecop_list, 2001 .vece = MO_8 }, 2002 { .fniv = tcg_gen_ssadd_vec, 2003 .fno = gen_helper_gvec_ssadd16, 2004 .opt_opc = vecop_list, 2005 .vece = MO_16 }, 2006 { .fniv = tcg_gen_ssadd_vec, 2007 .fno = gen_helper_gvec_ssadd32, 2008 .opt_opc = vecop_list, 2009 .vece = MO_32 }, 2010 { .fniv = tcg_gen_ssadd_vec, 2011 .fno = gen_helper_gvec_ssadd64, 2012 .opt_opc = vecop_list, 2013 .vece = MO_64 }, 2014 }; 2015 tcg_debug_assert(vece <= MO_64); 2016 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2017 } 2018 2019 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 2020 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2021 { 2022 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 2023 static const GVecGen3 g[4] = { 2024 { .fniv = tcg_gen_sssub_vec, 2025 .fno = gen_helper_gvec_sssub8, 2026 .opt_opc = vecop_list, 2027 .vece = MO_8 }, 2028 { .fniv = tcg_gen_sssub_vec, 2029 .fno = gen_helper_gvec_sssub16, 2030 .opt_opc = vecop_list, 2031 .vece = MO_16 }, 2032 { .fniv = tcg_gen_sssub_vec, 2033 .fno = gen_helper_gvec_sssub32, 2034 .opt_opc = vecop_list, 2035 .vece = MO_32 }, 2036 { .fniv = tcg_gen_sssub_vec, 2037 .fno = gen_helper_gvec_sssub64, 2038 .opt_opc = vecop_list, 2039 .vece = MO_64 }, 2040 }; 2041 tcg_debug_assert(vece <= MO_64); 2042 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2043 } 2044 2045 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2046 { 2047 TCGv_i32 max = tcg_const_i32(-1); 2048 tcg_gen_add_i32(d, a, b); 2049 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 2050 tcg_temp_free_i32(max); 2051 } 2052 2053 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2054 { 2055 TCGv_i64 max = tcg_const_i64(-1); 2056 tcg_gen_add_i64(d, a, b); 2057 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 2058 tcg_temp_free_i64(max); 2059 } 2060 2061 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2062 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2063 { 2064 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 2065 static const GVecGen3 g[4] = { 2066 { .fniv = tcg_gen_usadd_vec, 2067 .fno = gen_helper_gvec_usadd8, 2068 .opt_opc = vecop_list, 2069 .vece = MO_8 }, 2070 { .fniv = tcg_gen_usadd_vec, 2071 .fno = gen_helper_gvec_usadd16, 2072 .opt_opc = vecop_list, 2073 .vece = MO_16 }, 2074 { .fni4 = tcg_gen_usadd_i32, 2075 .fniv = tcg_gen_usadd_vec, 2076 .fno = gen_helper_gvec_usadd32, 2077 .opt_opc = vecop_list, 2078 .vece = MO_32 }, 2079 { .fni8 = tcg_gen_usadd_i64, 2080 .fniv = tcg_gen_usadd_vec, 2081 .fno = gen_helper_gvec_usadd64, 2082 .opt_opc = vecop_list, 2083 .vece = MO_64 } 2084 }; 2085 tcg_debug_assert(vece <= MO_64); 2086 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2087 } 2088 2089 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2090 { 2091 TCGv_i32 min = tcg_const_i32(0); 2092 tcg_gen_sub_i32(d, a, b); 2093 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 2094 tcg_temp_free_i32(min); 2095 } 2096 2097 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2098 { 2099 TCGv_i64 min = tcg_const_i64(0); 2100 tcg_gen_sub_i64(d, a, b); 2101 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 2102 tcg_temp_free_i64(min); 2103 } 2104 2105 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 2106 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2107 { 2108 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 2109 static const GVecGen3 g[4] = { 2110 { .fniv = tcg_gen_ussub_vec, 2111 .fno = gen_helper_gvec_ussub8, 2112 .opt_opc = vecop_list, 2113 .vece = MO_8 }, 2114 { .fniv = tcg_gen_ussub_vec, 2115 .fno = gen_helper_gvec_ussub16, 2116 .opt_opc = vecop_list, 2117 .vece = MO_16 }, 2118 { .fni4 = tcg_gen_ussub_i32, 2119 .fniv = tcg_gen_ussub_vec, 2120 .fno = gen_helper_gvec_ussub32, 2121 .opt_opc = vecop_list, 2122 .vece = MO_32 }, 2123 { .fni8 = tcg_gen_ussub_i64, 2124 .fniv = tcg_gen_ussub_vec, 2125 .fno = gen_helper_gvec_ussub64, 2126 .opt_opc = vecop_list, 2127 .vece = MO_64 } 2128 }; 2129 tcg_debug_assert(vece <= MO_64); 2130 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2131 } 2132 2133 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 2134 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2135 { 2136 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 2137 static const GVecGen3 g[4] = { 2138 { .fniv = tcg_gen_smin_vec, 2139 .fno = gen_helper_gvec_smin8, 2140 .opt_opc = vecop_list, 2141 .vece = MO_8 }, 2142 { .fniv = tcg_gen_smin_vec, 2143 .fno = gen_helper_gvec_smin16, 2144 .opt_opc = vecop_list, 2145 .vece = MO_16 }, 2146 { .fni4 = tcg_gen_smin_i32, 2147 .fniv = tcg_gen_smin_vec, 2148 .fno = gen_helper_gvec_smin32, 2149 .opt_opc = vecop_list, 2150 .vece = MO_32 }, 2151 { .fni8 = tcg_gen_smin_i64, 2152 .fniv = tcg_gen_smin_vec, 2153 .fno = gen_helper_gvec_smin64, 2154 .opt_opc = vecop_list, 2155 .vece = MO_64 } 2156 }; 2157 tcg_debug_assert(vece <= MO_64); 2158 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2159 } 2160 2161 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2162 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2163 { 2164 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2165 static const GVecGen3 g[4] = { 2166 { .fniv = tcg_gen_umin_vec, 2167 .fno = gen_helper_gvec_umin8, 2168 .opt_opc = vecop_list, 2169 .vece = MO_8 }, 2170 { .fniv = tcg_gen_umin_vec, 2171 .fno = gen_helper_gvec_umin16, 2172 .opt_opc = vecop_list, 2173 .vece = MO_16 }, 2174 { .fni4 = tcg_gen_umin_i32, 2175 .fniv = tcg_gen_umin_vec, 2176 .fno = gen_helper_gvec_umin32, 2177 .opt_opc = vecop_list, 2178 .vece = MO_32 }, 2179 { .fni8 = tcg_gen_umin_i64, 2180 .fniv = tcg_gen_umin_vec, 2181 .fno = gen_helper_gvec_umin64, 2182 .opt_opc = vecop_list, 2183 .vece = MO_64 } 2184 }; 2185 tcg_debug_assert(vece <= MO_64); 2186 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2187 } 2188 2189 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2190 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2191 { 2192 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2193 static const GVecGen3 g[4] = { 2194 { .fniv = tcg_gen_smax_vec, 2195 .fno = gen_helper_gvec_smax8, 2196 .opt_opc = vecop_list, 2197 .vece = MO_8 }, 2198 { .fniv = tcg_gen_smax_vec, 2199 .fno = gen_helper_gvec_smax16, 2200 .opt_opc = vecop_list, 2201 .vece = MO_16 }, 2202 { .fni4 = tcg_gen_smax_i32, 2203 .fniv = tcg_gen_smax_vec, 2204 .fno = gen_helper_gvec_smax32, 2205 .opt_opc = vecop_list, 2206 .vece = MO_32 }, 2207 { .fni8 = tcg_gen_smax_i64, 2208 .fniv = tcg_gen_smax_vec, 2209 .fno = gen_helper_gvec_smax64, 2210 .opt_opc = vecop_list, 2211 .vece = MO_64 } 2212 }; 2213 tcg_debug_assert(vece <= MO_64); 2214 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2215 } 2216 2217 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2218 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2219 { 2220 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2221 static const GVecGen3 g[4] = { 2222 { .fniv = tcg_gen_umax_vec, 2223 .fno = gen_helper_gvec_umax8, 2224 .opt_opc = vecop_list, 2225 .vece = MO_8 }, 2226 { .fniv = tcg_gen_umax_vec, 2227 .fno = gen_helper_gvec_umax16, 2228 .opt_opc = vecop_list, 2229 .vece = MO_16 }, 2230 { .fni4 = tcg_gen_umax_i32, 2231 .fniv = tcg_gen_umax_vec, 2232 .fno = gen_helper_gvec_umax32, 2233 .opt_opc = vecop_list, 2234 .vece = MO_32 }, 2235 { .fni8 = tcg_gen_umax_i64, 2236 .fniv = tcg_gen_umax_vec, 2237 .fno = gen_helper_gvec_umax64, 2238 .opt_opc = vecop_list, 2239 .vece = MO_64 } 2240 }; 2241 tcg_debug_assert(vece <= MO_64); 2242 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2243 } 2244 2245 /* Perform a vector negation using normal negation and a mask. 2246 Compare gen_subv_mask above. */ 2247 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2248 { 2249 TCGv_i64 t2 = tcg_temp_new_i64(); 2250 TCGv_i64 t3 = tcg_temp_new_i64(); 2251 2252 tcg_gen_andc_i64(t3, m, b); 2253 tcg_gen_andc_i64(t2, b, m); 2254 tcg_gen_sub_i64(d, m, t2); 2255 tcg_gen_xor_i64(d, d, t3); 2256 2257 tcg_temp_free_i64(t2); 2258 tcg_temp_free_i64(t3); 2259 } 2260 2261 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2262 { 2263 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 2264 gen_negv_mask(d, b, m); 2265 tcg_temp_free_i64(m); 2266 } 2267 2268 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2269 { 2270 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 2271 gen_negv_mask(d, b, m); 2272 tcg_temp_free_i64(m); 2273 } 2274 2275 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2276 { 2277 TCGv_i64 t1 = tcg_temp_new_i64(); 2278 TCGv_i64 t2 = tcg_temp_new_i64(); 2279 2280 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2281 tcg_gen_neg_i64(t2, b); 2282 tcg_gen_neg_i64(t1, t1); 2283 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2284 2285 tcg_temp_free_i64(t1); 2286 tcg_temp_free_i64(t2); 2287 } 2288 2289 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2290 uint32_t oprsz, uint32_t maxsz) 2291 { 2292 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2293 static const GVecGen2 g[4] = { 2294 { .fni8 = tcg_gen_vec_neg8_i64, 2295 .fniv = tcg_gen_neg_vec, 2296 .fno = gen_helper_gvec_neg8, 2297 .opt_opc = vecop_list, 2298 .vece = MO_8 }, 2299 { .fni8 = tcg_gen_vec_neg16_i64, 2300 .fniv = tcg_gen_neg_vec, 2301 .fno = gen_helper_gvec_neg16, 2302 .opt_opc = vecop_list, 2303 .vece = MO_16 }, 2304 { .fni4 = tcg_gen_neg_i32, 2305 .fniv = tcg_gen_neg_vec, 2306 .fno = gen_helper_gvec_neg32, 2307 .opt_opc = vecop_list, 2308 .vece = MO_32 }, 2309 { .fni8 = tcg_gen_neg_i64, 2310 .fniv = tcg_gen_neg_vec, 2311 .fno = gen_helper_gvec_neg64, 2312 .opt_opc = vecop_list, 2313 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2314 .vece = MO_64 }, 2315 }; 2316 2317 tcg_debug_assert(vece <= MO_64); 2318 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2319 } 2320 2321 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2322 { 2323 TCGv_i64 t = tcg_temp_new_i64(); 2324 int nbit = 8 << vece; 2325 2326 /* Create -1 for each negative element. */ 2327 tcg_gen_shri_i64(t, b, nbit - 1); 2328 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2329 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2330 2331 /* 2332 * Invert (via xor -1) and add one. 2333 * Because of the ordering the msb is cleared, 2334 * so we never have carry into the next element. 2335 */ 2336 tcg_gen_xor_i64(d, b, t); 2337 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2338 tcg_gen_add_i64(d, d, t); 2339 2340 tcg_temp_free_i64(t); 2341 } 2342 2343 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2344 { 2345 gen_absv_mask(d, b, MO_8); 2346 } 2347 2348 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2349 { 2350 gen_absv_mask(d, b, MO_16); 2351 } 2352 2353 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2354 uint32_t oprsz, uint32_t maxsz) 2355 { 2356 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2357 static const GVecGen2 g[4] = { 2358 { .fni8 = tcg_gen_vec_abs8_i64, 2359 .fniv = tcg_gen_abs_vec, 2360 .fno = gen_helper_gvec_abs8, 2361 .opt_opc = vecop_list, 2362 .vece = MO_8 }, 2363 { .fni8 = tcg_gen_vec_abs16_i64, 2364 .fniv = tcg_gen_abs_vec, 2365 .fno = gen_helper_gvec_abs16, 2366 .opt_opc = vecop_list, 2367 .vece = MO_16 }, 2368 { .fni4 = tcg_gen_abs_i32, 2369 .fniv = tcg_gen_abs_vec, 2370 .fno = gen_helper_gvec_abs32, 2371 .opt_opc = vecop_list, 2372 .vece = MO_32 }, 2373 { .fni8 = tcg_gen_abs_i64, 2374 .fniv = tcg_gen_abs_vec, 2375 .fno = gen_helper_gvec_abs64, 2376 .opt_opc = vecop_list, 2377 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2378 .vece = MO_64 }, 2379 }; 2380 2381 tcg_debug_assert(vece <= MO_64); 2382 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2383 } 2384 2385 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2386 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2387 { 2388 static const GVecGen3 g = { 2389 .fni8 = tcg_gen_and_i64, 2390 .fniv = tcg_gen_and_vec, 2391 .fno = gen_helper_gvec_and, 2392 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2393 }; 2394 2395 if (aofs == bofs) { 2396 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2397 } else { 2398 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2399 } 2400 } 2401 2402 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2403 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2404 { 2405 static const GVecGen3 g = { 2406 .fni8 = tcg_gen_or_i64, 2407 .fniv = tcg_gen_or_vec, 2408 .fno = gen_helper_gvec_or, 2409 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2410 }; 2411 2412 if (aofs == bofs) { 2413 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2414 } else { 2415 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2416 } 2417 } 2418 2419 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2420 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2421 { 2422 static const GVecGen3 g = { 2423 .fni8 = tcg_gen_xor_i64, 2424 .fniv = tcg_gen_xor_vec, 2425 .fno = gen_helper_gvec_xor, 2426 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2427 }; 2428 2429 if (aofs == bofs) { 2430 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2431 } else { 2432 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2433 } 2434 } 2435 2436 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2437 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2438 { 2439 static const GVecGen3 g = { 2440 .fni8 = tcg_gen_andc_i64, 2441 .fniv = tcg_gen_andc_vec, 2442 .fno = gen_helper_gvec_andc, 2443 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2444 }; 2445 2446 if (aofs == bofs) { 2447 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2448 } else { 2449 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2450 } 2451 } 2452 2453 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2454 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2455 { 2456 static const GVecGen3 g = { 2457 .fni8 = tcg_gen_orc_i64, 2458 .fniv = tcg_gen_orc_vec, 2459 .fno = gen_helper_gvec_orc, 2460 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2461 }; 2462 2463 if (aofs == bofs) { 2464 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2465 } else { 2466 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2467 } 2468 } 2469 2470 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2471 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2472 { 2473 static const GVecGen3 g = { 2474 .fni8 = tcg_gen_nand_i64, 2475 .fniv = tcg_gen_nand_vec, 2476 .fno = gen_helper_gvec_nand, 2477 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2478 }; 2479 2480 if (aofs == bofs) { 2481 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2482 } else { 2483 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2484 } 2485 } 2486 2487 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2488 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2489 { 2490 static const GVecGen3 g = { 2491 .fni8 = tcg_gen_nor_i64, 2492 .fniv = tcg_gen_nor_vec, 2493 .fno = gen_helper_gvec_nor, 2494 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2495 }; 2496 2497 if (aofs == bofs) { 2498 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2499 } else { 2500 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2501 } 2502 } 2503 2504 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2505 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2506 { 2507 static const GVecGen3 g = { 2508 .fni8 = tcg_gen_eqv_i64, 2509 .fniv = tcg_gen_eqv_vec, 2510 .fno = gen_helper_gvec_eqv, 2511 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2512 }; 2513 2514 if (aofs == bofs) { 2515 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2516 } else { 2517 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2518 } 2519 } 2520 2521 static const GVecGen2s gop_ands = { 2522 .fni8 = tcg_gen_and_i64, 2523 .fniv = tcg_gen_and_vec, 2524 .fno = gen_helper_gvec_ands, 2525 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2526 .vece = MO_64 2527 }; 2528 2529 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2530 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2531 { 2532 TCGv_i64 tmp = tcg_temp_new_i64(); 2533 gen_dup_i64(vece, tmp, c); 2534 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2535 tcg_temp_free_i64(tmp); 2536 } 2537 2538 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2539 int64_t c, uint32_t oprsz, uint32_t maxsz) 2540 { 2541 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2542 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2543 tcg_temp_free_i64(tmp); 2544 } 2545 2546 static const GVecGen2s gop_xors = { 2547 .fni8 = tcg_gen_xor_i64, 2548 .fniv = tcg_gen_xor_vec, 2549 .fno = gen_helper_gvec_xors, 2550 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2551 .vece = MO_64 2552 }; 2553 2554 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2555 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2556 { 2557 TCGv_i64 tmp = tcg_temp_new_i64(); 2558 gen_dup_i64(vece, tmp, c); 2559 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2560 tcg_temp_free_i64(tmp); 2561 } 2562 2563 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2564 int64_t c, uint32_t oprsz, uint32_t maxsz) 2565 { 2566 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2567 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2568 tcg_temp_free_i64(tmp); 2569 } 2570 2571 static const GVecGen2s gop_ors = { 2572 .fni8 = tcg_gen_or_i64, 2573 .fniv = tcg_gen_or_vec, 2574 .fno = gen_helper_gvec_ors, 2575 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2576 .vece = MO_64 2577 }; 2578 2579 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2580 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2581 { 2582 TCGv_i64 tmp = tcg_temp_new_i64(); 2583 gen_dup_i64(vece, tmp, c); 2584 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2585 tcg_temp_free_i64(tmp); 2586 } 2587 2588 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2589 int64_t c, uint32_t oprsz, uint32_t maxsz) 2590 { 2591 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2592 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2593 tcg_temp_free_i64(tmp); 2594 } 2595 2596 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2597 { 2598 uint64_t mask = dup_const(MO_8, 0xff << c); 2599 tcg_gen_shli_i64(d, a, c); 2600 tcg_gen_andi_i64(d, d, mask); 2601 } 2602 2603 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2604 { 2605 uint64_t mask = dup_const(MO_16, 0xffff << c); 2606 tcg_gen_shli_i64(d, a, c); 2607 tcg_gen_andi_i64(d, d, mask); 2608 } 2609 2610 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2611 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2612 { 2613 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2614 static const GVecGen2i g[4] = { 2615 { .fni8 = tcg_gen_vec_shl8i_i64, 2616 .fniv = tcg_gen_shli_vec, 2617 .fno = gen_helper_gvec_shl8i, 2618 .opt_opc = vecop_list, 2619 .vece = MO_8 }, 2620 { .fni8 = tcg_gen_vec_shl16i_i64, 2621 .fniv = tcg_gen_shli_vec, 2622 .fno = gen_helper_gvec_shl16i, 2623 .opt_opc = vecop_list, 2624 .vece = MO_16 }, 2625 { .fni4 = tcg_gen_shli_i32, 2626 .fniv = tcg_gen_shli_vec, 2627 .fno = gen_helper_gvec_shl32i, 2628 .opt_opc = vecop_list, 2629 .vece = MO_32 }, 2630 { .fni8 = tcg_gen_shli_i64, 2631 .fniv = tcg_gen_shli_vec, 2632 .fno = gen_helper_gvec_shl64i, 2633 .opt_opc = vecop_list, 2634 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2635 .vece = MO_64 }, 2636 }; 2637 2638 tcg_debug_assert(vece <= MO_64); 2639 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2640 if (shift == 0) { 2641 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2642 } else { 2643 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2644 } 2645 } 2646 2647 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2648 { 2649 uint64_t mask = dup_const(MO_8, 0xff >> c); 2650 tcg_gen_shri_i64(d, a, c); 2651 tcg_gen_andi_i64(d, d, mask); 2652 } 2653 2654 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2655 { 2656 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2657 tcg_gen_shri_i64(d, a, c); 2658 tcg_gen_andi_i64(d, d, mask); 2659 } 2660 2661 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2662 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2663 { 2664 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2665 static const GVecGen2i g[4] = { 2666 { .fni8 = tcg_gen_vec_shr8i_i64, 2667 .fniv = tcg_gen_shri_vec, 2668 .fno = gen_helper_gvec_shr8i, 2669 .opt_opc = vecop_list, 2670 .vece = MO_8 }, 2671 { .fni8 = tcg_gen_vec_shr16i_i64, 2672 .fniv = tcg_gen_shri_vec, 2673 .fno = gen_helper_gvec_shr16i, 2674 .opt_opc = vecop_list, 2675 .vece = MO_16 }, 2676 { .fni4 = tcg_gen_shri_i32, 2677 .fniv = tcg_gen_shri_vec, 2678 .fno = gen_helper_gvec_shr32i, 2679 .opt_opc = vecop_list, 2680 .vece = MO_32 }, 2681 { .fni8 = tcg_gen_shri_i64, 2682 .fniv = tcg_gen_shri_vec, 2683 .fno = gen_helper_gvec_shr64i, 2684 .opt_opc = vecop_list, 2685 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2686 .vece = MO_64 }, 2687 }; 2688 2689 tcg_debug_assert(vece <= MO_64); 2690 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2691 if (shift == 0) { 2692 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2693 } else { 2694 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2695 } 2696 } 2697 2698 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2699 { 2700 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2701 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2702 TCGv_i64 s = tcg_temp_new_i64(); 2703 2704 tcg_gen_shri_i64(d, a, c); 2705 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2706 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2707 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2708 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2709 tcg_temp_free_i64(s); 2710 } 2711 2712 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2713 { 2714 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2715 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2716 TCGv_i64 s = tcg_temp_new_i64(); 2717 2718 tcg_gen_shri_i64(d, a, c); 2719 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2720 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2721 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2722 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2723 tcg_temp_free_i64(s); 2724 } 2725 2726 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2727 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2728 { 2729 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 2730 static const GVecGen2i g[4] = { 2731 { .fni8 = tcg_gen_vec_sar8i_i64, 2732 .fniv = tcg_gen_sari_vec, 2733 .fno = gen_helper_gvec_sar8i, 2734 .opt_opc = vecop_list, 2735 .vece = MO_8 }, 2736 { .fni8 = tcg_gen_vec_sar16i_i64, 2737 .fniv = tcg_gen_sari_vec, 2738 .fno = gen_helper_gvec_sar16i, 2739 .opt_opc = vecop_list, 2740 .vece = MO_16 }, 2741 { .fni4 = tcg_gen_sari_i32, 2742 .fniv = tcg_gen_sari_vec, 2743 .fno = gen_helper_gvec_sar32i, 2744 .opt_opc = vecop_list, 2745 .vece = MO_32 }, 2746 { .fni8 = tcg_gen_sari_i64, 2747 .fniv = tcg_gen_sari_vec, 2748 .fno = gen_helper_gvec_sar64i, 2749 .opt_opc = vecop_list, 2750 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2751 .vece = MO_64 }, 2752 }; 2753 2754 tcg_debug_assert(vece <= MO_64); 2755 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2756 if (shift == 0) { 2757 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2758 } else { 2759 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2760 } 2761 } 2762 2763 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2764 { 2765 uint64_t mask = dup_const(MO_8, 0xff << c); 2766 2767 tcg_gen_shli_i64(d, a, c); 2768 tcg_gen_shri_i64(a, a, 8 - c); 2769 tcg_gen_andi_i64(d, d, mask); 2770 tcg_gen_andi_i64(a, a, ~mask); 2771 tcg_gen_or_i64(d, d, a); 2772 } 2773 2774 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2775 { 2776 uint64_t mask = dup_const(MO_16, 0xffff << c); 2777 2778 tcg_gen_shli_i64(d, a, c); 2779 tcg_gen_shri_i64(a, a, 16 - c); 2780 tcg_gen_andi_i64(d, d, mask); 2781 tcg_gen_andi_i64(a, a, ~mask); 2782 tcg_gen_or_i64(d, d, a); 2783 } 2784 2785 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs, 2786 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2787 { 2788 static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 }; 2789 static const GVecGen2i g[4] = { 2790 { .fni8 = tcg_gen_vec_rotl8i_i64, 2791 .fniv = tcg_gen_rotli_vec, 2792 .fno = gen_helper_gvec_rotl8i, 2793 .opt_opc = vecop_list, 2794 .vece = MO_8 }, 2795 { .fni8 = tcg_gen_vec_rotl16i_i64, 2796 .fniv = tcg_gen_rotli_vec, 2797 .fno = gen_helper_gvec_rotl16i, 2798 .opt_opc = vecop_list, 2799 .vece = MO_16 }, 2800 { .fni4 = tcg_gen_rotli_i32, 2801 .fniv = tcg_gen_rotli_vec, 2802 .fno = gen_helper_gvec_rotl32i, 2803 .opt_opc = vecop_list, 2804 .vece = MO_32 }, 2805 { .fni8 = tcg_gen_rotli_i64, 2806 .fniv = tcg_gen_rotli_vec, 2807 .fno = gen_helper_gvec_rotl64i, 2808 .opt_opc = vecop_list, 2809 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2810 .vece = MO_64 }, 2811 }; 2812 2813 tcg_debug_assert(vece <= MO_64); 2814 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2815 if (shift == 0) { 2816 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2817 } else { 2818 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2819 } 2820 } 2821 2822 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs, 2823 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2824 { 2825 tcg_debug_assert(vece <= MO_64); 2826 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2827 tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1), 2828 oprsz, maxsz); 2829 } 2830 2831 /* 2832 * Specialized generation vector shifts by a non-constant scalar. 2833 */ 2834 2835 typedef struct { 2836 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 2837 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 2838 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 2839 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 2840 gen_helper_gvec_2 *fno[4]; 2841 TCGOpcode s_list[2]; 2842 TCGOpcode v_list[2]; 2843 } GVecGen2sh; 2844 2845 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2846 uint32_t oprsz, uint32_t tysz, TCGType type, 2847 TCGv_i32 shift, 2848 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 2849 { 2850 TCGv_vec t0 = tcg_temp_new_vec(type); 2851 uint32_t i; 2852 2853 for (i = 0; i < oprsz; i += tysz) { 2854 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2855 fni(vece, t0, t0, shift); 2856 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2857 } 2858 tcg_temp_free_vec(t0); 2859 } 2860 2861 static void 2862 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 2863 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 2864 { 2865 TCGType type; 2866 uint32_t some; 2867 2868 check_size_align(oprsz, maxsz, dofs | aofs); 2869 check_overlap_2(dofs, aofs, maxsz); 2870 2871 /* If the backend has a scalar expansion, great. */ 2872 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 2873 if (type) { 2874 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2875 switch (type) { 2876 case TCG_TYPE_V256: 2877 some = QEMU_ALIGN_DOWN(oprsz, 32); 2878 expand_2sh_vec(vece, dofs, aofs, some, 32, 2879 TCG_TYPE_V256, shift, g->fniv_s); 2880 if (some == oprsz) { 2881 break; 2882 } 2883 dofs += some; 2884 aofs += some; 2885 oprsz -= some; 2886 maxsz -= some; 2887 /* fallthru */ 2888 case TCG_TYPE_V128: 2889 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 2890 TCG_TYPE_V128, shift, g->fniv_s); 2891 break; 2892 case TCG_TYPE_V64: 2893 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 2894 TCG_TYPE_V64, shift, g->fniv_s); 2895 break; 2896 default: 2897 g_assert_not_reached(); 2898 } 2899 tcg_swap_vecop_list(hold_list); 2900 goto clear_tail; 2901 } 2902 2903 /* If the backend supports variable vector shifts, also cool. */ 2904 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 2905 if (type) { 2906 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2907 TCGv_vec v_shift = tcg_temp_new_vec(type); 2908 2909 if (vece == MO_64) { 2910 TCGv_i64 sh64 = tcg_temp_new_i64(); 2911 tcg_gen_extu_i32_i64(sh64, shift); 2912 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 2913 tcg_temp_free_i64(sh64); 2914 } else { 2915 tcg_gen_dup_i32_vec(vece, v_shift, shift); 2916 } 2917 2918 switch (type) { 2919 case TCG_TYPE_V256: 2920 some = QEMU_ALIGN_DOWN(oprsz, 32); 2921 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 2922 v_shift, false, g->fniv_v); 2923 if (some == oprsz) { 2924 break; 2925 } 2926 dofs += some; 2927 aofs += some; 2928 oprsz -= some; 2929 maxsz -= some; 2930 /* fallthru */ 2931 case TCG_TYPE_V128: 2932 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 2933 v_shift, false, g->fniv_v); 2934 break; 2935 case TCG_TYPE_V64: 2936 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 2937 v_shift, false, g->fniv_v); 2938 break; 2939 default: 2940 g_assert_not_reached(); 2941 } 2942 tcg_temp_free_vec(v_shift); 2943 tcg_swap_vecop_list(hold_list); 2944 goto clear_tail; 2945 } 2946 2947 /* Otherwise fall back to integral... */ 2948 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2949 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 2950 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2951 TCGv_i64 sh64 = tcg_temp_new_i64(); 2952 tcg_gen_extu_i32_i64(sh64, shift); 2953 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 2954 tcg_temp_free_i64(sh64); 2955 } else { 2956 TCGv_ptr a0 = tcg_temp_new_ptr(); 2957 TCGv_ptr a1 = tcg_temp_new_ptr(); 2958 TCGv_i32 desc = tcg_temp_new_i32(); 2959 2960 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 2961 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 2962 tcg_gen_addi_ptr(a0, cpu_env, dofs); 2963 tcg_gen_addi_ptr(a1, cpu_env, aofs); 2964 2965 g->fno[vece](a0, a1, desc); 2966 2967 tcg_temp_free_ptr(a0); 2968 tcg_temp_free_ptr(a1); 2969 tcg_temp_free_i32(desc); 2970 return; 2971 } 2972 2973 clear_tail: 2974 if (oprsz < maxsz) { 2975 expand_clr(dofs + oprsz, maxsz - oprsz); 2976 } 2977 } 2978 2979 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 2980 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2981 { 2982 static const GVecGen2sh g = { 2983 .fni4 = tcg_gen_shl_i32, 2984 .fni8 = tcg_gen_shl_i64, 2985 .fniv_s = tcg_gen_shls_vec, 2986 .fniv_v = tcg_gen_shlv_vec, 2987 .fno = { 2988 gen_helper_gvec_shl8i, 2989 gen_helper_gvec_shl16i, 2990 gen_helper_gvec_shl32i, 2991 gen_helper_gvec_shl64i, 2992 }, 2993 .s_list = { INDEX_op_shls_vec, 0 }, 2994 .v_list = { INDEX_op_shlv_vec, 0 }, 2995 }; 2996 2997 tcg_debug_assert(vece <= MO_64); 2998 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2999 } 3000 3001 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 3002 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3003 { 3004 static const GVecGen2sh g = { 3005 .fni4 = tcg_gen_shr_i32, 3006 .fni8 = tcg_gen_shr_i64, 3007 .fniv_s = tcg_gen_shrs_vec, 3008 .fniv_v = tcg_gen_shrv_vec, 3009 .fno = { 3010 gen_helper_gvec_shr8i, 3011 gen_helper_gvec_shr16i, 3012 gen_helper_gvec_shr32i, 3013 gen_helper_gvec_shr64i, 3014 }, 3015 .s_list = { INDEX_op_shrs_vec, 0 }, 3016 .v_list = { INDEX_op_shrv_vec, 0 }, 3017 }; 3018 3019 tcg_debug_assert(vece <= MO_64); 3020 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3021 } 3022 3023 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 3024 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3025 { 3026 static const GVecGen2sh g = { 3027 .fni4 = tcg_gen_sar_i32, 3028 .fni8 = tcg_gen_sar_i64, 3029 .fniv_s = tcg_gen_sars_vec, 3030 .fniv_v = tcg_gen_sarv_vec, 3031 .fno = { 3032 gen_helper_gvec_sar8i, 3033 gen_helper_gvec_sar16i, 3034 gen_helper_gvec_sar32i, 3035 gen_helper_gvec_sar64i, 3036 }, 3037 .s_list = { INDEX_op_sars_vec, 0 }, 3038 .v_list = { INDEX_op_sarv_vec, 0 }, 3039 }; 3040 3041 tcg_debug_assert(vece <= MO_64); 3042 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3043 } 3044 3045 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs, 3046 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3047 { 3048 static const GVecGen2sh g = { 3049 .fni4 = tcg_gen_rotl_i32, 3050 .fni8 = tcg_gen_rotl_i64, 3051 .fniv_s = tcg_gen_rotls_vec, 3052 .fniv_v = tcg_gen_rotlv_vec, 3053 .fno = { 3054 gen_helper_gvec_rotl8i, 3055 gen_helper_gvec_rotl16i, 3056 gen_helper_gvec_rotl32i, 3057 gen_helper_gvec_rotl64i, 3058 }, 3059 .s_list = { INDEX_op_rotls_vec, 0 }, 3060 .v_list = { INDEX_op_rotlv_vec, 0 }, 3061 }; 3062 3063 tcg_debug_assert(vece <= MO_64); 3064 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3065 } 3066 3067 /* 3068 * Expand D = A << (B % element bits) 3069 * 3070 * Unlike scalar shifts, where it is easy for the target front end 3071 * to include the modulo as part of the expansion. If the target 3072 * naturally includes the modulo as part of the operation, great! 3073 * If the target has some other behaviour from out-of-range shifts, 3074 * then it could not use this function anyway, and would need to 3075 * do it's own expansion with custom functions. 3076 */ 3077 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 3078 TCGv_vec a, TCGv_vec b) 3079 { 3080 TCGv_vec t = tcg_temp_new_vec_matching(d); 3081 3082 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3083 tcg_gen_and_vec(vece, t, t, b); 3084 tcg_gen_shlv_vec(vece, d, a, t); 3085 tcg_temp_free_vec(t); 3086 } 3087 3088 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3089 { 3090 TCGv_i32 t = tcg_temp_new_i32(); 3091 3092 tcg_gen_andi_i32(t, b, 31); 3093 tcg_gen_shl_i32(d, a, t); 3094 tcg_temp_free_i32(t); 3095 } 3096 3097 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3098 { 3099 TCGv_i64 t = tcg_temp_new_i64(); 3100 3101 tcg_gen_andi_i64(t, b, 63); 3102 tcg_gen_shl_i64(d, a, t); 3103 tcg_temp_free_i64(t); 3104 } 3105 3106 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3107 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3108 { 3109 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 3110 static const GVecGen3 g[4] = { 3111 { .fniv = tcg_gen_shlv_mod_vec, 3112 .fno = gen_helper_gvec_shl8v, 3113 .opt_opc = vecop_list, 3114 .vece = MO_8 }, 3115 { .fniv = tcg_gen_shlv_mod_vec, 3116 .fno = gen_helper_gvec_shl16v, 3117 .opt_opc = vecop_list, 3118 .vece = MO_16 }, 3119 { .fni4 = tcg_gen_shl_mod_i32, 3120 .fniv = tcg_gen_shlv_mod_vec, 3121 .fno = gen_helper_gvec_shl32v, 3122 .opt_opc = vecop_list, 3123 .vece = MO_32 }, 3124 { .fni8 = tcg_gen_shl_mod_i64, 3125 .fniv = tcg_gen_shlv_mod_vec, 3126 .fno = gen_helper_gvec_shl64v, 3127 .opt_opc = vecop_list, 3128 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3129 .vece = MO_64 }, 3130 }; 3131 3132 tcg_debug_assert(vece <= MO_64); 3133 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3134 } 3135 3136 /* 3137 * Similarly for logical right shifts. 3138 */ 3139 3140 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 3141 TCGv_vec a, TCGv_vec b) 3142 { 3143 TCGv_vec t = tcg_temp_new_vec_matching(d); 3144 3145 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3146 tcg_gen_and_vec(vece, t, t, b); 3147 tcg_gen_shrv_vec(vece, d, a, t); 3148 tcg_temp_free_vec(t); 3149 } 3150 3151 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3152 { 3153 TCGv_i32 t = tcg_temp_new_i32(); 3154 3155 tcg_gen_andi_i32(t, b, 31); 3156 tcg_gen_shr_i32(d, a, t); 3157 tcg_temp_free_i32(t); 3158 } 3159 3160 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3161 { 3162 TCGv_i64 t = tcg_temp_new_i64(); 3163 3164 tcg_gen_andi_i64(t, b, 63); 3165 tcg_gen_shr_i64(d, a, t); 3166 tcg_temp_free_i64(t); 3167 } 3168 3169 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3170 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3171 { 3172 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 3173 static const GVecGen3 g[4] = { 3174 { .fniv = tcg_gen_shrv_mod_vec, 3175 .fno = gen_helper_gvec_shr8v, 3176 .opt_opc = vecop_list, 3177 .vece = MO_8 }, 3178 { .fniv = tcg_gen_shrv_mod_vec, 3179 .fno = gen_helper_gvec_shr16v, 3180 .opt_opc = vecop_list, 3181 .vece = MO_16 }, 3182 { .fni4 = tcg_gen_shr_mod_i32, 3183 .fniv = tcg_gen_shrv_mod_vec, 3184 .fno = gen_helper_gvec_shr32v, 3185 .opt_opc = vecop_list, 3186 .vece = MO_32 }, 3187 { .fni8 = tcg_gen_shr_mod_i64, 3188 .fniv = tcg_gen_shrv_mod_vec, 3189 .fno = gen_helper_gvec_shr64v, 3190 .opt_opc = vecop_list, 3191 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3192 .vece = MO_64 }, 3193 }; 3194 3195 tcg_debug_assert(vece <= MO_64); 3196 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3197 } 3198 3199 /* 3200 * Similarly for arithmetic right shifts. 3201 */ 3202 3203 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 3204 TCGv_vec a, TCGv_vec b) 3205 { 3206 TCGv_vec t = tcg_temp_new_vec_matching(d); 3207 3208 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3209 tcg_gen_and_vec(vece, t, t, b); 3210 tcg_gen_sarv_vec(vece, d, a, t); 3211 tcg_temp_free_vec(t); 3212 } 3213 3214 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3215 { 3216 TCGv_i32 t = tcg_temp_new_i32(); 3217 3218 tcg_gen_andi_i32(t, b, 31); 3219 tcg_gen_sar_i32(d, a, t); 3220 tcg_temp_free_i32(t); 3221 } 3222 3223 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3224 { 3225 TCGv_i64 t = tcg_temp_new_i64(); 3226 3227 tcg_gen_andi_i64(t, b, 63); 3228 tcg_gen_sar_i64(d, a, t); 3229 tcg_temp_free_i64(t); 3230 } 3231 3232 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3233 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3234 { 3235 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3236 static const GVecGen3 g[4] = { 3237 { .fniv = tcg_gen_sarv_mod_vec, 3238 .fno = gen_helper_gvec_sar8v, 3239 .opt_opc = vecop_list, 3240 .vece = MO_8 }, 3241 { .fniv = tcg_gen_sarv_mod_vec, 3242 .fno = gen_helper_gvec_sar16v, 3243 .opt_opc = vecop_list, 3244 .vece = MO_16 }, 3245 { .fni4 = tcg_gen_sar_mod_i32, 3246 .fniv = tcg_gen_sarv_mod_vec, 3247 .fno = gen_helper_gvec_sar32v, 3248 .opt_opc = vecop_list, 3249 .vece = MO_32 }, 3250 { .fni8 = tcg_gen_sar_mod_i64, 3251 .fniv = tcg_gen_sarv_mod_vec, 3252 .fno = gen_helper_gvec_sar64v, 3253 .opt_opc = vecop_list, 3254 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3255 .vece = MO_64 }, 3256 }; 3257 3258 tcg_debug_assert(vece <= MO_64); 3259 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3260 } 3261 3262 /* 3263 * Similarly for rotates. 3264 */ 3265 3266 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d, 3267 TCGv_vec a, TCGv_vec b) 3268 { 3269 TCGv_vec t = tcg_temp_new_vec_matching(d); 3270 3271 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3272 tcg_gen_and_vec(vece, t, t, b); 3273 tcg_gen_rotlv_vec(vece, d, a, t); 3274 tcg_temp_free_vec(t); 3275 } 3276 3277 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3278 { 3279 TCGv_i32 t = tcg_temp_new_i32(); 3280 3281 tcg_gen_andi_i32(t, b, 31); 3282 tcg_gen_rotl_i32(d, a, t); 3283 tcg_temp_free_i32(t); 3284 } 3285 3286 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3287 { 3288 TCGv_i64 t = tcg_temp_new_i64(); 3289 3290 tcg_gen_andi_i64(t, b, 63); 3291 tcg_gen_rotl_i64(d, a, t); 3292 tcg_temp_free_i64(t); 3293 } 3294 3295 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3296 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3297 { 3298 static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 }; 3299 static const GVecGen3 g[4] = { 3300 { .fniv = tcg_gen_rotlv_mod_vec, 3301 .fno = gen_helper_gvec_rotl8v, 3302 .opt_opc = vecop_list, 3303 .vece = MO_8 }, 3304 { .fniv = tcg_gen_rotlv_mod_vec, 3305 .fno = gen_helper_gvec_rotl16v, 3306 .opt_opc = vecop_list, 3307 .vece = MO_16 }, 3308 { .fni4 = tcg_gen_rotl_mod_i32, 3309 .fniv = tcg_gen_rotlv_mod_vec, 3310 .fno = gen_helper_gvec_rotl32v, 3311 .opt_opc = vecop_list, 3312 .vece = MO_32 }, 3313 { .fni8 = tcg_gen_rotl_mod_i64, 3314 .fniv = tcg_gen_rotlv_mod_vec, 3315 .fno = gen_helper_gvec_rotl64v, 3316 .opt_opc = vecop_list, 3317 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3318 .vece = MO_64 }, 3319 }; 3320 3321 tcg_debug_assert(vece <= MO_64); 3322 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3323 } 3324 3325 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d, 3326 TCGv_vec a, TCGv_vec b) 3327 { 3328 TCGv_vec t = tcg_temp_new_vec_matching(d); 3329 3330 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3331 tcg_gen_and_vec(vece, t, t, b); 3332 tcg_gen_rotrv_vec(vece, d, a, t); 3333 tcg_temp_free_vec(t); 3334 } 3335 3336 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3337 { 3338 TCGv_i32 t = tcg_temp_new_i32(); 3339 3340 tcg_gen_andi_i32(t, b, 31); 3341 tcg_gen_rotr_i32(d, a, t); 3342 tcg_temp_free_i32(t); 3343 } 3344 3345 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3346 { 3347 TCGv_i64 t = tcg_temp_new_i64(); 3348 3349 tcg_gen_andi_i64(t, b, 63); 3350 tcg_gen_rotr_i64(d, a, t); 3351 tcg_temp_free_i64(t); 3352 } 3353 3354 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3355 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3356 { 3357 static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 }; 3358 static const GVecGen3 g[4] = { 3359 { .fniv = tcg_gen_rotrv_mod_vec, 3360 .fno = gen_helper_gvec_rotr8v, 3361 .opt_opc = vecop_list, 3362 .vece = MO_8 }, 3363 { .fniv = tcg_gen_rotrv_mod_vec, 3364 .fno = gen_helper_gvec_rotr16v, 3365 .opt_opc = vecop_list, 3366 .vece = MO_16 }, 3367 { .fni4 = tcg_gen_rotr_mod_i32, 3368 .fniv = tcg_gen_rotrv_mod_vec, 3369 .fno = gen_helper_gvec_rotr32v, 3370 .opt_opc = vecop_list, 3371 .vece = MO_32 }, 3372 { .fni8 = tcg_gen_rotr_mod_i64, 3373 .fniv = tcg_gen_rotrv_mod_vec, 3374 .fno = gen_helper_gvec_rotr64v, 3375 .opt_opc = vecop_list, 3376 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3377 .vece = MO_64 }, 3378 }; 3379 3380 tcg_debug_assert(vece <= MO_64); 3381 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3382 } 3383 3384 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3385 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3386 uint32_t oprsz, TCGCond cond) 3387 { 3388 TCGv_i32 t0 = tcg_temp_new_i32(); 3389 TCGv_i32 t1 = tcg_temp_new_i32(); 3390 uint32_t i; 3391 3392 for (i = 0; i < oprsz; i += 4) { 3393 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 3394 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 3395 tcg_gen_setcond_i32(cond, t0, t0, t1); 3396 tcg_gen_neg_i32(t0, t0); 3397 tcg_gen_st_i32(t0, cpu_env, dofs + i); 3398 } 3399 tcg_temp_free_i32(t1); 3400 tcg_temp_free_i32(t0); 3401 } 3402 3403 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3404 uint32_t oprsz, TCGCond cond) 3405 { 3406 TCGv_i64 t0 = tcg_temp_new_i64(); 3407 TCGv_i64 t1 = tcg_temp_new_i64(); 3408 uint32_t i; 3409 3410 for (i = 0; i < oprsz; i += 8) { 3411 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 3412 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 3413 tcg_gen_setcond_i64(cond, t0, t0, t1); 3414 tcg_gen_neg_i64(t0, t0); 3415 tcg_gen_st_i64(t0, cpu_env, dofs + i); 3416 } 3417 tcg_temp_free_i64(t1); 3418 tcg_temp_free_i64(t0); 3419 } 3420 3421 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3422 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3423 TCGType type, TCGCond cond) 3424 { 3425 TCGv_vec t0 = tcg_temp_new_vec(type); 3426 TCGv_vec t1 = tcg_temp_new_vec(type); 3427 uint32_t i; 3428 3429 for (i = 0; i < oprsz; i += tysz) { 3430 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 3431 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 3432 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 3433 tcg_gen_st_vec(t0, cpu_env, dofs + i); 3434 } 3435 tcg_temp_free_vec(t1); 3436 tcg_temp_free_vec(t0); 3437 } 3438 3439 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3440 uint32_t aofs, uint32_t bofs, 3441 uint32_t oprsz, uint32_t maxsz) 3442 { 3443 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3444 static gen_helper_gvec_3 * const eq_fn[4] = { 3445 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3446 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3447 }; 3448 static gen_helper_gvec_3 * const ne_fn[4] = { 3449 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3450 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3451 }; 3452 static gen_helper_gvec_3 * const lt_fn[4] = { 3453 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3454 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3455 }; 3456 static gen_helper_gvec_3 * const le_fn[4] = { 3457 gen_helper_gvec_le8, gen_helper_gvec_le16, 3458 gen_helper_gvec_le32, gen_helper_gvec_le64 3459 }; 3460 static gen_helper_gvec_3 * const ltu_fn[4] = { 3461 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3462 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3463 }; 3464 static gen_helper_gvec_3 * const leu_fn[4] = { 3465 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3466 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3467 }; 3468 static gen_helper_gvec_3 * const * const fns[16] = { 3469 [TCG_COND_EQ] = eq_fn, 3470 [TCG_COND_NE] = ne_fn, 3471 [TCG_COND_LT] = lt_fn, 3472 [TCG_COND_LE] = le_fn, 3473 [TCG_COND_LTU] = ltu_fn, 3474 [TCG_COND_LEU] = leu_fn, 3475 }; 3476 3477 const TCGOpcode *hold_list; 3478 TCGType type; 3479 uint32_t some; 3480 3481 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3482 check_overlap_3(dofs, aofs, bofs, maxsz); 3483 3484 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3485 do_dup(MO_8, dofs, oprsz, maxsz, 3486 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3487 return; 3488 } 3489 3490 /* 3491 * Implement inline with a vector type, if possible. 3492 * Prefer integer when 64-bit host and 64-bit comparison. 3493 */ 3494 hold_list = tcg_swap_vecop_list(cmp_list); 3495 type = choose_vector_type(cmp_list, vece, oprsz, 3496 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3497 switch (type) { 3498 case TCG_TYPE_V256: 3499 /* Recall that ARM SVE allows vector sizes that are not a 3500 * power of 2, but always a multiple of 16. The intent is 3501 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3502 */ 3503 some = QEMU_ALIGN_DOWN(oprsz, 32); 3504 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3505 if (some == oprsz) { 3506 break; 3507 } 3508 dofs += some; 3509 aofs += some; 3510 bofs += some; 3511 oprsz -= some; 3512 maxsz -= some; 3513 /* fallthru */ 3514 case TCG_TYPE_V128: 3515 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3516 break; 3517 case TCG_TYPE_V64: 3518 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3519 break; 3520 3521 case 0: 3522 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3523 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3524 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3525 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3526 } else { 3527 gen_helper_gvec_3 * const *fn = fns[cond]; 3528 3529 if (fn == NULL) { 3530 uint32_t tmp; 3531 tmp = aofs, aofs = bofs, bofs = tmp; 3532 cond = tcg_swap_cond(cond); 3533 fn = fns[cond]; 3534 assert(fn != NULL); 3535 } 3536 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3537 oprsz = maxsz; 3538 } 3539 break; 3540 3541 default: 3542 g_assert_not_reached(); 3543 } 3544 tcg_swap_vecop_list(hold_list); 3545 3546 if (oprsz < maxsz) { 3547 expand_clr(dofs + oprsz, maxsz - oprsz); 3548 } 3549 } 3550 3551 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 3552 { 3553 TCGv_i64 t = tcg_temp_new_i64(); 3554 3555 tcg_gen_and_i64(t, b, a); 3556 tcg_gen_andc_i64(d, c, a); 3557 tcg_gen_or_i64(d, d, t); 3558 tcg_temp_free_i64(t); 3559 } 3560 3561 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 3562 uint32_t bofs, uint32_t cofs, 3563 uint32_t oprsz, uint32_t maxsz) 3564 { 3565 static const GVecGen4 g = { 3566 .fni8 = tcg_gen_bitsel_i64, 3567 .fniv = tcg_gen_bitsel_vec, 3568 .fno = gen_helper_gvec_bitsel, 3569 }; 3570 3571 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 3572 } 3573