1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "tcg/tcg.h" 22 #include "tcg/tcg-temp-internal.h" 23 #include "tcg/tcg-op-common.h" 24 #include "tcg/tcg-op-gvec-common.h" 25 #include "tcg/tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 #ifdef CONFIG_DEBUG_TCG 30 static const TCGOpcode vecop_list_empty[1] = { 0 }; 31 #else 32 #define vecop_list_empty NULL 33 #endif 34 35 36 /* Verify vector size and alignment rules. OFS should be the OR of all 37 of the operand offsets so that we can check them all at once. */ 38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 39 { 40 uint32_t max_align; 41 42 switch (oprsz) { 43 case 8: 44 case 16: 45 case 32: 46 tcg_debug_assert(oprsz <= maxsz); 47 break; 48 default: 49 tcg_debug_assert(oprsz == maxsz); 50 break; 51 } 52 tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS)); 53 54 max_align = maxsz >= 16 ? 15 : 7; 55 tcg_debug_assert((maxsz & max_align) == 0); 56 tcg_debug_assert((ofs & max_align) == 0); 57 } 58 59 /* Verify vector overlap rules for two operands. */ 60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 61 { 62 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 63 } 64 65 /* Verify vector overlap rules for three operands. */ 66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 67 { 68 check_overlap_2(d, a, s); 69 check_overlap_2(d, b, s); 70 check_overlap_2(a, b, s); 71 } 72 73 /* Verify vector overlap rules for four operands. */ 74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 75 uint32_t c, uint32_t s) 76 { 77 check_overlap_2(d, a, s); 78 check_overlap_2(d, b, s); 79 check_overlap_2(d, c, s); 80 check_overlap_2(a, b, s); 81 check_overlap_2(a, c, s); 82 check_overlap_2(b, c, s); 83 } 84 85 /* Create a descriptor from components. */ 86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 87 { 88 uint32_t desc = 0; 89 90 check_size_align(oprsz, maxsz, 0); 91 92 /* 93 * We want to check that 'data' will fit into SIMD_DATA_BITS. 94 * However, some callers want to treat the data as a signed 95 * value (which they can later get back with simd_data()) 96 * and some want to treat it as an unsigned value. 97 * So here we assert only that the data will fit into the 98 * field in at least one way. This means that some invalid 99 * values from the caller will not be detected, e.g. if the 100 * caller wants to handle the value as a signed integer but 101 * incorrectly passes us 1 << (SIMD_DATA_BITS - 1). 102 */ 103 tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS) || 104 data == extract32(data, 0, SIMD_DATA_BITS)); 105 106 oprsz = (oprsz / 8) - 1; 107 maxsz = (maxsz / 8) - 1; 108 109 /* 110 * We have just asserted in check_size_align that either 111 * oprsz is {8,16,32} or matches maxsz. Encode the final 112 * case with '2', as that would otherwise map to 24. 113 */ 114 if (oprsz == maxsz) { 115 oprsz = 2; 116 } 117 118 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 119 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 120 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 121 122 return desc; 123 } 124 125 /* Generate a call to a gvec-style helper with two vector operands. */ 126 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 127 uint32_t oprsz, uint32_t maxsz, int32_t data, 128 gen_helper_gvec_2 *fn) 129 { 130 TCGv_ptr a0, a1; 131 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 132 133 a0 = tcg_temp_ebb_new_ptr(); 134 a1 = tcg_temp_ebb_new_ptr(); 135 136 tcg_gen_addi_ptr(a0, tcg_env, dofs); 137 tcg_gen_addi_ptr(a1, tcg_env, aofs); 138 139 fn(a0, a1, desc); 140 141 tcg_temp_free_ptr(a0); 142 tcg_temp_free_ptr(a1); 143 } 144 145 /* Generate a call to a gvec-style helper with two vector operands 146 and one scalar operand. */ 147 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 148 uint32_t oprsz, uint32_t maxsz, int32_t data, 149 gen_helper_gvec_2i *fn) 150 { 151 TCGv_ptr a0, a1; 152 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 153 154 a0 = tcg_temp_ebb_new_ptr(); 155 a1 = tcg_temp_ebb_new_ptr(); 156 157 tcg_gen_addi_ptr(a0, tcg_env, dofs); 158 tcg_gen_addi_ptr(a1, tcg_env, aofs); 159 160 fn(a0, a1, c, desc); 161 162 tcg_temp_free_ptr(a0); 163 tcg_temp_free_ptr(a1); 164 } 165 166 /* Generate a call to a gvec-style helper with three vector operands. */ 167 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 168 uint32_t oprsz, uint32_t maxsz, int32_t data, 169 gen_helper_gvec_3 *fn) 170 { 171 TCGv_ptr a0, a1, a2; 172 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 173 174 a0 = tcg_temp_ebb_new_ptr(); 175 a1 = tcg_temp_ebb_new_ptr(); 176 a2 = tcg_temp_ebb_new_ptr(); 177 178 tcg_gen_addi_ptr(a0, tcg_env, dofs); 179 tcg_gen_addi_ptr(a1, tcg_env, aofs); 180 tcg_gen_addi_ptr(a2, tcg_env, bofs); 181 182 fn(a0, a1, a2, desc); 183 184 tcg_temp_free_ptr(a0); 185 tcg_temp_free_ptr(a1); 186 tcg_temp_free_ptr(a2); 187 } 188 189 /* Generate a call to a gvec-style helper with four vector operands. */ 190 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 191 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 192 int32_t data, gen_helper_gvec_4 *fn) 193 { 194 TCGv_ptr a0, a1, a2, a3; 195 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 196 197 a0 = tcg_temp_ebb_new_ptr(); 198 a1 = tcg_temp_ebb_new_ptr(); 199 a2 = tcg_temp_ebb_new_ptr(); 200 a3 = tcg_temp_ebb_new_ptr(); 201 202 tcg_gen_addi_ptr(a0, tcg_env, dofs); 203 tcg_gen_addi_ptr(a1, tcg_env, aofs); 204 tcg_gen_addi_ptr(a2, tcg_env, bofs); 205 tcg_gen_addi_ptr(a3, tcg_env, cofs); 206 207 fn(a0, a1, a2, a3, desc); 208 209 tcg_temp_free_ptr(a0); 210 tcg_temp_free_ptr(a1); 211 tcg_temp_free_ptr(a2); 212 tcg_temp_free_ptr(a3); 213 } 214 215 /* Generate a call to a gvec-style helper with five vector operands. */ 216 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 217 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 218 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 219 { 220 TCGv_ptr a0, a1, a2, a3, a4; 221 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 222 223 a0 = tcg_temp_ebb_new_ptr(); 224 a1 = tcg_temp_ebb_new_ptr(); 225 a2 = tcg_temp_ebb_new_ptr(); 226 a3 = tcg_temp_ebb_new_ptr(); 227 a4 = tcg_temp_ebb_new_ptr(); 228 229 tcg_gen_addi_ptr(a0, tcg_env, dofs); 230 tcg_gen_addi_ptr(a1, tcg_env, aofs); 231 tcg_gen_addi_ptr(a2, tcg_env, bofs); 232 tcg_gen_addi_ptr(a3, tcg_env, cofs); 233 tcg_gen_addi_ptr(a4, tcg_env, xofs); 234 235 fn(a0, a1, a2, a3, a4, desc); 236 237 tcg_temp_free_ptr(a0); 238 tcg_temp_free_ptr(a1); 239 tcg_temp_free_ptr(a2); 240 tcg_temp_free_ptr(a3); 241 tcg_temp_free_ptr(a4); 242 } 243 244 /* Generate a call to a gvec-style helper with three vector operands 245 and an extra pointer operand. */ 246 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 247 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 248 int32_t data, gen_helper_gvec_2_ptr *fn) 249 { 250 TCGv_ptr a0, a1; 251 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 252 253 a0 = tcg_temp_ebb_new_ptr(); 254 a1 = tcg_temp_ebb_new_ptr(); 255 256 tcg_gen_addi_ptr(a0, tcg_env, dofs); 257 tcg_gen_addi_ptr(a1, tcg_env, aofs); 258 259 fn(a0, a1, ptr, desc); 260 261 tcg_temp_free_ptr(a0); 262 tcg_temp_free_ptr(a1); 263 } 264 265 /* Generate a call to a gvec-style helper with three vector operands 266 and an extra pointer operand. */ 267 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 268 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 269 int32_t data, gen_helper_gvec_3_ptr *fn) 270 { 271 TCGv_ptr a0, a1, a2; 272 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 273 274 a0 = tcg_temp_ebb_new_ptr(); 275 a1 = tcg_temp_ebb_new_ptr(); 276 a2 = tcg_temp_ebb_new_ptr(); 277 278 tcg_gen_addi_ptr(a0, tcg_env, dofs); 279 tcg_gen_addi_ptr(a1, tcg_env, aofs); 280 tcg_gen_addi_ptr(a2, tcg_env, bofs); 281 282 fn(a0, a1, a2, ptr, desc); 283 284 tcg_temp_free_ptr(a0); 285 tcg_temp_free_ptr(a1); 286 tcg_temp_free_ptr(a2); 287 } 288 289 /* Generate a call to a gvec-style helper with four vector operands 290 and an extra pointer operand. */ 291 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 292 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 293 uint32_t maxsz, int32_t data, 294 gen_helper_gvec_4_ptr *fn) 295 { 296 TCGv_ptr a0, a1, a2, a3; 297 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 298 299 a0 = tcg_temp_ebb_new_ptr(); 300 a1 = tcg_temp_ebb_new_ptr(); 301 a2 = tcg_temp_ebb_new_ptr(); 302 a3 = tcg_temp_ebb_new_ptr(); 303 304 tcg_gen_addi_ptr(a0, tcg_env, dofs); 305 tcg_gen_addi_ptr(a1, tcg_env, aofs); 306 tcg_gen_addi_ptr(a2, tcg_env, bofs); 307 tcg_gen_addi_ptr(a3, tcg_env, cofs); 308 309 fn(a0, a1, a2, a3, ptr, desc); 310 311 tcg_temp_free_ptr(a0); 312 tcg_temp_free_ptr(a1); 313 tcg_temp_free_ptr(a2); 314 tcg_temp_free_ptr(a3); 315 } 316 317 /* Generate a call to a gvec-style helper with five vector operands 318 and an extra pointer operand. */ 319 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 320 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr, 321 uint32_t oprsz, uint32_t maxsz, int32_t data, 322 gen_helper_gvec_5_ptr *fn) 323 { 324 TCGv_ptr a0, a1, a2, a3, a4; 325 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 326 327 a0 = tcg_temp_ebb_new_ptr(); 328 a1 = tcg_temp_ebb_new_ptr(); 329 a2 = tcg_temp_ebb_new_ptr(); 330 a3 = tcg_temp_ebb_new_ptr(); 331 a4 = tcg_temp_ebb_new_ptr(); 332 333 tcg_gen_addi_ptr(a0, tcg_env, dofs); 334 tcg_gen_addi_ptr(a1, tcg_env, aofs); 335 tcg_gen_addi_ptr(a2, tcg_env, bofs); 336 tcg_gen_addi_ptr(a3, tcg_env, cofs); 337 tcg_gen_addi_ptr(a4, tcg_env, eofs); 338 339 fn(a0, a1, a2, a3, a4, ptr, desc); 340 341 tcg_temp_free_ptr(a0); 342 tcg_temp_free_ptr(a1); 343 tcg_temp_free_ptr(a2); 344 tcg_temp_free_ptr(a3); 345 tcg_temp_free_ptr(a4); 346 } 347 348 /* Return true if we want to implement something of OPRSZ bytes 349 in units of LNSZ. This limits the expansion of inline code. */ 350 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 351 { 352 uint32_t q, r; 353 354 if (oprsz < lnsz) { 355 return false; 356 } 357 358 q = oprsz / lnsz; 359 r = oprsz % lnsz; 360 tcg_debug_assert((r & 7) == 0); 361 362 if (lnsz < 16) { 363 /* For sizes below 16, accept no remainder. */ 364 if (r != 0) { 365 return false; 366 } 367 } else { 368 /* 369 * Recall that ARM SVE allows vector sizes that are not a 370 * power of 2, but always a multiple of 16. The intent is 371 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 372 * In addition, expand_clr needs to handle a multiple of 8. 373 * Thus we can handle the tail with one more operation per 374 * diminishing power of 2. 375 */ 376 q += ctpop32(r); 377 } 378 379 return q <= MAX_UNROLL; 380 } 381 382 static void expand_clr(uint32_t dofs, uint32_t maxsz); 383 384 /* Duplicate C as per VECE. */ 385 uint64_t (dup_const)(unsigned vece, uint64_t c) 386 { 387 switch (vece) { 388 case MO_8: 389 return 0x0101010101010101ull * (uint8_t)c; 390 case MO_16: 391 return 0x0001000100010001ull * (uint16_t)c; 392 case MO_32: 393 return 0x0000000100000001ull * (uint32_t)c; 394 case MO_64: 395 return c; 396 default: 397 g_assert_not_reached(); 398 } 399 } 400 401 /* Duplicate IN into OUT as per VECE. */ 402 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 403 { 404 switch (vece) { 405 case MO_8: 406 tcg_gen_ext8u_i32(out, in); 407 tcg_gen_muli_i32(out, out, 0x01010101); 408 break; 409 case MO_16: 410 tcg_gen_deposit_i32(out, in, in, 16, 16); 411 break; 412 case MO_32: 413 tcg_gen_mov_i32(out, in); 414 break; 415 default: 416 g_assert_not_reached(); 417 } 418 } 419 420 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 421 { 422 switch (vece) { 423 case MO_8: 424 tcg_gen_ext8u_i64(out, in); 425 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 426 break; 427 case MO_16: 428 tcg_gen_ext16u_i64(out, in); 429 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 430 break; 431 case MO_32: 432 tcg_gen_deposit_i64(out, in, in, 32, 32); 433 break; 434 case MO_64: 435 tcg_gen_mov_i64(out, in); 436 break; 437 default: 438 g_assert_not_reached(); 439 } 440 } 441 442 /* Select a supported vector type for implementing an operation on SIZE 443 * bytes. If OP is 0, assume that the real operation to be performed is 444 * required by all backends. Otherwise, make sure than OP can be performed 445 * on elements of size VECE in the selected type. Do not select V64 if 446 * PREFER_I64 is true. Return 0 if no vector type is selected. 447 */ 448 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 449 uint32_t size, bool prefer_i64) 450 { 451 /* 452 * Recall that ARM SVE allows vector sizes that are not a 453 * power of 2, but always a multiple of 16. The intent is 454 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 455 * It is hard to imagine a case in which v256 is supported 456 * but v128 is not, but check anyway. 457 * In addition, expand_clr needs to handle a multiple of 8. 458 */ 459 if (TCG_TARGET_HAS_v256 && 460 check_size_impl(size, 32) && 461 tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && 462 (!(size & 16) || 463 (TCG_TARGET_HAS_v128 && 464 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && 465 (!(size & 8) || 466 (TCG_TARGET_HAS_v64 && 467 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 468 return TCG_TYPE_V256; 469 } 470 if (TCG_TARGET_HAS_v128 && 471 check_size_impl(size, 16) && 472 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && 473 (!(size & 8) || 474 (TCG_TARGET_HAS_v64 && 475 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 476 return TCG_TYPE_V128; 477 } 478 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 479 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 480 return TCG_TYPE_V64; 481 } 482 return 0; 483 } 484 485 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 486 uint32_t maxsz, TCGv_vec t_vec) 487 { 488 uint32_t i = 0; 489 490 tcg_debug_assert(oprsz >= 8); 491 492 /* 493 * This may be expand_clr for the tail of an operation, e.g. 494 * oprsz == 8 && maxsz == 64. The first 8 bytes of this store 495 * are misaligned wrt the maximum vector size, so do that first. 496 */ 497 if (dofs & 8) { 498 tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64); 499 i += 8; 500 } 501 502 switch (type) { 503 case TCG_TYPE_V256: 504 /* 505 * Recall that ARM SVE allows vector sizes that are not a 506 * power of 2, but always a multiple of 16. The intent is 507 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 508 */ 509 for (; i + 32 <= oprsz; i += 32) { 510 tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V256); 511 } 512 /* fallthru */ 513 case TCG_TYPE_V128: 514 for (; i + 16 <= oprsz; i += 16) { 515 tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V128); 516 } 517 break; 518 case TCG_TYPE_V64: 519 for (; i < oprsz; i += 8) { 520 tcg_gen_stl_vec(t_vec, tcg_env, dofs + i, TCG_TYPE_V64); 521 } 522 break; 523 default: 524 g_assert_not_reached(); 525 } 526 527 if (oprsz < maxsz) { 528 expand_clr(dofs + oprsz, maxsz - oprsz); 529 } 530 } 531 532 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 533 * Only one of IN_32 or IN_64 may be set; 534 * IN_C is used if IN_32 and IN_64 are unset. 535 */ 536 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 537 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 538 uint64_t in_c) 539 { 540 TCGType type; 541 TCGv_i64 t_64; 542 TCGv_i32 t_32, t_desc; 543 TCGv_ptr t_ptr; 544 uint32_t i; 545 546 assert(vece <= (in_32 ? MO_32 : MO_64)); 547 assert(in_32 == NULL || in_64 == NULL); 548 549 /* If we're storing 0, expand oprsz to maxsz. */ 550 if (in_32 == NULL && in_64 == NULL) { 551 in_c = dup_const(vece, in_c); 552 if (in_c == 0) { 553 oprsz = maxsz; 554 vece = MO_8; 555 } else if (in_c == dup_const(MO_8, in_c)) { 556 vece = MO_8; 557 } 558 } 559 560 /* Implement inline with a vector type, if possible. 561 * Prefer integer when 64-bit host and no variable dup. 562 */ 563 type = choose_vector_type(NULL, vece, oprsz, 564 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 565 && (in_64 == NULL || vece == MO_64))); 566 if (type != 0) { 567 TCGv_vec t_vec = tcg_temp_new_vec(type); 568 569 if (in_32) { 570 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 571 } else if (in_64) { 572 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 573 } else { 574 tcg_gen_dupi_vec(vece, t_vec, in_c); 575 } 576 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 577 return; 578 } 579 580 /* Otherwise, inline with an integer type, unless "large". */ 581 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 582 t_64 = NULL; 583 t_32 = NULL; 584 585 if (in_32) { 586 /* We are given a 32-bit variable input. For a 64-bit host, 587 use a 64-bit operation unless the 32-bit operation would 588 be simple enough. */ 589 if (TCG_TARGET_REG_BITS == 64 590 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 591 t_64 = tcg_temp_ebb_new_i64(); 592 tcg_gen_extu_i32_i64(t_64, in_32); 593 tcg_gen_dup_i64(vece, t_64, t_64); 594 } else { 595 t_32 = tcg_temp_ebb_new_i32(); 596 tcg_gen_dup_i32(vece, t_32, in_32); 597 } 598 } else if (in_64) { 599 /* We are given a 64-bit variable input. */ 600 t_64 = tcg_temp_ebb_new_i64(); 601 tcg_gen_dup_i64(vece, t_64, in_64); 602 } else { 603 /* We are given a constant input. */ 604 /* For 64-bit hosts, use 64-bit constants for "simple" constants 605 or when we'd need too many 32-bit stores, or when a 64-bit 606 constant is really required. */ 607 if (vece == MO_64 608 || (TCG_TARGET_REG_BITS == 64 609 && (in_c == 0 || in_c == -1 610 || !check_size_impl(oprsz, 4)))) { 611 t_64 = tcg_constant_i64(in_c); 612 } else { 613 t_32 = tcg_constant_i32(in_c); 614 } 615 } 616 617 /* Implement inline if we picked an implementation size above. */ 618 if (t_32) { 619 for (i = 0; i < oprsz; i += 4) { 620 tcg_gen_st_i32(t_32, tcg_env, dofs + i); 621 } 622 tcg_temp_free_i32(t_32); 623 goto done; 624 } 625 if (t_64) { 626 for (i = 0; i < oprsz; i += 8) { 627 tcg_gen_st_i64(t_64, tcg_env, dofs + i); 628 } 629 tcg_temp_free_i64(t_64); 630 goto done; 631 } 632 } 633 634 /* Otherwise implement out of line. */ 635 t_ptr = tcg_temp_ebb_new_ptr(); 636 tcg_gen_addi_ptr(t_ptr, tcg_env, dofs); 637 638 /* 639 * This may be expand_clr for the tail of an operation, e.g. 640 * oprsz == 8 && maxsz == 64. The size of the clear is misaligned 641 * wrt simd_desc and will assert. Simply pass all replicated byte 642 * stores through to memset. 643 */ 644 if (oprsz == maxsz && vece == MO_8) { 645 TCGv_ptr t_size = tcg_constant_ptr(oprsz); 646 TCGv_i32 t_val; 647 648 if (in_32) { 649 t_val = in_32; 650 } else if (in_64) { 651 t_val = tcg_temp_ebb_new_i32(); 652 tcg_gen_extrl_i64_i32(t_val, in_64); 653 } else { 654 t_val = tcg_constant_i32(in_c); 655 } 656 gen_helper_memset(t_ptr, t_ptr, t_val, t_size); 657 658 if (in_64) { 659 tcg_temp_free_i32(t_val); 660 } 661 tcg_temp_free_ptr(t_ptr); 662 return; 663 } 664 665 t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0)); 666 667 if (vece == MO_64) { 668 if (in_64) { 669 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 670 } else { 671 t_64 = tcg_constant_i64(in_c); 672 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 673 } 674 } else { 675 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 676 static dup_fn * const fns[3] = { 677 gen_helper_gvec_dup8, 678 gen_helper_gvec_dup16, 679 gen_helper_gvec_dup32 680 }; 681 682 if (in_32) { 683 fns[vece](t_ptr, t_desc, in_32); 684 } else if (in_64) { 685 t_32 = tcg_temp_ebb_new_i32(); 686 tcg_gen_extrl_i64_i32(t_32, in_64); 687 fns[vece](t_ptr, t_desc, t_32); 688 tcg_temp_free_i32(t_32); 689 } else { 690 if (vece == MO_8) { 691 in_c &= 0xff; 692 } else if (vece == MO_16) { 693 in_c &= 0xffff; 694 } 695 t_32 = tcg_constant_i32(in_c); 696 fns[vece](t_ptr, t_desc, t_32); 697 } 698 } 699 700 tcg_temp_free_ptr(t_ptr); 701 return; 702 703 done: 704 if (oprsz < maxsz) { 705 expand_clr(dofs + oprsz, maxsz - oprsz); 706 } 707 } 708 709 /* Likewise, but with zero. */ 710 static void expand_clr(uint32_t dofs, uint32_t maxsz) 711 { 712 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 713 } 714 715 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 716 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 717 bool load_dest, void (*fni)(TCGv_i32, TCGv_i32)) 718 { 719 TCGv_i32 t0 = tcg_temp_new_i32(); 720 TCGv_i32 t1 = tcg_temp_new_i32(); 721 uint32_t i; 722 723 for (i = 0; i < oprsz; i += 4) { 724 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 725 if (load_dest) { 726 tcg_gen_ld_i32(t1, tcg_env, dofs + i); 727 } 728 fni(t1, t0); 729 tcg_gen_st_i32(t1, tcg_env, dofs + i); 730 } 731 tcg_temp_free_i32(t0); 732 tcg_temp_free_i32(t1); 733 } 734 735 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 736 int32_t c, bool load_dest, 737 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 738 { 739 TCGv_i32 t0 = tcg_temp_new_i32(); 740 TCGv_i32 t1 = tcg_temp_new_i32(); 741 uint32_t i; 742 743 for (i = 0; i < oprsz; i += 4) { 744 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 745 if (load_dest) { 746 tcg_gen_ld_i32(t1, tcg_env, dofs + i); 747 } 748 fni(t1, t0, c); 749 tcg_gen_st_i32(t1, tcg_env, dofs + i); 750 } 751 tcg_temp_free_i32(t0); 752 tcg_temp_free_i32(t1); 753 } 754 755 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 756 TCGv_i32 c, bool scalar_first, 757 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 758 { 759 TCGv_i32 t0 = tcg_temp_new_i32(); 760 TCGv_i32 t1 = tcg_temp_new_i32(); 761 uint32_t i; 762 763 for (i = 0; i < oprsz; i += 4) { 764 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 765 if (scalar_first) { 766 fni(t1, c, t0); 767 } else { 768 fni(t1, t0, c); 769 } 770 tcg_gen_st_i32(t1, tcg_env, dofs + i); 771 } 772 tcg_temp_free_i32(t0); 773 tcg_temp_free_i32(t1); 774 } 775 776 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 777 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 778 uint32_t bofs, uint32_t oprsz, bool load_dest, 779 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 780 { 781 TCGv_i32 t0 = tcg_temp_new_i32(); 782 TCGv_i32 t1 = tcg_temp_new_i32(); 783 TCGv_i32 t2 = tcg_temp_new_i32(); 784 uint32_t i; 785 786 for (i = 0; i < oprsz; i += 4) { 787 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 788 tcg_gen_ld_i32(t1, tcg_env, bofs + i); 789 if (load_dest) { 790 tcg_gen_ld_i32(t2, tcg_env, dofs + i); 791 } 792 fni(t2, t0, t1); 793 tcg_gen_st_i32(t2, tcg_env, dofs + i); 794 } 795 tcg_temp_free_i32(t2); 796 tcg_temp_free_i32(t1); 797 tcg_temp_free_i32(t0); 798 } 799 800 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 801 uint32_t oprsz, int32_t c, 802 bool load_dest, bool write_aofs, 803 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 804 { 805 TCGv_i32 t0 = tcg_temp_new_i32(); 806 TCGv_i32 t1 = tcg_temp_new_i32(); 807 TCGv_i32 t2 = tcg_temp_new_i32(); 808 uint32_t i; 809 810 for (i = 0; i < oprsz; i += 4) { 811 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 812 tcg_gen_ld_i32(t1, tcg_env, bofs + i); 813 if (load_dest) { 814 tcg_gen_ld_i32(t2, tcg_env, dofs + i); 815 } 816 fni(t2, t0, t1, c); 817 tcg_gen_st_i32(t2, tcg_env, dofs + i); 818 if (write_aofs) { 819 tcg_gen_st_i32(t0, tcg_env, aofs + i); 820 } 821 } 822 tcg_temp_free_i32(t0); 823 tcg_temp_free_i32(t1); 824 tcg_temp_free_i32(t2); 825 } 826 827 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 828 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 829 uint32_t cofs, uint32_t oprsz, bool write_aofs, 830 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 831 { 832 TCGv_i32 t0 = tcg_temp_new_i32(); 833 TCGv_i32 t1 = tcg_temp_new_i32(); 834 TCGv_i32 t2 = tcg_temp_new_i32(); 835 TCGv_i32 t3 = tcg_temp_new_i32(); 836 uint32_t i; 837 838 for (i = 0; i < oprsz; i += 4) { 839 tcg_gen_ld_i32(t1, tcg_env, aofs + i); 840 tcg_gen_ld_i32(t2, tcg_env, bofs + i); 841 tcg_gen_ld_i32(t3, tcg_env, cofs + i); 842 fni(t0, t1, t2, t3); 843 tcg_gen_st_i32(t0, tcg_env, dofs + i); 844 if (write_aofs) { 845 tcg_gen_st_i32(t1, tcg_env, aofs + i); 846 } 847 } 848 tcg_temp_free_i32(t3); 849 tcg_temp_free_i32(t2); 850 tcg_temp_free_i32(t1); 851 tcg_temp_free_i32(t0); 852 } 853 854 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 855 uint32_t cofs, uint32_t oprsz, int32_t c, 856 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32, 857 int32_t)) 858 { 859 TCGv_i32 t0 = tcg_temp_new_i32(); 860 TCGv_i32 t1 = tcg_temp_new_i32(); 861 TCGv_i32 t2 = tcg_temp_new_i32(); 862 TCGv_i32 t3 = tcg_temp_new_i32(); 863 uint32_t i; 864 865 for (i = 0; i < oprsz; i += 4) { 866 tcg_gen_ld_i32(t1, tcg_env, aofs + i); 867 tcg_gen_ld_i32(t2, tcg_env, bofs + i); 868 tcg_gen_ld_i32(t3, tcg_env, cofs + i); 869 fni(t0, t1, t2, t3, c); 870 tcg_gen_st_i32(t0, tcg_env, dofs + i); 871 } 872 tcg_temp_free_i32(t3); 873 tcg_temp_free_i32(t2); 874 tcg_temp_free_i32(t1); 875 tcg_temp_free_i32(t0); 876 } 877 878 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 879 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 880 bool load_dest, void (*fni)(TCGv_i64, TCGv_i64)) 881 { 882 TCGv_i64 t0 = tcg_temp_new_i64(); 883 TCGv_i64 t1 = tcg_temp_new_i64(); 884 uint32_t i; 885 886 for (i = 0; i < oprsz; i += 8) { 887 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 888 if (load_dest) { 889 tcg_gen_ld_i64(t1, tcg_env, dofs + i); 890 } 891 fni(t1, t0); 892 tcg_gen_st_i64(t1, tcg_env, dofs + i); 893 } 894 tcg_temp_free_i64(t0); 895 tcg_temp_free_i64(t1); 896 } 897 898 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 899 int64_t c, bool load_dest, 900 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 901 { 902 TCGv_i64 t0 = tcg_temp_new_i64(); 903 TCGv_i64 t1 = tcg_temp_new_i64(); 904 uint32_t i; 905 906 for (i = 0; i < oprsz; i += 8) { 907 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 908 if (load_dest) { 909 tcg_gen_ld_i64(t1, tcg_env, dofs + i); 910 } 911 fni(t1, t0, c); 912 tcg_gen_st_i64(t1, tcg_env, dofs + i); 913 } 914 tcg_temp_free_i64(t0); 915 tcg_temp_free_i64(t1); 916 } 917 918 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 919 TCGv_i64 c, bool scalar_first, 920 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 921 { 922 TCGv_i64 t0 = tcg_temp_new_i64(); 923 TCGv_i64 t1 = tcg_temp_new_i64(); 924 uint32_t i; 925 926 for (i = 0; i < oprsz; i += 8) { 927 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 928 if (scalar_first) { 929 fni(t1, c, t0); 930 } else { 931 fni(t1, t0, c); 932 } 933 tcg_gen_st_i64(t1, tcg_env, dofs + i); 934 } 935 tcg_temp_free_i64(t0); 936 tcg_temp_free_i64(t1); 937 } 938 939 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 940 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 941 uint32_t bofs, uint32_t oprsz, bool load_dest, 942 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 943 { 944 TCGv_i64 t0 = tcg_temp_new_i64(); 945 TCGv_i64 t1 = tcg_temp_new_i64(); 946 TCGv_i64 t2 = tcg_temp_new_i64(); 947 uint32_t i; 948 949 for (i = 0; i < oprsz; i += 8) { 950 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 951 tcg_gen_ld_i64(t1, tcg_env, bofs + i); 952 if (load_dest) { 953 tcg_gen_ld_i64(t2, tcg_env, dofs + i); 954 } 955 fni(t2, t0, t1); 956 tcg_gen_st_i64(t2, tcg_env, dofs + i); 957 } 958 tcg_temp_free_i64(t2); 959 tcg_temp_free_i64(t1); 960 tcg_temp_free_i64(t0); 961 } 962 963 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 964 uint32_t oprsz, int64_t c, 965 bool load_dest, bool write_aofs, 966 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 967 { 968 TCGv_i64 t0 = tcg_temp_new_i64(); 969 TCGv_i64 t1 = tcg_temp_new_i64(); 970 TCGv_i64 t2 = tcg_temp_new_i64(); 971 uint32_t i; 972 973 for (i = 0; i < oprsz; i += 8) { 974 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 975 tcg_gen_ld_i64(t1, tcg_env, bofs + i); 976 if (load_dest) { 977 tcg_gen_ld_i64(t2, tcg_env, dofs + i); 978 } 979 fni(t2, t0, t1, c); 980 tcg_gen_st_i64(t2, tcg_env, dofs + i); 981 if (write_aofs) { 982 tcg_gen_st_i64(t0, tcg_env, aofs + i); 983 } 984 } 985 tcg_temp_free_i64(t0); 986 tcg_temp_free_i64(t1); 987 tcg_temp_free_i64(t2); 988 } 989 990 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 991 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 992 uint32_t cofs, uint32_t oprsz, bool write_aofs, 993 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 994 { 995 TCGv_i64 t0 = tcg_temp_new_i64(); 996 TCGv_i64 t1 = tcg_temp_new_i64(); 997 TCGv_i64 t2 = tcg_temp_new_i64(); 998 TCGv_i64 t3 = tcg_temp_new_i64(); 999 uint32_t i; 1000 1001 for (i = 0; i < oprsz; i += 8) { 1002 tcg_gen_ld_i64(t1, tcg_env, aofs + i); 1003 tcg_gen_ld_i64(t2, tcg_env, bofs + i); 1004 tcg_gen_ld_i64(t3, tcg_env, cofs + i); 1005 fni(t0, t1, t2, t3); 1006 tcg_gen_st_i64(t0, tcg_env, dofs + i); 1007 if (write_aofs) { 1008 tcg_gen_st_i64(t1, tcg_env, aofs + i); 1009 } 1010 } 1011 tcg_temp_free_i64(t3); 1012 tcg_temp_free_i64(t2); 1013 tcg_temp_free_i64(t1); 1014 tcg_temp_free_i64(t0); 1015 } 1016 1017 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1018 uint32_t cofs, uint32_t oprsz, int64_t c, 1019 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64, 1020 int64_t)) 1021 { 1022 TCGv_i64 t0 = tcg_temp_new_i64(); 1023 TCGv_i64 t1 = tcg_temp_new_i64(); 1024 TCGv_i64 t2 = tcg_temp_new_i64(); 1025 TCGv_i64 t3 = tcg_temp_new_i64(); 1026 uint32_t i; 1027 1028 for (i = 0; i < oprsz; i += 8) { 1029 tcg_gen_ld_i64(t1, tcg_env, aofs + i); 1030 tcg_gen_ld_i64(t2, tcg_env, bofs + i); 1031 tcg_gen_ld_i64(t3, tcg_env, cofs + i); 1032 fni(t0, t1, t2, t3, c); 1033 tcg_gen_st_i64(t0, tcg_env, dofs + i); 1034 } 1035 tcg_temp_free_i64(t3); 1036 tcg_temp_free_i64(t2); 1037 tcg_temp_free_i64(t1); 1038 tcg_temp_free_i64(t0); 1039 } 1040 1041 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 1042 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1043 uint32_t oprsz, uint32_t tysz, TCGType type, 1044 bool load_dest, 1045 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 1046 { 1047 for (uint32_t i = 0; i < oprsz; i += tysz) { 1048 TCGv_vec t0 = tcg_temp_new_vec(type); 1049 TCGv_vec t1 = tcg_temp_new_vec(type); 1050 1051 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1052 if (load_dest) { 1053 tcg_gen_ld_vec(t1, tcg_env, dofs + i); 1054 } 1055 fni(vece, t1, t0); 1056 tcg_gen_st_vec(t1, tcg_env, dofs + i); 1057 } 1058 } 1059 1060 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 1061 using host vectors. */ 1062 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1063 uint32_t oprsz, uint32_t tysz, TCGType type, 1064 int64_t c, bool load_dest, 1065 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 1066 { 1067 for (uint32_t i = 0; i < oprsz; i += tysz) { 1068 TCGv_vec t0 = tcg_temp_new_vec(type); 1069 TCGv_vec t1 = tcg_temp_new_vec(type); 1070 1071 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1072 if (load_dest) { 1073 tcg_gen_ld_vec(t1, tcg_env, dofs + i); 1074 } 1075 fni(vece, t1, t0, c); 1076 tcg_gen_st_vec(t1, tcg_env, dofs + i); 1077 } 1078 } 1079 1080 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1081 uint32_t oprsz, uint32_t tysz, TCGType type, 1082 TCGv_vec c, bool scalar_first, 1083 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1084 { 1085 for (uint32_t i = 0; i < oprsz; i += tysz) { 1086 TCGv_vec t0 = tcg_temp_new_vec(type); 1087 TCGv_vec t1 = tcg_temp_new_vec(type); 1088 1089 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1090 if (scalar_first) { 1091 fni(vece, t1, c, t0); 1092 } else { 1093 fni(vece, t1, t0, c); 1094 } 1095 tcg_gen_st_vec(t1, tcg_env, dofs + i); 1096 } 1097 } 1098 1099 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 1100 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1101 uint32_t bofs, uint32_t oprsz, 1102 uint32_t tysz, TCGType type, bool load_dest, 1103 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1104 { 1105 for (uint32_t i = 0; i < oprsz; i += tysz) { 1106 TCGv_vec t0 = tcg_temp_new_vec(type); 1107 TCGv_vec t1 = tcg_temp_new_vec(type); 1108 TCGv_vec t2 = tcg_temp_new_vec(type); 1109 1110 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1111 tcg_gen_ld_vec(t1, tcg_env, bofs + i); 1112 if (load_dest) { 1113 tcg_gen_ld_vec(t2, tcg_env, dofs + i); 1114 } 1115 fni(vece, t2, t0, t1); 1116 tcg_gen_st_vec(t2, tcg_env, dofs + i); 1117 } 1118 } 1119 1120 /* 1121 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 1122 * using host vectors. 1123 */ 1124 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1125 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 1126 TCGType type, int64_t c, 1127 bool load_dest, bool write_aofs, 1128 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 1129 int64_t)) 1130 { 1131 for (uint32_t i = 0; i < oprsz; i += tysz) { 1132 TCGv_vec t0 = tcg_temp_new_vec(type); 1133 TCGv_vec t1 = tcg_temp_new_vec(type); 1134 TCGv_vec t2 = tcg_temp_new_vec(type); 1135 1136 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1137 tcg_gen_ld_vec(t1, tcg_env, bofs + i); 1138 if (load_dest) { 1139 tcg_gen_ld_vec(t2, tcg_env, dofs + i); 1140 } 1141 fni(vece, t2, t0, t1, c); 1142 tcg_gen_st_vec(t2, tcg_env, dofs + i); 1143 if (write_aofs) { 1144 tcg_gen_st_vec(t0, tcg_env, aofs + i); 1145 } 1146 } 1147 } 1148 1149 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 1150 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1151 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1152 uint32_t tysz, TCGType type, bool write_aofs, 1153 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1154 TCGv_vec, TCGv_vec)) 1155 { 1156 for (uint32_t i = 0; i < oprsz; i += tysz) { 1157 TCGv_vec t0 = tcg_temp_new_vec(type); 1158 TCGv_vec t1 = tcg_temp_new_vec(type); 1159 TCGv_vec t2 = tcg_temp_new_vec(type); 1160 TCGv_vec t3 = tcg_temp_new_vec(type); 1161 1162 tcg_gen_ld_vec(t1, tcg_env, aofs + i); 1163 tcg_gen_ld_vec(t2, tcg_env, bofs + i); 1164 tcg_gen_ld_vec(t3, tcg_env, cofs + i); 1165 fni(vece, t0, t1, t2, t3); 1166 tcg_gen_st_vec(t0, tcg_env, dofs + i); 1167 if (write_aofs) { 1168 tcg_gen_st_vec(t1, tcg_env, aofs + i); 1169 } 1170 } 1171 } 1172 1173 /* 1174 * Expand OPSZ bytes worth of four-vector operands and an immediate operand 1175 * using host vectors. 1176 */ 1177 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1178 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1179 uint32_t tysz, TCGType type, int64_t c, 1180 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1181 TCGv_vec, TCGv_vec, int64_t)) 1182 { 1183 for (uint32_t i = 0; i < oprsz; i += tysz) { 1184 TCGv_vec t0 = tcg_temp_new_vec(type); 1185 TCGv_vec t1 = tcg_temp_new_vec(type); 1186 TCGv_vec t2 = tcg_temp_new_vec(type); 1187 TCGv_vec t3 = tcg_temp_new_vec(type); 1188 1189 tcg_gen_ld_vec(t1, tcg_env, aofs + i); 1190 tcg_gen_ld_vec(t2, tcg_env, bofs + i); 1191 tcg_gen_ld_vec(t3, tcg_env, cofs + i); 1192 fni(vece, t0, t1, t2, t3, c); 1193 tcg_gen_st_vec(t0, tcg_env, dofs + i); 1194 } 1195 } 1196 1197 /* Expand a vector two-operand operation. */ 1198 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 1199 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 1200 { 1201 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1202 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1203 TCGType type; 1204 uint32_t some; 1205 1206 check_size_align(oprsz, maxsz, dofs | aofs); 1207 check_overlap_2(dofs, aofs, maxsz); 1208 1209 type = 0; 1210 if (g->fniv) { 1211 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1212 } 1213 switch (type) { 1214 case TCG_TYPE_V256: 1215 /* Recall that ARM SVE allows vector sizes that are not a 1216 * power of 2, but always a multiple of 16. The intent is 1217 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1218 */ 1219 some = QEMU_ALIGN_DOWN(oprsz, 32); 1220 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1221 g->load_dest, g->fniv); 1222 if (some == oprsz) { 1223 break; 1224 } 1225 dofs += some; 1226 aofs += some; 1227 oprsz -= some; 1228 maxsz -= some; 1229 /* fallthru */ 1230 case TCG_TYPE_V128: 1231 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1232 g->load_dest, g->fniv); 1233 break; 1234 case TCG_TYPE_V64: 1235 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1236 g->load_dest, g->fniv); 1237 break; 1238 1239 case 0: 1240 if (g->fni8 && check_size_impl(oprsz, 8)) { 1241 expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8); 1242 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1243 expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4); 1244 } else { 1245 assert(g->fno != NULL); 1246 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1247 oprsz = maxsz; 1248 } 1249 break; 1250 1251 default: 1252 g_assert_not_reached(); 1253 } 1254 tcg_swap_vecop_list(hold_list); 1255 1256 if (oprsz < maxsz) { 1257 expand_clr(dofs + oprsz, maxsz - oprsz); 1258 } 1259 } 1260 1261 /* Expand a vector operation with two vectors and an immediate. */ 1262 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1263 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1264 { 1265 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1266 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1267 TCGType type; 1268 uint32_t some; 1269 1270 check_size_align(oprsz, maxsz, dofs | aofs); 1271 check_overlap_2(dofs, aofs, maxsz); 1272 1273 type = 0; 1274 if (g->fniv) { 1275 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1276 } 1277 switch (type) { 1278 case TCG_TYPE_V256: 1279 /* Recall that ARM SVE allows vector sizes that are not a 1280 * power of 2, but always a multiple of 16. The intent is 1281 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1282 */ 1283 some = QEMU_ALIGN_DOWN(oprsz, 32); 1284 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1285 c, g->load_dest, g->fniv); 1286 if (some == oprsz) { 1287 break; 1288 } 1289 dofs += some; 1290 aofs += some; 1291 oprsz -= some; 1292 maxsz -= some; 1293 /* fallthru */ 1294 case TCG_TYPE_V128: 1295 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1296 c, g->load_dest, g->fniv); 1297 break; 1298 case TCG_TYPE_V64: 1299 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1300 c, g->load_dest, g->fniv); 1301 break; 1302 1303 case 0: 1304 if (g->fni8 && check_size_impl(oprsz, 8)) { 1305 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1306 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1307 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1308 } else { 1309 if (g->fno) { 1310 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1311 } else { 1312 TCGv_i64 tcg_c = tcg_constant_i64(c); 1313 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1314 maxsz, c, g->fnoi); 1315 } 1316 oprsz = maxsz; 1317 } 1318 break; 1319 1320 default: 1321 g_assert_not_reached(); 1322 } 1323 tcg_swap_vecop_list(hold_list); 1324 1325 if (oprsz < maxsz) { 1326 expand_clr(dofs + oprsz, maxsz - oprsz); 1327 } 1328 } 1329 1330 /* Expand a vector operation with two vectors and a scalar. */ 1331 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1332 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1333 { 1334 TCGType type; 1335 1336 check_size_align(oprsz, maxsz, dofs | aofs); 1337 check_overlap_2(dofs, aofs, maxsz); 1338 1339 type = 0; 1340 if (g->fniv) { 1341 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1342 } 1343 if (type != 0) { 1344 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1345 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1346 TCGv_vec t_vec = tcg_temp_new_vec(type); 1347 uint32_t some; 1348 1349 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1350 1351 switch (type) { 1352 case TCG_TYPE_V256: 1353 /* Recall that ARM SVE allows vector sizes that are not a 1354 * power of 2, but always a multiple of 16. The intent is 1355 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1356 */ 1357 some = QEMU_ALIGN_DOWN(oprsz, 32); 1358 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1359 t_vec, g->scalar_first, g->fniv); 1360 if (some == oprsz) { 1361 break; 1362 } 1363 dofs += some; 1364 aofs += some; 1365 oprsz -= some; 1366 maxsz -= some; 1367 /* fallthru */ 1368 1369 case TCG_TYPE_V128: 1370 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1371 t_vec, g->scalar_first, g->fniv); 1372 break; 1373 1374 case TCG_TYPE_V64: 1375 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1376 t_vec, g->scalar_first, g->fniv); 1377 break; 1378 1379 default: 1380 g_assert_not_reached(); 1381 } 1382 tcg_temp_free_vec(t_vec); 1383 tcg_swap_vecop_list(hold_list); 1384 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1385 TCGv_i64 t64 = tcg_temp_new_i64(); 1386 1387 tcg_gen_dup_i64(g->vece, t64, c); 1388 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1389 tcg_temp_free_i64(t64); 1390 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1391 TCGv_i32 t32 = tcg_temp_new_i32(); 1392 1393 tcg_gen_extrl_i64_i32(t32, c); 1394 tcg_gen_dup_i32(g->vece, t32, t32); 1395 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1396 tcg_temp_free_i32(t32); 1397 } else { 1398 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1399 return; 1400 } 1401 1402 if (oprsz < maxsz) { 1403 expand_clr(dofs + oprsz, maxsz - oprsz); 1404 } 1405 } 1406 1407 /* Expand a vector three-operand operation. */ 1408 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1409 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1410 { 1411 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1412 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1413 TCGType type; 1414 uint32_t some; 1415 1416 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1417 check_overlap_3(dofs, aofs, bofs, maxsz); 1418 1419 type = 0; 1420 if (g->fniv) { 1421 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1422 } 1423 switch (type) { 1424 case TCG_TYPE_V256: 1425 /* Recall that ARM SVE allows vector sizes that are not a 1426 * power of 2, but always a multiple of 16. The intent is 1427 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1428 */ 1429 some = QEMU_ALIGN_DOWN(oprsz, 32); 1430 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1431 g->load_dest, g->fniv); 1432 if (some == oprsz) { 1433 break; 1434 } 1435 dofs += some; 1436 aofs += some; 1437 bofs += some; 1438 oprsz -= some; 1439 maxsz -= some; 1440 /* fallthru */ 1441 case TCG_TYPE_V128: 1442 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1443 g->load_dest, g->fniv); 1444 break; 1445 case TCG_TYPE_V64: 1446 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1447 g->load_dest, g->fniv); 1448 break; 1449 1450 case 0: 1451 if (g->fni8 && check_size_impl(oprsz, 8)) { 1452 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1453 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1454 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1455 } else { 1456 assert(g->fno != NULL); 1457 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1458 maxsz, g->data, g->fno); 1459 oprsz = maxsz; 1460 } 1461 break; 1462 1463 default: 1464 g_assert_not_reached(); 1465 } 1466 tcg_swap_vecop_list(hold_list); 1467 1468 if (oprsz < maxsz) { 1469 expand_clr(dofs + oprsz, maxsz - oprsz); 1470 } 1471 } 1472 1473 /* Expand a vector operation with three vectors and an immediate. */ 1474 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1475 uint32_t oprsz, uint32_t maxsz, int64_t c, 1476 const GVecGen3i *g) 1477 { 1478 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1479 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1480 TCGType type; 1481 uint32_t some; 1482 1483 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1484 check_overlap_3(dofs, aofs, bofs, maxsz); 1485 1486 type = 0; 1487 if (g->fniv) { 1488 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1489 } 1490 switch (type) { 1491 case TCG_TYPE_V256: 1492 /* 1493 * Recall that ARM SVE allows vector sizes that are not a 1494 * power of 2, but always a multiple of 16. The intent is 1495 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1496 */ 1497 some = QEMU_ALIGN_DOWN(oprsz, 32); 1498 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1499 c, g->load_dest, g->write_aofs, g->fniv); 1500 if (some == oprsz) { 1501 break; 1502 } 1503 dofs += some; 1504 aofs += some; 1505 bofs += some; 1506 oprsz -= some; 1507 maxsz -= some; 1508 /* fallthru */ 1509 case TCG_TYPE_V128: 1510 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1511 c, g->load_dest, g->write_aofs, g->fniv); 1512 break; 1513 case TCG_TYPE_V64: 1514 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1515 c, g->load_dest, g->write_aofs, g->fniv); 1516 break; 1517 1518 case 0: 1519 if (g->fni8 && check_size_impl(oprsz, 8)) { 1520 expand_3i_i64(dofs, aofs, bofs, oprsz, c, 1521 g->load_dest, g->write_aofs, g->fni8); 1522 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1523 expand_3i_i32(dofs, aofs, bofs, oprsz, c, 1524 g->load_dest, g->write_aofs, g->fni4); 1525 } else { 1526 assert(g->fno != NULL); 1527 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1528 oprsz = maxsz; 1529 } 1530 break; 1531 1532 default: 1533 g_assert_not_reached(); 1534 } 1535 tcg_swap_vecop_list(hold_list); 1536 1537 if (oprsz < maxsz) { 1538 expand_clr(dofs + oprsz, maxsz - oprsz); 1539 } 1540 } 1541 1542 /* Expand a vector four-operand operation. */ 1543 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1544 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1545 { 1546 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1547 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1548 TCGType type; 1549 uint32_t some; 1550 1551 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1552 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1553 1554 type = 0; 1555 if (g->fniv) { 1556 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1557 } 1558 switch (type) { 1559 case TCG_TYPE_V256: 1560 /* Recall that ARM SVE allows vector sizes that are not a 1561 * power of 2, but always a multiple of 16. The intent is 1562 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1563 */ 1564 some = QEMU_ALIGN_DOWN(oprsz, 32); 1565 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1566 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1567 if (some == oprsz) { 1568 break; 1569 } 1570 dofs += some; 1571 aofs += some; 1572 bofs += some; 1573 cofs += some; 1574 oprsz -= some; 1575 maxsz -= some; 1576 /* fallthru */ 1577 case TCG_TYPE_V128: 1578 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1579 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1580 break; 1581 case TCG_TYPE_V64: 1582 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1583 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1584 break; 1585 1586 case 0: 1587 if (g->fni8 && check_size_impl(oprsz, 8)) { 1588 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1589 g->write_aofs, g->fni8); 1590 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1591 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1592 g->write_aofs, g->fni4); 1593 } else { 1594 assert(g->fno != NULL); 1595 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1596 oprsz, maxsz, g->data, g->fno); 1597 oprsz = maxsz; 1598 } 1599 break; 1600 1601 default: 1602 g_assert_not_reached(); 1603 } 1604 tcg_swap_vecop_list(hold_list); 1605 1606 if (oprsz < maxsz) { 1607 expand_clr(dofs + oprsz, maxsz - oprsz); 1608 } 1609 } 1610 1611 /* Expand a vector four-operand operation. */ 1612 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1613 uint32_t oprsz, uint32_t maxsz, int64_t c, 1614 const GVecGen4i *g) 1615 { 1616 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1617 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1618 TCGType type; 1619 uint32_t some; 1620 1621 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1622 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1623 1624 type = 0; 1625 if (g->fniv) { 1626 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1627 } 1628 switch (type) { 1629 case TCG_TYPE_V256: 1630 /* 1631 * Recall that ARM SVE allows vector sizes that are not a 1632 * power of 2, but always a multiple of 16. The intent is 1633 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1634 */ 1635 some = QEMU_ALIGN_DOWN(oprsz, 32); 1636 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some, 1637 32, TCG_TYPE_V256, c, g->fniv); 1638 if (some == oprsz) { 1639 break; 1640 } 1641 dofs += some; 1642 aofs += some; 1643 bofs += some; 1644 cofs += some; 1645 oprsz -= some; 1646 maxsz -= some; 1647 /* fallthru */ 1648 case TCG_TYPE_V128: 1649 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1650 16, TCG_TYPE_V128, c, g->fniv); 1651 break; 1652 case TCG_TYPE_V64: 1653 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1654 8, TCG_TYPE_V64, c, g->fniv); 1655 break; 1656 1657 case 0: 1658 if (g->fni8 && check_size_impl(oprsz, 8)) { 1659 expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8); 1660 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1661 expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4); 1662 } else { 1663 assert(g->fno != NULL); 1664 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1665 oprsz, maxsz, c, g->fno); 1666 oprsz = maxsz; 1667 } 1668 break; 1669 1670 default: 1671 g_assert_not_reached(); 1672 } 1673 tcg_swap_vecop_list(hold_list); 1674 1675 if (oprsz < maxsz) { 1676 expand_clr(dofs + oprsz, maxsz - oprsz); 1677 } 1678 } 1679 1680 /* 1681 * Expand specific vector operations. 1682 */ 1683 1684 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1685 { 1686 tcg_gen_mov_vec(a, b); 1687 } 1688 1689 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1690 uint32_t oprsz, uint32_t maxsz) 1691 { 1692 static const GVecGen2 g = { 1693 .fni8 = tcg_gen_mov_i64, 1694 .fniv = vec_mov2, 1695 .fno = gen_helper_gvec_mov, 1696 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1697 }; 1698 if (dofs != aofs) { 1699 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1700 } else { 1701 check_size_align(oprsz, maxsz, dofs); 1702 if (oprsz < maxsz) { 1703 expand_clr(dofs + oprsz, maxsz - oprsz); 1704 } 1705 } 1706 } 1707 1708 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1709 uint32_t maxsz, TCGv_i32 in) 1710 { 1711 check_size_align(oprsz, maxsz, dofs); 1712 tcg_debug_assert(vece <= MO_32); 1713 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1714 } 1715 1716 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1717 uint32_t maxsz, TCGv_i64 in) 1718 { 1719 check_size_align(oprsz, maxsz, dofs); 1720 tcg_debug_assert(vece <= MO_64); 1721 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1722 } 1723 1724 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1725 uint32_t oprsz, uint32_t maxsz) 1726 { 1727 check_size_align(oprsz, maxsz, dofs); 1728 if (vece <= MO_64) { 1729 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1730 if (type != 0) { 1731 TCGv_vec t_vec = tcg_temp_new_vec(type); 1732 tcg_gen_dup_mem_vec(vece, t_vec, tcg_env, aofs); 1733 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1734 } else if (vece <= MO_32) { 1735 TCGv_i32 in = tcg_temp_ebb_new_i32(); 1736 switch (vece) { 1737 case MO_8: 1738 tcg_gen_ld8u_i32(in, tcg_env, aofs); 1739 break; 1740 case MO_16: 1741 tcg_gen_ld16u_i32(in, tcg_env, aofs); 1742 break; 1743 default: 1744 tcg_gen_ld_i32(in, tcg_env, aofs); 1745 break; 1746 } 1747 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1748 tcg_temp_free_i32(in); 1749 } else { 1750 TCGv_i64 in = tcg_temp_ebb_new_i64(); 1751 tcg_gen_ld_i64(in, tcg_env, aofs); 1752 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1753 tcg_temp_free_i64(in); 1754 } 1755 } else if (vece == 4) { 1756 /* 128-bit duplicate. */ 1757 int i; 1758 1759 tcg_debug_assert(oprsz >= 16); 1760 if (TCG_TARGET_HAS_v128) { 1761 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1762 1763 tcg_gen_ld_vec(in, tcg_env, aofs); 1764 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1765 tcg_gen_st_vec(in, tcg_env, dofs + i); 1766 } 1767 } else { 1768 TCGv_i64 in0 = tcg_temp_ebb_new_i64(); 1769 TCGv_i64 in1 = tcg_temp_ebb_new_i64(); 1770 1771 tcg_gen_ld_i64(in0, tcg_env, aofs); 1772 tcg_gen_ld_i64(in1, tcg_env, aofs + 8); 1773 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1774 tcg_gen_st_i64(in0, tcg_env, dofs + i); 1775 tcg_gen_st_i64(in1, tcg_env, dofs + i + 8); 1776 } 1777 tcg_temp_free_i64(in0); 1778 tcg_temp_free_i64(in1); 1779 } 1780 if (oprsz < maxsz) { 1781 expand_clr(dofs + oprsz, maxsz - oprsz); 1782 } 1783 } else if (vece == 5) { 1784 /* 256-bit duplicate. */ 1785 int i; 1786 1787 tcg_debug_assert(oprsz >= 32); 1788 tcg_debug_assert(oprsz % 32 == 0); 1789 if (TCG_TARGET_HAS_v256) { 1790 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256); 1791 1792 tcg_gen_ld_vec(in, tcg_env, aofs); 1793 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1794 tcg_gen_st_vec(in, tcg_env, dofs + i); 1795 } 1796 } else if (TCG_TARGET_HAS_v128) { 1797 TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128); 1798 TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128); 1799 1800 tcg_gen_ld_vec(in0, tcg_env, aofs); 1801 tcg_gen_ld_vec(in1, tcg_env, aofs + 16); 1802 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1803 tcg_gen_st_vec(in0, tcg_env, dofs + i); 1804 tcg_gen_st_vec(in1, tcg_env, dofs + i + 16); 1805 } 1806 } else { 1807 TCGv_i64 in[4]; 1808 int j; 1809 1810 for (j = 0; j < 4; ++j) { 1811 in[j] = tcg_temp_ebb_new_i64(); 1812 tcg_gen_ld_i64(in[j], tcg_env, aofs + j * 8); 1813 } 1814 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1815 for (j = 0; j < 4; ++j) { 1816 tcg_gen_st_i64(in[j], tcg_env, dofs + i + j * 8); 1817 } 1818 } 1819 for (j = 0; j < 4; ++j) { 1820 tcg_temp_free_i64(in[j]); 1821 } 1822 } 1823 if (oprsz < maxsz) { 1824 expand_clr(dofs + oprsz, maxsz - oprsz); 1825 } 1826 } else { 1827 g_assert_not_reached(); 1828 } 1829 } 1830 1831 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz, 1832 uint32_t maxsz, uint64_t x) 1833 { 1834 check_size_align(oprsz, maxsz, dofs); 1835 do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x); 1836 } 1837 1838 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1839 uint32_t oprsz, uint32_t maxsz) 1840 { 1841 static const GVecGen2 g = { 1842 .fni8 = tcg_gen_not_i64, 1843 .fniv = tcg_gen_not_vec, 1844 .fno = gen_helper_gvec_not, 1845 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1846 }; 1847 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1848 } 1849 1850 /* Perform a vector addition using normal addition and a mask. The mask 1851 should be the sign bit of each lane. This 6-operation form is more 1852 efficient than separate additions when there are 4 or more lanes in 1853 the 64-bit operation. */ 1854 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1855 { 1856 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 1857 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 1858 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 1859 1860 tcg_gen_andc_i64(t1, a, m); 1861 tcg_gen_andc_i64(t2, b, m); 1862 tcg_gen_xor_i64(t3, a, b); 1863 tcg_gen_add_i64(d, t1, t2); 1864 tcg_gen_and_i64(t3, t3, m); 1865 tcg_gen_xor_i64(d, d, t3); 1866 1867 tcg_temp_free_i64(t1); 1868 tcg_temp_free_i64(t2); 1869 tcg_temp_free_i64(t3); 1870 } 1871 1872 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1873 { 1874 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 1875 gen_addv_mask(d, a, b, m); 1876 } 1877 1878 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1879 { 1880 TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80)); 1881 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 1882 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 1883 TCGv_i32 t3 = tcg_temp_ebb_new_i32(); 1884 1885 tcg_gen_andc_i32(t1, a, m); 1886 tcg_gen_andc_i32(t2, b, m); 1887 tcg_gen_xor_i32(t3, a, b); 1888 tcg_gen_add_i32(d, t1, t2); 1889 tcg_gen_and_i32(t3, t3, m); 1890 tcg_gen_xor_i32(d, d, t3); 1891 1892 tcg_temp_free_i32(t1); 1893 tcg_temp_free_i32(t2); 1894 tcg_temp_free_i32(t3); 1895 } 1896 1897 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1898 { 1899 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 1900 gen_addv_mask(d, a, b, m); 1901 } 1902 1903 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1904 { 1905 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 1906 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 1907 1908 tcg_gen_andi_i32(t1, a, ~0xffff); 1909 tcg_gen_add_i32(t2, a, b); 1910 tcg_gen_add_i32(t1, t1, b); 1911 tcg_gen_deposit_i32(d, t1, t2, 0, 16); 1912 1913 tcg_temp_free_i32(t1); 1914 tcg_temp_free_i32(t2); 1915 } 1916 1917 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1918 { 1919 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 1920 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 1921 1922 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1923 tcg_gen_add_i64(t2, a, b); 1924 tcg_gen_add_i64(t1, t1, b); 1925 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1926 1927 tcg_temp_free_i64(t1); 1928 tcg_temp_free_i64(t2); 1929 } 1930 1931 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1932 1933 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1934 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1935 { 1936 static const GVecGen3 g[4] = { 1937 { .fni8 = tcg_gen_vec_add8_i64, 1938 .fniv = tcg_gen_add_vec, 1939 .fno = gen_helper_gvec_add8, 1940 .opt_opc = vecop_list_add, 1941 .vece = MO_8 }, 1942 { .fni8 = tcg_gen_vec_add16_i64, 1943 .fniv = tcg_gen_add_vec, 1944 .fno = gen_helper_gvec_add16, 1945 .opt_opc = vecop_list_add, 1946 .vece = MO_16 }, 1947 { .fni4 = tcg_gen_add_i32, 1948 .fniv = tcg_gen_add_vec, 1949 .fno = gen_helper_gvec_add32, 1950 .opt_opc = vecop_list_add, 1951 .vece = MO_32 }, 1952 { .fni8 = tcg_gen_add_i64, 1953 .fniv = tcg_gen_add_vec, 1954 .fno = gen_helper_gvec_add64, 1955 .opt_opc = vecop_list_add, 1956 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1957 .vece = MO_64 }, 1958 }; 1959 1960 tcg_debug_assert(vece <= MO_64); 1961 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1962 } 1963 1964 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1965 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1966 { 1967 static const GVecGen2s g[4] = { 1968 { .fni8 = tcg_gen_vec_add8_i64, 1969 .fniv = tcg_gen_add_vec, 1970 .fno = gen_helper_gvec_adds8, 1971 .opt_opc = vecop_list_add, 1972 .vece = MO_8 }, 1973 { .fni8 = tcg_gen_vec_add16_i64, 1974 .fniv = tcg_gen_add_vec, 1975 .fno = gen_helper_gvec_adds16, 1976 .opt_opc = vecop_list_add, 1977 .vece = MO_16 }, 1978 { .fni4 = tcg_gen_add_i32, 1979 .fniv = tcg_gen_add_vec, 1980 .fno = gen_helper_gvec_adds32, 1981 .opt_opc = vecop_list_add, 1982 .vece = MO_32 }, 1983 { .fni8 = tcg_gen_add_i64, 1984 .fniv = tcg_gen_add_vec, 1985 .fno = gen_helper_gvec_adds64, 1986 .opt_opc = vecop_list_add, 1987 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1988 .vece = MO_64 }, 1989 }; 1990 1991 tcg_debug_assert(vece <= MO_64); 1992 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1993 } 1994 1995 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1996 int64_t c, uint32_t oprsz, uint32_t maxsz) 1997 { 1998 TCGv_i64 tmp = tcg_constant_i64(c); 1999 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 2000 } 2001 2002 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 2003 2004 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 2005 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2006 { 2007 static const GVecGen2s g[4] = { 2008 { .fni8 = tcg_gen_vec_sub8_i64, 2009 .fniv = tcg_gen_sub_vec, 2010 .fno = gen_helper_gvec_subs8, 2011 .opt_opc = vecop_list_sub, 2012 .vece = MO_8 }, 2013 { .fni8 = tcg_gen_vec_sub16_i64, 2014 .fniv = tcg_gen_sub_vec, 2015 .fno = gen_helper_gvec_subs16, 2016 .opt_opc = vecop_list_sub, 2017 .vece = MO_16 }, 2018 { .fni4 = tcg_gen_sub_i32, 2019 .fniv = tcg_gen_sub_vec, 2020 .fno = gen_helper_gvec_subs32, 2021 .opt_opc = vecop_list_sub, 2022 .vece = MO_32 }, 2023 { .fni8 = tcg_gen_sub_i64, 2024 .fniv = tcg_gen_sub_vec, 2025 .fno = gen_helper_gvec_subs64, 2026 .opt_opc = vecop_list_sub, 2027 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2028 .vece = MO_64 }, 2029 }; 2030 2031 tcg_debug_assert(vece <= MO_64); 2032 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 2033 } 2034 2035 /* Perform a vector subtraction using normal subtraction and a mask. 2036 Compare gen_addv_mask above. */ 2037 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 2038 { 2039 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2040 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2041 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 2042 2043 tcg_gen_or_i64(t1, a, m); 2044 tcg_gen_andc_i64(t2, b, m); 2045 tcg_gen_eqv_i64(t3, a, b); 2046 tcg_gen_sub_i64(d, t1, t2); 2047 tcg_gen_and_i64(t3, t3, m); 2048 tcg_gen_xor_i64(d, d, t3); 2049 2050 tcg_temp_free_i64(t1); 2051 tcg_temp_free_i64(t2); 2052 tcg_temp_free_i64(t3); 2053 } 2054 2055 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2056 { 2057 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 2058 gen_subv_mask(d, a, b, m); 2059 } 2060 2061 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2062 { 2063 TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80)); 2064 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 2065 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 2066 TCGv_i32 t3 = tcg_temp_ebb_new_i32(); 2067 2068 tcg_gen_or_i32(t1, a, m); 2069 tcg_gen_andc_i32(t2, b, m); 2070 tcg_gen_eqv_i32(t3, a, b); 2071 tcg_gen_sub_i32(d, t1, t2); 2072 tcg_gen_and_i32(t3, t3, m); 2073 tcg_gen_xor_i32(d, d, t3); 2074 2075 tcg_temp_free_i32(t1); 2076 tcg_temp_free_i32(t2); 2077 tcg_temp_free_i32(t3); 2078 } 2079 2080 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2081 { 2082 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 2083 gen_subv_mask(d, a, b, m); 2084 } 2085 2086 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2087 { 2088 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 2089 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 2090 2091 tcg_gen_andi_i32(t1, b, ~0xffff); 2092 tcg_gen_sub_i32(t2, a, b); 2093 tcg_gen_sub_i32(t1, a, t1); 2094 tcg_gen_deposit_i32(d, t1, t2, 0, 16); 2095 2096 tcg_temp_free_i32(t1); 2097 tcg_temp_free_i32(t2); 2098 } 2099 2100 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2101 { 2102 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2103 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2104 2105 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2106 tcg_gen_sub_i64(t2, a, b); 2107 tcg_gen_sub_i64(t1, a, t1); 2108 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2109 2110 tcg_temp_free_i64(t1); 2111 tcg_temp_free_i64(t2); 2112 } 2113 2114 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 2115 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2116 { 2117 static const GVecGen3 g[4] = { 2118 { .fni8 = tcg_gen_vec_sub8_i64, 2119 .fniv = tcg_gen_sub_vec, 2120 .fno = gen_helper_gvec_sub8, 2121 .opt_opc = vecop_list_sub, 2122 .vece = MO_8 }, 2123 { .fni8 = tcg_gen_vec_sub16_i64, 2124 .fniv = tcg_gen_sub_vec, 2125 .fno = gen_helper_gvec_sub16, 2126 .opt_opc = vecop_list_sub, 2127 .vece = MO_16 }, 2128 { .fni4 = tcg_gen_sub_i32, 2129 .fniv = tcg_gen_sub_vec, 2130 .fno = gen_helper_gvec_sub32, 2131 .opt_opc = vecop_list_sub, 2132 .vece = MO_32 }, 2133 { .fni8 = tcg_gen_sub_i64, 2134 .fniv = tcg_gen_sub_vec, 2135 .fno = gen_helper_gvec_sub64, 2136 .opt_opc = vecop_list_sub, 2137 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2138 .vece = MO_64 }, 2139 }; 2140 2141 tcg_debug_assert(vece <= MO_64); 2142 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2143 } 2144 2145 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 2146 2147 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 2148 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2149 { 2150 static const GVecGen3 g[4] = { 2151 { .fniv = tcg_gen_mul_vec, 2152 .fno = gen_helper_gvec_mul8, 2153 .opt_opc = vecop_list_mul, 2154 .vece = MO_8 }, 2155 { .fniv = tcg_gen_mul_vec, 2156 .fno = gen_helper_gvec_mul16, 2157 .opt_opc = vecop_list_mul, 2158 .vece = MO_16 }, 2159 { .fni4 = tcg_gen_mul_i32, 2160 .fniv = tcg_gen_mul_vec, 2161 .fno = gen_helper_gvec_mul32, 2162 .opt_opc = vecop_list_mul, 2163 .vece = MO_32 }, 2164 { .fni8 = tcg_gen_mul_i64, 2165 .fniv = tcg_gen_mul_vec, 2166 .fno = gen_helper_gvec_mul64, 2167 .opt_opc = vecop_list_mul, 2168 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2169 .vece = MO_64 }, 2170 }; 2171 2172 tcg_debug_assert(vece <= MO_64); 2173 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2174 } 2175 2176 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 2177 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2178 { 2179 static const GVecGen2s g[4] = { 2180 { .fniv = tcg_gen_mul_vec, 2181 .fno = gen_helper_gvec_muls8, 2182 .opt_opc = vecop_list_mul, 2183 .vece = MO_8 }, 2184 { .fniv = tcg_gen_mul_vec, 2185 .fno = gen_helper_gvec_muls16, 2186 .opt_opc = vecop_list_mul, 2187 .vece = MO_16 }, 2188 { .fni4 = tcg_gen_mul_i32, 2189 .fniv = tcg_gen_mul_vec, 2190 .fno = gen_helper_gvec_muls32, 2191 .opt_opc = vecop_list_mul, 2192 .vece = MO_32 }, 2193 { .fni8 = tcg_gen_mul_i64, 2194 .fniv = tcg_gen_mul_vec, 2195 .fno = gen_helper_gvec_muls64, 2196 .opt_opc = vecop_list_mul, 2197 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2198 .vece = MO_64 }, 2199 }; 2200 2201 tcg_debug_assert(vece <= MO_64); 2202 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 2203 } 2204 2205 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 2206 int64_t c, uint32_t oprsz, uint32_t maxsz) 2207 { 2208 TCGv_i64 tmp = tcg_constant_i64(c); 2209 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 2210 } 2211 2212 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2213 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2214 { 2215 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 2216 static const GVecGen3 g[4] = { 2217 { .fniv = tcg_gen_ssadd_vec, 2218 .fno = gen_helper_gvec_ssadd8, 2219 .opt_opc = vecop_list, 2220 .vece = MO_8 }, 2221 { .fniv = tcg_gen_ssadd_vec, 2222 .fno = gen_helper_gvec_ssadd16, 2223 .opt_opc = vecop_list, 2224 .vece = MO_16 }, 2225 { .fniv = tcg_gen_ssadd_vec, 2226 .fno = gen_helper_gvec_ssadd32, 2227 .opt_opc = vecop_list, 2228 .vece = MO_32 }, 2229 { .fniv = tcg_gen_ssadd_vec, 2230 .fno = gen_helper_gvec_ssadd64, 2231 .opt_opc = vecop_list, 2232 .vece = MO_64 }, 2233 }; 2234 tcg_debug_assert(vece <= MO_64); 2235 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2236 } 2237 2238 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 2239 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2240 { 2241 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 2242 static const GVecGen3 g[4] = { 2243 { .fniv = tcg_gen_sssub_vec, 2244 .fno = gen_helper_gvec_sssub8, 2245 .opt_opc = vecop_list, 2246 .vece = MO_8 }, 2247 { .fniv = tcg_gen_sssub_vec, 2248 .fno = gen_helper_gvec_sssub16, 2249 .opt_opc = vecop_list, 2250 .vece = MO_16 }, 2251 { .fniv = tcg_gen_sssub_vec, 2252 .fno = gen_helper_gvec_sssub32, 2253 .opt_opc = vecop_list, 2254 .vece = MO_32 }, 2255 { .fniv = tcg_gen_sssub_vec, 2256 .fno = gen_helper_gvec_sssub64, 2257 .opt_opc = vecop_list, 2258 .vece = MO_64 }, 2259 }; 2260 tcg_debug_assert(vece <= MO_64); 2261 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2262 } 2263 2264 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2265 { 2266 TCGv_i32 max = tcg_constant_i32(-1); 2267 tcg_gen_add_i32(d, a, b); 2268 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 2269 } 2270 2271 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2272 { 2273 TCGv_i64 max = tcg_constant_i64(-1); 2274 tcg_gen_add_i64(d, a, b); 2275 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 2276 } 2277 2278 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2279 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2280 { 2281 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 2282 static const GVecGen3 g[4] = { 2283 { .fniv = tcg_gen_usadd_vec, 2284 .fno = gen_helper_gvec_usadd8, 2285 .opt_opc = vecop_list, 2286 .vece = MO_8 }, 2287 { .fniv = tcg_gen_usadd_vec, 2288 .fno = gen_helper_gvec_usadd16, 2289 .opt_opc = vecop_list, 2290 .vece = MO_16 }, 2291 { .fni4 = tcg_gen_usadd_i32, 2292 .fniv = tcg_gen_usadd_vec, 2293 .fno = gen_helper_gvec_usadd32, 2294 .opt_opc = vecop_list, 2295 .vece = MO_32 }, 2296 { .fni8 = tcg_gen_usadd_i64, 2297 .fniv = tcg_gen_usadd_vec, 2298 .fno = gen_helper_gvec_usadd64, 2299 .opt_opc = vecop_list, 2300 .vece = MO_64 } 2301 }; 2302 tcg_debug_assert(vece <= MO_64); 2303 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2304 } 2305 2306 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2307 { 2308 TCGv_i32 min = tcg_constant_i32(0); 2309 tcg_gen_sub_i32(d, a, b); 2310 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 2311 } 2312 2313 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2314 { 2315 TCGv_i64 min = tcg_constant_i64(0); 2316 tcg_gen_sub_i64(d, a, b); 2317 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 2318 } 2319 2320 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 2321 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2322 { 2323 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 2324 static const GVecGen3 g[4] = { 2325 { .fniv = tcg_gen_ussub_vec, 2326 .fno = gen_helper_gvec_ussub8, 2327 .opt_opc = vecop_list, 2328 .vece = MO_8 }, 2329 { .fniv = tcg_gen_ussub_vec, 2330 .fno = gen_helper_gvec_ussub16, 2331 .opt_opc = vecop_list, 2332 .vece = MO_16 }, 2333 { .fni4 = tcg_gen_ussub_i32, 2334 .fniv = tcg_gen_ussub_vec, 2335 .fno = gen_helper_gvec_ussub32, 2336 .opt_opc = vecop_list, 2337 .vece = MO_32 }, 2338 { .fni8 = tcg_gen_ussub_i64, 2339 .fniv = tcg_gen_ussub_vec, 2340 .fno = gen_helper_gvec_ussub64, 2341 .opt_opc = vecop_list, 2342 .vece = MO_64 } 2343 }; 2344 tcg_debug_assert(vece <= MO_64); 2345 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2346 } 2347 2348 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 2349 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2350 { 2351 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 2352 static const GVecGen3 g[4] = { 2353 { .fniv = tcg_gen_smin_vec, 2354 .fno = gen_helper_gvec_smin8, 2355 .opt_opc = vecop_list, 2356 .vece = MO_8 }, 2357 { .fniv = tcg_gen_smin_vec, 2358 .fno = gen_helper_gvec_smin16, 2359 .opt_opc = vecop_list, 2360 .vece = MO_16 }, 2361 { .fni4 = tcg_gen_smin_i32, 2362 .fniv = tcg_gen_smin_vec, 2363 .fno = gen_helper_gvec_smin32, 2364 .opt_opc = vecop_list, 2365 .vece = MO_32 }, 2366 { .fni8 = tcg_gen_smin_i64, 2367 .fniv = tcg_gen_smin_vec, 2368 .fno = gen_helper_gvec_smin64, 2369 .opt_opc = vecop_list, 2370 .vece = MO_64 } 2371 }; 2372 tcg_debug_assert(vece <= MO_64); 2373 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2374 } 2375 2376 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2377 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2378 { 2379 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2380 static const GVecGen3 g[4] = { 2381 { .fniv = tcg_gen_umin_vec, 2382 .fno = gen_helper_gvec_umin8, 2383 .opt_opc = vecop_list, 2384 .vece = MO_8 }, 2385 { .fniv = tcg_gen_umin_vec, 2386 .fno = gen_helper_gvec_umin16, 2387 .opt_opc = vecop_list, 2388 .vece = MO_16 }, 2389 { .fni4 = tcg_gen_umin_i32, 2390 .fniv = tcg_gen_umin_vec, 2391 .fno = gen_helper_gvec_umin32, 2392 .opt_opc = vecop_list, 2393 .vece = MO_32 }, 2394 { .fni8 = tcg_gen_umin_i64, 2395 .fniv = tcg_gen_umin_vec, 2396 .fno = gen_helper_gvec_umin64, 2397 .opt_opc = vecop_list, 2398 .vece = MO_64 } 2399 }; 2400 tcg_debug_assert(vece <= MO_64); 2401 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2402 } 2403 2404 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2405 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2406 { 2407 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2408 static const GVecGen3 g[4] = { 2409 { .fniv = tcg_gen_smax_vec, 2410 .fno = gen_helper_gvec_smax8, 2411 .opt_opc = vecop_list, 2412 .vece = MO_8 }, 2413 { .fniv = tcg_gen_smax_vec, 2414 .fno = gen_helper_gvec_smax16, 2415 .opt_opc = vecop_list, 2416 .vece = MO_16 }, 2417 { .fni4 = tcg_gen_smax_i32, 2418 .fniv = tcg_gen_smax_vec, 2419 .fno = gen_helper_gvec_smax32, 2420 .opt_opc = vecop_list, 2421 .vece = MO_32 }, 2422 { .fni8 = tcg_gen_smax_i64, 2423 .fniv = tcg_gen_smax_vec, 2424 .fno = gen_helper_gvec_smax64, 2425 .opt_opc = vecop_list, 2426 .vece = MO_64 } 2427 }; 2428 tcg_debug_assert(vece <= MO_64); 2429 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2430 } 2431 2432 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2433 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2434 { 2435 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2436 static const GVecGen3 g[4] = { 2437 { .fniv = tcg_gen_umax_vec, 2438 .fno = gen_helper_gvec_umax8, 2439 .opt_opc = vecop_list, 2440 .vece = MO_8 }, 2441 { .fniv = tcg_gen_umax_vec, 2442 .fno = gen_helper_gvec_umax16, 2443 .opt_opc = vecop_list, 2444 .vece = MO_16 }, 2445 { .fni4 = tcg_gen_umax_i32, 2446 .fniv = tcg_gen_umax_vec, 2447 .fno = gen_helper_gvec_umax32, 2448 .opt_opc = vecop_list, 2449 .vece = MO_32 }, 2450 { .fni8 = tcg_gen_umax_i64, 2451 .fniv = tcg_gen_umax_vec, 2452 .fno = gen_helper_gvec_umax64, 2453 .opt_opc = vecop_list, 2454 .vece = MO_64 } 2455 }; 2456 tcg_debug_assert(vece <= MO_64); 2457 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2458 } 2459 2460 /* Perform a vector negation using normal negation and a mask. 2461 Compare gen_subv_mask above. */ 2462 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2463 { 2464 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2465 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 2466 2467 tcg_gen_andc_i64(t3, m, b); 2468 tcg_gen_andc_i64(t2, b, m); 2469 tcg_gen_sub_i64(d, m, t2); 2470 tcg_gen_xor_i64(d, d, t3); 2471 2472 tcg_temp_free_i64(t2); 2473 tcg_temp_free_i64(t3); 2474 } 2475 2476 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2477 { 2478 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 2479 gen_negv_mask(d, b, m); 2480 } 2481 2482 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2483 { 2484 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 2485 gen_negv_mask(d, b, m); 2486 } 2487 2488 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2489 { 2490 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2491 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2492 2493 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2494 tcg_gen_neg_i64(t2, b); 2495 tcg_gen_neg_i64(t1, t1); 2496 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2497 2498 tcg_temp_free_i64(t1); 2499 tcg_temp_free_i64(t2); 2500 } 2501 2502 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2503 uint32_t oprsz, uint32_t maxsz) 2504 { 2505 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2506 static const GVecGen2 g[4] = { 2507 { .fni8 = tcg_gen_vec_neg8_i64, 2508 .fniv = tcg_gen_neg_vec, 2509 .fno = gen_helper_gvec_neg8, 2510 .opt_opc = vecop_list, 2511 .vece = MO_8 }, 2512 { .fni8 = tcg_gen_vec_neg16_i64, 2513 .fniv = tcg_gen_neg_vec, 2514 .fno = gen_helper_gvec_neg16, 2515 .opt_opc = vecop_list, 2516 .vece = MO_16 }, 2517 { .fni4 = tcg_gen_neg_i32, 2518 .fniv = tcg_gen_neg_vec, 2519 .fno = gen_helper_gvec_neg32, 2520 .opt_opc = vecop_list, 2521 .vece = MO_32 }, 2522 { .fni8 = tcg_gen_neg_i64, 2523 .fniv = tcg_gen_neg_vec, 2524 .fno = gen_helper_gvec_neg64, 2525 .opt_opc = vecop_list, 2526 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2527 .vece = MO_64 }, 2528 }; 2529 2530 tcg_debug_assert(vece <= MO_64); 2531 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2532 } 2533 2534 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2535 { 2536 TCGv_i64 t = tcg_temp_ebb_new_i64(); 2537 int nbit = 8 << vece; 2538 2539 /* Create -1 for each negative element. */ 2540 tcg_gen_shri_i64(t, b, nbit - 1); 2541 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2542 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2543 2544 /* 2545 * Invert (via xor -1) and add one. 2546 * Because of the ordering the msb is cleared, 2547 * so we never have carry into the next element. 2548 */ 2549 tcg_gen_xor_i64(d, b, t); 2550 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2551 tcg_gen_add_i64(d, d, t); 2552 2553 tcg_temp_free_i64(t); 2554 } 2555 2556 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2557 { 2558 gen_absv_mask(d, b, MO_8); 2559 } 2560 2561 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2562 { 2563 gen_absv_mask(d, b, MO_16); 2564 } 2565 2566 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2567 uint32_t oprsz, uint32_t maxsz) 2568 { 2569 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2570 static const GVecGen2 g[4] = { 2571 { .fni8 = tcg_gen_vec_abs8_i64, 2572 .fniv = tcg_gen_abs_vec, 2573 .fno = gen_helper_gvec_abs8, 2574 .opt_opc = vecop_list, 2575 .vece = MO_8 }, 2576 { .fni8 = tcg_gen_vec_abs16_i64, 2577 .fniv = tcg_gen_abs_vec, 2578 .fno = gen_helper_gvec_abs16, 2579 .opt_opc = vecop_list, 2580 .vece = MO_16 }, 2581 { .fni4 = tcg_gen_abs_i32, 2582 .fniv = tcg_gen_abs_vec, 2583 .fno = gen_helper_gvec_abs32, 2584 .opt_opc = vecop_list, 2585 .vece = MO_32 }, 2586 { .fni8 = tcg_gen_abs_i64, 2587 .fniv = tcg_gen_abs_vec, 2588 .fno = gen_helper_gvec_abs64, 2589 .opt_opc = vecop_list, 2590 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2591 .vece = MO_64 }, 2592 }; 2593 2594 tcg_debug_assert(vece <= MO_64); 2595 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2596 } 2597 2598 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2599 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2600 { 2601 static const GVecGen3 g = { 2602 .fni8 = tcg_gen_and_i64, 2603 .fniv = tcg_gen_and_vec, 2604 .fno = gen_helper_gvec_and, 2605 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2606 }; 2607 2608 if (aofs == bofs) { 2609 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2610 } else { 2611 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2612 } 2613 } 2614 2615 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2616 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2617 { 2618 static const GVecGen3 g = { 2619 .fni8 = tcg_gen_or_i64, 2620 .fniv = tcg_gen_or_vec, 2621 .fno = gen_helper_gvec_or, 2622 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2623 }; 2624 2625 if (aofs == bofs) { 2626 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2627 } else { 2628 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2629 } 2630 } 2631 2632 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2633 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2634 { 2635 static const GVecGen3 g = { 2636 .fni8 = tcg_gen_xor_i64, 2637 .fniv = tcg_gen_xor_vec, 2638 .fno = gen_helper_gvec_xor, 2639 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2640 }; 2641 2642 if (aofs == bofs) { 2643 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2644 } else { 2645 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2646 } 2647 } 2648 2649 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2650 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2651 { 2652 static const GVecGen3 g = { 2653 .fni8 = tcg_gen_andc_i64, 2654 .fniv = tcg_gen_andc_vec, 2655 .fno = gen_helper_gvec_andc, 2656 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2657 }; 2658 2659 if (aofs == bofs) { 2660 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2661 } else { 2662 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2663 } 2664 } 2665 2666 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2667 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2668 { 2669 static const GVecGen3 g = { 2670 .fni8 = tcg_gen_orc_i64, 2671 .fniv = tcg_gen_orc_vec, 2672 .fno = gen_helper_gvec_orc, 2673 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2674 }; 2675 2676 if (aofs == bofs) { 2677 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2678 } else { 2679 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2680 } 2681 } 2682 2683 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2684 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2685 { 2686 static const GVecGen3 g = { 2687 .fni8 = tcg_gen_nand_i64, 2688 .fniv = tcg_gen_nand_vec, 2689 .fno = gen_helper_gvec_nand, 2690 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2691 }; 2692 2693 if (aofs == bofs) { 2694 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2695 } else { 2696 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2697 } 2698 } 2699 2700 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2701 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2702 { 2703 static const GVecGen3 g = { 2704 .fni8 = tcg_gen_nor_i64, 2705 .fniv = tcg_gen_nor_vec, 2706 .fno = gen_helper_gvec_nor, 2707 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2708 }; 2709 2710 if (aofs == bofs) { 2711 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2712 } else { 2713 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2714 } 2715 } 2716 2717 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2718 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2719 { 2720 static const GVecGen3 g = { 2721 .fni8 = tcg_gen_eqv_i64, 2722 .fniv = tcg_gen_eqv_vec, 2723 .fno = gen_helper_gvec_eqv, 2724 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2725 }; 2726 2727 if (aofs == bofs) { 2728 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2729 } else { 2730 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2731 } 2732 } 2733 2734 static const GVecGen2s gop_ands = { 2735 .fni8 = tcg_gen_and_i64, 2736 .fniv = tcg_gen_and_vec, 2737 .fno = gen_helper_gvec_ands, 2738 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2739 .vece = MO_64 2740 }; 2741 2742 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2743 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2744 { 2745 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2746 tcg_gen_dup_i64(vece, tmp, c); 2747 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2748 tcg_temp_free_i64(tmp); 2749 } 2750 2751 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2752 int64_t c, uint32_t oprsz, uint32_t maxsz) 2753 { 2754 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2755 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2756 } 2757 2758 void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs, 2759 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2760 { 2761 static GVecGen2s g = { 2762 .fni8 = tcg_gen_andc_i64, 2763 .fniv = tcg_gen_andc_vec, 2764 .fno = gen_helper_gvec_andcs, 2765 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2766 .vece = MO_64 2767 }; 2768 2769 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2770 tcg_gen_dup_i64(vece, tmp, c); 2771 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &g); 2772 tcg_temp_free_i64(tmp); 2773 } 2774 2775 static const GVecGen2s gop_xors = { 2776 .fni8 = tcg_gen_xor_i64, 2777 .fniv = tcg_gen_xor_vec, 2778 .fno = gen_helper_gvec_xors, 2779 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2780 .vece = MO_64 2781 }; 2782 2783 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2784 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2785 { 2786 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2787 tcg_gen_dup_i64(vece, tmp, c); 2788 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2789 tcg_temp_free_i64(tmp); 2790 } 2791 2792 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2793 int64_t c, uint32_t oprsz, uint32_t maxsz) 2794 { 2795 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2796 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2797 } 2798 2799 static const GVecGen2s gop_ors = { 2800 .fni8 = tcg_gen_or_i64, 2801 .fniv = tcg_gen_or_vec, 2802 .fno = gen_helper_gvec_ors, 2803 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2804 .vece = MO_64 2805 }; 2806 2807 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2808 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2809 { 2810 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2811 tcg_gen_dup_i64(vece, tmp, c); 2812 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2813 tcg_temp_free_i64(tmp); 2814 } 2815 2816 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2817 int64_t c, uint32_t oprsz, uint32_t maxsz) 2818 { 2819 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2820 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2821 } 2822 2823 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2824 { 2825 uint64_t mask = dup_const(MO_8, 0xff << c); 2826 tcg_gen_shli_i64(d, a, c); 2827 tcg_gen_andi_i64(d, d, mask); 2828 } 2829 2830 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2831 { 2832 uint64_t mask = dup_const(MO_16, 0xffff << c); 2833 tcg_gen_shli_i64(d, a, c); 2834 tcg_gen_andi_i64(d, d, mask); 2835 } 2836 2837 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2838 { 2839 uint32_t mask = dup_const(MO_8, 0xff << c); 2840 tcg_gen_shli_i32(d, a, c); 2841 tcg_gen_andi_i32(d, d, mask); 2842 } 2843 2844 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2845 { 2846 uint32_t mask = dup_const(MO_16, 0xffff << c); 2847 tcg_gen_shli_i32(d, a, c); 2848 tcg_gen_andi_i32(d, d, mask); 2849 } 2850 2851 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2852 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2853 { 2854 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2855 static const GVecGen2i g[4] = { 2856 { .fni8 = tcg_gen_vec_shl8i_i64, 2857 .fniv = tcg_gen_shli_vec, 2858 .fno = gen_helper_gvec_shl8i, 2859 .opt_opc = vecop_list, 2860 .vece = MO_8 }, 2861 { .fni8 = tcg_gen_vec_shl16i_i64, 2862 .fniv = tcg_gen_shli_vec, 2863 .fno = gen_helper_gvec_shl16i, 2864 .opt_opc = vecop_list, 2865 .vece = MO_16 }, 2866 { .fni4 = tcg_gen_shli_i32, 2867 .fniv = tcg_gen_shli_vec, 2868 .fno = gen_helper_gvec_shl32i, 2869 .opt_opc = vecop_list, 2870 .vece = MO_32 }, 2871 { .fni8 = tcg_gen_shli_i64, 2872 .fniv = tcg_gen_shli_vec, 2873 .fno = gen_helper_gvec_shl64i, 2874 .opt_opc = vecop_list, 2875 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2876 .vece = MO_64 }, 2877 }; 2878 2879 tcg_debug_assert(vece <= MO_64); 2880 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2881 if (shift == 0) { 2882 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2883 } else { 2884 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2885 } 2886 } 2887 2888 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2889 { 2890 uint64_t mask = dup_const(MO_8, 0xff >> c); 2891 tcg_gen_shri_i64(d, a, c); 2892 tcg_gen_andi_i64(d, d, mask); 2893 } 2894 2895 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2896 { 2897 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2898 tcg_gen_shri_i64(d, a, c); 2899 tcg_gen_andi_i64(d, d, mask); 2900 } 2901 2902 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2903 { 2904 uint32_t mask = dup_const(MO_8, 0xff >> c); 2905 tcg_gen_shri_i32(d, a, c); 2906 tcg_gen_andi_i32(d, d, mask); 2907 } 2908 2909 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2910 { 2911 uint32_t mask = dup_const(MO_16, 0xffff >> c); 2912 tcg_gen_shri_i32(d, a, c); 2913 tcg_gen_andi_i32(d, d, mask); 2914 } 2915 2916 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2917 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2918 { 2919 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2920 static const GVecGen2i g[4] = { 2921 { .fni8 = tcg_gen_vec_shr8i_i64, 2922 .fniv = tcg_gen_shri_vec, 2923 .fno = gen_helper_gvec_shr8i, 2924 .opt_opc = vecop_list, 2925 .vece = MO_8 }, 2926 { .fni8 = tcg_gen_vec_shr16i_i64, 2927 .fniv = tcg_gen_shri_vec, 2928 .fno = gen_helper_gvec_shr16i, 2929 .opt_opc = vecop_list, 2930 .vece = MO_16 }, 2931 { .fni4 = tcg_gen_shri_i32, 2932 .fniv = tcg_gen_shri_vec, 2933 .fno = gen_helper_gvec_shr32i, 2934 .opt_opc = vecop_list, 2935 .vece = MO_32 }, 2936 { .fni8 = tcg_gen_shri_i64, 2937 .fniv = tcg_gen_shri_vec, 2938 .fno = gen_helper_gvec_shr64i, 2939 .opt_opc = vecop_list, 2940 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2941 .vece = MO_64 }, 2942 }; 2943 2944 tcg_debug_assert(vece <= MO_64); 2945 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2946 if (shift == 0) { 2947 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2948 } else { 2949 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2950 } 2951 } 2952 2953 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2954 { 2955 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2956 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2957 TCGv_i64 s = tcg_temp_ebb_new_i64(); 2958 2959 tcg_gen_shri_i64(d, a, c); 2960 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2961 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2962 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2963 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2964 tcg_temp_free_i64(s); 2965 } 2966 2967 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2968 { 2969 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2970 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2971 TCGv_i64 s = tcg_temp_ebb_new_i64(); 2972 2973 tcg_gen_shri_i64(d, a, c); 2974 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2975 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2976 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2977 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2978 tcg_temp_free_i64(s); 2979 } 2980 2981 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2982 { 2983 uint32_t s_mask = dup_const(MO_8, 0x80 >> c); 2984 uint32_t c_mask = dup_const(MO_8, 0xff >> c); 2985 TCGv_i32 s = tcg_temp_ebb_new_i32(); 2986 2987 tcg_gen_shri_i32(d, a, c); 2988 tcg_gen_andi_i32(s, d, s_mask); /* isolate (shifted) sign bit */ 2989 tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */ 2990 tcg_gen_andi_i32(d, d, c_mask); /* clear out bits above sign */ 2991 tcg_gen_or_i32(d, d, s); /* include sign extension */ 2992 tcg_temp_free_i32(s); 2993 } 2994 2995 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2996 { 2997 uint32_t s_mask = dup_const(MO_16, 0x8000 >> c); 2998 uint32_t c_mask = dup_const(MO_16, 0xffff >> c); 2999 TCGv_i32 s = tcg_temp_ebb_new_i32(); 3000 3001 tcg_gen_shri_i32(d, a, c); 3002 tcg_gen_andi_i32(s, d, s_mask); /* isolate (shifted) sign bit */ 3003 tcg_gen_andi_i32(d, d, c_mask); /* clear out bits above sign */ 3004 tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */ 3005 tcg_gen_or_i32(d, d, s); /* include sign extension */ 3006 tcg_temp_free_i32(s); 3007 } 3008 3009 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 3010 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3011 { 3012 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 3013 static const GVecGen2i g[4] = { 3014 { .fni8 = tcg_gen_vec_sar8i_i64, 3015 .fniv = tcg_gen_sari_vec, 3016 .fno = gen_helper_gvec_sar8i, 3017 .opt_opc = vecop_list, 3018 .vece = MO_8 }, 3019 { .fni8 = tcg_gen_vec_sar16i_i64, 3020 .fniv = tcg_gen_sari_vec, 3021 .fno = gen_helper_gvec_sar16i, 3022 .opt_opc = vecop_list, 3023 .vece = MO_16 }, 3024 { .fni4 = tcg_gen_sari_i32, 3025 .fniv = tcg_gen_sari_vec, 3026 .fno = gen_helper_gvec_sar32i, 3027 .opt_opc = vecop_list, 3028 .vece = MO_32 }, 3029 { .fni8 = tcg_gen_sari_i64, 3030 .fniv = tcg_gen_sari_vec, 3031 .fno = gen_helper_gvec_sar64i, 3032 .opt_opc = vecop_list, 3033 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3034 .vece = MO_64 }, 3035 }; 3036 3037 tcg_debug_assert(vece <= MO_64); 3038 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3039 if (shift == 0) { 3040 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 3041 } else { 3042 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 3043 } 3044 } 3045 3046 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 3047 { 3048 uint64_t mask = dup_const(MO_8, 0xff << c); 3049 3050 tcg_gen_shli_i64(d, a, c); 3051 tcg_gen_shri_i64(a, a, 8 - c); 3052 tcg_gen_andi_i64(d, d, mask); 3053 tcg_gen_andi_i64(a, a, ~mask); 3054 tcg_gen_or_i64(d, d, a); 3055 } 3056 3057 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 3058 { 3059 uint64_t mask = dup_const(MO_16, 0xffff << c); 3060 3061 tcg_gen_shli_i64(d, a, c); 3062 tcg_gen_shri_i64(a, a, 16 - c); 3063 tcg_gen_andi_i64(d, d, mask); 3064 tcg_gen_andi_i64(a, a, ~mask); 3065 tcg_gen_or_i64(d, d, a); 3066 } 3067 3068 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs, 3069 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3070 { 3071 static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 }; 3072 static const GVecGen2i g[4] = { 3073 { .fni8 = tcg_gen_vec_rotl8i_i64, 3074 .fniv = tcg_gen_rotli_vec, 3075 .fno = gen_helper_gvec_rotl8i, 3076 .opt_opc = vecop_list, 3077 .vece = MO_8 }, 3078 { .fni8 = tcg_gen_vec_rotl16i_i64, 3079 .fniv = tcg_gen_rotli_vec, 3080 .fno = gen_helper_gvec_rotl16i, 3081 .opt_opc = vecop_list, 3082 .vece = MO_16 }, 3083 { .fni4 = tcg_gen_rotli_i32, 3084 .fniv = tcg_gen_rotli_vec, 3085 .fno = gen_helper_gvec_rotl32i, 3086 .opt_opc = vecop_list, 3087 .vece = MO_32 }, 3088 { .fni8 = tcg_gen_rotli_i64, 3089 .fniv = tcg_gen_rotli_vec, 3090 .fno = gen_helper_gvec_rotl64i, 3091 .opt_opc = vecop_list, 3092 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3093 .vece = MO_64 }, 3094 }; 3095 3096 tcg_debug_assert(vece <= MO_64); 3097 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3098 if (shift == 0) { 3099 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 3100 } else { 3101 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 3102 } 3103 } 3104 3105 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs, 3106 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3107 { 3108 tcg_debug_assert(vece <= MO_64); 3109 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3110 tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1), 3111 oprsz, maxsz); 3112 } 3113 3114 /* 3115 * Specialized generation vector shifts by a non-constant scalar. 3116 */ 3117 3118 typedef struct { 3119 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 3120 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 3121 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 3122 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 3123 gen_helper_gvec_2 *fno[4]; 3124 TCGOpcode s_list[2]; 3125 TCGOpcode v_list[2]; 3126 } GVecGen2sh; 3127 3128 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3129 uint32_t oprsz, uint32_t tysz, TCGType type, 3130 TCGv_i32 shift, 3131 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 3132 { 3133 for (uint32_t i = 0; i < oprsz; i += tysz) { 3134 TCGv_vec t0 = tcg_temp_new_vec(type); 3135 TCGv_vec t1 = tcg_temp_new_vec(type); 3136 3137 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 3138 fni(vece, t1, t0, shift); 3139 tcg_gen_st_vec(t1, tcg_env, dofs + i); 3140 } 3141 } 3142 3143 static void 3144 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 3145 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 3146 { 3147 TCGType type; 3148 uint32_t some; 3149 3150 check_size_align(oprsz, maxsz, dofs | aofs); 3151 check_overlap_2(dofs, aofs, maxsz); 3152 3153 /* If the backend has a scalar expansion, great. */ 3154 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 3155 if (type) { 3156 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 3157 switch (type) { 3158 case TCG_TYPE_V256: 3159 some = QEMU_ALIGN_DOWN(oprsz, 32); 3160 expand_2sh_vec(vece, dofs, aofs, some, 32, 3161 TCG_TYPE_V256, shift, g->fniv_s); 3162 if (some == oprsz) { 3163 break; 3164 } 3165 dofs += some; 3166 aofs += some; 3167 oprsz -= some; 3168 maxsz -= some; 3169 /* fallthru */ 3170 case TCG_TYPE_V128: 3171 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 3172 TCG_TYPE_V128, shift, g->fniv_s); 3173 break; 3174 case TCG_TYPE_V64: 3175 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 3176 TCG_TYPE_V64, shift, g->fniv_s); 3177 break; 3178 default: 3179 g_assert_not_reached(); 3180 } 3181 tcg_swap_vecop_list(hold_list); 3182 goto clear_tail; 3183 } 3184 3185 /* If the backend supports variable vector shifts, also cool. */ 3186 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 3187 if (type) { 3188 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 3189 TCGv_vec v_shift = tcg_temp_new_vec(type); 3190 3191 if (vece == MO_64) { 3192 TCGv_i64 sh64 = tcg_temp_ebb_new_i64(); 3193 tcg_gen_extu_i32_i64(sh64, shift); 3194 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 3195 tcg_temp_free_i64(sh64); 3196 } else { 3197 tcg_gen_dup_i32_vec(vece, v_shift, shift); 3198 } 3199 3200 switch (type) { 3201 case TCG_TYPE_V256: 3202 some = QEMU_ALIGN_DOWN(oprsz, 32); 3203 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 3204 v_shift, false, g->fniv_v); 3205 if (some == oprsz) { 3206 break; 3207 } 3208 dofs += some; 3209 aofs += some; 3210 oprsz -= some; 3211 maxsz -= some; 3212 /* fallthru */ 3213 case TCG_TYPE_V128: 3214 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 3215 v_shift, false, g->fniv_v); 3216 break; 3217 case TCG_TYPE_V64: 3218 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 3219 v_shift, false, g->fniv_v); 3220 break; 3221 default: 3222 g_assert_not_reached(); 3223 } 3224 tcg_temp_free_vec(v_shift); 3225 tcg_swap_vecop_list(hold_list); 3226 goto clear_tail; 3227 } 3228 3229 /* Otherwise fall back to integral... */ 3230 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3231 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 3232 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3233 TCGv_i64 sh64 = tcg_temp_ebb_new_i64(); 3234 tcg_gen_extu_i32_i64(sh64, shift); 3235 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 3236 tcg_temp_free_i64(sh64); 3237 } else { 3238 TCGv_ptr a0 = tcg_temp_ebb_new_ptr(); 3239 TCGv_ptr a1 = tcg_temp_ebb_new_ptr(); 3240 TCGv_i32 desc = tcg_temp_ebb_new_i32(); 3241 3242 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 3243 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 3244 tcg_gen_addi_ptr(a0, tcg_env, dofs); 3245 tcg_gen_addi_ptr(a1, tcg_env, aofs); 3246 3247 g->fno[vece](a0, a1, desc); 3248 3249 tcg_temp_free_ptr(a0); 3250 tcg_temp_free_ptr(a1); 3251 tcg_temp_free_i32(desc); 3252 return; 3253 } 3254 3255 clear_tail: 3256 if (oprsz < maxsz) { 3257 expand_clr(dofs + oprsz, maxsz - oprsz); 3258 } 3259 } 3260 3261 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 3262 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3263 { 3264 static const GVecGen2sh g = { 3265 .fni4 = tcg_gen_shl_i32, 3266 .fni8 = tcg_gen_shl_i64, 3267 .fniv_s = tcg_gen_shls_vec, 3268 .fniv_v = tcg_gen_shlv_vec, 3269 .fno = { 3270 gen_helper_gvec_shl8i, 3271 gen_helper_gvec_shl16i, 3272 gen_helper_gvec_shl32i, 3273 gen_helper_gvec_shl64i, 3274 }, 3275 .s_list = { INDEX_op_shls_vec, 0 }, 3276 .v_list = { INDEX_op_shlv_vec, 0 }, 3277 }; 3278 3279 tcg_debug_assert(vece <= MO_64); 3280 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3281 } 3282 3283 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 3284 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3285 { 3286 static const GVecGen2sh g = { 3287 .fni4 = tcg_gen_shr_i32, 3288 .fni8 = tcg_gen_shr_i64, 3289 .fniv_s = tcg_gen_shrs_vec, 3290 .fniv_v = tcg_gen_shrv_vec, 3291 .fno = { 3292 gen_helper_gvec_shr8i, 3293 gen_helper_gvec_shr16i, 3294 gen_helper_gvec_shr32i, 3295 gen_helper_gvec_shr64i, 3296 }, 3297 .s_list = { INDEX_op_shrs_vec, 0 }, 3298 .v_list = { INDEX_op_shrv_vec, 0 }, 3299 }; 3300 3301 tcg_debug_assert(vece <= MO_64); 3302 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3303 } 3304 3305 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 3306 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3307 { 3308 static const GVecGen2sh g = { 3309 .fni4 = tcg_gen_sar_i32, 3310 .fni8 = tcg_gen_sar_i64, 3311 .fniv_s = tcg_gen_sars_vec, 3312 .fniv_v = tcg_gen_sarv_vec, 3313 .fno = { 3314 gen_helper_gvec_sar8i, 3315 gen_helper_gvec_sar16i, 3316 gen_helper_gvec_sar32i, 3317 gen_helper_gvec_sar64i, 3318 }, 3319 .s_list = { INDEX_op_sars_vec, 0 }, 3320 .v_list = { INDEX_op_sarv_vec, 0 }, 3321 }; 3322 3323 tcg_debug_assert(vece <= MO_64); 3324 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3325 } 3326 3327 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs, 3328 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3329 { 3330 static const GVecGen2sh g = { 3331 .fni4 = tcg_gen_rotl_i32, 3332 .fni8 = tcg_gen_rotl_i64, 3333 .fniv_s = tcg_gen_rotls_vec, 3334 .fniv_v = tcg_gen_rotlv_vec, 3335 .fno = { 3336 gen_helper_gvec_rotl8i, 3337 gen_helper_gvec_rotl16i, 3338 gen_helper_gvec_rotl32i, 3339 gen_helper_gvec_rotl64i, 3340 }, 3341 .s_list = { INDEX_op_rotls_vec, 0 }, 3342 .v_list = { INDEX_op_rotlv_vec, 0 }, 3343 }; 3344 3345 tcg_debug_assert(vece <= MO_64); 3346 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3347 } 3348 3349 void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs, 3350 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3351 { 3352 TCGv_i32 tmp = tcg_temp_ebb_new_i32(); 3353 3354 tcg_gen_neg_i32(tmp, shift); 3355 tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1); 3356 tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz); 3357 tcg_temp_free_i32(tmp); 3358 } 3359 3360 /* 3361 * Expand D = A << (B % element bits) 3362 * 3363 * Unlike scalar shifts, where it is easy for the target front end 3364 * to include the modulo as part of the expansion. If the target 3365 * naturally includes the modulo as part of the operation, great! 3366 * If the target has some other behaviour from out-of-range shifts, 3367 * then it could not use this function anyway, and would need to 3368 * do it's own expansion with custom functions. 3369 */ 3370 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 3371 TCGv_vec a, TCGv_vec b) 3372 { 3373 TCGv_vec t = tcg_temp_new_vec_matching(d); 3374 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3375 3376 tcg_gen_and_vec(vece, t, b, m); 3377 tcg_gen_shlv_vec(vece, d, a, t); 3378 tcg_temp_free_vec(t); 3379 } 3380 3381 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3382 { 3383 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3384 3385 tcg_gen_andi_i32(t, b, 31); 3386 tcg_gen_shl_i32(d, a, t); 3387 tcg_temp_free_i32(t); 3388 } 3389 3390 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3391 { 3392 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3393 3394 tcg_gen_andi_i64(t, b, 63); 3395 tcg_gen_shl_i64(d, a, t); 3396 tcg_temp_free_i64(t); 3397 } 3398 3399 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3400 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3401 { 3402 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 3403 static const GVecGen3 g[4] = { 3404 { .fniv = tcg_gen_shlv_mod_vec, 3405 .fno = gen_helper_gvec_shl8v, 3406 .opt_opc = vecop_list, 3407 .vece = MO_8 }, 3408 { .fniv = tcg_gen_shlv_mod_vec, 3409 .fno = gen_helper_gvec_shl16v, 3410 .opt_opc = vecop_list, 3411 .vece = MO_16 }, 3412 { .fni4 = tcg_gen_shl_mod_i32, 3413 .fniv = tcg_gen_shlv_mod_vec, 3414 .fno = gen_helper_gvec_shl32v, 3415 .opt_opc = vecop_list, 3416 .vece = MO_32 }, 3417 { .fni8 = tcg_gen_shl_mod_i64, 3418 .fniv = tcg_gen_shlv_mod_vec, 3419 .fno = gen_helper_gvec_shl64v, 3420 .opt_opc = vecop_list, 3421 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3422 .vece = MO_64 }, 3423 }; 3424 3425 tcg_debug_assert(vece <= MO_64); 3426 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3427 } 3428 3429 /* 3430 * Similarly for logical right shifts. 3431 */ 3432 3433 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 3434 TCGv_vec a, TCGv_vec b) 3435 { 3436 TCGv_vec t = tcg_temp_new_vec_matching(d); 3437 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3438 3439 tcg_gen_and_vec(vece, t, b, m); 3440 tcg_gen_shrv_vec(vece, d, a, t); 3441 tcg_temp_free_vec(t); 3442 } 3443 3444 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3445 { 3446 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3447 3448 tcg_gen_andi_i32(t, b, 31); 3449 tcg_gen_shr_i32(d, a, t); 3450 tcg_temp_free_i32(t); 3451 } 3452 3453 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3454 { 3455 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3456 3457 tcg_gen_andi_i64(t, b, 63); 3458 tcg_gen_shr_i64(d, a, t); 3459 tcg_temp_free_i64(t); 3460 } 3461 3462 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3463 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3464 { 3465 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 3466 static const GVecGen3 g[4] = { 3467 { .fniv = tcg_gen_shrv_mod_vec, 3468 .fno = gen_helper_gvec_shr8v, 3469 .opt_opc = vecop_list, 3470 .vece = MO_8 }, 3471 { .fniv = tcg_gen_shrv_mod_vec, 3472 .fno = gen_helper_gvec_shr16v, 3473 .opt_opc = vecop_list, 3474 .vece = MO_16 }, 3475 { .fni4 = tcg_gen_shr_mod_i32, 3476 .fniv = tcg_gen_shrv_mod_vec, 3477 .fno = gen_helper_gvec_shr32v, 3478 .opt_opc = vecop_list, 3479 .vece = MO_32 }, 3480 { .fni8 = tcg_gen_shr_mod_i64, 3481 .fniv = tcg_gen_shrv_mod_vec, 3482 .fno = gen_helper_gvec_shr64v, 3483 .opt_opc = vecop_list, 3484 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3485 .vece = MO_64 }, 3486 }; 3487 3488 tcg_debug_assert(vece <= MO_64); 3489 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3490 } 3491 3492 /* 3493 * Similarly for arithmetic right shifts. 3494 */ 3495 3496 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 3497 TCGv_vec a, TCGv_vec b) 3498 { 3499 TCGv_vec t = tcg_temp_new_vec_matching(d); 3500 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3501 3502 tcg_gen_and_vec(vece, t, b, m); 3503 tcg_gen_sarv_vec(vece, d, a, t); 3504 tcg_temp_free_vec(t); 3505 } 3506 3507 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3508 { 3509 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3510 3511 tcg_gen_andi_i32(t, b, 31); 3512 tcg_gen_sar_i32(d, a, t); 3513 tcg_temp_free_i32(t); 3514 } 3515 3516 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3517 { 3518 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3519 3520 tcg_gen_andi_i64(t, b, 63); 3521 tcg_gen_sar_i64(d, a, t); 3522 tcg_temp_free_i64(t); 3523 } 3524 3525 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3526 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3527 { 3528 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3529 static const GVecGen3 g[4] = { 3530 { .fniv = tcg_gen_sarv_mod_vec, 3531 .fno = gen_helper_gvec_sar8v, 3532 .opt_opc = vecop_list, 3533 .vece = MO_8 }, 3534 { .fniv = tcg_gen_sarv_mod_vec, 3535 .fno = gen_helper_gvec_sar16v, 3536 .opt_opc = vecop_list, 3537 .vece = MO_16 }, 3538 { .fni4 = tcg_gen_sar_mod_i32, 3539 .fniv = tcg_gen_sarv_mod_vec, 3540 .fno = gen_helper_gvec_sar32v, 3541 .opt_opc = vecop_list, 3542 .vece = MO_32 }, 3543 { .fni8 = tcg_gen_sar_mod_i64, 3544 .fniv = tcg_gen_sarv_mod_vec, 3545 .fno = gen_helper_gvec_sar64v, 3546 .opt_opc = vecop_list, 3547 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3548 .vece = MO_64 }, 3549 }; 3550 3551 tcg_debug_assert(vece <= MO_64); 3552 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3553 } 3554 3555 /* 3556 * Similarly for rotates. 3557 */ 3558 3559 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d, 3560 TCGv_vec a, TCGv_vec b) 3561 { 3562 TCGv_vec t = tcg_temp_new_vec_matching(d); 3563 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3564 3565 tcg_gen_and_vec(vece, t, b, m); 3566 tcg_gen_rotlv_vec(vece, d, a, t); 3567 tcg_temp_free_vec(t); 3568 } 3569 3570 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3571 { 3572 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3573 3574 tcg_gen_andi_i32(t, b, 31); 3575 tcg_gen_rotl_i32(d, a, t); 3576 tcg_temp_free_i32(t); 3577 } 3578 3579 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3580 { 3581 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3582 3583 tcg_gen_andi_i64(t, b, 63); 3584 tcg_gen_rotl_i64(d, a, t); 3585 tcg_temp_free_i64(t); 3586 } 3587 3588 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3589 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3590 { 3591 static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 }; 3592 static const GVecGen3 g[4] = { 3593 { .fniv = tcg_gen_rotlv_mod_vec, 3594 .fno = gen_helper_gvec_rotl8v, 3595 .opt_opc = vecop_list, 3596 .vece = MO_8 }, 3597 { .fniv = tcg_gen_rotlv_mod_vec, 3598 .fno = gen_helper_gvec_rotl16v, 3599 .opt_opc = vecop_list, 3600 .vece = MO_16 }, 3601 { .fni4 = tcg_gen_rotl_mod_i32, 3602 .fniv = tcg_gen_rotlv_mod_vec, 3603 .fno = gen_helper_gvec_rotl32v, 3604 .opt_opc = vecop_list, 3605 .vece = MO_32 }, 3606 { .fni8 = tcg_gen_rotl_mod_i64, 3607 .fniv = tcg_gen_rotlv_mod_vec, 3608 .fno = gen_helper_gvec_rotl64v, 3609 .opt_opc = vecop_list, 3610 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3611 .vece = MO_64 }, 3612 }; 3613 3614 tcg_debug_assert(vece <= MO_64); 3615 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3616 } 3617 3618 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d, 3619 TCGv_vec a, TCGv_vec b) 3620 { 3621 TCGv_vec t = tcg_temp_new_vec_matching(d); 3622 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3623 3624 tcg_gen_and_vec(vece, t, b, m); 3625 tcg_gen_rotrv_vec(vece, d, a, t); 3626 tcg_temp_free_vec(t); 3627 } 3628 3629 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3630 { 3631 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3632 3633 tcg_gen_andi_i32(t, b, 31); 3634 tcg_gen_rotr_i32(d, a, t); 3635 tcg_temp_free_i32(t); 3636 } 3637 3638 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3639 { 3640 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3641 3642 tcg_gen_andi_i64(t, b, 63); 3643 tcg_gen_rotr_i64(d, a, t); 3644 tcg_temp_free_i64(t); 3645 } 3646 3647 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3648 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3649 { 3650 static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 }; 3651 static const GVecGen3 g[4] = { 3652 { .fniv = tcg_gen_rotrv_mod_vec, 3653 .fno = gen_helper_gvec_rotr8v, 3654 .opt_opc = vecop_list, 3655 .vece = MO_8 }, 3656 { .fniv = tcg_gen_rotrv_mod_vec, 3657 .fno = gen_helper_gvec_rotr16v, 3658 .opt_opc = vecop_list, 3659 .vece = MO_16 }, 3660 { .fni4 = tcg_gen_rotr_mod_i32, 3661 .fniv = tcg_gen_rotrv_mod_vec, 3662 .fno = gen_helper_gvec_rotr32v, 3663 .opt_opc = vecop_list, 3664 .vece = MO_32 }, 3665 { .fni8 = tcg_gen_rotr_mod_i64, 3666 .fniv = tcg_gen_rotrv_mod_vec, 3667 .fno = gen_helper_gvec_rotr64v, 3668 .opt_opc = vecop_list, 3669 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3670 .vece = MO_64 }, 3671 }; 3672 3673 tcg_debug_assert(vece <= MO_64); 3674 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3675 } 3676 3677 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3678 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3679 uint32_t oprsz, TCGCond cond) 3680 { 3681 TCGv_i32 t0 = tcg_temp_ebb_new_i32(); 3682 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 3683 uint32_t i; 3684 3685 for (i = 0; i < oprsz; i += 4) { 3686 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 3687 tcg_gen_ld_i32(t1, tcg_env, bofs + i); 3688 tcg_gen_negsetcond_i32(cond, t0, t0, t1); 3689 tcg_gen_st_i32(t0, tcg_env, dofs + i); 3690 } 3691 tcg_temp_free_i32(t1); 3692 tcg_temp_free_i32(t0); 3693 } 3694 3695 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3696 uint32_t oprsz, TCGCond cond) 3697 { 3698 TCGv_i64 t0 = tcg_temp_ebb_new_i64(); 3699 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 3700 uint32_t i; 3701 3702 for (i = 0; i < oprsz; i += 8) { 3703 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 3704 tcg_gen_ld_i64(t1, tcg_env, bofs + i); 3705 tcg_gen_negsetcond_i64(cond, t0, t0, t1); 3706 tcg_gen_st_i64(t0, tcg_env, dofs + i); 3707 } 3708 tcg_temp_free_i64(t1); 3709 tcg_temp_free_i64(t0); 3710 } 3711 3712 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3713 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3714 TCGType type, TCGCond cond) 3715 { 3716 for (uint32_t i = 0; i < oprsz; i += tysz) { 3717 TCGv_vec t0 = tcg_temp_new_vec(type); 3718 TCGv_vec t1 = tcg_temp_new_vec(type); 3719 TCGv_vec t2 = tcg_temp_new_vec(type); 3720 3721 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 3722 tcg_gen_ld_vec(t1, tcg_env, bofs + i); 3723 tcg_gen_cmp_vec(cond, vece, t2, t0, t1); 3724 tcg_gen_st_vec(t2, tcg_env, dofs + i); 3725 } 3726 } 3727 3728 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3729 uint32_t aofs, uint32_t bofs, 3730 uint32_t oprsz, uint32_t maxsz) 3731 { 3732 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3733 static gen_helper_gvec_3 * const eq_fn[4] = { 3734 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3735 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3736 }; 3737 static gen_helper_gvec_3 * const ne_fn[4] = { 3738 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3739 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3740 }; 3741 static gen_helper_gvec_3 * const lt_fn[4] = { 3742 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3743 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3744 }; 3745 static gen_helper_gvec_3 * const le_fn[4] = { 3746 gen_helper_gvec_le8, gen_helper_gvec_le16, 3747 gen_helper_gvec_le32, gen_helper_gvec_le64 3748 }; 3749 static gen_helper_gvec_3 * const ltu_fn[4] = { 3750 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3751 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3752 }; 3753 static gen_helper_gvec_3 * const leu_fn[4] = { 3754 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3755 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3756 }; 3757 static gen_helper_gvec_3 * const * const fns[16] = { 3758 [TCG_COND_EQ] = eq_fn, 3759 [TCG_COND_NE] = ne_fn, 3760 [TCG_COND_LT] = lt_fn, 3761 [TCG_COND_LE] = le_fn, 3762 [TCG_COND_LTU] = ltu_fn, 3763 [TCG_COND_LEU] = leu_fn, 3764 }; 3765 3766 const TCGOpcode *hold_list; 3767 TCGType type; 3768 uint32_t some; 3769 3770 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3771 check_overlap_3(dofs, aofs, bofs, maxsz); 3772 3773 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3774 do_dup(MO_8, dofs, oprsz, maxsz, 3775 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3776 return; 3777 } 3778 3779 /* 3780 * Implement inline with a vector type, if possible. 3781 * Prefer integer when 64-bit host and 64-bit comparison. 3782 */ 3783 hold_list = tcg_swap_vecop_list(cmp_list); 3784 type = choose_vector_type(cmp_list, vece, oprsz, 3785 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3786 switch (type) { 3787 case TCG_TYPE_V256: 3788 /* Recall that ARM SVE allows vector sizes that are not a 3789 * power of 2, but always a multiple of 16. The intent is 3790 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3791 */ 3792 some = QEMU_ALIGN_DOWN(oprsz, 32); 3793 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3794 if (some == oprsz) { 3795 break; 3796 } 3797 dofs += some; 3798 aofs += some; 3799 bofs += some; 3800 oprsz -= some; 3801 maxsz -= some; 3802 /* fallthru */ 3803 case TCG_TYPE_V128: 3804 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3805 break; 3806 case TCG_TYPE_V64: 3807 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3808 break; 3809 3810 case 0: 3811 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3812 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3813 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3814 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3815 } else { 3816 gen_helper_gvec_3 * const *fn = fns[cond]; 3817 3818 if (fn == NULL) { 3819 uint32_t tmp; 3820 tmp = aofs, aofs = bofs, bofs = tmp; 3821 cond = tcg_swap_cond(cond); 3822 fn = fns[cond]; 3823 assert(fn != NULL); 3824 } 3825 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3826 oprsz = maxsz; 3827 } 3828 break; 3829 3830 default: 3831 g_assert_not_reached(); 3832 } 3833 tcg_swap_vecop_list(hold_list); 3834 3835 if (oprsz < maxsz) { 3836 expand_clr(dofs + oprsz, maxsz - oprsz); 3837 } 3838 } 3839 3840 static void expand_cmps_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3841 uint32_t oprsz, uint32_t tysz, TCGType type, 3842 TCGCond cond, TCGv_vec c) 3843 { 3844 TCGv_vec t0 = tcg_temp_new_vec(type); 3845 TCGv_vec t1 = tcg_temp_new_vec(type); 3846 uint32_t i; 3847 3848 for (i = 0; i < oprsz; i += tysz) { 3849 tcg_gen_ld_vec(t1, tcg_env, aofs + i); 3850 tcg_gen_cmp_vec(cond, vece, t0, t1, c); 3851 tcg_gen_st_vec(t0, tcg_env, dofs + i); 3852 } 3853 } 3854 3855 void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs, 3856 uint32_t aofs, TCGv_i64 c, 3857 uint32_t oprsz, uint32_t maxsz) 3858 { 3859 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3860 static gen_helper_gvec_2i * const eq_fn[4] = { 3861 gen_helper_gvec_eqs8, gen_helper_gvec_eqs16, 3862 gen_helper_gvec_eqs32, gen_helper_gvec_eqs64 3863 }; 3864 static gen_helper_gvec_2i * const lt_fn[4] = { 3865 gen_helper_gvec_lts8, gen_helper_gvec_lts16, 3866 gen_helper_gvec_lts32, gen_helper_gvec_lts64 3867 }; 3868 static gen_helper_gvec_2i * const le_fn[4] = { 3869 gen_helper_gvec_les8, gen_helper_gvec_les16, 3870 gen_helper_gvec_les32, gen_helper_gvec_les64 3871 }; 3872 static gen_helper_gvec_2i * const ltu_fn[4] = { 3873 gen_helper_gvec_ltus8, gen_helper_gvec_ltus16, 3874 gen_helper_gvec_ltus32, gen_helper_gvec_ltus64 3875 }; 3876 static gen_helper_gvec_2i * const leu_fn[4] = { 3877 gen_helper_gvec_leus8, gen_helper_gvec_leus16, 3878 gen_helper_gvec_leus32, gen_helper_gvec_leus64 3879 }; 3880 static gen_helper_gvec_2i * const * const fns[16] = { 3881 [TCG_COND_EQ] = eq_fn, 3882 [TCG_COND_LT] = lt_fn, 3883 [TCG_COND_LE] = le_fn, 3884 [TCG_COND_LTU] = ltu_fn, 3885 [TCG_COND_LEU] = leu_fn, 3886 }; 3887 3888 TCGType type; 3889 3890 check_size_align(oprsz, maxsz, dofs | aofs); 3891 check_overlap_2(dofs, aofs, maxsz); 3892 3893 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3894 do_dup(MO_8, dofs, oprsz, maxsz, 3895 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3896 return; 3897 } 3898 3899 /* 3900 * Implement inline with a vector type, if possible. 3901 * Prefer integer when 64-bit host and 64-bit comparison. 3902 */ 3903 type = choose_vector_type(cmp_list, vece, oprsz, 3904 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3905 if (type != 0) { 3906 const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list); 3907 TCGv_vec t_vec = tcg_temp_new_vec(type); 3908 uint32_t some; 3909 3910 tcg_gen_dup_i64_vec(vece, t_vec, c); 3911 switch (type) { 3912 case TCG_TYPE_V256: 3913 some = QEMU_ALIGN_DOWN(oprsz, 32); 3914 expand_cmps_vec(vece, dofs, aofs, some, 32, 3915 TCG_TYPE_V256, cond, t_vec); 3916 aofs += some; 3917 dofs += some; 3918 oprsz -= some; 3919 maxsz -= some; 3920 /* fallthru */ 3921 3922 case TCG_TYPE_V128: 3923 some = QEMU_ALIGN_DOWN(oprsz, 16); 3924 expand_cmps_vec(vece, dofs, aofs, some, 16, 3925 TCG_TYPE_V128, cond, t_vec); 3926 break; 3927 3928 case TCG_TYPE_V64: 3929 some = QEMU_ALIGN_DOWN(oprsz, 8); 3930 expand_cmps_vec(vece, dofs, aofs, some, 8, 3931 TCG_TYPE_V64, cond, t_vec); 3932 break; 3933 3934 default: 3935 g_assert_not_reached(); 3936 } 3937 tcg_temp_free_vec(t_vec); 3938 tcg_swap_vecop_list(hold_list); 3939 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3940 TCGv_i64 t0 = tcg_temp_ebb_new_i64(); 3941 uint32_t i; 3942 3943 for (i = 0; i < oprsz; i += 8) { 3944 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 3945 tcg_gen_negsetcond_i64(cond, t0, t0, c); 3946 tcg_gen_st_i64(t0, tcg_env, dofs + i); 3947 } 3948 tcg_temp_free_i64(t0); 3949 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3950 TCGv_i32 t0 = tcg_temp_ebb_new_i32(); 3951 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 3952 uint32_t i; 3953 3954 tcg_gen_extrl_i64_i32(t1, c); 3955 for (i = 0; i < oprsz; i += 4) { 3956 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 3957 tcg_gen_negsetcond_i32(cond, t0, t0, t1); 3958 tcg_gen_st_i32(t0, tcg_env, dofs + i); 3959 } 3960 tcg_temp_free_i32(t0); 3961 tcg_temp_free_i32(t1); 3962 } else { 3963 gen_helper_gvec_2i * const *fn = fns[cond]; 3964 bool inv = false; 3965 3966 if (fn == NULL) { 3967 cond = tcg_invert_cond(cond); 3968 fn = fns[cond]; 3969 assert(fn != NULL); 3970 inv = true; 3971 } 3972 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]); 3973 return; 3974 } 3975 3976 if (oprsz < maxsz) { 3977 expand_clr(dofs + oprsz, maxsz - oprsz); 3978 } 3979 } 3980 3981 void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs, 3982 uint32_t aofs, int64_t c, 3983 uint32_t oprsz, uint32_t maxsz) 3984 { 3985 TCGv_i64 tmp = tcg_constant_i64(c); 3986 tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz); 3987 } 3988 3989 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 3990 { 3991 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3992 3993 tcg_gen_and_i64(t, b, a); 3994 tcg_gen_andc_i64(d, c, a); 3995 tcg_gen_or_i64(d, d, t); 3996 tcg_temp_free_i64(t); 3997 } 3998 3999 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 4000 uint32_t bofs, uint32_t cofs, 4001 uint32_t oprsz, uint32_t maxsz) 4002 { 4003 static const GVecGen4 g = { 4004 .fni8 = tcg_gen_bitsel_i64, 4005 .fniv = tcg_gen_bitsel_vec, 4006 .fno = gen_helper_gvec_bitsel, 4007 }; 4008 4009 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 4010 } 4011