1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "tcg/tcg.h" 22 #include "tcg/tcg-temp-internal.h" 23 #include "tcg/tcg-op-common.h" 24 #include "tcg/tcg-op-gvec-common.h" 25 #include "tcg/tcg-gvec-desc.h" 26 #include "tcg-has.h" 27 28 #define MAX_UNROLL 4 29 30 #ifdef CONFIG_DEBUG_TCG 31 static const TCGOpcode vecop_list_empty[1] = { 0 }; 32 #else 33 #define vecop_list_empty NULL 34 #endif 35 36 37 /* Verify vector size and alignment rules. OFS should be the OR of all 38 of the operand offsets so that we can check them all at once. */ 39 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 40 { 41 uint32_t max_align; 42 43 switch (oprsz) { 44 case 8: 45 case 16: 46 case 32: 47 tcg_debug_assert(oprsz <= maxsz); 48 break; 49 default: 50 tcg_debug_assert(oprsz == maxsz); 51 break; 52 } 53 tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS)); 54 55 max_align = maxsz >= 16 ? 15 : 7; 56 tcg_debug_assert((maxsz & max_align) == 0); 57 tcg_debug_assert((ofs & max_align) == 0); 58 } 59 60 /* 61 * Verify vector overlap rules for two operands. 62 * When dbase and abase are not the same pointer, we cannot check for 63 * overlap at compile-time, but the runtime restrictions remain. 64 */ 65 static void check_overlap_2(TCGv_ptr dbase, uint32_t d, 66 TCGv_ptr abase, uint32_t a, uint32_t s) 67 { 68 tcg_debug_assert(dbase != abase || d == a || d + s <= a || a + s <= d); 69 } 70 71 /* Verify vector overlap rules for three operands. */ 72 static void check_overlap_3(TCGv_ptr dbase, uint32_t d, 73 TCGv_ptr abase, uint32_t a, 74 TCGv_ptr bbase, uint32_t b, uint32_t s) 75 { 76 check_overlap_2(dbase, d, abase, a, s); 77 check_overlap_2(dbase, d, bbase, b, s); 78 check_overlap_2(abase, a, bbase, b, s); 79 } 80 81 /* Verify vector overlap rules for four operands. */ 82 static void check_overlap_4(TCGv_ptr dbase, uint32_t d, 83 TCGv_ptr abase, uint32_t a, 84 TCGv_ptr bbase, uint32_t b, 85 TCGv_ptr cbase, uint32_t c, uint32_t s) 86 { 87 check_overlap_2(dbase, d, abase, a, s); 88 check_overlap_2(dbase, d, bbase, b, s); 89 check_overlap_2(dbase, d, cbase, c, s); 90 check_overlap_2(abase, a, bbase, b, s); 91 check_overlap_2(abase, a, cbase, c, s); 92 check_overlap_2(bbase, b, cbase, c, s); 93 } 94 95 /* Create a descriptor from components. */ 96 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 97 { 98 uint32_t desc = 0; 99 100 check_size_align(oprsz, maxsz, 0); 101 102 /* 103 * We want to check that 'data' will fit into SIMD_DATA_BITS. 104 * However, some callers want to treat the data as a signed 105 * value (which they can later get back with simd_data()) 106 * and some want to treat it as an unsigned value. 107 * So here we assert only that the data will fit into the 108 * field in at least one way. This means that some invalid 109 * values from the caller will not be detected, e.g. if the 110 * caller wants to handle the value as a signed integer but 111 * incorrectly passes us 1 << (SIMD_DATA_BITS - 1). 112 */ 113 tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS) || 114 data == extract32(data, 0, SIMD_DATA_BITS)); 115 116 oprsz = (oprsz / 8) - 1; 117 maxsz = (maxsz / 8) - 1; 118 119 /* 120 * We have just asserted in check_size_align that either 121 * oprsz is {8,16,32} or matches maxsz. Encode the final 122 * case with '2', as that would otherwise map to 24. 123 */ 124 if (oprsz == maxsz) { 125 oprsz = 2; 126 } 127 128 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 129 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 130 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 131 132 return desc; 133 } 134 135 /* Generate a call to a gvec-style helper with two vector operands. */ 136 static void expand_2_ool(TCGv_ptr dbase, uint32_t dofs, 137 TCGv_ptr abase, uint32_t aofs, 138 uint32_t oprsz, uint32_t maxsz, 139 int32_t data, gen_helper_gvec_2 *fn) 140 { 141 TCGv_ptr a0, a1; 142 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 143 144 a0 = tcg_temp_ebb_new_ptr(); 145 a1 = tcg_temp_ebb_new_ptr(); 146 147 tcg_gen_addi_ptr(a0, dbase, dofs); 148 tcg_gen_addi_ptr(a1, abase, aofs); 149 150 fn(a0, a1, desc); 151 152 tcg_temp_free_ptr(a0); 153 tcg_temp_free_ptr(a1); 154 } 155 156 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 157 uint32_t oprsz, uint32_t maxsz, int32_t data, 158 gen_helper_gvec_2 *fn) 159 { 160 expand_2_ool(tcg_env, dofs, tcg_env, aofs, oprsz, maxsz, data, fn); 161 } 162 163 /* Generate a call to a gvec-style helper with two vector operands 164 and one scalar operand. */ 165 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 166 uint32_t oprsz, uint32_t maxsz, int32_t data, 167 gen_helper_gvec_2i *fn) 168 { 169 TCGv_ptr a0, a1; 170 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 171 172 a0 = tcg_temp_ebb_new_ptr(); 173 a1 = tcg_temp_ebb_new_ptr(); 174 175 tcg_gen_addi_ptr(a0, tcg_env, dofs); 176 tcg_gen_addi_ptr(a1, tcg_env, aofs); 177 178 fn(a0, a1, c, desc); 179 180 tcg_temp_free_ptr(a0); 181 tcg_temp_free_ptr(a1); 182 } 183 184 /* Generate a call to a gvec-style helper with three vector operands. */ 185 static void expand_3_ool(TCGv_ptr dbase, uint32_t dofs, 186 TCGv_ptr abase, uint32_t aofs, 187 TCGv_ptr bbase, uint32_t bofs, 188 uint32_t oprsz, uint32_t maxsz, 189 int32_t data, gen_helper_gvec_3 *fn) 190 { 191 TCGv_ptr a0, a1, a2; 192 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 193 194 a0 = tcg_temp_ebb_new_ptr(); 195 a1 = tcg_temp_ebb_new_ptr(); 196 a2 = tcg_temp_ebb_new_ptr(); 197 198 tcg_gen_addi_ptr(a0, dbase, dofs); 199 tcg_gen_addi_ptr(a1, abase, aofs); 200 tcg_gen_addi_ptr(a2, bbase, bofs); 201 202 fn(a0, a1, a2, desc); 203 204 tcg_temp_free_ptr(a0); 205 tcg_temp_free_ptr(a1); 206 tcg_temp_free_ptr(a2); 207 } 208 209 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 210 uint32_t oprsz, uint32_t maxsz, int32_t data, 211 gen_helper_gvec_3 *fn) 212 { 213 expand_3_ool(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs, 214 oprsz, maxsz, data, fn); 215 } 216 217 /* Generate a call to a gvec-style helper with four vector operands. */ 218 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 219 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 220 int32_t data, gen_helper_gvec_4 *fn) 221 { 222 TCGv_ptr a0, a1, a2, a3; 223 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 224 225 a0 = tcg_temp_ebb_new_ptr(); 226 a1 = tcg_temp_ebb_new_ptr(); 227 a2 = tcg_temp_ebb_new_ptr(); 228 a3 = tcg_temp_ebb_new_ptr(); 229 230 tcg_gen_addi_ptr(a0, tcg_env, dofs); 231 tcg_gen_addi_ptr(a1, tcg_env, aofs); 232 tcg_gen_addi_ptr(a2, tcg_env, bofs); 233 tcg_gen_addi_ptr(a3, tcg_env, cofs); 234 235 fn(a0, a1, a2, a3, desc); 236 237 tcg_temp_free_ptr(a0); 238 tcg_temp_free_ptr(a1); 239 tcg_temp_free_ptr(a2); 240 tcg_temp_free_ptr(a3); 241 } 242 243 /* Generate a call to a gvec-style helper with five vector operands. */ 244 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 245 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 246 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 247 { 248 TCGv_ptr a0, a1, a2, a3, a4; 249 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 250 251 a0 = tcg_temp_ebb_new_ptr(); 252 a1 = tcg_temp_ebb_new_ptr(); 253 a2 = tcg_temp_ebb_new_ptr(); 254 a3 = tcg_temp_ebb_new_ptr(); 255 a4 = tcg_temp_ebb_new_ptr(); 256 257 tcg_gen_addi_ptr(a0, tcg_env, dofs); 258 tcg_gen_addi_ptr(a1, tcg_env, aofs); 259 tcg_gen_addi_ptr(a2, tcg_env, bofs); 260 tcg_gen_addi_ptr(a3, tcg_env, cofs); 261 tcg_gen_addi_ptr(a4, tcg_env, xofs); 262 263 fn(a0, a1, a2, a3, a4, desc); 264 265 tcg_temp_free_ptr(a0); 266 tcg_temp_free_ptr(a1); 267 tcg_temp_free_ptr(a2); 268 tcg_temp_free_ptr(a3); 269 tcg_temp_free_ptr(a4); 270 } 271 272 /* Generate a call to a gvec-style helper with three vector operands 273 and an extra pointer operand. */ 274 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 275 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 276 int32_t data, gen_helper_gvec_2_ptr *fn) 277 { 278 TCGv_ptr a0, a1; 279 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 280 281 a0 = tcg_temp_ebb_new_ptr(); 282 a1 = tcg_temp_ebb_new_ptr(); 283 284 tcg_gen_addi_ptr(a0, tcg_env, dofs); 285 tcg_gen_addi_ptr(a1, tcg_env, aofs); 286 287 fn(a0, a1, ptr, desc); 288 289 tcg_temp_free_ptr(a0); 290 tcg_temp_free_ptr(a1); 291 } 292 293 /* Generate a call to a gvec-style helper with three vector operands 294 and an extra pointer operand. */ 295 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 296 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 297 int32_t data, gen_helper_gvec_3_ptr *fn) 298 { 299 TCGv_ptr a0, a1, a2; 300 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 301 302 a0 = tcg_temp_ebb_new_ptr(); 303 a1 = tcg_temp_ebb_new_ptr(); 304 a2 = tcg_temp_ebb_new_ptr(); 305 306 tcg_gen_addi_ptr(a0, tcg_env, dofs); 307 tcg_gen_addi_ptr(a1, tcg_env, aofs); 308 tcg_gen_addi_ptr(a2, tcg_env, bofs); 309 310 fn(a0, a1, a2, ptr, desc); 311 312 tcg_temp_free_ptr(a0); 313 tcg_temp_free_ptr(a1); 314 tcg_temp_free_ptr(a2); 315 } 316 317 /* Generate a call to a gvec-style helper with four vector operands 318 and an extra pointer operand. */ 319 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 320 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 321 uint32_t maxsz, int32_t data, 322 gen_helper_gvec_4_ptr *fn) 323 { 324 TCGv_ptr a0, a1, a2, a3; 325 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 326 327 a0 = tcg_temp_ebb_new_ptr(); 328 a1 = tcg_temp_ebb_new_ptr(); 329 a2 = tcg_temp_ebb_new_ptr(); 330 a3 = tcg_temp_ebb_new_ptr(); 331 332 tcg_gen_addi_ptr(a0, tcg_env, dofs); 333 tcg_gen_addi_ptr(a1, tcg_env, aofs); 334 tcg_gen_addi_ptr(a2, tcg_env, bofs); 335 tcg_gen_addi_ptr(a3, tcg_env, cofs); 336 337 fn(a0, a1, a2, a3, ptr, desc); 338 339 tcg_temp_free_ptr(a0); 340 tcg_temp_free_ptr(a1); 341 tcg_temp_free_ptr(a2); 342 tcg_temp_free_ptr(a3); 343 } 344 345 /* Generate a call to a gvec-style helper with five vector operands 346 and an extra pointer operand. */ 347 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 348 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr, 349 uint32_t oprsz, uint32_t maxsz, int32_t data, 350 gen_helper_gvec_5_ptr *fn) 351 { 352 TCGv_ptr a0, a1, a2, a3, a4; 353 TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data)); 354 355 a0 = tcg_temp_ebb_new_ptr(); 356 a1 = tcg_temp_ebb_new_ptr(); 357 a2 = tcg_temp_ebb_new_ptr(); 358 a3 = tcg_temp_ebb_new_ptr(); 359 a4 = tcg_temp_ebb_new_ptr(); 360 361 tcg_gen_addi_ptr(a0, tcg_env, dofs); 362 tcg_gen_addi_ptr(a1, tcg_env, aofs); 363 tcg_gen_addi_ptr(a2, tcg_env, bofs); 364 tcg_gen_addi_ptr(a3, tcg_env, cofs); 365 tcg_gen_addi_ptr(a4, tcg_env, eofs); 366 367 fn(a0, a1, a2, a3, a4, ptr, desc); 368 369 tcg_temp_free_ptr(a0); 370 tcg_temp_free_ptr(a1); 371 tcg_temp_free_ptr(a2); 372 tcg_temp_free_ptr(a3); 373 tcg_temp_free_ptr(a4); 374 } 375 376 /* Return true if we want to implement something of OPRSZ bytes 377 in units of LNSZ. This limits the expansion of inline code. */ 378 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 379 { 380 uint32_t q, r; 381 382 if (oprsz < lnsz) { 383 return false; 384 } 385 386 q = oprsz / lnsz; 387 r = oprsz % lnsz; 388 tcg_debug_assert((r & 7) == 0); 389 390 if (lnsz < 16) { 391 /* For sizes below 16, accept no remainder. */ 392 if (r != 0) { 393 return false; 394 } 395 } else { 396 /* 397 * Recall that ARM SVE allows vector sizes that are not a 398 * power of 2, but always a multiple of 16. The intent is 399 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 400 * In addition, expand_clr needs to handle a multiple of 8. 401 * Thus we can handle the tail with one more operation per 402 * diminishing power of 2. 403 */ 404 q += ctpop32(r); 405 } 406 407 return q <= MAX_UNROLL; 408 } 409 410 static void expand_clr(TCGv_ptr dbase, uint32_t dofs, uint32_t maxsz); 411 412 /* Duplicate C as per VECE. */ 413 uint64_t (dup_const)(unsigned vece, uint64_t c) 414 { 415 switch (vece) { 416 case MO_8: 417 return 0x0101010101010101ull * (uint8_t)c; 418 case MO_16: 419 return 0x0001000100010001ull * (uint16_t)c; 420 case MO_32: 421 return 0x0000000100000001ull * (uint32_t)c; 422 case MO_64: 423 return c; 424 default: 425 g_assert_not_reached(); 426 } 427 } 428 429 /* Duplicate IN into OUT as per VECE. */ 430 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 431 { 432 switch (vece) { 433 case MO_8: 434 tcg_gen_ext8u_i32(out, in); 435 tcg_gen_muli_i32(out, out, 0x01010101); 436 break; 437 case MO_16: 438 tcg_gen_deposit_i32(out, in, in, 16, 16); 439 break; 440 case MO_32: 441 tcg_gen_mov_i32(out, in); 442 break; 443 default: 444 g_assert_not_reached(); 445 } 446 } 447 448 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 449 { 450 switch (vece) { 451 case MO_8: 452 tcg_gen_ext8u_i64(out, in); 453 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 454 break; 455 case MO_16: 456 tcg_gen_ext16u_i64(out, in); 457 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 458 break; 459 case MO_32: 460 tcg_gen_deposit_i64(out, in, in, 32, 32); 461 break; 462 case MO_64: 463 tcg_gen_mov_i64(out, in); 464 break; 465 default: 466 g_assert_not_reached(); 467 } 468 } 469 470 /* Select a supported vector type for implementing an operation on SIZE 471 * bytes. If OP is 0, assume that the real operation to be performed is 472 * required by all backends. Otherwise, make sure than OP can be performed 473 * on elements of size VECE in the selected type. Do not select V64 if 474 * PREFER_I64 is true. Return 0 if no vector type is selected. 475 */ 476 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 477 uint32_t size, bool prefer_i64) 478 { 479 /* 480 * Recall that ARM SVE allows vector sizes that are not a 481 * power of 2, but always a multiple of 16. The intent is 482 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 483 * It is hard to imagine a case in which v256 is supported 484 * but v128 is not, but check anyway. 485 * In addition, expand_clr needs to handle a multiple of 8. 486 */ 487 if (TCG_TARGET_HAS_v256 && 488 check_size_impl(size, 32) && 489 tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && 490 (!(size & 16) || 491 (TCG_TARGET_HAS_v128 && 492 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && 493 (!(size & 8) || 494 (TCG_TARGET_HAS_v64 && 495 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 496 return TCG_TYPE_V256; 497 } 498 if (TCG_TARGET_HAS_v128 && 499 check_size_impl(size, 16) && 500 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && 501 (!(size & 8) || 502 (TCG_TARGET_HAS_v64 && 503 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 504 return TCG_TYPE_V128; 505 } 506 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 507 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 508 return TCG_TYPE_V64; 509 } 510 return 0; 511 } 512 513 static void do_dup_store(TCGType type, TCGv_ptr dbase, uint32_t dofs, 514 uint32_t oprsz, uint32_t maxsz, TCGv_vec t_vec) 515 { 516 uint32_t i = 0; 517 518 tcg_debug_assert(oprsz >= 8); 519 520 /* 521 * This may be expand_clr for the tail of an operation, e.g. 522 * oprsz == 8 && maxsz == 64. The first 8 bytes of this store 523 * are misaligned wrt the maximum vector size, so do that first. 524 */ 525 if (dofs & 8) { 526 tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V64); 527 i += 8; 528 } 529 530 switch (type) { 531 case TCG_TYPE_V256: 532 /* 533 * Recall that ARM SVE allows vector sizes that are not a 534 * power of 2, but always a multiple of 16. The intent is 535 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 536 */ 537 for (; i + 32 <= oprsz; i += 32) { 538 tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V256); 539 } 540 /* fallthru */ 541 case TCG_TYPE_V128: 542 for (; i + 16 <= oprsz; i += 16) { 543 tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V128); 544 } 545 break; 546 case TCG_TYPE_V64: 547 for (; i < oprsz; i += 8) { 548 tcg_gen_stl_vec(t_vec, dbase, dofs + i, TCG_TYPE_V64); 549 } 550 break; 551 default: 552 g_assert_not_reached(); 553 } 554 555 if (oprsz < maxsz) { 556 expand_clr(dbase, dofs + oprsz, maxsz - oprsz); 557 } 558 } 559 560 /* 561 * Set OPRSZ bytes at DBASE + DOFS to replications of IN_32, IN_64 or IN_C. 562 * Only one of IN_32 or IN_64 may be set; 563 * IN_C is used if IN_32 and IN_64 are unset. 564 */ 565 static void do_dup(unsigned vece, TCGv_ptr dbase, uint32_t dofs, 566 uint32_t oprsz, uint32_t maxsz, 567 TCGv_i32 in_32, TCGv_i64 in_64, uint64_t in_c) 568 { 569 TCGType type; 570 TCGv_i64 t_64; 571 TCGv_i32 t_32, t_desc; 572 TCGv_ptr t_ptr; 573 uint32_t i; 574 575 assert(vece <= (in_32 ? MO_32 : MO_64)); 576 assert(in_32 == NULL || in_64 == NULL); 577 578 /* If we're storing 0, expand oprsz to maxsz. */ 579 if (in_32 == NULL && in_64 == NULL) { 580 in_c = dup_const(vece, in_c); 581 if (in_c == 0) { 582 oprsz = maxsz; 583 vece = MO_8; 584 } else if (in_c == dup_const(MO_8, in_c)) { 585 vece = MO_8; 586 } 587 } 588 589 /* Implement inline with a vector type, if possible. 590 * Prefer integer when 64-bit host and no variable dup. 591 */ 592 type = choose_vector_type(NULL, vece, oprsz, 593 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 594 && (in_64 == NULL || vece == MO_64))); 595 if (type != 0) { 596 TCGv_vec t_vec = tcg_temp_new_vec(type); 597 598 if (in_32) { 599 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 600 } else if (in_64) { 601 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 602 } else { 603 tcg_gen_dupi_vec(vece, t_vec, in_c); 604 } 605 do_dup_store(type, dbase, dofs, oprsz, maxsz, t_vec); 606 return; 607 } 608 609 /* Otherwise, inline with an integer type, unless "large". */ 610 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 611 t_64 = NULL; 612 t_32 = NULL; 613 614 if (in_32) { 615 /* We are given a 32-bit variable input. For a 64-bit host, 616 use a 64-bit operation unless the 32-bit operation would 617 be simple enough. */ 618 if (TCG_TARGET_REG_BITS == 64 619 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 620 t_64 = tcg_temp_ebb_new_i64(); 621 tcg_gen_extu_i32_i64(t_64, in_32); 622 tcg_gen_dup_i64(vece, t_64, t_64); 623 } else { 624 t_32 = tcg_temp_ebb_new_i32(); 625 tcg_gen_dup_i32(vece, t_32, in_32); 626 } 627 } else if (in_64) { 628 /* We are given a 64-bit variable input. */ 629 t_64 = tcg_temp_ebb_new_i64(); 630 tcg_gen_dup_i64(vece, t_64, in_64); 631 } else { 632 /* We are given a constant input. */ 633 /* For 64-bit hosts, use 64-bit constants for "simple" constants 634 or when we'd need too many 32-bit stores, or when a 64-bit 635 constant is really required. */ 636 if (vece == MO_64 637 || (TCG_TARGET_REG_BITS == 64 638 && (in_c == 0 || in_c == -1 639 || !check_size_impl(oprsz, 4)))) { 640 t_64 = tcg_constant_i64(in_c); 641 } else { 642 t_32 = tcg_constant_i32(in_c); 643 } 644 } 645 646 /* Implement inline if we picked an implementation size above. */ 647 if (t_32) { 648 for (i = 0; i < oprsz; i += 4) { 649 tcg_gen_st_i32(t_32, dbase, dofs + i); 650 } 651 tcg_temp_free_i32(t_32); 652 goto done; 653 } 654 if (t_64) { 655 for (i = 0; i < oprsz; i += 8) { 656 tcg_gen_st_i64(t_64, dbase, dofs + i); 657 } 658 tcg_temp_free_i64(t_64); 659 goto done; 660 } 661 } 662 663 /* Otherwise implement out of line. */ 664 t_ptr = tcg_temp_ebb_new_ptr(); 665 tcg_gen_addi_ptr(t_ptr, dbase, dofs); 666 667 /* 668 * This may be expand_clr for the tail of an operation, e.g. 669 * oprsz == 8 && maxsz == 64. The size of the clear is misaligned 670 * wrt simd_desc and will assert. Simply pass all replicated byte 671 * stores through to memset. 672 */ 673 if (oprsz == maxsz && vece == MO_8) { 674 TCGv_ptr t_size = tcg_constant_ptr(oprsz); 675 TCGv_i32 t_val; 676 677 if (in_32) { 678 t_val = in_32; 679 } else if (in_64) { 680 t_val = tcg_temp_ebb_new_i32(); 681 tcg_gen_extrl_i64_i32(t_val, in_64); 682 } else { 683 t_val = tcg_constant_i32(in_c); 684 } 685 gen_helper_memset(t_ptr, t_ptr, t_val, t_size); 686 687 if (in_64) { 688 tcg_temp_free_i32(t_val); 689 } 690 tcg_temp_free_ptr(t_ptr); 691 return; 692 } 693 694 t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0)); 695 696 if (vece == MO_64) { 697 if (in_64) { 698 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 699 } else { 700 t_64 = tcg_constant_i64(in_c); 701 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 702 } 703 } else { 704 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 705 static dup_fn * const fns[3] = { 706 gen_helper_gvec_dup8, 707 gen_helper_gvec_dup16, 708 gen_helper_gvec_dup32 709 }; 710 711 if (in_32) { 712 fns[vece](t_ptr, t_desc, in_32); 713 } else if (in_64) { 714 t_32 = tcg_temp_ebb_new_i32(); 715 tcg_gen_extrl_i64_i32(t_32, in_64); 716 fns[vece](t_ptr, t_desc, t_32); 717 tcg_temp_free_i32(t_32); 718 } else { 719 if (vece == MO_8) { 720 in_c &= 0xff; 721 } else if (vece == MO_16) { 722 in_c &= 0xffff; 723 } 724 t_32 = tcg_constant_i32(in_c); 725 fns[vece](t_ptr, t_desc, t_32); 726 } 727 } 728 729 tcg_temp_free_ptr(t_ptr); 730 return; 731 732 done: 733 if (oprsz < maxsz) { 734 expand_clr(dbase, dofs + oprsz, maxsz - oprsz); 735 } 736 } 737 738 /* Likewise, but with zero. */ 739 static void expand_clr(TCGv_ptr dbase, uint32_t dofs, uint32_t maxsz) 740 { 741 do_dup(MO_8, dbase, dofs, maxsz, maxsz, NULL, NULL, 0); 742 } 743 744 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 745 static void expand_2_i32(TCGv_ptr dbase, uint32_t dofs, TCGv_ptr abase, 746 uint32_t aofs, uint32_t oprsz, bool load_dest, 747 void (*fni)(TCGv_i32, TCGv_i32)) 748 { 749 TCGv_i32 t0 = tcg_temp_new_i32(); 750 TCGv_i32 t1 = tcg_temp_new_i32(); 751 uint32_t i; 752 753 for (i = 0; i < oprsz; i += 4) { 754 tcg_gen_ld_i32(t0, abase, aofs + i); 755 if (load_dest) { 756 tcg_gen_ld_i32(t1, dbase, dofs + i); 757 } 758 fni(t1, t0); 759 tcg_gen_st_i32(t1, dbase, dofs + i); 760 } 761 tcg_temp_free_i32(t0); 762 tcg_temp_free_i32(t1); 763 } 764 765 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 766 int32_t c, bool load_dest, 767 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 768 { 769 TCGv_i32 t0 = tcg_temp_new_i32(); 770 TCGv_i32 t1 = tcg_temp_new_i32(); 771 uint32_t i; 772 773 for (i = 0; i < oprsz; i += 4) { 774 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 775 if (load_dest) { 776 tcg_gen_ld_i32(t1, tcg_env, dofs + i); 777 } 778 fni(t1, t0, c); 779 tcg_gen_st_i32(t1, tcg_env, dofs + i); 780 } 781 tcg_temp_free_i32(t0); 782 tcg_temp_free_i32(t1); 783 } 784 785 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 786 TCGv_i32 c, bool scalar_first, 787 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 788 { 789 TCGv_i32 t0 = tcg_temp_new_i32(); 790 TCGv_i32 t1 = tcg_temp_new_i32(); 791 uint32_t i; 792 793 for (i = 0; i < oprsz; i += 4) { 794 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 795 if (scalar_first) { 796 fni(t1, c, t0); 797 } else { 798 fni(t1, t0, c); 799 } 800 tcg_gen_st_i32(t1, tcg_env, dofs + i); 801 } 802 tcg_temp_free_i32(t0); 803 tcg_temp_free_i32(t1); 804 } 805 806 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 807 static void expand_3_i32(TCGv_ptr dbase, uint32_t dofs, 808 TCGv_ptr abase, uint32_t aofs, 809 TCGv_ptr bbase, uint32_t bofs, 810 uint32_t oprsz, bool load_dest, 811 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 812 { 813 TCGv_i32 t0 = tcg_temp_new_i32(); 814 TCGv_i32 t1 = tcg_temp_new_i32(); 815 TCGv_i32 t2 = tcg_temp_new_i32(); 816 uint32_t i; 817 818 for (i = 0; i < oprsz; i += 4) { 819 tcg_gen_ld_i32(t0, abase, aofs + i); 820 tcg_gen_ld_i32(t1, bbase, bofs + i); 821 if (load_dest) { 822 tcg_gen_ld_i32(t2, dbase, dofs + i); 823 } 824 fni(t2, t0, t1); 825 tcg_gen_st_i32(t2, dbase, dofs + i); 826 } 827 tcg_temp_free_i32(t2); 828 tcg_temp_free_i32(t1); 829 tcg_temp_free_i32(t0); 830 } 831 832 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 833 uint32_t oprsz, int32_t c, 834 bool load_dest, bool write_aofs, 835 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 836 { 837 TCGv_i32 t0 = tcg_temp_new_i32(); 838 TCGv_i32 t1 = tcg_temp_new_i32(); 839 TCGv_i32 t2 = tcg_temp_new_i32(); 840 uint32_t i; 841 842 for (i = 0; i < oprsz; i += 4) { 843 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 844 tcg_gen_ld_i32(t1, tcg_env, bofs + i); 845 if (load_dest) { 846 tcg_gen_ld_i32(t2, tcg_env, dofs + i); 847 } 848 fni(t2, t0, t1, c); 849 tcg_gen_st_i32(t2, tcg_env, dofs + i); 850 if (write_aofs) { 851 tcg_gen_st_i32(t0, tcg_env, aofs + i); 852 } 853 } 854 tcg_temp_free_i32(t0); 855 tcg_temp_free_i32(t1); 856 tcg_temp_free_i32(t2); 857 } 858 859 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 860 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 861 uint32_t cofs, uint32_t oprsz, bool write_aofs, 862 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 863 { 864 TCGv_i32 t0 = tcg_temp_new_i32(); 865 TCGv_i32 t1 = tcg_temp_new_i32(); 866 TCGv_i32 t2 = tcg_temp_new_i32(); 867 TCGv_i32 t3 = tcg_temp_new_i32(); 868 uint32_t i; 869 870 for (i = 0; i < oprsz; i += 4) { 871 tcg_gen_ld_i32(t1, tcg_env, aofs + i); 872 tcg_gen_ld_i32(t2, tcg_env, bofs + i); 873 tcg_gen_ld_i32(t3, tcg_env, cofs + i); 874 fni(t0, t1, t2, t3); 875 tcg_gen_st_i32(t0, tcg_env, dofs + i); 876 if (write_aofs) { 877 tcg_gen_st_i32(t1, tcg_env, aofs + i); 878 } 879 } 880 tcg_temp_free_i32(t3); 881 tcg_temp_free_i32(t2); 882 tcg_temp_free_i32(t1); 883 tcg_temp_free_i32(t0); 884 } 885 886 static void expand_4i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 887 uint32_t cofs, uint32_t oprsz, int32_t c, 888 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32, 889 int32_t)) 890 { 891 TCGv_i32 t0 = tcg_temp_new_i32(); 892 TCGv_i32 t1 = tcg_temp_new_i32(); 893 TCGv_i32 t2 = tcg_temp_new_i32(); 894 TCGv_i32 t3 = tcg_temp_new_i32(); 895 uint32_t i; 896 897 for (i = 0; i < oprsz; i += 4) { 898 tcg_gen_ld_i32(t1, tcg_env, aofs + i); 899 tcg_gen_ld_i32(t2, tcg_env, bofs + i); 900 tcg_gen_ld_i32(t3, tcg_env, cofs + i); 901 fni(t0, t1, t2, t3, c); 902 tcg_gen_st_i32(t0, tcg_env, dofs + i); 903 } 904 tcg_temp_free_i32(t3); 905 tcg_temp_free_i32(t2); 906 tcg_temp_free_i32(t1); 907 tcg_temp_free_i32(t0); 908 } 909 910 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 911 static void expand_2_i64(TCGv_ptr dbase, uint32_t dofs, TCGv_ptr abase, 912 uint32_t aofs, uint32_t oprsz, bool load_dest, 913 void (*fni)(TCGv_i64, TCGv_i64)) 914 { 915 TCGv_i64 t0 = tcg_temp_new_i64(); 916 TCGv_i64 t1 = tcg_temp_new_i64(); 917 uint32_t i; 918 919 for (i = 0; i < oprsz; i += 8) { 920 tcg_gen_ld_i64(t0, abase, aofs + i); 921 if (load_dest) { 922 tcg_gen_ld_i64(t1, dbase, dofs + i); 923 } 924 fni(t1, t0); 925 tcg_gen_st_i64(t1, dbase, dofs + i); 926 } 927 tcg_temp_free_i64(t0); 928 tcg_temp_free_i64(t1); 929 } 930 931 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 932 int64_t c, bool load_dest, 933 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 934 { 935 TCGv_i64 t0 = tcg_temp_new_i64(); 936 TCGv_i64 t1 = tcg_temp_new_i64(); 937 uint32_t i; 938 939 for (i = 0; i < oprsz; i += 8) { 940 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 941 if (load_dest) { 942 tcg_gen_ld_i64(t1, tcg_env, dofs + i); 943 } 944 fni(t1, t0, c); 945 tcg_gen_st_i64(t1, tcg_env, dofs + i); 946 } 947 tcg_temp_free_i64(t0); 948 tcg_temp_free_i64(t1); 949 } 950 951 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 952 TCGv_i64 c, bool scalar_first, 953 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 954 { 955 TCGv_i64 t0 = tcg_temp_new_i64(); 956 TCGv_i64 t1 = tcg_temp_new_i64(); 957 uint32_t i; 958 959 for (i = 0; i < oprsz; i += 8) { 960 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 961 if (scalar_first) { 962 fni(t1, c, t0); 963 } else { 964 fni(t1, t0, c); 965 } 966 tcg_gen_st_i64(t1, tcg_env, dofs + i); 967 } 968 tcg_temp_free_i64(t0); 969 tcg_temp_free_i64(t1); 970 } 971 972 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 973 static void expand_3_i64(TCGv_ptr dbase, uint32_t dofs, 974 TCGv_ptr abase, uint32_t aofs, 975 TCGv_ptr bbase, uint32_t bofs, 976 uint32_t oprsz, bool load_dest, 977 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 978 { 979 TCGv_i64 t0 = tcg_temp_new_i64(); 980 TCGv_i64 t1 = tcg_temp_new_i64(); 981 TCGv_i64 t2 = tcg_temp_new_i64(); 982 uint32_t i; 983 984 for (i = 0; i < oprsz; i += 8) { 985 tcg_gen_ld_i64(t0, abase, aofs + i); 986 tcg_gen_ld_i64(t1, bbase, bofs + i); 987 if (load_dest) { 988 tcg_gen_ld_i64(t2, dbase, dofs + i); 989 } 990 fni(t2, t0, t1); 991 tcg_gen_st_i64(t2, dbase, dofs + i); 992 } 993 tcg_temp_free_i64(t2); 994 tcg_temp_free_i64(t1); 995 tcg_temp_free_i64(t0); 996 } 997 998 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 999 uint32_t oprsz, int64_t c, 1000 bool load_dest, bool write_aofs, 1001 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 1002 { 1003 TCGv_i64 t0 = tcg_temp_new_i64(); 1004 TCGv_i64 t1 = tcg_temp_new_i64(); 1005 TCGv_i64 t2 = tcg_temp_new_i64(); 1006 uint32_t i; 1007 1008 for (i = 0; i < oprsz; i += 8) { 1009 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 1010 tcg_gen_ld_i64(t1, tcg_env, bofs + i); 1011 if (load_dest) { 1012 tcg_gen_ld_i64(t2, tcg_env, dofs + i); 1013 } 1014 fni(t2, t0, t1, c); 1015 tcg_gen_st_i64(t2, tcg_env, dofs + i); 1016 if (write_aofs) { 1017 tcg_gen_st_i64(t0, tcg_env, aofs + i); 1018 } 1019 } 1020 tcg_temp_free_i64(t0); 1021 tcg_temp_free_i64(t1); 1022 tcg_temp_free_i64(t2); 1023 } 1024 1025 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 1026 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1027 uint32_t cofs, uint32_t oprsz, bool write_aofs, 1028 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 1029 { 1030 TCGv_i64 t0 = tcg_temp_new_i64(); 1031 TCGv_i64 t1 = tcg_temp_new_i64(); 1032 TCGv_i64 t2 = tcg_temp_new_i64(); 1033 TCGv_i64 t3 = tcg_temp_new_i64(); 1034 uint32_t i; 1035 1036 for (i = 0; i < oprsz; i += 8) { 1037 tcg_gen_ld_i64(t1, tcg_env, aofs + i); 1038 tcg_gen_ld_i64(t2, tcg_env, bofs + i); 1039 tcg_gen_ld_i64(t3, tcg_env, cofs + i); 1040 fni(t0, t1, t2, t3); 1041 tcg_gen_st_i64(t0, tcg_env, dofs + i); 1042 if (write_aofs) { 1043 tcg_gen_st_i64(t1, tcg_env, aofs + i); 1044 } 1045 } 1046 tcg_temp_free_i64(t3); 1047 tcg_temp_free_i64(t2); 1048 tcg_temp_free_i64(t1); 1049 tcg_temp_free_i64(t0); 1050 } 1051 1052 static void expand_4i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1053 uint32_t cofs, uint32_t oprsz, int64_t c, 1054 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64, 1055 int64_t)) 1056 { 1057 TCGv_i64 t0 = tcg_temp_new_i64(); 1058 TCGv_i64 t1 = tcg_temp_new_i64(); 1059 TCGv_i64 t2 = tcg_temp_new_i64(); 1060 TCGv_i64 t3 = tcg_temp_new_i64(); 1061 uint32_t i; 1062 1063 for (i = 0; i < oprsz; i += 8) { 1064 tcg_gen_ld_i64(t1, tcg_env, aofs + i); 1065 tcg_gen_ld_i64(t2, tcg_env, bofs + i); 1066 tcg_gen_ld_i64(t3, tcg_env, cofs + i); 1067 fni(t0, t1, t2, t3, c); 1068 tcg_gen_st_i64(t0, tcg_env, dofs + i); 1069 } 1070 tcg_temp_free_i64(t3); 1071 tcg_temp_free_i64(t2); 1072 tcg_temp_free_i64(t1); 1073 tcg_temp_free_i64(t0); 1074 } 1075 1076 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 1077 static void expand_2_vec(unsigned vece, TCGv_ptr dbase, uint32_t dofs, 1078 TCGv_ptr abase, uint32_t aofs, 1079 uint32_t oprsz, uint32_t tysz, TCGType type, 1080 bool load_dest, 1081 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 1082 { 1083 for (uint32_t i = 0; i < oprsz; i += tysz) { 1084 TCGv_vec t0 = tcg_temp_new_vec(type); 1085 TCGv_vec t1 = tcg_temp_new_vec(type); 1086 1087 tcg_gen_ld_vec(t0, abase, aofs + i); 1088 if (load_dest) { 1089 tcg_gen_ld_vec(t1, dbase, dofs + i); 1090 } 1091 fni(vece, t1, t0); 1092 tcg_gen_st_vec(t1, dbase, dofs + i); 1093 } 1094 } 1095 1096 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 1097 using host vectors. */ 1098 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1099 uint32_t oprsz, uint32_t tysz, TCGType type, 1100 int64_t c, bool load_dest, 1101 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 1102 { 1103 for (uint32_t i = 0; i < oprsz; i += tysz) { 1104 TCGv_vec t0 = tcg_temp_new_vec(type); 1105 TCGv_vec t1 = tcg_temp_new_vec(type); 1106 1107 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1108 if (load_dest) { 1109 tcg_gen_ld_vec(t1, tcg_env, dofs + i); 1110 } 1111 fni(vece, t1, t0, c); 1112 tcg_gen_st_vec(t1, tcg_env, dofs + i); 1113 } 1114 } 1115 1116 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1117 uint32_t oprsz, uint32_t tysz, TCGType type, 1118 TCGv_vec c, bool scalar_first, 1119 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1120 { 1121 for (uint32_t i = 0; i < oprsz; i += tysz) { 1122 TCGv_vec t0 = tcg_temp_new_vec(type); 1123 TCGv_vec t1 = tcg_temp_new_vec(type); 1124 1125 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1126 if (scalar_first) { 1127 fni(vece, t1, c, t0); 1128 } else { 1129 fni(vece, t1, t0, c); 1130 } 1131 tcg_gen_st_vec(t1, tcg_env, dofs + i); 1132 } 1133 } 1134 1135 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 1136 static void expand_3_vec(unsigned vece, TCGv_ptr dbase, uint32_t dofs, 1137 TCGv_ptr abase, uint32_t aofs, 1138 TCGv_ptr bbase, uint32_t bofs, uint32_t oprsz, 1139 uint32_t tysz, TCGType type, bool load_dest, 1140 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1141 { 1142 for (uint32_t i = 0; i < oprsz; i += tysz) { 1143 TCGv_vec t0 = tcg_temp_new_vec(type); 1144 TCGv_vec t1 = tcg_temp_new_vec(type); 1145 TCGv_vec t2 = tcg_temp_new_vec(type); 1146 1147 tcg_gen_ld_vec(t0, abase, aofs + i); 1148 tcg_gen_ld_vec(t1, bbase, bofs + i); 1149 if (load_dest) { 1150 tcg_gen_ld_vec(t2, dbase, dofs + i); 1151 } 1152 fni(vece, t2, t0, t1); 1153 tcg_gen_st_vec(t2, dbase, dofs + i); 1154 } 1155 } 1156 1157 /* 1158 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 1159 * using host vectors. 1160 */ 1161 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1162 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 1163 TCGType type, int64_t c, 1164 bool load_dest, bool write_aofs, 1165 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 1166 int64_t)) 1167 { 1168 for (uint32_t i = 0; i < oprsz; i += tysz) { 1169 TCGv_vec t0 = tcg_temp_new_vec(type); 1170 TCGv_vec t1 = tcg_temp_new_vec(type); 1171 TCGv_vec t2 = tcg_temp_new_vec(type); 1172 1173 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 1174 tcg_gen_ld_vec(t1, tcg_env, bofs + i); 1175 if (load_dest) { 1176 tcg_gen_ld_vec(t2, tcg_env, dofs + i); 1177 } 1178 fni(vece, t2, t0, t1, c); 1179 tcg_gen_st_vec(t2, tcg_env, dofs + i); 1180 if (write_aofs) { 1181 tcg_gen_st_vec(t0, tcg_env, aofs + i); 1182 } 1183 } 1184 } 1185 1186 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 1187 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1188 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1189 uint32_t tysz, TCGType type, bool write_aofs, 1190 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1191 TCGv_vec, TCGv_vec)) 1192 { 1193 for (uint32_t i = 0; i < oprsz; i += tysz) { 1194 TCGv_vec t0 = tcg_temp_new_vec(type); 1195 TCGv_vec t1 = tcg_temp_new_vec(type); 1196 TCGv_vec t2 = tcg_temp_new_vec(type); 1197 TCGv_vec t3 = tcg_temp_new_vec(type); 1198 1199 tcg_gen_ld_vec(t1, tcg_env, aofs + i); 1200 tcg_gen_ld_vec(t2, tcg_env, bofs + i); 1201 tcg_gen_ld_vec(t3, tcg_env, cofs + i); 1202 fni(vece, t0, t1, t2, t3); 1203 tcg_gen_st_vec(t0, tcg_env, dofs + i); 1204 if (write_aofs) { 1205 tcg_gen_st_vec(t1, tcg_env, aofs + i); 1206 } 1207 } 1208 } 1209 1210 /* 1211 * Expand OPSZ bytes worth of four-vector operands and an immediate operand 1212 * using host vectors. 1213 */ 1214 static void expand_4i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1215 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1216 uint32_t tysz, TCGType type, int64_t c, 1217 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1218 TCGv_vec, TCGv_vec, int64_t)) 1219 { 1220 for (uint32_t i = 0; i < oprsz; i += tysz) { 1221 TCGv_vec t0 = tcg_temp_new_vec(type); 1222 TCGv_vec t1 = tcg_temp_new_vec(type); 1223 TCGv_vec t2 = tcg_temp_new_vec(type); 1224 TCGv_vec t3 = tcg_temp_new_vec(type); 1225 1226 tcg_gen_ld_vec(t1, tcg_env, aofs + i); 1227 tcg_gen_ld_vec(t2, tcg_env, bofs + i); 1228 tcg_gen_ld_vec(t3, tcg_env, cofs + i); 1229 fni(vece, t0, t1, t2, t3, c); 1230 tcg_gen_st_vec(t0, tcg_env, dofs + i); 1231 } 1232 } 1233 1234 /* Expand a vector two-operand operation. */ 1235 void tcg_gen_gvec_2_var(TCGv_ptr dbase, uint32_t dofs, 1236 TCGv_ptr abase, uint32_t aofs, 1237 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 1238 { 1239 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1240 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1241 TCGType type; 1242 uint32_t some; 1243 1244 check_size_align(oprsz, maxsz, dofs | aofs); 1245 check_overlap_2(dbase, dofs, abase, aofs, maxsz); 1246 1247 type = 0; 1248 if (g->fniv) { 1249 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1250 } 1251 switch (type) { 1252 case TCG_TYPE_V256: 1253 /* Recall that ARM SVE allows vector sizes that are not a 1254 * power of 2, but always a multiple of 16. The intent is 1255 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1256 */ 1257 some = QEMU_ALIGN_DOWN(oprsz, 32); 1258 expand_2_vec(g->vece, dbase, dofs, abase, aofs, some, 32, 1259 TCG_TYPE_V256, g->load_dest, g->fniv); 1260 if (some == oprsz) { 1261 break; 1262 } 1263 dofs += some; 1264 aofs += some; 1265 oprsz -= some; 1266 maxsz -= some; 1267 /* fallthru */ 1268 case TCG_TYPE_V128: 1269 expand_2_vec(g->vece, dbase, dofs, abase, aofs, oprsz, 16, 1270 TCG_TYPE_V128, g->load_dest, g->fniv); 1271 break; 1272 case TCG_TYPE_V64: 1273 expand_2_vec(g->vece, dbase, dofs, abase, aofs, oprsz, 8, 1274 TCG_TYPE_V64, g->load_dest, g->fniv); 1275 break; 1276 1277 case 0: 1278 if (g->fni8 && check_size_impl(oprsz, 8)) { 1279 expand_2_i64(dbase, dofs, abase, aofs, 1280 oprsz, g->load_dest, g->fni8); 1281 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1282 expand_2_i32(dbase, dofs, abase, aofs, 1283 oprsz, g->load_dest, g->fni4); 1284 } else { 1285 assert(g->fno != NULL); 1286 expand_2_ool(dbase, dofs, abase, aofs, 1287 oprsz, maxsz, g->data, g->fno); 1288 oprsz = maxsz; 1289 } 1290 break; 1291 1292 default: 1293 g_assert_not_reached(); 1294 } 1295 tcg_swap_vecop_list(hold_list); 1296 1297 if (oprsz < maxsz) { 1298 expand_clr(dbase, dofs + oprsz, maxsz - oprsz); 1299 } 1300 } 1301 1302 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 1303 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 1304 { 1305 tcg_gen_gvec_2_var(tcg_env, dofs, tcg_env, aofs, oprsz, maxsz, g); 1306 } 1307 1308 /* Expand a vector operation with two vectors and an immediate. */ 1309 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1310 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1311 { 1312 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1313 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1314 TCGType type; 1315 uint32_t some; 1316 1317 check_size_align(oprsz, maxsz, dofs | aofs); 1318 check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz); 1319 1320 type = 0; 1321 if (g->fniv) { 1322 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1323 } 1324 switch (type) { 1325 case TCG_TYPE_V256: 1326 /* Recall that ARM SVE allows vector sizes that are not a 1327 * power of 2, but always a multiple of 16. The intent is 1328 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1329 */ 1330 some = QEMU_ALIGN_DOWN(oprsz, 32); 1331 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1332 c, g->load_dest, g->fniv); 1333 if (some == oprsz) { 1334 break; 1335 } 1336 dofs += some; 1337 aofs += some; 1338 oprsz -= some; 1339 maxsz -= some; 1340 /* fallthru */ 1341 case TCG_TYPE_V128: 1342 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1343 c, g->load_dest, g->fniv); 1344 break; 1345 case TCG_TYPE_V64: 1346 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1347 c, g->load_dest, g->fniv); 1348 break; 1349 1350 case 0: 1351 if (g->fni8 && check_size_impl(oprsz, 8)) { 1352 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1353 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1354 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1355 } else { 1356 if (g->fno) { 1357 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1358 } else { 1359 TCGv_i64 tcg_c = tcg_constant_i64(c); 1360 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1361 maxsz, c, g->fnoi); 1362 } 1363 oprsz = maxsz; 1364 } 1365 break; 1366 1367 default: 1368 g_assert_not_reached(); 1369 } 1370 tcg_swap_vecop_list(hold_list); 1371 1372 if (oprsz < maxsz) { 1373 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz); 1374 } 1375 } 1376 1377 /* Expand a vector operation with two vectors and a scalar. */ 1378 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1379 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1380 { 1381 TCGType type; 1382 1383 check_size_align(oprsz, maxsz, dofs | aofs); 1384 check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz); 1385 1386 type = 0; 1387 if (g->fniv) { 1388 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1389 } 1390 if (type != 0) { 1391 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1392 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1393 TCGv_vec t_vec = tcg_temp_new_vec(type); 1394 uint32_t some; 1395 1396 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1397 1398 switch (type) { 1399 case TCG_TYPE_V256: 1400 /* Recall that ARM SVE allows vector sizes that are not a 1401 * power of 2, but always a multiple of 16. The intent is 1402 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1403 */ 1404 some = QEMU_ALIGN_DOWN(oprsz, 32); 1405 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1406 t_vec, g->scalar_first, g->fniv); 1407 if (some == oprsz) { 1408 break; 1409 } 1410 dofs += some; 1411 aofs += some; 1412 oprsz -= some; 1413 maxsz -= some; 1414 /* fallthru */ 1415 1416 case TCG_TYPE_V128: 1417 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1418 t_vec, g->scalar_first, g->fniv); 1419 break; 1420 1421 case TCG_TYPE_V64: 1422 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1423 t_vec, g->scalar_first, g->fniv); 1424 break; 1425 1426 default: 1427 g_assert_not_reached(); 1428 } 1429 tcg_temp_free_vec(t_vec); 1430 tcg_swap_vecop_list(hold_list); 1431 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1432 TCGv_i64 t64 = tcg_temp_new_i64(); 1433 1434 tcg_gen_dup_i64(g->vece, t64, c); 1435 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1436 tcg_temp_free_i64(t64); 1437 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1438 TCGv_i32 t32 = tcg_temp_new_i32(); 1439 1440 tcg_gen_extrl_i64_i32(t32, c); 1441 tcg_gen_dup_i32(g->vece, t32, t32); 1442 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1443 tcg_temp_free_i32(t32); 1444 } else { 1445 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1446 return; 1447 } 1448 1449 if (oprsz < maxsz) { 1450 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz); 1451 } 1452 } 1453 1454 /* Expand a vector three-operand operation. */ 1455 void tcg_gen_gvec_3_var(TCGv_ptr dbase, uint32_t dofs, 1456 TCGv_ptr abase, uint32_t aofs, 1457 TCGv_ptr bbase, uint32_t bofs, 1458 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1459 { 1460 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1461 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1462 TCGType type; 1463 uint32_t some; 1464 1465 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1466 check_overlap_3(dbase, dofs, abase, aofs, bbase, bofs, maxsz); 1467 1468 type = 0; 1469 if (g->fniv) { 1470 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1471 } 1472 switch (type) { 1473 case TCG_TYPE_V256: 1474 /* Recall that ARM SVE allows vector sizes that are not a 1475 * power of 2, but always a multiple of 16. The intent is 1476 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1477 */ 1478 some = QEMU_ALIGN_DOWN(oprsz, 32); 1479 expand_3_vec(g->vece, dbase, dofs, abase, aofs, bbase, bofs, 1480 some, 32, TCG_TYPE_V256, g->load_dest, g->fniv); 1481 if (some == oprsz) { 1482 break; 1483 } 1484 dofs += some; 1485 aofs += some; 1486 bofs += some; 1487 oprsz -= some; 1488 maxsz -= some; 1489 /* fallthru */ 1490 case TCG_TYPE_V128: 1491 expand_3_vec(g->vece, dbase, dofs, abase, aofs, bbase, bofs, 1492 oprsz, 16, TCG_TYPE_V128, g->load_dest, g->fniv); 1493 break; 1494 case TCG_TYPE_V64: 1495 expand_3_vec(g->vece, dbase, dofs, abase, aofs, bbase, bofs, 1496 oprsz, 8, TCG_TYPE_V64, g->load_dest, g->fniv); 1497 break; 1498 1499 case 0: 1500 if (g->fni8 && check_size_impl(oprsz, 8)) { 1501 expand_3_i64(dbase, dofs, abase, aofs, bbase, bofs, 1502 oprsz, g->load_dest, g->fni8); 1503 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1504 expand_3_i32(dbase, dofs, abase, aofs, bbase, bofs, 1505 oprsz, g->load_dest, g->fni4); 1506 } else { 1507 assert(g->fno != NULL); 1508 expand_3_ool(dbase, dofs, abase, aofs, bbase, bofs, 1509 oprsz, maxsz, g->data, g->fno); 1510 oprsz = maxsz; 1511 } 1512 break; 1513 1514 default: 1515 g_assert_not_reached(); 1516 } 1517 tcg_swap_vecop_list(hold_list); 1518 1519 if (oprsz < maxsz) { 1520 expand_clr(dbase, dofs + oprsz, maxsz - oprsz); 1521 } 1522 } 1523 1524 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1525 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1526 { 1527 tcg_gen_gvec_3_var(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs, 1528 oprsz, maxsz, g); 1529 } 1530 1531 /* Expand a vector operation with three vectors and an immediate. */ 1532 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1533 uint32_t oprsz, uint32_t maxsz, int64_t c, 1534 const GVecGen3i *g) 1535 { 1536 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1537 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1538 TCGType type; 1539 uint32_t some; 1540 1541 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1542 check_overlap_3(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs, maxsz); 1543 1544 type = 0; 1545 if (g->fniv) { 1546 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1547 } 1548 switch (type) { 1549 case TCG_TYPE_V256: 1550 /* 1551 * Recall that ARM SVE allows vector sizes that are not a 1552 * power of 2, but always a multiple of 16. The intent is 1553 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1554 */ 1555 some = QEMU_ALIGN_DOWN(oprsz, 32); 1556 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1557 c, g->load_dest, g->write_aofs, g->fniv); 1558 if (some == oprsz) { 1559 break; 1560 } 1561 dofs += some; 1562 aofs += some; 1563 bofs += some; 1564 oprsz -= some; 1565 maxsz -= some; 1566 /* fallthru */ 1567 case TCG_TYPE_V128: 1568 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1569 c, g->load_dest, g->write_aofs, g->fniv); 1570 break; 1571 case TCG_TYPE_V64: 1572 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1573 c, g->load_dest, g->write_aofs, g->fniv); 1574 break; 1575 1576 case 0: 1577 if (g->fni8 && check_size_impl(oprsz, 8)) { 1578 expand_3i_i64(dofs, aofs, bofs, oprsz, c, 1579 g->load_dest, g->write_aofs, g->fni8); 1580 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1581 expand_3i_i32(dofs, aofs, bofs, oprsz, c, 1582 g->load_dest, g->write_aofs, g->fni4); 1583 } else { 1584 assert(g->fno != NULL); 1585 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1586 oprsz = maxsz; 1587 } 1588 break; 1589 1590 default: 1591 g_assert_not_reached(); 1592 } 1593 tcg_swap_vecop_list(hold_list); 1594 1595 if (oprsz < maxsz) { 1596 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz); 1597 } 1598 } 1599 1600 /* Expand a vector four-operand operation. */ 1601 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1602 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1603 { 1604 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1605 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1606 TCGType type; 1607 uint32_t some; 1608 1609 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1610 check_overlap_4(tcg_env, dofs, tcg_env, aofs, 1611 tcg_env, bofs, tcg_env, cofs, maxsz); 1612 1613 type = 0; 1614 if (g->fniv) { 1615 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1616 } 1617 switch (type) { 1618 case TCG_TYPE_V256: 1619 /* Recall that ARM SVE allows vector sizes that are not a 1620 * power of 2, but always a multiple of 16. The intent is 1621 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1622 */ 1623 some = QEMU_ALIGN_DOWN(oprsz, 32); 1624 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1625 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1626 if (some == oprsz) { 1627 break; 1628 } 1629 dofs += some; 1630 aofs += some; 1631 bofs += some; 1632 cofs += some; 1633 oprsz -= some; 1634 maxsz -= some; 1635 /* fallthru */ 1636 case TCG_TYPE_V128: 1637 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1638 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1639 break; 1640 case TCG_TYPE_V64: 1641 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1642 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1643 break; 1644 1645 case 0: 1646 if (g->fni8 && check_size_impl(oprsz, 8)) { 1647 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1648 g->write_aofs, g->fni8); 1649 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1650 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1651 g->write_aofs, g->fni4); 1652 } else { 1653 assert(g->fno != NULL); 1654 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1655 oprsz, maxsz, g->data, g->fno); 1656 oprsz = maxsz; 1657 } 1658 break; 1659 1660 default: 1661 g_assert_not_reached(); 1662 } 1663 tcg_swap_vecop_list(hold_list); 1664 1665 if (oprsz < maxsz) { 1666 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz); 1667 } 1668 } 1669 1670 /* Expand a vector four-operand operation. */ 1671 void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1672 uint32_t oprsz, uint32_t maxsz, int64_t c, 1673 const GVecGen4i *g) 1674 { 1675 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1676 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1677 TCGType type; 1678 uint32_t some; 1679 1680 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1681 check_overlap_4(tcg_env, dofs, tcg_env, aofs, 1682 tcg_env, bofs, tcg_env, cofs, maxsz); 1683 1684 type = 0; 1685 if (g->fniv) { 1686 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1687 } 1688 switch (type) { 1689 case TCG_TYPE_V256: 1690 /* 1691 * Recall that ARM SVE allows vector sizes that are not a 1692 * power of 2, but always a multiple of 16. The intent is 1693 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1694 */ 1695 some = QEMU_ALIGN_DOWN(oprsz, 32); 1696 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, some, 1697 32, TCG_TYPE_V256, c, g->fniv); 1698 if (some == oprsz) { 1699 break; 1700 } 1701 dofs += some; 1702 aofs += some; 1703 bofs += some; 1704 cofs += some; 1705 oprsz -= some; 1706 maxsz -= some; 1707 /* fallthru */ 1708 case TCG_TYPE_V128: 1709 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1710 16, TCG_TYPE_V128, c, g->fniv); 1711 break; 1712 case TCG_TYPE_V64: 1713 expand_4i_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1714 8, TCG_TYPE_V64, c, g->fniv); 1715 break; 1716 1717 case 0: 1718 if (g->fni8 && check_size_impl(oprsz, 8)) { 1719 expand_4i_i64(dofs, aofs, bofs, cofs, oprsz, c, g->fni8); 1720 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1721 expand_4i_i32(dofs, aofs, bofs, cofs, oprsz, c, g->fni4); 1722 } else { 1723 assert(g->fno != NULL); 1724 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1725 oprsz, maxsz, c, g->fno); 1726 oprsz = maxsz; 1727 } 1728 break; 1729 1730 default: 1731 g_assert_not_reached(); 1732 } 1733 tcg_swap_vecop_list(hold_list); 1734 1735 if (oprsz < maxsz) { 1736 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz); 1737 } 1738 } 1739 1740 /* 1741 * Expand specific vector operations. 1742 */ 1743 1744 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1745 { 1746 tcg_gen_mov_vec(a, b); 1747 } 1748 1749 void tcg_gen_gvec_mov_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs, 1750 TCGv_ptr abase, uint32_t aofs, 1751 uint32_t oprsz, uint32_t maxsz) 1752 { 1753 static const GVecGen2 g = { 1754 .fni8 = tcg_gen_mov_i64, 1755 .fniv = vec_mov2, 1756 .fno = gen_helper_gvec_mov, 1757 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1758 }; 1759 1760 if (dofs == aofs && dbase == abase) { 1761 check_size_align(oprsz, maxsz, dofs); 1762 if (oprsz < maxsz) { 1763 expand_clr(dbase, dofs + oprsz, maxsz - oprsz); 1764 } 1765 return; 1766 } 1767 1768 tcg_gen_gvec_2_var(dbase, dofs, abase, aofs, oprsz, maxsz, &g); 1769 } 1770 1771 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1772 uint32_t oprsz, uint32_t maxsz) 1773 { 1774 tcg_gen_gvec_mov_var(vece, tcg_env, dofs, tcg_env, aofs, oprsz, maxsz); 1775 } 1776 1777 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1778 uint32_t maxsz, TCGv_i32 in) 1779 { 1780 check_size_align(oprsz, maxsz, dofs); 1781 tcg_debug_assert(vece <= MO_32); 1782 do_dup(vece, tcg_env, dofs, oprsz, maxsz, in, NULL, 0); 1783 } 1784 1785 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1786 uint32_t maxsz, TCGv_i64 in) 1787 { 1788 check_size_align(oprsz, maxsz, dofs); 1789 tcg_debug_assert(vece <= MO_64); 1790 do_dup(vece, tcg_env, dofs, oprsz, maxsz, NULL, in, 0); 1791 } 1792 1793 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1794 uint32_t oprsz, uint32_t maxsz) 1795 { 1796 check_size_align(oprsz, maxsz, dofs); 1797 if (vece <= MO_64) { 1798 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1799 if (type != 0) { 1800 TCGv_vec t_vec = tcg_temp_new_vec(type); 1801 tcg_gen_dup_mem_vec(vece, t_vec, tcg_env, aofs); 1802 do_dup_store(type, tcg_env, dofs, oprsz, maxsz, t_vec); 1803 } else if (vece <= MO_32) { 1804 TCGv_i32 in = tcg_temp_ebb_new_i32(); 1805 switch (vece) { 1806 case MO_8: 1807 tcg_gen_ld8u_i32(in, tcg_env, aofs); 1808 break; 1809 case MO_16: 1810 tcg_gen_ld16u_i32(in, tcg_env, aofs); 1811 break; 1812 default: 1813 tcg_gen_ld_i32(in, tcg_env, aofs); 1814 break; 1815 } 1816 do_dup(vece, tcg_env, dofs, oprsz, maxsz, in, NULL, 0); 1817 tcg_temp_free_i32(in); 1818 } else { 1819 TCGv_i64 in = tcg_temp_ebb_new_i64(); 1820 tcg_gen_ld_i64(in, tcg_env, aofs); 1821 do_dup(vece, tcg_env, dofs, oprsz, maxsz, NULL, in, 0); 1822 tcg_temp_free_i64(in); 1823 } 1824 } else if (vece == 4) { 1825 /* 128-bit duplicate. */ 1826 int i; 1827 1828 tcg_debug_assert(oprsz >= 16); 1829 if (TCG_TARGET_HAS_v128) { 1830 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1831 1832 tcg_gen_ld_vec(in, tcg_env, aofs); 1833 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1834 tcg_gen_st_vec(in, tcg_env, dofs + i); 1835 } 1836 } else { 1837 TCGv_i64 in0 = tcg_temp_ebb_new_i64(); 1838 TCGv_i64 in1 = tcg_temp_ebb_new_i64(); 1839 1840 tcg_gen_ld_i64(in0, tcg_env, aofs); 1841 tcg_gen_ld_i64(in1, tcg_env, aofs + 8); 1842 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1843 tcg_gen_st_i64(in0, tcg_env, dofs + i); 1844 tcg_gen_st_i64(in1, tcg_env, dofs + i + 8); 1845 } 1846 tcg_temp_free_i64(in0); 1847 tcg_temp_free_i64(in1); 1848 } 1849 if (oprsz < maxsz) { 1850 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz); 1851 } 1852 } else if (vece == 5) { 1853 /* 256-bit duplicate. */ 1854 int i; 1855 1856 tcg_debug_assert(oprsz >= 32); 1857 tcg_debug_assert(oprsz % 32 == 0); 1858 if (TCG_TARGET_HAS_v256) { 1859 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256); 1860 1861 tcg_gen_ld_vec(in, tcg_env, aofs); 1862 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1863 tcg_gen_st_vec(in, tcg_env, dofs + i); 1864 } 1865 } else if (TCG_TARGET_HAS_v128) { 1866 TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128); 1867 TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128); 1868 1869 tcg_gen_ld_vec(in0, tcg_env, aofs); 1870 tcg_gen_ld_vec(in1, tcg_env, aofs + 16); 1871 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1872 tcg_gen_st_vec(in0, tcg_env, dofs + i); 1873 tcg_gen_st_vec(in1, tcg_env, dofs + i + 16); 1874 } 1875 } else { 1876 TCGv_i64 in[4]; 1877 int j; 1878 1879 for (j = 0; j < 4; ++j) { 1880 in[j] = tcg_temp_ebb_new_i64(); 1881 tcg_gen_ld_i64(in[j], tcg_env, aofs + j * 8); 1882 } 1883 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1884 for (j = 0; j < 4; ++j) { 1885 tcg_gen_st_i64(in[j], tcg_env, dofs + i + j * 8); 1886 } 1887 } 1888 for (j = 0; j < 4; ++j) { 1889 tcg_temp_free_i64(in[j]); 1890 } 1891 } 1892 if (oprsz < maxsz) { 1893 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz); 1894 } 1895 } else { 1896 g_assert_not_reached(); 1897 } 1898 } 1899 1900 void tcg_gen_gvec_dup_imm_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs, 1901 uint32_t oprsz, uint32_t maxsz, uint64_t x) 1902 { 1903 check_size_align(oprsz, maxsz, dofs); 1904 do_dup(vece, dbase, dofs, oprsz, maxsz, NULL, NULL, x); 1905 } 1906 1907 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz, 1908 uint32_t maxsz, uint64_t x) 1909 { 1910 tcg_gen_gvec_dup_imm_var(vece, tcg_env, dofs, oprsz, maxsz, x); 1911 } 1912 1913 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1914 uint32_t oprsz, uint32_t maxsz) 1915 { 1916 static const GVecGen2 g = { 1917 .fni8 = tcg_gen_not_i64, 1918 .fniv = tcg_gen_not_vec, 1919 .fno = gen_helper_gvec_not, 1920 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1921 }; 1922 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1923 } 1924 1925 /* Perform a vector addition using normal addition and a mask. The mask 1926 should be the sign bit of each lane. This 6-operation form is more 1927 efficient than separate additions when there are 4 or more lanes in 1928 the 64-bit operation. */ 1929 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1930 { 1931 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 1932 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 1933 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 1934 1935 tcg_gen_andc_i64(t1, a, m); 1936 tcg_gen_andc_i64(t2, b, m); 1937 tcg_gen_xor_i64(t3, a, b); 1938 tcg_gen_add_i64(d, t1, t2); 1939 tcg_gen_and_i64(t3, t3, m); 1940 tcg_gen_xor_i64(d, d, t3); 1941 1942 tcg_temp_free_i64(t1); 1943 tcg_temp_free_i64(t2); 1944 tcg_temp_free_i64(t3); 1945 } 1946 1947 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1948 { 1949 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 1950 gen_addv_mask(d, a, b, m); 1951 } 1952 1953 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1954 { 1955 TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80)); 1956 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 1957 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 1958 TCGv_i32 t3 = tcg_temp_ebb_new_i32(); 1959 1960 tcg_gen_andc_i32(t1, a, m); 1961 tcg_gen_andc_i32(t2, b, m); 1962 tcg_gen_xor_i32(t3, a, b); 1963 tcg_gen_add_i32(d, t1, t2); 1964 tcg_gen_and_i32(t3, t3, m); 1965 tcg_gen_xor_i32(d, d, t3); 1966 1967 tcg_temp_free_i32(t1); 1968 tcg_temp_free_i32(t2); 1969 tcg_temp_free_i32(t3); 1970 } 1971 1972 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1973 { 1974 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 1975 gen_addv_mask(d, a, b, m); 1976 } 1977 1978 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1979 { 1980 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 1981 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 1982 1983 tcg_gen_andi_i32(t1, a, ~0xffff); 1984 tcg_gen_add_i32(t2, a, b); 1985 tcg_gen_add_i32(t1, t1, b); 1986 tcg_gen_deposit_i32(d, t1, t2, 0, 16); 1987 1988 tcg_temp_free_i32(t1); 1989 tcg_temp_free_i32(t2); 1990 } 1991 1992 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1993 { 1994 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 1995 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 1996 1997 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1998 tcg_gen_add_i64(t2, a, b); 1999 tcg_gen_add_i64(t1, t1, b); 2000 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2001 2002 tcg_temp_free_i64(t1); 2003 tcg_temp_free_i64(t2); 2004 } 2005 2006 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 2007 2008 void tcg_gen_gvec_add_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs, 2009 TCGv_ptr abase, uint32_t aofs, 2010 TCGv_ptr bbase, uint32_t bofs, 2011 uint32_t oprsz, uint32_t maxsz) 2012 { 2013 static const GVecGen3 g[4] = { 2014 { .fni8 = tcg_gen_vec_add8_i64, 2015 .fniv = tcg_gen_add_vec, 2016 .fno = gen_helper_gvec_add8, 2017 .opt_opc = vecop_list_add, 2018 .vece = MO_8 }, 2019 { .fni8 = tcg_gen_vec_add16_i64, 2020 .fniv = tcg_gen_add_vec, 2021 .fno = gen_helper_gvec_add16, 2022 .opt_opc = vecop_list_add, 2023 .vece = MO_16 }, 2024 { .fni4 = tcg_gen_add_i32, 2025 .fniv = tcg_gen_add_vec, 2026 .fno = gen_helper_gvec_add32, 2027 .opt_opc = vecop_list_add, 2028 .vece = MO_32 }, 2029 { .fni8 = tcg_gen_add_i64, 2030 .fniv = tcg_gen_add_vec, 2031 .fno = gen_helper_gvec_add64, 2032 .opt_opc = vecop_list_add, 2033 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2034 .vece = MO_64 }, 2035 }; 2036 2037 tcg_debug_assert(vece <= MO_64); 2038 tcg_gen_gvec_3_var(dbase, dofs, abase, aofs, bbase, bofs, 2039 oprsz, maxsz, &g[vece]); 2040 } 2041 2042 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 2043 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2044 { 2045 tcg_gen_gvec_add_var(vece, tcg_env, dofs, tcg_env, aofs, tcg_env, bofs, 2046 oprsz, maxsz); 2047 } 2048 2049 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 2050 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2051 { 2052 static const GVecGen2s g[4] = { 2053 { .fni8 = tcg_gen_vec_add8_i64, 2054 .fniv = tcg_gen_add_vec, 2055 .fno = gen_helper_gvec_adds8, 2056 .opt_opc = vecop_list_add, 2057 .vece = MO_8 }, 2058 { .fni8 = tcg_gen_vec_add16_i64, 2059 .fniv = tcg_gen_add_vec, 2060 .fno = gen_helper_gvec_adds16, 2061 .opt_opc = vecop_list_add, 2062 .vece = MO_16 }, 2063 { .fni4 = tcg_gen_add_i32, 2064 .fniv = tcg_gen_add_vec, 2065 .fno = gen_helper_gvec_adds32, 2066 .opt_opc = vecop_list_add, 2067 .vece = MO_32 }, 2068 { .fni8 = tcg_gen_add_i64, 2069 .fniv = tcg_gen_add_vec, 2070 .fno = gen_helper_gvec_adds64, 2071 .opt_opc = vecop_list_add, 2072 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2073 .vece = MO_64 }, 2074 }; 2075 2076 tcg_debug_assert(vece <= MO_64); 2077 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 2078 } 2079 2080 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 2081 int64_t c, uint32_t oprsz, uint32_t maxsz) 2082 { 2083 TCGv_i64 tmp = tcg_constant_i64(c); 2084 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 2085 } 2086 2087 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 2088 2089 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 2090 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2091 { 2092 static const GVecGen2s g[4] = { 2093 { .fni8 = tcg_gen_vec_sub8_i64, 2094 .fniv = tcg_gen_sub_vec, 2095 .fno = gen_helper_gvec_subs8, 2096 .opt_opc = vecop_list_sub, 2097 .vece = MO_8 }, 2098 { .fni8 = tcg_gen_vec_sub16_i64, 2099 .fniv = tcg_gen_sub_vec, 2100 .fno = gen_helper_gvec_subs16, 2101 .opt_opc = vecop_list_sub, 2102 .vece = MO_16 }, 2103 { .fni4 = tcg_gen_sub_i32, 2104 .fniv = tcg_gen_sub_vec, 2105 .fno = gen_helper_gvec_subs32, 2106 .opt_opc = vecop_list_sub, 2107 .vece = MO_32 }, 2108 { .fni8 = tcg_gen_sub_i64, 2109 .fniv = tcg_gen_sub_vec, 2110 .fno = gen_helper_gvec_subs64, 2111 .opt_opc = vecop_list_sub, 2112 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2113 .vece = MO_64 }, 2114 }; 2115 2116 tcg_debug_assert(vece <= MO_64); 2117 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 2118 } 2119 2120 /* Perform a vector subtraction using normal subtraction and a mask. 2121 Compare gen_addv_mask above. */ 2122 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 2123 { 2124 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2125 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2126 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 2127 2128 tcg_gen_or_i64(t1, a, m); 2129 tcg_gen_andc_i64(t2, b, m); 2130 tcg_gen_eqv_i64(t3, a, b); 2131 tcg_gen_sub_i64(d, t1, t2); 2132 tcg_gen_and_i64(t3, t3, m); 2133 tcg_gen_xor_i64(d, d, t3); 2134 2135 tcg_temp_free_i64(t1); 2136 tcg_temp_free_i64(t2); 2137 tcg_temp_free_i64(t3); 2138 } 2139 2140 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2141 { 2142 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 2143 gen_subv_mask(d, a, b, m); 2144 } 2145 2146 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2147 { 2148 TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80)); 2149 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 2150 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 2151 TCGv_i32 t3 = tcg_temp_ebb_new_i32(); 2152 2153 tcg_gen_or_i32(t1, a, m); 2154 tcg_gen_andc_i32(t2, b, m); 2155 tcg_gen_eqv_i32(t3, a, b); 2156 tcg_gen_sub_i32(d, t1, t2); 2157 tcg_gen_and_i32(t3, t3, m); 2158 tcg_gen_xor_i32(d, d, t3); 2159 2160 tcg_temp_free_i32(t1); 2161 tcg_temp_free_i32(t2); 2162 tcg_temp_free_i32(t3); 2163 } 2164 2165 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2166 { 2167 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 2168 gen_subv_mask(d, a, b, m); 2169 } 2170 2171 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2172 { 2173 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 2174 TCGv_i32 t2 = tcg_temp_ebb_new_i32(); 2175 2176 tcg_gen_andi_i32(t1, b, ~0xffff); 2177 tcg_gen_sub_i32(t2, a, b); 2178 tcg_gen_sub_i32(t1, a, t1); 2179 tcg_gen_deposit_i32(d, t1, t2, 0, 16); 2180 2181 tcg_temp_free_i32(t1); 2182 tcg_temp_free_i32(t2); 2183 } 2184 2185 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2186 { 2187 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2188 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2189 2190 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2191 tcg_gen_sub_i64(t2, a, b); 2192 tcg_gen_sub_i64(t1, a, t1); 2193 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2194 2195 tcg_temp_free_i64(t1); 2196 tcg_temp_free_i64(t2); 2197 } 2198 2199 void tcg_gen_gvec_sub_var(unsigned vece, TCGv_ptr dbase, uint32_t dofs, 2200 TCGv_ptr abase, uint32_t aofs, 2201 TCGv_ptr bbase, uint32_t bofs, 2202 uint32_t oprsz, uint32_t maxsz) 2203 { 2204 static const GVecGen3 g[4] = { 2205 { .fni8 = tcg_gen_vec_sub8_i64, 2206 .fniv = tcg_gen_sub_vec, 2207 .fno = gen_helper_gvec_sub8, 2208 .opt_opc = vecop_list_sub, 2209 .vece = MO_8 }, 2210 { .fni8 = tcg_gen_vec_sub16_i64, 2211 .fniv = tcg_gen_sub_vec, 2212 .fno = gen_helper_gvec_sub16, 2213 .opt_opc = vecop_list_sub, 2214 .vece = MO_16 }, 2215 { .fni4 = tcg_gen_sub_i32, 2216 .fniv = tcg_gen_sub_vec, 2217 .fno = gen_helper_gvec_sub32, 2218 .opt_opc = vecop_list_sub, 2219 .vece = MO_32 }, 2220 { .fni8 = tcg_gen_sub_i64, 2221 .fniv = tcg_gen_sub_vec, 2222 .fno = gen_helper_gvec_sub64, 2223 .opt_opc = vecop_list_sub, 2224 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2225 .vece = MO_64 }, 2226 }; 2227 2228 tcg_debug_assert(vece <= MO_64); 2229 tcg_gen_gvec_3_var(dbase, dofs, abase, aofs, bbase, bofs, 2230 oprsz, maxsz, &g[vece]); 2231 } 2232 2233 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 2234 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2235 { 2236 tcg_gen_gvec_sub_var(vece, tcg_env, dofs, tcg_env, aofs, tcg_env, bofs, 2237 oprsz, maxsz); 2238 } 2239 2240 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 2241 2242 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 2243 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2244 { 2245 static const GVecGen3 g[4] = { 2246 { .fniv = tcg_gen_mul_vec, 2247 .fno = gen_helper_gvec_mul8, 2248 .opt_opc = vecop_list_mul, 2249 .vece = MO_8 }, 2250 { .fniv = tcg_gen_mul_vec, 2251 .fno = gen_helper_gvec_mul16, 2252 .opt_opc = vecop_list_mul, 2253 .vece = MO_16 }, 2254 { .fni4 = tcg_gen_mul_i32, 2255 .fniv = tcg_gen_mul_vec, 2256 .fno = gen_helper_gvec_mul32, 2257 .opt_opc = vecop_list_mul, 2258 .vece = MO_32 }, 2259 { .fni8 = tcg_gen_mul_i64, 2260 .fniv = tcg_gen_mul_vec, 2261 .fno = gen_helper_gvec_mul64, 2262 .opt_opc = vecop_list_mul, 2263 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2264 .vece = MO_64 }, 2265 }; 2266 2267 tcg_debug_assert(vece <= MO_64); 2268 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2269 } 2270 2271 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 2272 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2273 { 2274 static const GVecGen2s g[4] = { 2275 { .fniv = tcg_gen_mul_vec, 2276 .fno = gen_helper_gvec_muls8, 2277 .opt_opc = vecop_list_mul, 2278 .vece = MO_8 }, 2279 { .fniv = tcg_gen_mul_vec, 2280 .fno = gen_helper_gvec_muls16, 2281 .opt_opc = vecop_list_mul, 2282 .vece = MO_16 }, 2283 { .fni4 = tcg_gen_mul_i32, 2284 .fniv = tcg_gen_mul_vec, 2285 .fno = gen_helper_gvec_muls32, 2286 .opt_opc = vecop_list_mul, 2287 .vece = MO_32 }, 2288 { .fni8 = tcg_gen_mul_i64, 2289 .fniv = tcg_gen_mul_vec, 2290 .fno = gen_helper_gvec_muls64, 2291 .opt_opc = vecop_list_mul, 2292 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2293 .vece = MO_64 }, 2294 }; 2295 2296 tcg_debug_assert(vece <= MO_64); 2297 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 2298 } 2299 2300 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 2301 int64_t c, uint32_t oprsz, uint32_t maxsz) 2302 { 2303 TCGv_i64 tmp = tcg_constant_i64(c); 2304 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 2305 } 2306 2307 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2308 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2309 { 2310 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 2311 static const GVecGen3 g[4] = { 2312 { .fniv = tcg_gen_ssadd_vec, 2313 .fno = gen_helper_gvec_ssadd8, 2314 .opt_opc = vecop_list, 2315 .vece = MO_8 }, 2316 { .fniv = tcg_gen_ssadd_vec, 2317 .fno = gen_helper_gvec_ssadd16, 2318 .opt_opc = vecop_list, 2319 .vece = MO_16 }, 2320 { .fniv = tcg_gen_ssadd_vec, 2321 .fno = gen_helper_gvec_ssadd32, 2322 .opt_opc = vecop_list, 2323 .vece = MO_32 }, 2324 { .fniv = tcg_gen_ssadd_vec, 2325 .fno = gen_helper_gvec_ssadd64, 2326 .opt_opc = vecop_list, 2327 .vece = MO_64 }, 2328 }; 2329 tcg_debug_assert(vece <= MO_64); 2330 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2331 } 2332 2333 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 2334 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2335 { 2336 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 2337 static const GVecGen3 g[4] = { 2338 { .fniv = tcg_gen_sssub_vec, 2339 .fno = gen_helper_gvec_sssub8, 2340 .opt_opc = vecop_list, 2341 .vece = MO_8 }, 2342 { .fniv = tcg_gen_sssub_vec, 2343 .fno = gen_helper_gvec_sssub16, 2344 .opt_opc = vecop_list, 2345 .vece = MO_16 }, 2346 { .fniv = tcg_gen_sssub_vec, 2347 .fno = gen_helper_gvec_sssub32, 2348 .opt_opc = vecop_list, 2349 .vece = MO_32 }, 2350 { .fniv = tcg_gen_sssub_vec, 2351 .fno = gen_helper_gvec_sssub64, 2352 .opt_opc = vecop_list, 2353 .vece = MO_64 }, 2354 }; 2355 tcg_debug_assert(vece <= MO_64); 2356 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2357 } 2358 2359 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2360 { 2361 TCGv_i32 max = tcg_constant_i32(-1); 2362 tcg_gen_add_i32(d, a, b); 2363 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 2364 } 2365 2366 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2367 { 2368 TCGv_i64 max = tcg_constant_i64(-1); 2369 tcg_gen_add_i64(d, a, b); 2370 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 2371 } 2372 2373 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2374 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2375 { 2376 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 2377 static const GVecGen3 g[4] = { 2378 { .fniv = tcg_gen_usadd_vec, 2379 .fno = gen_helper_gvec_usadd8, 2380 .opt_opc = vecop_list, 2381 .vece = MO_8 }, 2382 { .fniv = tcg_gen_usadd_vec, 2383 .fno = gen_helper_gvec_usadd16, 2384 .opt_opc = vecop_list, 2385 .vece = MO_16 }, 2386 { .fni4 = tcg_gen_usadd_i32, 2387 .fniv = tcg_gen_usadd_vec, 2388 .fno = gen_helper_gvec_usadd32, 2389 .opt_opc = vecop_list, 2390 .vece = MO_32 }, 2391 { .fni8 = tcg_gen_usadd_i64, 2392 .fniv = tcg_gen_usadd_vec, 2393 .fno = gen_helper_gvec_usadd64, 2394 .opt_opc = vecop_list, 2395 .vece = MO_64 } 2396 }; 2397 tcg_debug_assert(vece <= MO_64); 2398 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2399 } 2400 2401 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2402 { 2403 TCGv_i32 min = tcg_constant_i32(0); 2404 tcg_gen_sub_i32(d, a, b); 2405 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 2406 } 2407 2408 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2409 { 2410 TCGv_i64 min = tcg_constant_i64(0); 2411 tcg_gen_sub_i64(d, a, b); 2412 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 2413 } 2414 2415 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 2416 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2417 { 2418 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 2419 static const GVecGen3 g[4] = { 2420 { .fniv = tcg_gen_ussub_vec, 2421 .fno = gen_helper_gvec_ussub8, 2422 .opt_opc = vecop_list, 2423 .vece = MO_8 }, 2424 { .fniv = tcg_gen_ussub_vec, 2425 .fno = gen_helper_gvec_ussub16, 2426 .opt_opc = vecop_list, 2427 .vece = MO_16 }, 2428 { .fni4 = tcg_gen_ussub_i32, 2429 .fniv = tcg_gen_ussub_vec, 2430 .fno = gen_helper_gvec_ussub32, 2431 .opt_opc = vecop_list, 2432 .vece = MO_32 }, 2433 { .fni8 = tcg_gen_ussub_i64, 2434 .fniv = tcg_gen_ussub_vec, 2435 .fno = gen_helper_gvec_ussub64, 2436 .opt_opc = vecop_list, 2437 .vece = MO_64 } 2438 }; 2439 tcg_debug_assert(vece <= MO_64); 2440 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2441 } 2442 2443 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 2444 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2445 { 2446 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 2447 static const GVecGen3 g[4] = { 2448 { .fniv = tcg_gen_smin_vec, 2449 .fno = gen_helper_gvec_smin8, 2450 .opt_opc = vecop_list, 2451 .vece = MO_8 }, 2452 { .fniv = tcg_gen_smin_vec, 2453 .fno = gen_helper_gvec_smin16, 2454 .opt_opc = vecop_list, 2455 .vece = MO_16 }, 2456 { .fni4 = tcg_gen_smin_i32, 2457 .fniv = tcg_gen_smin_vec, 2458 .fno = gen_helper_gvec_smin32, 2459 .opt_opc = vecop_list, 2460 .vece = MO_32 }, 2461 { .fni8 = tcg_gen_smin_i64, 2462 .fniv = tcg_gen_smin_vec, 2463 .fno = gen_helper_gvec_smin64, 2464 .opt_opc = vecop_list, 2465 .vece = MO_64 } 2466 }; 2467 tcg_debug_assert(vece <= MO_64); 2468 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2469 } 2470 2471 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2472 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2473 { 2474 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2475 static const GVecGen3 g[4] = { 2476 { .fniv = tcg_gen_umin_vec, 2477 .fno = gen_helper_gvec_umin8, 2478 .opt_opc = vecop_list, 2479 .vece = MO_8 }, 2480 { .fniv = tcg_gen_umin_vec, 2481 .fno = gen_helper_gvec_umin16, 2482 .opt_opc = vecop_list, 2483 .vece = MO_16 }, 2484 { .fni4 = tcg_gen_umin_i32, 2485 .fniv = tcg_gen_umin_vec, 2486 .fno = gen_helper_gvec_umin32, 2487 .opt_opc = vecop_list, 2488 .vece = MO_32 }, 2489 { .fni8 = tcg_gen_umin_i64, 2490 .fniv = tcg_gen_umin_vec, 2491 .fno = gen_helper_gvec_umin64, 2492 .opt_opc = vecop_list, 2493 .vece = MO_64 } 2494 }; 2495 tcg_debug_assert(vece <= MO_64); 2496 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2497 } 2498 2499 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2500 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2501 { 2502 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2503 static const GVecGen3 g[4] = { 2504 { .fniv = tcg_gen_smax_vec, 2505 .fno = gen_helper_gvec_smax8, 2506 .opt_opc = vecop_list, 2507 .vece = MO_8 }, 2508 { .fniv = tcg_gen_smax_vec, 2509 .fno = gen_helper_gvec_smax16, 2510 .opt_opc = vecop_list, 2511 .vece = MO_16 }, 2512 { .fni4 = tcg_gen_smax_i32, 2513 .fniv = tcg_gen_smax_vec, 2514 .fno = gen_helper_gvec_smax32, 2515 .opt_opc = vecop_list, 2516 .vece = MO_32 }, 2517 { .fni8 = tcg_gen_smax_i64, 2518 .fniv = tcg_gen_smax_vec, 2519 .fno = gen_helper_gvec_smax64, 2520 .opt_opc = vecop_list, 2521 .vece = MO_64 } 2522 }; 2523 tcg_debug_assert(vece <= MO_64); 2524 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2525 } 2526 2527 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2528 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2529 { 2530 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2531 static const GVecGen3 g[4] = { 2532 { .fniv = tcg_gen_umax_vec, 2533 .fno = gen_helper_gvec_umax8, 2534 .opt_opc = vecop_list, 2535 .vece = MO_8 }, 2536 { .fniv = tcg_gen_umax_vec, 2537 .fno = gen_helper_gvec_umax16, 2538 .opt_opc = vecop_list, 2539 .vece = MO_16 }, 2540 { .fni4 = tcg_gen_umax_i32, 2541 .fniv = tcg_gen_umax_vec, 2542 .fno = gen_helper_gvec_umax32, 2543 .opt_opc = vecop_list, 2544 .vece = MO_32 }, 2545 { .fni8 = tcg_gen_umax_i64, 2546 .fniv = tcg_gen_umax_vec, 2547 .fno = gen_helper_gvec_umax64, 2548 .opt_opc = vecop_list, 2549 .vece = MO_64 } 2550 }; 2551 tcg_debug_assert(vece <= MO_64); 2552 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2553 } 2554 2555 /* Perform a vector negation using normal negation and a mask. 2556 Compare gen_subv_mask above. */ 2557 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2558 { 2559 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2560 TCGv_i64 t3 = tcg_temp_ebb_new_i64(); 2561 2562 tcg_gen_andc_i64(t3, m, b); 2563 tcg_gen_andc_i64(t2, b, m); 2564 tcg_gen_sub_i64(d, m, t2); 2565 tcg_gen_xor_i64(d, d, t3); 2566 2567 tcg_temp_free_i64(t2); 2568 tcg_temp_free_i64(t3); 2569 } 2570 2571 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2572 { 2573 TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80)); 2574 gen_negv_mask(d, b, m); 2575 } 2576 2577 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2578 { 2579 TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000)); 2580 gen_negv_mask(d, b, m); 2581 } 2582 2583 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2584 { 2585 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 2586 TCGv_i64 t2 = tcg_temp_ebb_new_i64(); 2587 2588 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2589 tcg_gen_neg_i64(t2, b); 2590 tcg_gen_neg_i64(t1, t1); 2591 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2592 2593 tcg_temp_free_i64(t1); 2594 tcg_temp_free_i64(t2); 2595 } 2596 2597 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2598 uint32_t oprsz, uint32_t maxsz) 2599 { 2600 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2601 static const GVecGen2 g[4] = { 2602 { .fni8 = tcg_gen_vec_neg8_i64, 2603 .fniv = tcg_gen_neg_vec, 2604 .fno = gen_helper_gvec_neg8, 2605 .opt_opc = vecop_list, 2606 .vece = MO_8 }, 2607 { .fni8 = tcg_gen_vec_neg16_i64, 2608 .fniv = tcg_gen_neg_vec, 2609 .fno = gen_helper_gvec_neg16, 2610 .opt_opc = vecop_list, 2611 .vece = MO_16 }, 2612 { .fni4 = tcg_gen_neg_i32, 2613 .fniv = tcg_gen_neg_vec, 2614 .fno = gen_helper_gvec_neg32, 2615 .opt_opc = vecop_list, 2616 .vece = MO_32 }, 2617 { .fni8 = tcg_gen_neg_i64, 2618 .fniv = tcg_gen_neg_vec, 2619 .fno = gen_helper_gvec_neg64, 2620 .opt_opc = vecop_list, 2621 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2622 .vece = MO_64 }, 2623 }; 2624 2625 tcg_debug_assert(vece <= MO_64); 2626 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2627 } 2628 2629 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2630 { 2631 TCGv_i64 t = tcg_temp_ebb_new_i64(); 2632 int nbit = 8 << vece; 2633 2634 /* Create -1 for each negative element. */ 2635 tcg_gen_shri_i64(t, b, nbit - 1); 2636 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2637 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2638 2639 /* 2640 * Invert (via xor -1) and add one. 2641 * Because of the ordering the msb is cleared, 2642 * so we never have carry into the next element. 2643 */ 2644 tcg_gen_xor_i64(d, b, t); 2645 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2646 tcg_gen_add_i64(d, d, t); 2647 2648 tcg_temp_free_i64(t); 2649 } 2650 2651 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2652 { 2653 gen_absv_mask(d, b, MO_8); 2654 } 2655 2656 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2657 { 2658 gen_absv_mask(d, b, MO_16); 2659 } 2660 2661 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2662 uint32_t oprsz, uint32_t maxsz) 2663 { 2664 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2665 static const GVecGen2 g[4] = { 2666 { .fni8 = tcg_gen_vec_abs8_i64, 2667 .fniv = tcg_gen_abs_vec, 2668 .fno = gen_helper_gvec_abs8, 2669 .opt_opc = vecop_list, 2670 .vece = MO_8 }, 2671 { .fni8 = tcg_gen_vec_abs16_i64, 2672 .fniv = tcg_gen_abs_vec, 2673 .fno = gen_helper_gvec_abs16, 2674 .opt_opc = vecop_list, 2675 .vece = MO_16 }, 2676 { .fni4 = tcg_gen_abs_i32, 2677 .fniv = tcg_gen_abs_vec, 2678 .fno = gen_helper_gvec_abs32, 2679 .opt_opc = vecop_list, 2680 .vece = MO_32 }, 2681 { .fni8 = tcg_gen_abs_i64, 2682 .fniv = tcg_gen_abs_vec, 2683 .fno = gen_helper_gvec_abs64, 2684 .opt_opc = vecop_list, 2685 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2686 .vece = MO_64 }, 2687 }; 2688 2689 tcg_debug_assert(vece <= MO_64); 2690 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2691 } 2692 2693 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2694 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2695 { 2696 static const GVecGen3 g = { 2697 .fni8 = tcg_gen_and_i64, 2698 .fniv = tcg_gen_and_vec, 2699 .fno = gen_helper_gvec_and, 2700 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2701 }; 2702 2703 if (aofs == bofs) { 2704 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2705 } else { 2706 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2707 } 2708 } 2709 2710 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2711 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2712 { 2713 static const GVecGen3 g = { 2714 .fni8 = tcg_gen_or_i64, 2715 .fniv = tcg_gen_or_vec, 2716 .fno = gen_helper_gvec_or, 2717 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2718 }; 2719 2720 if (aofs == bofs) { 2721 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2722 } else { 2723 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2724 } 2725 } 2726 2727 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2728 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2729 { 2730 static const GVecGen3 g = { 2731 .fni8 = tcg_gen_xor_i64, 2732 .fniv = tcg_gen_xor_vec, 2733 .fno = gen_helper_gvec_xor, 2734 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2735 }; 2736 2737 if (aofs == bofs) { 2738 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2739 } else { 2740 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2741 } 2742 } 2743 2744 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2745 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2746 { 2747 static const GVecGen3 g = { 2748 .fni8 = tcg_gen_andc_i64, 2749 .fniv = tcg_gen_andc_vec, 2750 .fno = gen_helper_gvec_andc, 2751 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2752 }; 2753 2754 if (aofs == bofs) { 2755 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2756 } else { 2757 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2758 } 2759 } 2760 2761 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2762 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2763 { 2764 static const GVecGen3 g = { 2765 .fni8 = tcg_gen_orc_i64, 2766 .fniv = tcg_gen_orc_vec, 2767 .fno = gen_helper_gvec_orc, 2768 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2769 }; 2770 2771 if (aofs == bofs) { 2772 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2773 } else { 2774 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2775 } 2776 } 2777 2778 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2779 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2780 { 2781 static const GVecGen3 g = { 2782 .fni8 = tcg_gen_nand_i64, 2783 .fniv = tcg_gen_nand_vec, 2784 .fno = gen_helper_gvec_nand, 2785 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2786 }; 2787 2788 if (aofs == bofs) { 2789 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2790 } else { 2791 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2792 } 2793 } 2794 2795 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2796 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2797 { 2798 static const GVecGen3 g = { 2799 .fni8 = tcg_gen_nor_i64, 2800 .fniv = tcg_gen_nor_vec, 2801 .fno = gen_helper_gvec_nor, 2802 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2803 }; 2804 2805 if (aofs == bofs) { 2806 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2807 } else { 2808 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2809 } 2810 } 2811 2812 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2813 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2814 { 2815 static const GVecGen3 g = { 2816 .fni8 = tcg_gen_eqv_i64, 2817 .fniv = tcg_gen_eqv_vec, 2818 .fno = gen_helper_gvec_eqv, 2819 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2820 }; 2821 2822 if (aofs == bofs) { 2823 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2824 } else { 2825 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2826 } 2827 } 2828 2829 static const GVecGen2s gop_ands = { 2830 .fni8 = tcg_gen_and_i64, 2831 .fniv = tcg_gen_and_vec, 2832 .fno = gen_helper_gvec_ands, 2833 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2834 .vece = MO_64 2835 }; 2836 2837 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2838 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2839 { 2840 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2841 tcg_gen_dup_i64(vece, tmp, c); 2842 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2843 tcg_temp_free_i64(tmp); 2844 } 2845 2846 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2847 int64_t c, uint32_t oprsz, uint32_t maxsz) 2848 { 2849 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2850 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2851 } 2852 2853 void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs, 2854 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2855 { 2856 static GVecGen2s g = { 2857 .fni8 = tcg_gen_andc_i64, 2858 .fniv = tcg_gen_andc_vec, 2859 .fno = gen_helper_gvec_andcs, 2860 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2861 .vece = MO_64 2862 }; 2863 2864 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2865 tcg_gen_dup_i64(vece, tmp, c); 2866 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &g); 2867 tcg_temp_free_i64(tmp); 2868 } 2869 2870 static const GVecGen2s gop_xors = { 2871 .fni8 = tcg_gen_xor_i64, 2872 .fniv = tcg_gen_xor_vec, 2873 .fno = gen_helper_gvec_xors, 2874 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2875 .vece = MO_64 2876 }; 2877 2878 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2879 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2880 { 2881 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2882 tcg_gen_dup_i64(vece, tmp, c); 2883 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2884 tcg_temp_free_i64(tmp); 2885 } 2886 2887 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2888 int64_t c, uint32_t oprsz, uint32_t maxsz) 2889 { 2890 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2891 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2892 } 2893 2894 static const GVecGen2s gop_ors = { 2895 .fni8 = tcg_gen_or_i64, 2896 .fniv = tcg_gen_or_vec, 2897 .fno = gen_helper_gvec_ors, 2898 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2899 .vece = MO_64 2900 }; 2901 2902 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2903 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2904 { 2905 TCGv_i64 tmp = tcg_temp_ebb_new_i64(); 2906 tcg_gen_dup_i64(vece, tmp, c); 2907 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2908 tcg_temp_free_i64(tmp); 2909 } 2910 2911 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2912 int64_t c, uint32_t oprsz, uint32_t maxsz) 2913 { 2914 TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c)); 2915 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2916 } 2917 2918 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2919 { 2920 uint64_t mask = dup_const(MO_8, 0xff << c); 2921 tcg_gen_shli_i64(d, a, c); 2922 tcg_gen_andi_i64(d, d, mask); 2923 } 2924 2925 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2926 { 2927 uint64_t mask = dup_const(MO_16, 0xffff << c); 2928 tcg_gen_shli_i64(d, a, c); 2929 tcg_gen_andi_i64(d, d, mask); 2930 } 2931 2932 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2933 { 2934 uint32_t mask = dup_const(MO_8, 0xff << c); 2935 tcg_gen_shli_i32(d, a, c); 2936 tcg_gen_andi_i32(d, d, mask); 2937 } 2938 2939 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2940 { 2941 uint32_t mask = dup_const(MO_16, 0xffff << c); 2942 tcg_gen_shli_i32(d, a, c); 2943 tcg_gen_andi_i32(d, d, mask); 2944 } 2945 2946 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2947 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2948 { 2949 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2950 static const GVecGen2i g[4] = { 2951 { .fni8 = tcg_gen_vec_shl8i_i64, 2952 .fniv = tcg_gen_shli_vec, 2953 .fno = gen_helper_gvec_shl8i, 2954 .opt_opc = vecop_list, 2955 .vece = MO_8 }, 2956 { .fni8 = tcg_gen_vec_shl16i_i64, 2957 .fniv = tcg_gen_shli_vec, 2958 .fno = gen_helper_gvec_shl16i, 2959 .opt_opc = vecop_list, 2960 .vece = MO_16 }, 2961 { .fni4 = tcg_gen_shli_i32, 2962 .fniv = tcg_gen_shli_vec, 2963 .fno = gen_helper_gvec_shl32i, 2964 .opt_opc = vecop_list, 2965 .vece = MO_32 }, 2966 { .fni8 = tcg_gen_shli_i64, 2967 .fniv = tcg_gen_shli_vec, 2968 .fno = gen_helper_gvec_shl64i, 2969 .opt_opc = vecop_list, 2970 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2971 .vece = MO_64 }, 2972 }; 2973 2974 tcg_debug_assert(vece <= MO_64); 2975 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2976 if (shift == 0) { 2977 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2978 } else { 2979 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2980 } 2981 } 2982 2983 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2984 { 2985 uint64_t mask = dup_const(MO_8, 0xff >> c); 2986 tcg_gen_shri_i64(d, a, c); 2987 tcg_gen_andi_i64(d, d, mask); 2988 } 2989 2990 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2991 { 2992 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2993 tcg_gen_shri_i64(d, a, c); 2994 tcg_gen_andi_i64(d, d, mask); 2995 } 2996 2997 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 2998 { 2999 uint32_t mask = dup_const(MO_8, 0xff >> c); 3000 tcg_gen_shri_i32(d, a, c); 3001 tcg_gen_andi_i32(d, d, mask); 3002 } 3003 3004 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 3005 { 3006 uint32_t mask = dup_const(MO_16, 0xffff >> c); 3007 tcg_gen_shri_i32(d, a, c); 3008 tcg_gen_andi_i32(d, d, mask); 3009 } 3010 3011 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 3012 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3013 { 3014 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 3015 static const GVecGen2i g[4] = { 3016 { .fni8 = tcg_gen_vec_shr8i_i64, 3017 .fniv = tcg_gen_shri_vec, 3018 .fno = gen_helper_gvec_shr8i, 3019 .opt_opc = vecop_list, 3020 .vece = MO_8 }, 3021 { .fni8 = tcg_gen_vec_shr16i_i64, 3022 .fniv = tcg_gen_shri_vec, 3023 .fno = gen_helper_gvec_shr16i, 3024 .opt_opc = vecop_list, 3025 .vece = MO_16 }, 3026 { .fni4 = tcg_gen_shri_i32, 3027 .fniv = tcg_gen_shri_vec, 3028 .fno = gen_helper_gvec_shr32i, 3029 .opt_opc = vecop_list, 3030 .vece = MO_32 }, 3031 { .fni8 = tcg_gen_shri_i64, 3032 .fniv = tcg_gen_shri_vec, 3033 .fno = gen_helper_gvec_shr64i, 3034 .opt_opc = vecop_list, 3035 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3036 .vece = MO_64 }, 3037 }; 3038 3039 tcg_debug_assert(vece <= MO_64); 3040 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3041 if (shift == 0) { 3042 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 3043 } else { 3044 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 3045 } 3046 } 3047 3048 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 3049 { 3050 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 3051 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 3052 TCGv_i64 s = tcg_temp_ebb_new_i64(); 3053 3054 tcg_gen_shri_i64(d, a, c); 3055 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 3056 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 3057 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 3058 tcg_gen_or_i64(d, d, s); /* include sign extension */ 3059 tcg_temp_free_i64(s); 3060 } 3061 3062 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 3063 { 3064 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 3065 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 3066 TCGv_i64 s = tcg_temp_ebb_new_i64(); 3067 3068 tcg_gen_shri_i64(d, a, c); 3069 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 3070 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 3071 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 3072 tcg_gen_or_i64(d, d, s); /* include sign extension */ 3073 tcg_temp_free_i64(s); 3074 } 3075 3076 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 3077 { 3078 uint32_t s_mask = dup_const(MO_8, 0x80 >> c); 3079 uint32_t c_mask = dup_const(MO_8, 0xff >> c); 3080 TCGv_i32 s = tcg_temp_ebb_new_i32(); 3081 3082 tcg_gen_shri_i32(d, a, c); 3083 tcg_gen_andi_i32(s, d, s_mask); /* isolate (shifted) sign bit */ 3084 tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */ 3085 tcg_gen_andi_i32(d, d, c_mask); /* clear out bits above sign */ 3086 tcg_gen_or_i32(d, d, s); /* include sign extension */ 3087 tcg_temp_free_i32(s); 3088 } 3089 3090 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c) 3091 { 3092 uint32_t s_mask = dup_const(MO_16, 0x8000 >> c); 3093 uint32_t c_mask = dup_const(MO_16, 0xffff >> c); 3094 TCGv_i32 s = tcg_temp_ebb_new_i32(); 3095 3096 tcg_gen_shri_i32(d, a, c); 3097 tcg_gen_andi_i32(s, d, s_mask); /* isolate (shifted) sign bit */ 3098 tcg_gen_andi_i32(d, d, c_mask); /* clear out bits above sign */ 3099 tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */ 3100 tcg_gen_or_i32(d, d, s); /* include sign extension */ 3101 tcg_temp_free_i32(s); 3102 } 3103 3104 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 3105 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3106 { 3107 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 3108 static const GVecGen2i g[4] = { 3109 { .fni8 = tcg_gen_vec_sar8i_i64, 3110 .fniv = tcg_gen_sari_vec, 3111 .fno = gen_helper_gvec_sar8i, 3112 .opt_opc = vecop_list, 3113 .vece = MO_8 }, 3114 { .fni8 = tcg_gen_vec_sar16i_i64, 3115 .fniv = tcg_gen_sari_vec, 3116 .fno = gen_helper_gvec_sar16i, 3117 .opt_opc = vecop_list, 3118 .vece = MO_16 }, 3119 { .fni4 = tcg_gen_sari_i32, 3120 .fniv = tcg_gen_sari_vec, 3121 .fno = gen_helper_gvec_sar32i, 3122 .opt_opc = vecop_list, 3123 .vece = MO_32 }, 3124 { .fni8 = tcg_gen_sari_i64, 3125 .fniv = tcg_gen_sari_vec, 3126 .fno = gen_helper_gvec_sar64i, 3127 .opt_opc = vecop_list, 3128 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3129 .vece = MO_64 }, 3130 }; 3131 3132 tcg_debug_assert(vece <= MO_64); 3133 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3134 if (shift == 0) { 3135 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 3136 } else { 3137 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 3138 } 3139 } 3140 3141 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 3142 { 3143 uint64_t mask = dup_const(MO_8, 0xff << c); 3144 3145 tcg_gen_shli_i64(d, a, c); 3146 tcg_gen_shri_i64(a, a, 8 - c); 3147 tcg_gen_andi_i64(d, d, mask); 3148 tcg_gen_andi_i64(a, a, ~mask); 3149 tcg_gen_or_i64(d, d, a); 3150 } 3151 3152 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 3153 { 3154 uint64_t mask = dup_const(MO_16, 0xffff << c); 3155 3156 tcg_gen_shli_i64(d, a, c); 3157 tcg_gen_shri_i64(a, a, 16 - c); 3158 tcg_gen_andi_i64(d, d, mask); 3159 tcg_gen_andi_i64(a, a, ~mask); 3160 tcg_gen_or_i64(d, d, a); 3161 } 3162 3163 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs, 3164 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3165 { 3166 static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 }; 3167 static const GVecGen2i g[4] = { 3168 { .fni8 = tcg_gen_vec_rotl8i_i64, 3169 .fniv = tcg_gen_rotli_vec, 3170 .fno = gen_helper_gvec_rotl8i, 3171 .opt_opc = vecop_list, 3172 .vece = MO_8 }, 3173 { .fni8 = tcg_gen_vec_rotl16i_i64, 3174 .fniv = tcg_gen_rotli_vec, 3175 .fno = gen_helper_gvec_rotl16i, 3176 .opt_opc = vecop_list, 3177 .vece = MO_16 }, 3178 { .fni4 = tcg_gen_rotli_i32, 3179 .fniv = tcg_gen_rotli_vec, 3180 .fno = gen_helper_gvec_rotl32i, 3181 .opt_opc = vecop_list, 3182 .vece = MO_32 }, 3183 { .fni8 = tcg_gen_rotli_i64, 3184 .fniv = tcg_gen_rotli_vec, 3185 .fno = gen_helper_gvec_rotl64i, 3186 .opt_opc = vecop_list, 3187 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3188 .vece = MO_64 }, 3189 }; 3190 3191 tcg_debug_assert(vece <= MO_64); 3192 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3193 if (shift == 0) { 3194 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 3195 } else { 3196 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 3197 } 3198 } 3199 3200 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs, 3201 int64_t shift, uint32_t oprsz, uint32_t maxsz) 3202 { 3203 tcg_debug_assert(vece <= MO_64); 3204 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 3205 tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1), 3206 oprsz, maxsz); 3207 } 3208 3209 /* 3210 * Specialized generation vector shifts by a non-constant scalar. 3211 */ 3212 3213 typedef struct { 3214 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 3215 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 3216 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 3217 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 3218 gen_helper_gvec_2 *fno[4]; 3219 TCGOpcode s_list[2]; 3220 TCGOpcode v_list[2]; 3221 } GVecGen2sh; 3222 3223 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3224 uint32_t oprsz, uint32_t tysz, TCGType type, 3225 TCGv_i32 shift, 3226 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 3227 { 3228 for (uint32_t i = 0; i < oprsz; i += tysz) { 3229 TCGv_vec t0 = tcg_temp_new_vec(type); 3230 TCGv_vec t1 = tcg_temp_new_vec(type); 3231 3232 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 3233 fni(vece, t1, t0, shift); 3234 tcg_gen_st_vec(t1, tcg_env, dofs + i); 3235 } 3236 } 3237 3238 static void 3239 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 3240 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 3241 { 3242 TCGType type; 3243 uint32_t some; 3244 3245 check_size_align(oprsz, maxsz, dofs | aofs); 3246 check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz); 3247 3248 /* If the backend has a scalar expansion, great. */ 3249 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 3250 if (type) { 3251 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 3252 switch (type) { 3253 case TCG_TYPE_V256: 3254 some = QEMU_ALIGN_DOWN(oprsz, 32); 3255 expand_2sh_vec(vece, dofs, aofs, some, 32, 3256 TCG_TYPE_V256, shift, g->fniv_s); 3257 if (some == oprsz) { 3258 break; 3259 } 3260 dofs += some; 3261 aofs += some; 3262 oprsz -= some; 3263 maxsz -= some; 3264 /* fallthru */ 3265 case TCG_TYPE_V128: 3266 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 3267 TCG_TYPE_V128, shift, g->fniv_s); 3268 break; 3269 case TCG_TYPE_V64: 3270 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 3271 TCG_TYPE_V64, shift, g->fniv_s); 3272 break; 3273 default: 3274 g_assert_not_reached(); 3275 } 3276 tcg_swap_vecop_list(hold_list); 3277 goto clear_tail; 3278 } 3279 3280 /* If the backend supports variable vector shifts, also cool. */ 3281 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 3282 if (type) { 3283 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 3284 TCGv_vec v_shift = tcg_temp_new_vec(type); 3285 3286 if (vece == MO_64) { 3287 TCGv_i64 sh64 = tcg_temp_ebb_new_i64(); 3288 tcg_gen_extu_i32_i64(sh64, shift); 3289 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 3290 tcg_temp_free_i64(sh64); 3291 } else { 3292 tcg_gen_dup_i32_vec(vece, v_shift, shift); 3293 } 3294 3295 switch (type) { 3296 case TCG_TYPE_V256: 3297 some = QEMU_ALIGN_DOWN(oprsz, 32); 3298 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 3299 v_shift, false, g->fniv_v); 3300 if (some == oprsz) { 3301 break; 3302 } 3303 dofs += some; 3304 aofs += some; 3305 oprsz -= some; 3306 maxsz -= some; 3307 /* fallthru */ 3308 case TCG_TYPE_V128: 3309 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 3310 v_shift, false, g->fniv_v); 3311 break; 3312 case TCG_TYPE_V64: 3313 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 3314 v_shift, false, g->fniv_v); 3315 break; 3316 default: 3317 g_assert_not_reached(); 3318 } 3319 tcg_temp_free_vec(v_shift); 3320 tcg_swap_vecop_list(hold_list); 3321 goto clear_tail; 3322 } 3323 3324 /* Otherwise fall back to integral... */ 3325 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3326 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 3327 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3328 TCGv_i64 sh64 = tcg_temp_ebb_new_i64(); 3329 tcg_gen_extu_i32_i64(sh64, shift); 3330 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 3331 tcg_temp_free_i64(sh64); 3332 } else { 3333 TCGv_ptr a0 = tcg_temp_ebb_new_ptr(); 3334 TCGv_ptr a1 = tcg_temp_ebb_new_ptr(); 3335 TCGv_i32 desc = tcg_temp_ebb_new_i32(); 3336 3337 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 3338 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 3339 tcg_gen_addi_ptr(a0, tcg_env, dofs); 3340 tcg_gen_addi_ptr(a1, tcg_env, aofs); 3341 3342 g->fno[vece](a0, a1, desc); 3343 3344 tcg_temp_free_ptr(a0); 3345 tcg_temp_free_ptr(a1); 3346 tcg_temp_free_i32(desc); 3347 return; 3348 } 3349 3350 clear_tail: 3351 if (oprsz < maxsz) { 3352 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz); 3353 } 3354 } 3355 3356 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 3357 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3358 { 3359 static const GVecGen2sh g = { 3360 .fni4 = tcg_gen_shl_i32, 3361 .fni8 = tcg_gen_shl_i64, 3362 .fniv_s = tcg_gen_shls_vec, 3363 .fniv_v = tcg_gen_shlv_vec, 3364 .fno = { 3365 gen_helper_gvec_shl8i, 3366 gen_helper_gvec_shl16i, 3367 gen_helper_gvec_shl32i, 3368 gen_helper_gvec_shl64i, 3369 }, 3370 .s_list = { INDEX_op_shls_vec, 0 }, 3371 .v_list = { INDEX_op_shlv_vec, 0 }, 3372 }; 3373 3374 tcg_debug_assert(vece <= MO_64); 3375 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3376 } 3377 3378 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 3379 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3380 { 3381 static const GVecGen2sh g = { 3382 .fni4 = tcg_gen_shr_i32, 3383 .fni8 = tcg_gen_shr_i64, 3384 .fniv_s = tcg_gen_shrs_vec, 3385 .fniv_v = tcg_gen_shrv_vec, 3386 .fno = { 3387 gen_helper_gvec_shr8i, 3388 gen_helper_gvec_shr16i, 3389 gen_helper_gvec_shr32i, 3390 gen_helper_gvec_shr64i, 3391 }, 3392 .s_list = { INDEX_op_shrs_vec, 0 }, 3393 .v_list = { INDEX_op_shrv_vec, 0 }, 3394 }; 3395 3396 tcg_debug_assert(vece <= MO_64); 3397 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3398 } 3399 3400 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 3401 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3402 { 3403 static const GVecGen2sh g = { 3404 .fni4 = tcg_gen_sar_i32, 3405 .fni8 = tcg_gen_sar_i64, 3406 .fniv_s = tcg_gen_sars_vec, 3407 .fniv_v = tcg_gen_sarv_vec, 3408 .fno = { 3409 gen_helper_gvec_sar8i, 3410 gen_helper_gvec_sar16i, 3411 gen_helper_gvec_sar32i, 3412 gen_helper_gvec_sar64i, 3413 }, 3414 .s_list = { INDEX_op_sars_vec, 0 }, 3415 .v_list = { INDEX_op_sarv_vec, 0 }, 3416 }; 3417 3418 tcg_debug_assert(vece <= MO_64); 3419 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3420 } 3421 3422 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs, 3423 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3424 { 3425 static const GVecGen2sh g = { 3426 .fni4 = tcg_gen_rotl_i32, 3427 .fni8 = tcg_gen_rotl_i64, 3428 .fniv_s = tcg_gen_rotls_vec, 3429 .fniv_v = tcg_gen_rotlv_vec, 3430 .fno = { 3431 gen_helper_gvec_rotl8i, 3432 gen_helper_gvec_rotl16i, 3433 gen_helper_gvec_rotl32i, 3434 gen_helper_gvec_rotl64i, 3435 }, 3436 .s_list = { INDEX_op_rotls_vec, 0 }, 3437 .v_list = { INDEX_op_rotlv_vec, 0 }, 3438 }; 3439 3440 tcg_debug_assert(vece <= MO_64); 3441 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3442 } 3443 3444 void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs, 3445 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3446 { 3447 TCGv_i32 tmp = tcg_temp_ebb_new_i32(); 3448 3449 tcg_gen_neg_i32(tmp, shift); 3450 tcg_gen_andi_i32(tmp, tmp, (8 << vece) - 1); 3451 tcg_gen_gvec_rotls(vece, dofs, aofs, tmp, oprsz, maxsz); 3452 tcg_temp_free_i32(tmp); 3453 } 3454 3455 /* 3456 * Expand D = A << (B % element bits) 3457 * 3458 * Unlike scalar shifts, where it is easy for the target front end 3459 * to include the modulo as part of the expansion. If the target 3460 * naturally includes the modulo as part of the operation, great! 3461 * If the target has some other behaviour from out-of-range shifts, 3462 * then it could not use this function anyway, and would need to 3463 * do it's own expansion with custom functions. 3464 */ 3465 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 3466 TCGv_vec a, TCGv_vec b) 3467 { 3468 TCGv_vec t = tcg_temp_new_vec_matching(d); 3469 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3470 3471 tcg_gen_and_vec(vece, t, b, m); 3472 tcg_gen_shlv_vec(vece, d, a, t); 3473 tcg_temp_free_vec(t); 3474 } 3475 3476 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3477 { 3478 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3479 3480 tcg_gen_andi_i32(t, b, 31); 3481 tcg_gen_shl_i32(d, a, t); 3482 tcg_temp_free_i32(t); 3483 } 3484 3485 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3486 { 3487 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3488 3489 tcg_gen_andi_i64(t, b, 63); 3490 tcg_gen_shl_i64(d, a, t); 3491 tcg_temp_free_i64(t); 3492 } 3493 3494 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3495 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3496 { 3497 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 3498 static const GVecGen3 g[4] = { 3499 { .fniv = tcg_gen_shlv_mod_vec, 3500 .fno = gen_helper_gvec_shl8v, 3501 .opt_opc = vecop_list, 3502 .vece = MO_8 }, 3503 { .fniv = tcg_gen_shlv_mod_vec, 3504 .fno = gen_helper_gvec_shl16v, 3505 .opt_opc = vecop_list, 3506 .vece = MO_16 }, 3507 { .fni4 = tcg_gen_shl_mod_i32, 3508 .fniv = tcg_gen_shlv_mod_vec, 3509 .fno = gen_helper_gvec_shl32v, 3510 .opt_opc = vecop_list, 3511 .vece = MO_32 }, 3512 { .fni8 = tcg_gen_shl_mod_i64, 3513 .fniv = tcg_gen_shlv_mod_vec, 3514 .fno = gen_helper_gvec_shl64v, 3515 .opt_opc = vecop_list, 3516 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3517 .vece = MO_64 }, 3518 }; 3519 3520 tcg_debug_assert(vece <= MO_64); 3521 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3522 } 3523 3524 /* 3525 * Similarly for logical right shifts. 3526 */ 3527 3528 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 3529 TCGv_vec a, TCGv_vec b) 3530 { 3531 TCGv_vec t = tcg_temp_new_vec_matching(d); 3532 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3533 3534 tcg_gen_and_vec(vece, t, b, m); 3535 tcg_gen_shrv_vec(vece, d, a, t); 3536 tcg_temp_free_vec(t); 3537 } 3538 3539 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3540 { 3541 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3542 3543 tcg_gen_andi_i32(t, b, 31); 3544 tcg_gen_shr_i32(d, a, t); 3545 tcg_temp_free_i32(t); 3546 } 3547 3548 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3549 { 3550 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3551 3552 tcg_gen_andi_i64(t, b, 63); 3553 tcg_gen_shr_i64(d, a, t); 3554 tcg_temp_free_i64(t); 3555 } 3556 3557 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3558 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3559 { 3560 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 3561 static const GVecGen3 g[4] = { 3562 { .fniv = tcg_gen_shrv_mod_vec, 3563 .fno = gen_helper_gvec_shr8v, 3564 .opt_opc = vecop_list, 3565 .vece = MO_8 }, 3566 { .fniv = tcg_gen_shrv_mod_vec, 3567 .fno = gen_helper_gvec_shr16v, 3568 .opt_opc = vecop_list, 3569 .vece = MO_16 }, 3570 { .fni4 = tcg_gen_shr_mod_i32, 3571 .fniv = tcg_gen_shrv_mod_vec, 3572 .fno = gen_helper_gvec_shr32v, 3573 .opt_opc = vecop_list, 3574 .vece = MO_32 }, 3575 { .fni8 = tcg_gen_shr_mod_i64, 3576 .fniv = tcg_gen_shrv_mod_vec, 3577 .fno = gen_helper_gvec_shr64v, 3578 .opt_opc = vecop_list, 3579 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3580 .vece = MO_64 }, 3581 }; 3582 3583 tcg_debug_assert(vece <= MO_64); 3584 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3585 } 3586 3587 /* 3588 * Similarly for arithmetic right shifts. 3589 */ 3590 3591 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 3592 TCGv_vec a, TCGv_vec b) 3593 { 3594 TCGv_vec t = tcg_temp_new_vec_matching(d); 3595 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3596 3597 tcg_gen_and_vec(vece, t, b, m); 3598 tcg_gen_sarv_vec(vece, d, a, t); 3599 tcg_temp_free_vec(t); 3600 } 3601 3602 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3603 { 3604 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3605 3606 tcg_gen_andi_i32(t, b, 31); 3607 tcg_gen_sar_i32(d, a, t); 3608 tcg_temp_free_i32(t); 3609 } 3610 3611 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3612 { 3613 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3614 3615 tcg_gen_andi_i64(t, b, 63); 3616 tcg_gen_sar_i64(d, a, t); 3617 tcg_temp_free_i64(t); 3618 } 3619 3620 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3621 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3622 { 3623 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3624 static const GVecGen3 g[4] = { 3625 { .fniv = tcg_gen_sarv_mod_vec, 3626 .fno = gen_helper_gvec_sar8v, 3627 .opt_opc = vecop_list, 3628 .vece = MO_8 }, 3629 { .fniv = tcg_gen_sarv_mod_vec, 3630 .fno = gen_helper_gvec_sar16v, 3631 .opt_opc = vecop_list, 3632 .vece = MO_16 }, 3633 { .fni4 = tcg_gen_sar_mod_i32, 3634 .fniv = tcg_gen_sarv_mod_vec, 3635 .fno = gen_helper_gvec_sar32v, 3636 .opt_opc = vecop_list, 3637 .vece = MO_32 }, 3638 { .fni8 = tcg_gen_sar_mod_i64, 3639 .fniv = tcg_gen_sarv_mod_vec, 3640 .fno = gen_helper_gvec_sar64v, 3641 .opt_opc = vecop_list, 3642 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3643 .vece = MO_64 }, 3644 }; 3645 3646 tcg_debug_assert(vece <= MO_64); 3647 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3648 } 3649 3650 /* 3651 * Similarly for rotates. 3652 */ 3653 3654 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d, 3655 TCGv_vec a, TCGv_vec b) 3656 { 3657 TCGv_vec t = tcg_temp_new_vec_matching(d); 3658 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3659 3660 tcg_gen_and_vec(vece, t, b, m); 3661 tcg_gen_rotlv_vec(vece, d, a, t); 3662 tcg_temp_free_vec(t); 3663 } 3664 3665 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3666 { 3667 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3668 3669 tcg_gen_andi_i32(t, b, 31); 3670 tcg_gen_rotl_i32(d, a, t); 3671 tcg_temp_free_i32(t); 3672 } 3673 3674 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3675 { 3676 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3677 3678 tcg_gen_andi_i64(t, b, 63); 3679 tcg_gen_rotl_i64(d, a, t); 3680 tcg_temp_free_i64(t); 3681 } 3682 3683 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3684 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3685 { 3686 static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 }; 3687 static const GVecGen3 g[4] = { 3688 { .fniv = tcg_gen_rotlv_mod_vec, 3689 .fno = gen_helper_gvec_rotl8v, 3690 .opt_opc = vecop_list, 3691 .vece = MO_8 }, 3692 { .fniv = tcg_gen_rotlv_mod_vec, 3693 .fno = gen_helper_gvec_rotl16v, 3694 .opt_opc = vecop_list, 3695 .vece = MO_16 }, 3696 { .fni4 = tcg_gen_rotl_mod_i32, 3697 .fniv = tcg_gen_rotlv_mod_vec, 3698 .fno = gen_helper_gvec_rotl32v, 3699 .opt_opc = vecop_list, 3700 .vece = MO_32 }, 3701 { .fni8 = tcg_gen_rotl_mod_i64, 3702 .fniv = tcg_gen_rotlv_mod_vec, 3703 .fno = gen_helper_gvec_rotl64v, 3704 .opt_opc = vecop_list, 3705 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3706 .vece = MO_64 }, 3707 }; 3708 3709 tcg_debug_assert(vece <= MO_64); 3710 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3711 } 3712 3713 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d, 3714 TCGv_vec a, TCGv_vec b) 3715 { 3716 TCGv_vec t = tcg_temp_new_vec_matching(d); 3717 TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1); 3718 3719 tcg_gen_and_vec(vece, t, b, m); 3720 tcg_gen_rotrv_vec(vece, d, a, t); 3721 tcg_temp_free_vec(t); 3722 } 3723 3724 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3725 { 3726 TCGv_i32 t = tcg_temp_ebb_new_i32(); 3727 3728 tcg_gen_andi_i32(t, b, 31); 3729 tcg_gen_rotr_i32(d, a, t); 3730 tcg_temp_free_i32(t); 3731 } 3732 3733 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3734 { 3735 TCGv_i64 t = tcg_temp_ebb_new_i64(); 3736 3737 tcg_gen_andi_i64(t, b, 63); 3738 tcg_gen_rotr_i64(d, a, t); 3739 tcg_temp_free_i64(t); 3740 } 3741 3742 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3743 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3744 { 3745 static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 }; 3746 static const GVecGen3 g[4] = { 3747 { .fniv = tcg_gen_rotrv_mod_vec, 3748 .fno = gen_helper_gvec_rotr8v, 3749 .opt_opc = vecop_list, 3750 .vece = MO_8 }, 3751 { .fniv = tcg_gen_rotrv_mod_vec, 3752 .fno = gen_helper_gvec_rotr16v, 3753 .opt_opc = vecop_list, 3754 .vece = MO_16 }, 3755 { .fni4 = tcg_gen_rotr_mod_i32, 3756 .fniv = tcg_gen_rotrv_mod_vec, 3757 .fno = gen_helper_gvec_rotr32v, 3758 .opt_opc = vecop_list, 3759 .vece = MO_32 }, 3760 { .fni8 = tcg_gen_rotr_mod_i64, 3761 .fniv = tcg_gen_rotrv_mod_vec, 3762 .fno = gen_helper_gvec_rotr64v, 3763 .opt_opc = vecop_list, 3764 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3765 .vece = MO_64 }, 3766 }; 3767 3768 tcg_debug_assert(vece <= MO_64); 3769 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3770 } 3771 3772 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3773 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3774 uint32_t oprsz, TCGCond cond) 3775 { 3776 TCGv_i32 t0 = tcg_temp_ebb_new_i32(); 3777 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 3778 uint32_t i; 3779 3780 for (i = 0; i < oprsz; i += 4) { 3781 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 3782 tcg_gen_ld_i32(t1, tcg_env, bofs + i); 3783 tcg_gen_negsetcond_i32(cond, t0, t0, t1); 3784 tcg_gen_st_i32(t0, tcg_env, dofs + i); 3785 } 3786 tcg_temp_free_i32(t1); 3787 tcg_temp_free_i32(t0); 3788 } 3789 3790 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3791 uint32_t oprsz, TCGCond cond) 3792 { 3793 TCGv_i64 t0 = tcg_temp_ebb_new_i64(); 3794 TCGv_i64 t1 = tcg_temp_ebb_new_i64(); 3795 uint32_t i; 3796 3797 for (i = 0; i < oprsz; i += 8) { 3798 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 3799 tcg_gen_ld_i64(t1, tcg_env, bofs + i); 3800 tcg_gen_negsetcond_i64(cond, t0, t0, t1); 3801 tcg_gen_st_i64(t0, tcg_env, dofs + i); 3802 } 3803 tcg_temp_free_i64(t1); 3804 tcg_temp_free_i64(t0); 3805 } 3806 3807 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3808 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3809 TCGType type, TCGCond cond) 3810 { 3811 for (uint32_t i = 0; i < oprsz; i += tysz) { 3812 TCGv_vec t0 = tcg_temp_new_vec(type); 3813 TCGv_vec t1 = tcg_temp_new_vec(type); 3814 TCGv_vec t2 = tcg_temp_new_vec(type); 3815 3816 tcg_gen_ld_vec(t0, tcg_env, aofs + i); 3817 tcg_gen_ld_vec(t1, tcg_env, bofs + i); 3818 tcg_gen_cmp_vec(cond, vece, t2, t0, t1); 3819 tcg_gen_st_vec(t2, tcg_env, dofs + i); 3820 } 3821 } 3822 3823 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3824 uint32_t aofs, uint32_t bofs, 3825 uint32_t oprsz, uint32_t maxsz) 3826 { 3827 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3828 static gen_helper_gvec_3 * const eq_fn[4] = { 3829 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3830 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3831 }; 3832 static gen_helper_gvec_3 * const ne_fn[4] = { 3833 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3834 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3835 }; 3836 static gen_helper_gvec_3 * const lt_fn[4] = { 3837 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3838 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3839 }; 3840 static gen_helper_gvec_3 * const le_fn[4] = { 3841 gen_helper_gvec_le8, gen_helper_gvec_le16, 3842 gen_helper_gvec_le32, gen_helper_gvec_le64 3843 }; 3844 static gen_helper_gvec_3 * const ltu_fn[4] = { 3845 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3846 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3847 }; 3848 static gen_helper_gvec_3 * const leu_fn[4] = { 3849 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3850 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3851 }; 3852 static gen_helper_gvec_3 * const * const fns[16] = { 3853 [TCG_COND_EQ] = eq_fn, 3854 [TCG_COND_NE] = ne_fn, 3855 [TCG_COND_LT] = lt_fn, 3856 [TCG_COND_LE] = le_fn, 3857 [TCG_COND_LTU] = ltu_fn, 3858 [TCG_COND_LEU] = leu_fn, 3859 }; 3860 3861 const TCGOpcode *hold_list; 3862 TCGType type; 3863 uint32_t some; 3864 3865 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3866 check_overlap_3(tcg_env, dofs, tcg_env, aofs, tcg_env, bofs, maxsz); 3867 3868 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3869 do_dup(MO_8, tcg_env, dofs, oprsz, maxsz, 3870 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3871 return; 3872 } 3873 3874 /* 3875 * Implement inline with a vector type, if possible. 3876 * Prefer integer when 64-bit host and 64-bit comparison. 3877 */ 3878 hold_list = tcg_swap_vecop_list(cmp_list); 3879 type = choose_vector_type(cmp_list, vece, oprsz, 3880 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3881 switch (type) { 3882 case TCG_TYPE_V256: 3883 /* Recall that ARM SVE allows vector sizes that are not a 3884 * power of 2, but always a multiple of 16. The intent is 3885 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3886 */ 3887 some = QEMU_ALIGN_DOWN(oprsz, 32); 3888 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3889 if (some == oprsz) { 3890 break; 3891 } 3892 dofs += some; 3893 aofs += some; 3894 bofs += some; 3895 oprsz -= some; 3896 maxsz -= some; 3897 /* fallthru */ 3898 case TCG_TYPE_V128: 3899 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3900 break; 3901 case TCG_TYPE_V64: 3902 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3903 break; 3904 3905 case 0: 3906 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3907 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3908 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3909 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3910 } else { 3911 gen_helper_gvec_3 * const *fn = fns[cond]; 3912 3913 if (fn == NULL) { 3914 uint32_t tmp; 3915 tmp = aofs, aofs = bofs, bofs = tmp; 3916 cond = tcg_swap_cond(cond); 3917 fn = fns[cond]; 3918 assert(fn != NULL); 3919 } 3920 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3921 oprsz = maxsz; 3922 } 3923 break; 3924 3925 default: 3926 g_assert_not_reached(); 3927 } 3928 tcg_swap_vecop_list(hold_list); 3929 3930 if (oprsz < maxsz) { 3931 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz); 3932 } 3933 } 3934 3935 static void expand_cmps_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3936 uint32_t oprsz, uint32_t tysz, TCGType type, 3937 TCGCond cond, TCGv_vec c) 3938 { 3939 TCGv_vec t0 = tcg_temp_new_vec(type); 3940 TCGv_vec t1 = tcg_temp_new_vec(type); 3941 uint32_t i; 3942 3943 for (i = 0; i < oprsz; i += tysz) { 3944 tcg_gen_ld_vec(t1, tcg_env, aofs + i); 3945 tcg_gen_cmp_vec(cond, vece, t0, t1, c); 3946 tcg_gen_st_vec(t0, tcg_env, dofs + i); 3947 } 3948 } 3949 3950 void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs, 3951 uint32_t aofs, TCGv_i64 c, 3952 uint32_t oprsz, uint32_t maxsz) 3953 { 3954 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3955 static gen_helper_gvec_2i * const eq_fn[4] = { 3956 gen_helper_gvec_eqs8, gen_helper_gvec_eqs16, 3957 gen_helper_gvec_eqs32, gen_helper_gvec_eqs64 3958 }; 3959 static gen_helper_gvec_2i * const lt_fn[4] = { 3960 gen_helper_gvec_lts8, gen_helper_gvec_lts16, 3961 gen_helper_gvec_lts32, gen_helper_gvec_lts64 3962 }; 3963 static gen_helper_gvec_2i * const le_fn[4] = { 3964 gen_helper_gvec_les8, gen_helper_gvec_les16, 3965 gen_helper_gvec_les32, gen_helper_gvec_les64 3966 }; 3967 static gen_helper_gvec_2i * const ltu_fn[4] = { 3968 gen_helper_gvec_ltus8, gen_helper_gvec_ltus16, 3969 gen_helper_gvec_ltus32, gen_helper_gvec_ltus64 3970 }; 3971 static gen_helper_gvec_2i * const leu_fn[4] = { 3972 gen_helper_gvec_leus8, gen_helper_gvec_leus16, 3973 gen_helper_gvec_leus32, gen_helper_gvec_leus64 3974 }; 3975 static gen_helper_gvec_2i * const * const fns[16] = { 3976 [TCG_COND_EQ] = eq_fn, 3977 [TCG_COND_LT] = lt_fn, 3978 [TCG_COND_LE] = le_fn, 3979 [TCG_COND_LTU] = ltu_fn, 3980 [TCG_COND_LEU] = leu_fn, 3981 }; 3982 3983 TCGType type; 3984 3985 check_size_align(oprsz, maxsz, dofs | aofs); 3986 check_overlap_2(tcg_env, dofs, tcg_env, aofs, maxsz); 3987 3988 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3989 do_dup(MO_8, tcg_env, dofs, oprsz, maxsz, 3990 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3991 return; 3992 } 3993 3994 /* 3995 * Implement inline with a vector type, if possible. 3996 * Prefer integer when 64-bit host and 64-bit comparison. 3997 */ 3998 type = choose_vector_type(cmp_list, vece, oprsz, 3999 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 4000 if (type != 0) { 4001 const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list); 4002 TCGv_vec t_vec = tcg_temp_new_vec(type); 4003 uint32_t some; 4004 4005 tcg_gen_dup_i64_vec(vece, t_vec, c); 4006 switch (type) { 4007 case TCG_TYPE_V256: 4008 some = QEMU_ALIGN_DOWN(oprsz, 32); 4009 expand_cmps_vec(vece, dofs, aofs, some, 32, 4010 TCG_TYPE_V256, cond, t_vec); 4011 aofs += some; 4012 dofs += some; 4013 oprsz -= some; 4014 maxsz -= some; 4015 /* fallthru */ 4016 4017 case TCG_TYPE_V128: 4018 some = QEMU_ALIGN_DOWN(oprsz, 16); 4019 expand_cmps_vec(vece, dofs, aofs, some, 16, 4020 TCG_TYPE_V128, cond, t_vec); 4021 break; 4022 4023 case TCG_TYPE_V64: 4024 some = QEMU_ALIGN_DOWN(oprsz, 8); 4025 expand_cmps_vec(vece, dofs, aofs, some, 8, 4026 TCG_TYPE_V64, cond, t_vec); 4027 break; 4028 4029 default: 4030 g_assert_not_reached(); 4031 } 4032 tcg_temp_free_vec(t_vec); 4033 tcg_swap_vecop_list(hold_list); 4034 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 4035 TCGv_i64 t0 = tcg_temp_ebb_new_i64(); 4036 uint32_t i; 4037 4038 for (i = 0; i < oprsz; i += 8) { 4039 tcg_gen_ld_i64(t0, tcg_env, aofs + i); 4040 tcg_gen_negsetcond_i64(cond, t0, t0, c); 4041 tcg_gen_st_i64(t0, tcg_env, dofs + i); 4042 } 4043 tcg_temp_free_i64(t0); 4044 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 4045 TCGv_i32 t0 = tcg_temp_ebb_new_i32(); 4046 TCGv_i32 t1 = tcg_temp_ebb_new_i32(); 4047 uint32_t i; 4048 4049 tcg_gen_extrl_i64_i32(t1, c); 4050 for (i = 0; i < oprsz; i += 4) { 4051 tcg_gen_ld_i32(t0, tcg_env, aofs + i); 4052 tcg_gen_negsetcond_i32(cond, t0, t0, t1); 4053 tcg_gen_st_i32(t0, tcg_env, dofs + i); 4054 } 4055 tcg_temp_free_i32(t0); 4056 tcg_temp_free_i32(t1); 4057 } else { 4058 gen_helper_gvec_2i * const *fn = fns[cond]; 4059 bool inv = false; 4060 4061 if (fn == NULL) { 4062 cond = tcg_invert_cond(cond); 4063 fn = fns[cond]; 4064 assert(fn != NULL); 4065 inv = true; 4066 } 4067 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]); 4068 return; 4069 } 4070 4071 if (oprsz < maxsz) { 4072 expand_clr(tcg_env, dofs + oprsz, maxsz - oprsz); 4073 } 4074 } 4075 4076 void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs, 4077 uint32_t aofs, int64_t c, 4078 uint32_t oprsz, uint32_t maxsz) 4079 { 4080 TCGv_i64 tmp = tcg_constant_i64(c); 4081 tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz); 4082 } 4083 4084 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 4085 { 4086 TCGv_i64 t = tcg_temp_ebb_new_i64(); 4087 4088 tcg_gen_and_i64(t, b, a); 4089 tcg_gen_andc_i64(d, c, a); 4090 tcg_gen_or_i64(d, d, t); 4091 tcg_temp_free_i64(t); 4092 } 4093 4094 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 4095 uint32_t bofs, uint32_t cofs, 4096 uint32_t oprsz, uint32_t maxsz) 4097 { 4098 static const GVecGen4 g = { 4099 .fni8 = tcg_gen_bitsel_i64, 4100 .fniv = tcg_gen_bitsel_vec, 4101 .fno = gen_helper_gvec_bitsel, 4102 }; 4103 4104 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 4105 } 4106