1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "tcg/tcg.h" 22 #include "tcg/tcg-op.h" 23 #include "tcg/tcg-op-gvec.h" 24 #include "qemu/main-loop.h" 25 #include "tcg/tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 #ifdef CONFIG_DEBUG_TCG 30 static const TCGOpcode vecop_list_empty[1] = { 0 }; 31 #else 32 #define vecop_list_empty NULL 33 #endif 34 35 36 /* Verify vector size and alignment rules. OFS should be the OR of all 37 of the operand offsets so that we can check them all at once. */ 38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 39 { 40 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 41 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 42 tcg_debug_assert(oprsz > 0); 43 tcg_debug_assert(oprsz <= maxsz); 44 tcg_debug_assert((oprsz & opr_align) == 0); 45 tcg_debug_assert((maxsz & max_align) == 0); 46 tcg_debug_assert((ofs & max_align) == 0); 47 } 48 49 /* Verify vector overlap rules for two operands. */ 50 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 51 { 52 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 53 } 54 55 /* Verify vector overlap rules for three operands. */ 56 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 57 { 58 check_overlap_2(d, a, s); 59 check_overlap_2(d, b, s); 60 check_overlap_2(a, b, s); 61 } 62 63 /* Verify vector overlap rules for four operands. */ 64 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 65 uint32_t c, uint32_t s) 66 { 67 check_overlap_2(d, a, s); 68 check_overlap_2(d, b, s); 69 check_overlap_2(d, c, s); 70 check_overlap_2(a, b, s); 71 check_overlap_2(a, c, s); 72 check_overlap_2(b, c, s); 73 } 74 75 /* Create a descriptor from components. */ 76 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 77 { 78 uint32_t desc = 0; 79 80 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 81 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 82 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 83 84 oprsz = (oprsz / 8) - 1; 85 maxsz = (maxsz / 8) - 1; 86 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 87 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 88 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 89 90 return desc; 91 } 92 93 /* Generate a call to a gvec-style helper with two vector operands. */ 94 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 95 uint32_t oprsz, uint32_t maxsz, int32_t data, 96 gen_helper_gvec_2 *fn) 97 { 98 TCGv_ptr a0, a1; 99 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 100 101 a0 = tcg_temp_new_ptr(); 102 a1 = tcg_temp_new_ptr(); 103 104 tcg_gen_addi_ptr(a0, cpu_env, dofs); 105 tcg_gen_addi_ptr(a1, cpu_env, aofs); 106 107 fn(a0, a1, desc); 108 109 tcg_temp_free_ptr(a0); 110 tcg_temp_free_ptr(a1); 111 tcg_temp_free_i32(desc); 112 } 113 114 /* Generate a call to a gvec-style helper with two vector operands 115 and one scalar operand. */ 116 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 117 uint32_t oprsz, uint32_t maxsz, int32_t data, 118 gen_helper_gvec_2i *fn) 119 { 120 TCGv_ptr a0, a1; 121 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 122 123 a0 = tcg_temp_new_ptr(); 124 a1 = tcg_temp_new_ptr(); 125 126 tcg_gen_addi_ptr(a0, cpu_env, dofs); 127 tcg_gen_addi_ptr(a1, cpu_env, aofs); 128 129 fn(a0, a1, c, desc); 130 131 tcg_temp_free_ptr(a0); 132 tcg_temp_free_ptr(a1); 133 tcg_temp_free_i32(desc); 134 } 135 136 /* Generate a call to a gvec-style helper with three vector operands. */ 137 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 138 uint32_t oprsz, uint32_t maxsz, int32_t data, 139 gen_helper_gvec_3 *fn) 140 { 141 TCGv_ptr a0, a1, a2; 142 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 143 144 a0 = tcg_temp_new_ptr(); 145 a1 = tcg_temp_new_ptr(); 146 a2 = tcg_temp_new_ptr(); 147 148 tcg_gen_addi_ptr(a0, cpu_env, dofs); 149 tcg_gen_addi_ptr(a1, cpu_env, aofs); 150 tcg_gen_addi_ptr(a2, cpu_env, bofs); 151 152 fn(a0, a1, a2, desc); 153 154 tcg_temp_free_ptr(a0); 155 tcg_temp_free_ptr(a1); 156 tcg_temp_free_ptr(a2); 157 tcg_temp_free_i32(desc); 158 } 159 160 /* Generate a call to a gvec-style helper with four vector operands. */ 161 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 162 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 163 int32_t data, gen_helper_gvec_4 *fn) 164 { 165 TCGv_ptr a0, a1, a2, a3; 166 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 167 168 a0 = tcg_temp_new_ptr(); 169 a1 = tcg_temp_new_ptr(); 170 a2 = tcg_temp_new_ptr(); 171 a3 = tcg_temp_new_ptr(); 172 173 tcg_gen_addi_ptr(a0, cpu_env, dofs); 174 tcg_gen_addi_ptr(a1, cpu_env, aofs); 175 tcg_gen_addi_ptr(a2, cpu_env, bofs); 176 tcg_gen_addi_ptr(a3, cpu_env, cofs); 177 178 fn(a0, a1, a2, a3, desc); 179 180 tcg_temp_free_ptr(a0); 181 tcg_temp_free_ptr(a1); 182 tcg_temp_free_ptr(a2); 183 tcg_temp_free_ptr(a3); 184 tcg_temp_free_i32(desc); 185 } 186 187 /* Generate a call to a gvec-style helper with five vector operands. */ 188 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 189 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 190 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 191 { 192 TCGv_ptr a0, a1, a2, a3, a4; 193 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 194 195 a0 = tcg_temp_new_ptr(); 196 a1 = tcg_temp_new_ptr(); 197 a2 = tcg_temp_new_ptr(); 198 a3 = tcg_temp_new_ptr(); 199 a4 = tcg_temp_new_ptr(); 200 201 tcg_gen_addi_ptr(a0, cpu_env, dofs); 202 tcg_gen_addi_ptr(a1, cpu_env, aofs); 203 tcg_gen_addi_ptr(a2, cpu_env, bofs); 204 tcg_gen_addi_ptr(a3, cpu_env, cofs); 205 tcg_gen_addi_ptr(a4, cpu_env, xofs); 206 207 fn(a0, a1, a2, a3, a4, desc); 208 209 tcg_temp_free_ptr(a0); 210 tcg_temp_free_ptr(a1); 211 tcg_temp_free_ptr(a2); 212 tcg_temp_free_ptr(a3); 213 tcg_temp_free_ptr(a4); 214 tcg_temp_free_i32(desc); 215 } 216 217 /* Generate a call to a gvec-style helper with three vector operands 218 and an extra pointer operand. */ 219 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 220 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 221 int32_t data, gen_helper_gvec_2_ptr *fn) 222 { 223 TCGv_ptr a0, a1; 224 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 225 226 a0 = tcg_temp_new_ptr(); 227 a1 = tcg_temp_new_ptr(); 228 229 tcg_gen_addi_ptr(a0, cpu_env, dofs); 230 tcg_gen_addi_ptr(a1, cpu_env, aofs); 231 232 fn(a0, a1, ptr, desc); 233 234 tcg_temp_free_ptr(a0); 235 tcg_temp_free_ptr(a1); 236 tcg_temp_free_i32(desc); 237 } 238 239 /* Generate a call to a gvec-style helper with three vector operands 240 and an extra pointer operand. */ 241 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 242 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 243 int32_t data, gen_helper_gvec_3_ptr *fn) 244 { 245 TCGv_ptr a0, a1, a2; 246 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 247 248 a0 = tcg_temp_new_ptr(); 249 a1 = tcg_temp_new_ptr(); 250 a2 = tcg_temp_new_ptr(); 251 252 tcg_gen_addi_ptr(a0, cpu_env, dofs); 253 tcg_gen_addi_ptr(a1, cpu_env, aofs); 254 tcg_gen_addi_ptr(a2, cpu_env, bofs); 255 256 fn(a0, a1, a2, ptr, desc); 257 258 tcg_temp_free_ptr(a0); 259 tcg_temp_free_ptr(a1); 260 tcg_temp_free_ptr(a2); 261 tcg_temp_free_i32(desc); 262 } 263 264 /* Generate a call to a gvec-style helper with four vector operands 265 and an extra pointer operand. */ 266 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 267 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 268 uint32_t maxsz, int32_t data, 269 gen_helper_gvec_4_ptr *fn) 270 { 271 TCGv_ptr a0, a1, a2, a3; 272 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 273 274 a0 = tcg_temp_new_ptr(); 275 a1 = tcg_temp_new_ptr(); 276 a2 = tcg_temp_new_ptr(); 277 a3 = tcg_temp_new_ptr(); 278 279 tcg_gen_addi_ptr(a0, cpu_env, dofs); 280 tcg_gen_addi_ptr(a1, cpu_env, aofs); 281 tcg_gen_addi_ptr(a2, cpu_env, bofs); 282 tcg_gen_addi_ptr(a3, cpu_env, cofs); 283 284 fn(a0, a1, a2, a3, ptr, desc); 285 286 tcg_temp_free_ptr(a0); 287 tcg_temp_free_ptr(a1); 288 tcg_temp_free_ptr(a2); 289 tcg_temp_free_ptr(a3); 290 tcg_temp_free_i32(desc); 291 } 292 293 /* Generate a call to a gvec-style helper with five vector operands 294 and an extra pointer operand. */ 295 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 296 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr, 297 uint32_t oprsz, uint32_t maxsz, int32_t data, 298 gen_helper_gvec_5_ptr *fn) 299 { 300 TCGv_ptr a0, a1, a2, a3, a4; 301 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 302 303 a0 = tcg_temp_new_ptr(); 304 a1 = tcg_temp_new_ptr(); 305 a2 = tcg_temp_new_ptr(); 306 a3 = tcg_temp_new_ptr(); 307 a4 = tcg_temp_new_ptr(); 308 309 tcg_gen_addi_ptr(a0, cpu_env, dofs); 310 tcg_gen_addi_ptr(a1, cpu_env, aofs); 311 tcg_gen_addi_ptr(a2, cpu_env, bofs); 312 tcg_gen_addi_ptr(a3, cpu_env, cofs); 313 tcg_gen_addi_ptr(a4, cpu_env, eofs); 314 315 fn(a0, a1, a2, a3, a4, ptr, desc); 316 317 tcg_temp_free_ptr(a0); 318 tcg_temp_free_ptr(a1); 319 tcg_temp_free_ptr(a2); 320 tcg_temp_free_ptr(a3); 321 tcg_temp_free_ptr(a4); 322 tcg_temp_free_i32(desc); 323 } 324 325 /* Return true if we want to implement something of OPRSZ bytes 326 in units of LNSZ. This limits the expansion of inline code. */ 327 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 328 { 329 uint32_t q, r; 330 331 if (oprsz < lnsz) { 332 return false; 333 } 334 335 q = oprsz / lnsz; 336 r = oprsz % lnsz; 337 tcg_debug_assert((r & 7) == 0); 338 339 if (lnsz < 16) { 340 /* For sizes below 16, accept no remainder. */ 341 if (r != 0) { 342 return false; 343 } 344 } else { 345 /* 346 * Recall that ARM SVE allows vector sizes that are not a 347 * power of 2, but always a multiple of 16. The intent is 348 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 349 * In addition, expand_clr needs to handle a multiple of 8. 350 * Thus we can handle the tail with one more operation per 351 * diminishing power of 2. 352 */ 353 q += ctpop32(r); 354 } 355 356 return q <= MAX_UNROLL; 357 } 358 359 static void expand_clr(uint32_t dofs, uint32_t maxsz); 360 361 /* Duplicate C as per VECE. */ 362 uint64_t (dup_const)(unsigned vece, uint64_t c) 363 { 364 switch (vece) { 365 case MO_8: 366 return 0x0101010101010101ull * (uint8_t)c; 367 case MO_16: 368 return 0x0001000100010001ull * (uint16_t)c; 369 case MO_32: 370 return 0x0000000100000001ull * (uint32_t)c; 371 case MO_64: 372 return c; 373 default: 374 g_assert_not_reached(); 375 } 376 } 377 378 /* Duplicate IN into OUT as per VECE. */ 379 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 380 { 381 switch (vece) { 382 case MO_8: 383 tcg_gen_ext8u_i32(out, in); 384 tcg_gen_muli_i32(out, out, 0x01010101); 385 break; 386 case MO_16: 387 tcg_gen_deposit_i32(out, in, in, 16, 16); 388 break; 389 case MO_32: 390 tcg_gen_mov_i32(out, in); 391 break; 392 default: 393 g_assert_not_reached(); 394 } 395 } 396 397 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 398 { 399 switch (vece) { 400 case MO_8: 401 tcg_gen_ext8u_i64(out, in); 402 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 403 break; 404 case MO_16: 405 tcg_gen_ext16u_i64(out, in); 406 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 407 break; 408 case MO_32: 409 tcg_gen_deposit_i64(out, in, in, 32, 32); 410 break; 411 case MO_64: 412 tcg_gen_mov_i64(out, in); 413 break; 414 default: 415 g_assert_not_reached(); 416 } 417 } 418 419 /* Select a supported vector type for implementing an operation on SIZE 420 * bytes. If OP is 0, assume that the real operation to be performed is 421 * required by all backends. Otherwise, make sure than OP can be performed 422 * on elements of size VECE in the selected type. Do not select V64 if 423 * PREFER_I64 is true. Return 0 if no vector type is selected. 424 */ 425 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 426 uint32_t size, bool prefer_i64) 427 { 428 /* 429 * Recall that ARM SVE allows vector sizes that are not a 430 * power of 2, but always a multiple of 16. The intent is 431 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 432 * It is hard to imagine a case in which v256 is supported 433 * but v128 is not, but check anyway. 434 * In addition, expand_clr needs to handle a multiple of 8. 435 */ 436 if (TCG_TARGET_HAS_v256 && 437 check_size_impl(size, 32) && 438 tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && 439 (!(size & 16) || 440 (TCG_TARGET_HAS_v128 && 441 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && 442 (!(size & 8) || 443 (TCG_TARGET_HAS_v64 && 444 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 445 return TCG_TYPE_V256; 446 } 447 if (TCG_TARGET_HAS_v128 && 448 check_size_impl(size, 16) && 449 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && 450 (!(size & 8) || 451 (TCG_TARGET_HAS_v64 && 452 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 453 return TCG_TYPE_V128; 454 } 455 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 456 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 457 return TCG_TYPE_V64; 458 } 459 return 0; 460 } 461 462 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 463 uint32_t maxsz, TCGv_vec t_vec) 464 { 465 uint32_t i = 0; 466 467 tcg_debug_assert(oprsz >= 8); 468 469 /* 470 * This may be expand_clr for the tail of an operation, e.g. 471 * oprsz == 8 && maxsz == 64. The first 8 bytes of this store 472 * are misaligned wrt the maximum vector size, so do that first. 473 */ 474 if (dofs & 8) { 475 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 476 i += 8; 477 } 478 479 switch (type) { 480 case TCG_TYPE_V256: 481 /* 482 * Recall that ARM SVE allows vector sizes that are not a 483 * power of 2, but always a multiple of 16. The intent is 484 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 485 */ 486 for (; i + 32 <= oprsz; i += 32) { 487 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 488 } 489 /* fallthru */ 490 case TCG_TYPE_V128: 491 for (; i + 16 <= oprsz; i += 16) { 492 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 493 } 494 break; 495 case TCG_TYPE_V64: 496 for (; i < oprsz; i += 8) { 497 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 498 } 499 break; 500 default: 501 g_assert_not_reached(); 502 } 503 504 if (oprsz < maxsz) { 505 expand_clr(dofs + oprsz, maxsz - oprsz); 506 } 507 } 508 509 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 510 * Only one of IN_32 or IN_64 may be set; 511 * IN_C is used if IN_32 and IN_64 are unset. 512 */ 513 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 514 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 515 uint64_t in_c) 516 { 517 TCGType type; 518 TCGv_i64 t_64; 519 TCGv_i32 t_32, t_desc; 520 TCGv_ptr t_ptr; 521 uint32_t i; 522 523 assert(vece <= (in_32 ? MO_32 : MO_64)); 524 assert(in_32 == NULL || in_64 == NULL); 525 526 /* If we're storing 0, expand oprsz to maxsz. */ 527 if (in_32 == NULL && in_64 == NULL) { 528 in_c = dup_const(vece, in_c); 529 if (in_c == 0) { 530 oprsz = maxsz; 531 } 532 } 533 534 /* Implement inline with a vector type, if possible. 535 * Prefer integer when 64-bit host and no variable dup. 536 */ 537 type = choose_vector_type(NULL, vece, oprsz, 538 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 539 && (in_64 == NULL || vece == MO_64))); 540 if (type != 0) { 541 TCGv_vec t_vec = tcg_temp_new_vec(type); 542 543 if (in_32) { 544 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 545 } else if (in_64) { 546 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 547 } else { 548 tcg_gen_dupi_vec(vece, t_vec, in_c); 549 } 550 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 551 tcg_temp_free_vec(t_vec); 552 return; 553 } 554 555 /* Otherwise, inline with an integer type, unless "large". */ 556 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 557 t_64 = NULL; 558 t_32 = NULL; 559 560 if (in_32) { 561 /* We are given a 32-bit variable input. For a 64-bit host, 562 use a 64-bit operation unless the 32-bit operation would 563 be simple enough. */ 564 if (TCG_TARGET_REG_BITS == 64 565 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 566 t_64 = tcg_temp_new_i64(); 567 tcg_gen_extu_i32_i64(t_64, in_32); 568 gen_dup_i64(vece, t_64, t_64); 569 } else { 570 t_32 = tcg_temp_new_i32(); 571 gen_dup_i32(vece, t_32, in_32); 572 } 573 } else if (in_64) { 574 /* We are given a 64-bit variable input. */ 575 t_64 = tcg_temp_new_i64(); 576 gen_dup_i64(vece, t_64, in_64); 577 } else { 578 /* We are given a constant input. */ 579 /* For 64-bit hosts, use 64-bit constants for "simple" constants 580 or when we'd need too many 32-bit stores, or when a 64-bit 581 constant is really required. */ 582 if (vece == MO_64 583 || (TCG_TARGET_REG_BITS == 64 584 && (in_c == 0 || in_c == -1 585 || !check_size_impl(oprsz, 4)))) { 586 t_64 = tcg_const_i64(in_c); 587 } else { 588 t_32 = tcg_const_i32(in_c); 589 } 590 } 591 592 /* Implement inline if we picked an implementation size above. */ 593 if (t_32) { 594 for (i = 0; i < oprsz; i += 4) { 595 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 596 } 597 tcg_temp_free_i32(t_32); 598 goto done; 599 } 600 if (t_64) { 601 for (i = 0; i < oprsz; i += 8) { 602 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 603 } 604 tcg_temp_free_i64(t_64); 605 goto done; 606 } 607 } 608 609 /* Otherwise implement out of line. */ 610 t_ptr = tcg_temp_new_ptr(); 611 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 612 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 613 614 if (vece == MO_64) { 615 if (in_64) { 616 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 617 } else { 618 t_64 = tcg_const_i64(in_c); 619 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 620 tcg_temp_free_i64(t_64); 621 } 622 } else { 623 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 624 static dup_fn * const fns[3] = { 625 gen_helper_gvec_dup8, 626 gen_helper_gvec_dup16, 627 gen_helper_gvec_dup32 628 }; 629 630 if (in_32) { 631 fns[vece](t_ptr, t_desc, in_32); 632 } else { 633 t_32 = tcg_temp_new_i32(); 634 if (in_64) { 635 tcg_gen_extrl_i64_i32(t_32, in_64); 636 } else if (vece == MO_8) { 637 tcg_gen_movi_i32(t_32, in_c & 0xff); 638 } else if (vece == MO_16) { 639 tcg_gen_movi_i32(t_32, in_c & 0xffff); 640 } else { 641 tcg_gen_movi_i32(t_32, in_c); 642 } 643 fns[vece](t_ptr, t_desc, t_32); 644 tcg_temp_free_i32(t_32); 645 } 646 } 647 648 tcg_temp_free_ptr(t_ptr); 649 tcg_temp_free_i32(t_desc); 650 return; 651 652 done: 653 if (oprsz < maxsz) { 654 expand_clr(dofs + oprsz, maxsz - oprsz); 655 } 656 } 657 658 /* Likewise, but with zero. */ 659 static void expand_clr(uint32_t dofs, uint32_t maxsz) 660 { 661 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 662 } 663 664 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 665 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 666 bool load_dest, void (*fni)(TCGv_i32, TCGv_i32)) 667 { 668 TCGv_i32 t0 = tcg_temp_new_i32(); 669 TCGv_i32 t1 = tcg_temp_new_i32(); 670 uint32_t i; 671 672 for (i = 0; i < oprsz; i += 4) { 673 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 674 if (load_dest) { 675 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 676 } 677 fni(t1, t0); 678 tcg_gen_st_i32(t1, cpu_env, dofs + i); 679 } 680 tcg_temp_free_i32(t0); 681 tcg_temp_free_i32(t1); 682 } 683 684 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 685 int32_t c, bool load_dest, 686 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 687 { 688 TCGv_i32 t0 = tcg_temp_new_i32(); 689 TCGv_i32 t1 = tcg_temp_new_i32(); 690 uint32_t i; 691 692 for (i = 0; i < oprsz; i += 4) { 693 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 694 if (load_dest) { 695 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 696 } 697 fni(t1, t0, c); 698 tcg_gen_st_i32(t1, cpu_env, dofs + i); 699 } 700 tcg_temp_free_i32(t0); 701 tcg_temp_free_i32(t1); 702 } 703 704 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 705 TCGv_i32 c, bool scalar_first, 706 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 707 { 708 TCGv_i32 t0 = tcg_temp_new_i32(); 709 TCGv_i32 t1 = tcg_temp_new_i32(); 710 uint32_t i; 711 712 for (i = 0; i < oprsz; i += 4) { 713 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 714 if (scalar_first) { 715 fni(t1, c, t0); 716 } else { 717 fni(t1, t0, c); 718 } 719 tcg_gen_st_i32(t1, cpu_env, dofs + i); 720 } 721 tcg_temp_free_i32(t0); 722 tcg_temp_free_i32(t1); 723 } 724 725 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 726 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 727 uint32_t bofs, uint32_t oprsz, bool load_dest, 728 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 729 { 730 TCGv_i32 t0 = tcg_temp_new_i32(); 731 TCGv_i32 t1 = tcg_temp_new_i32(); 732 TCGv_i32 t2 = tcg_temp_new_i32(); 733 uint32_t i; 734 735 for (i = 0; i < oprsz; i += 4) { 736 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 737 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 738 if (load_dest) { 739 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 740 } 741 fni(t2, t0, t1); 742 tcg_gen_st_i32(t2, cpu_env, dofs + i); 743 } 744 tcg_temp_free_i32(t2); 745 tcg_temp_free_i32(t1); 746 tcg_temp_free_i32(t0); 747 } 748 749 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 750 uint32_t oprsz, int32_t c, bool load_dest, 751 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 752 { 753 TCGv_i32 t0 = tcg_temp_new_i32(); 754 TCGv_i32 t1 = tcg_temp_new_i32(); 755 TCGv_i32 t2 = tcg_temp_new_i32(); 756 uint32_t i; 757 758 for (i = 0; i < oprsz; i += 4) { 759 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 760 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 761 if (load_dest) { 762 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 763 } 764 fni(t2, t0, t1, c); 765 tcg_gen_st_i32(t2, cpu_env, dofs + i); 766 } 767 tcg_temp_free_i32(t0); 768 tcg_temp_free_i32(t1); 769 tcg_temp_free_i32(t2); 770 } 771 772 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 773 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 774 uint32_t cofs, uint32_t oprsz, bool write_aofs, 775 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 776 { 777 TCGv_i32 t0 = tcg_temp_new_i32(); 778 TCGv_i32 t1 = tcg_temp_new_i32(); 779 TCGv_i32 t2 = tcg_temp_new_i32(); 780 TCGv_i32 t3 = tcg_temp_new_i32(); 781 uint32_t i; 782 783 for (i = 0; i < oprsz; i += 4) { 784 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 785 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 786 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 787 fni(t0, t1, t2, t3); 788 tcg_gen_st_i32(t0, cpu_env, dofs + i); 789 if (write_aofs) { 790 tcg_gen_st_i32(t1, cpu_env, aofs + i); 791 } 792 } 793 tcg_temp_free_i32(t3); 794 tcg_temp_free_i32(t2); 795 tcg_temp_free_i32(t1); 796 tcg_temp_free_i32(t0); 797 } 798 799 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 800 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 801 bool load_dest, void (*fni)(TCGv_i64, TCGv_i64)) 802 { 803 TCGv_i64 t0 = tcg_temp_new_i64(); 804 TCGv_i64 t1 = tcg_temp_new_i64(); 805 uint32_t i; 806 807 for (i = 0; i < oprsz; i += 8) { 808 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 809 if (load_dest) { 810 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 811 } 812 fni(t1, t0); 813 tcg_gen_st_i64(t1, cpu_env, dofs + i); 814 } 815 tcg_temp_free_i64(t0); 816 tcg_temp_free_i64(t1); 817 } 818 819 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 820 int64_t c, bool load_dest, 821 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 822 { 823 TCGv_i64 t0 = tcg_temp_new_i64(); 824 TCGv_i64 t1 = tcg_temp_new_i64(); 825 uint32_t i; 826 827 for (i = 0; i < oprsz; i += 8) { 828 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 829 if (load_dest) { 830 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 831 } 832 fni(t1, t0, c); 833 tcg_gen_st_i64(t1, cpu_env, dofs + i); 834 } 835 tcg_temp_free_i64(t0); 836 tcg_temp_free_i64(t1); 837 } 838 839 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 840 TCGv_i64 c, bool scalar_first, 841 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 842 { 843 TCGv_i64 t0 = tcg_temp_new_i64(); 844 TCGv_i64 t1 = tcg_temp_new_i64(); 845 uint32_t i; 846 847 for (i = 0; i < oprsz; i += 8) { 848 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 849 if (scalar_first) { 850 fni(t1, c, t0); 851 } else { 852 fni(t1, t0, c); 853 } 854 tcg_gen_st_i64(t1, cpu_env, dofs + i); 855 } 856 tcg_temp_free_i64(t0); 857 tcg_temp_free_i64(t1); 858 } 859 860 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 861 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 862 uint32_t bofs, uint32_t oprsz, bool load_dest, 863 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 864 { 865 TCGv_i64 t0 = tcg_temp_new_i64(); 866 TCGv_i64 t1 = tcg_temp_new_i64(); 867 TCGv_i64 t2 = tcg_temp_new_i64(); 868 uint32_t i; 869 870 for (i = 0; i < oprsz; i += 8) { 871 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 872 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 873 if (load_dest) { 874 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 875 } 876 fni(t2, t0, t1); 877 tcg_gen_st_i64(t2, cpu_env, dofs + i); 878 } 879 tcg_temp_free_i64(t2); 880 tcg_temp_free_i64(t1); 881 tcg_temp_free_i64(t0); 882 } 883 884 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 885 uint32_t oprsz, int64_t c, bool load_dest, 886 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 887 { 888 TCGv_i64 t0 = tcg_temp_new_i64(); 889 TCGv_i64 t1 = tcg_temp_new_i64(); 890 TCGv_i64 t2 = tcg_temp_new_i64(); 891 uint32_t i; 892 893 for (i = 0; i < oprsz; i += 8) { 894 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 895 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 896 if (load_dest) { 897 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 898 } 899 fni(t2, t0, t1, c); 900 tcg_gen_st_i64(t2, cpu_env, dofs + i); 901 } 902 tcg_temp_free_i64(t0); 903 tcg_temp_free_i64(t1); 904 tcg_temp_free_i64(t2); 905 } 906 907 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 908 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 909 uint32_t cofs, uint32_t oprsz, bool write_aofs, 910 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 911 { 912 TCGv_i64 t0 = tcg_temp_new_i64(); 913 TCGv_i64 t1 = tcg_temp_new_i64(); 914 TCGv_i64 t2 = tcg_temp_new_i64(); 915 TCGv_i64 t3 = tcg_temp_new_i64(); 916 uint32_t i; 917 918 for (i = 0; i < oprsz; i += 8) { 919 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 920 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 921 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 922 fni(t0, t1, t2, t3); 923 tcg_gen_st_i64(t0, cpu_env, dofs + i); 924 if (write_aofs) { 925 tcg_gen_st_i64(t1, cpu_env, aofs + i); 926 } 927 } 928 tcg_temp_free_i64(t3); 929 tcg_temp_free_i64(t2); 930 tcg_temp_free_i64(t1); 931 tcg_temp_free_i64(t0); 932 } 933 934 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 935 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 936 uint32_t oprsz, uint32_t tysz, TCGType type, 937 bool load_dest, 938 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 939 { 940 TCGv_vec t0 = tcg_temp_new_vec(type); 941 TCGv_vec t1 = tcg_temp_new_vec(type); 942 uint32_t i; 943 944 for (i = 0; i < oprsz; i += tysz) { 945 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 946 if (load_dest) { 947 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 948 } 949 fni(vece, t1, t0); 950 tcg_gen_st_vec(t1, cpu_env, dofs + i); 951 } 952 tcg_temp_free_vec(t0); 953 tcg_temp_free_vec(t1); 954 } 955 956 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 957 using host vectors. */ 958 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 959 uint32_t oprsz, uint32_t tysz, TCGType type, 960 int64_t c, bool load_dest, 961 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 962 { 963 TCGv_vec t0 = tcg_temp_new_vec(type); 964 TCGv_vec t1 = tcg_temp_new_vec(type); 965 uint32_t i; 966 967 for (i = 0; i < oprsz; i += tysz) { 968 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 969 if (load_dest) { 970 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 971 } 972 fni(vece, t1, t0, c); 973 tcg_gen_st_vec(t1, cpu_env, dofs + i); 974 } 975 tcg_temp_free_vec(t0); 976 tcg_temp_free_vec(t1); 977 } 978 979 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 980 uint32_t oprsz, uint32_t tysz, TCGType type, 981 TCGv_vec c, bool scalar_first, 982 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 983 { 984 TCGv_vec t0 = tcg_temp_new_vec(type); 985 TCGv_vec t1 = tcg_temp_new_vec(type); 986 uint32_t i; 987 988 for (i = 0; i < oprsz; i += tysz) { 989 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 990 if (scalar_first) { 991 fni(vece, t1, c, t0); 992 } else { 993 fni(vece, t1, t0, c); 994 } 995 tcg_gen_st_vec(t1, cpu_env, dofs + i); 996 } 997 tcg_temp_free_vec(t0); 998 tcg_temp_free_vec(t1); 999 } 1000 1001 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 1002 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1003 uint32_t bofs, uint32_t oprsz, 1004 uint32_t tysz, TCGType type, bool load_dest, 1005 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1006 { 1007 TCGv_vec t0 = tcg_temp_new_vec(type); 1008 TCGv_vec t1 = tcg_temp_new_vec(type); 1009 TCGv_vec t2 = tcg_temp_new_vec(type); 1010 uint32_t i; 1011 1012 for (i = 0; i < oprsz; i += tysz) { 1013 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1014 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 1015 if (load_dest) { 1016 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 1017 } 1018 fni(vece, t2, t0, t1); 1019 tcg_gen_st_vec(t2, cpu_env, dofs + i); 1020 } 1021 tcg_temp_free_vec(t2); 1022 tcg_temp_free_vec(t1); 1023 tcg_temp_free_vec(t0); 1024 } 1025 1026 /* 1027 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 1028 * using host vectors. 1029 */ 1030 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1031 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 1032 TCGType type, int64_t c, bool load_dest, 1033 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 1034 int64_t)) 1035 { 1036 TCGv_vec t0 = tcg_temp_new_vec(type); 1037 TCGv_vec t1 = tcg_temp_new_vec(type); 1038 TCGv_vec t2 = tcg_temp_new_vec(type); 1039 uint32_t i; 1040 1041 for (i = 0; i < oprsz; i += tysz) { 1042 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1043 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 1044 if (load_dest) { 1045 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 1046 } 1047 fni(vece, t2, t0, t1, c); 1048 tcg_gen_st_vec(t2, cpu_env, dofs + i); 1049 } 1050 tcg_temp_free_vec(t0); 1051 tcg_temp_free_vec(t1); 1052 tcg_temp_free_vec(t2); 1053 } 1054 1055 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 1056 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1057 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1058 uint32_t tysz, TCGType type, bool write_aofs, 1059 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1060 TCGv_vec, TCGv_vec)) 1061 { 1062 TCGv_vec t0 = tcg_temp_new_vec(type); 1063 TCGv_vec t1 = tcg_temp_new_vec(type); 1064 TCGv_vec t2 = tcg_temp_new_vec(type); 1065 TCGv_vec t3 = tcg_temp_new_vec(type); 1066 uint32_t i; 1067 1068 for (i = 0; i < oprsz; i += tysz) { 1069 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 1070 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 1071 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 1072 fni(vece, t0, t1, t2, t3); 1073 tcg_gen_st_vec(t0, cpu_env, dofs + i); 1074 if (write_aofs) { 1075 tcg_gen_st_vec(t1, cpu_env, aofs + i); 1076 } 1077 } 1078 tcg_temp_free_vec(t3); 1079 tcg_temp_free_vec(t2); 1080 tcg_temp_free_vec(t1); 1081 tcg_temp_free_vec(t0); 1082 } 1083 1084 /* Expand a vector two-operand operation. */ 1085 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 1086 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 1087 { 1088 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1089 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1090 TCGType type; 1091 uint32_t some; 1092 1093 check_size_align(oprsz, maxsz, dofs | aofs); 1094 check_overlap_2(dofs, aofs, maxsz); 1095 1096 type = 0; 1097 if (g->fniv) { 1098 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1099 } 1100 switch (type) { 1101 case TCG_TYPE_V256: 1102 /* Recall that ARM SVE allows vector sizes that are not a 1103 * power of 2, but always a multiple of 16. The intent is 1104 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1105 */ 1106 some = QEMU_ALIGN_DOWN(oprsz, 32); 1107 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1108 g->load_dest, g->fniv); 1109 if (some == oprsz) { 1110 break; 1111 } 1112 dofs += some; 1113 aofs += some; 1114 oprsz -= some; 1115 maxsz -= some; 1116 /* fallthru */ 1117 case TCG_TYPE_V128: 1118 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1119 g->load_dest, g->fniv); 1120 break; 1121 case TCG_TYPE_V64: 1122 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1123 g->load_dest, g->fniv); 1124 break; 1125 1126 case 0: 1127 if (g->fni8 && check_size_impl(oprsz, 8)) { 1128 expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8); 1129 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1130 expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4); 1131 } else { 1132 assert(g->fno != NULL); 1133 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1134 oprsz = maxsz; 1135 } 1136 break; 1137 1138 default: 1139 g_assert_not_reached(); 1140 } 1141 tcg_swap_vecop_list(hold_list); 1142 1143 if (oprsz < maxsz) { 1144 expand_clr(dofs + oprsz, maxsz - oprsz); 1145 } 1146 } 1147 1148 /* Expand a vector operation with two vectors and an immediate. */ 1149 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1150 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1151 { 1152 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1153 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1154 TCGType type; 1155 uint32_t some; 1156 1157 check_size_align(oprsz, maxsz, dofs | aofs); 1158 check_overlap_2(dofs, aofs, maxsz); 1159 1160 type = 0; 1161 if (g->fniv) { 1162 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1163 } 1164 switch (type) { 1165 case TCG_TYPE_V256: 1166 /* Recall that ARM SVE allows vector sizes that are not a 1167 * power of 2, but always a multiple of 16. The intent is 1168 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1169 */ 1170 some = QEMU_ALIGN_DOWN(oprsz, 32); 1171 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1172 c, g->load_dest, g->fniv); 1173 if (some == oprsz) { 1174 break; 1175 } 1176 dofs += some; 1177 aofs += some; 1178 oprsz -= some; 1179 maxsz -= some; 1180 /* fallthru */ 1181 case TCG_TYPE_V128: 1182 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1183 c, g->load_dest, g->fniv); 1184 break; 1185 case TCG_TYPE_V64: 1186 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1187 c, g->load_dest, g->fniv); 1188 break; 1189 1190 case 0: 1191 if (g->fni8 && check_size_impl(oprsz, 8)) { 1192 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1193 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1194 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1195 } else { 1196 if (g->fno) { 1197 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1198 } else { 1199 TCGv_i64 tcg_c = tcg_const_i64(c); 1200 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1201 maxsz, c, g->fnoi); 1202 tcg_temp_free_i64(tcg_c); 1203 } 1204 oprsz = maxsz; 1205 } 1206 break; 1207 1208 default: 1209 g_assert_not_reached(); 1210 } 1211 tcg_swap_vecop_list(hold_list); 1212 1213 if (oprsz < maxsz) { 1214 expand_clr(dofs + oprsz, maxsz - oprsz); 1215 } 1216 } 1217 1218 /* Expand a vector operation with two vectors and a scalar. */ 1219 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1220 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1221 { 1222 TCGType type; 1223 1224 check_size_align(oprsz, maxsz, dofs | aofs); 1225 check_overlap_2(dofs, aofs, maxsz); 1226 1227 type = 0; 1228 if (g->fniv) { 1229 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1230 } 1231 if (type != 0) { 1232 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1233 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1234 TCGv_vec t_vec = tcg_temp_new_vec(type); 1235 uint32_t some; 1236 1237 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1238 1239 switch (type) { 1240 case TCG_TYPE_V256: 1241 /* Recall that ARM SVE allows vector sizes that are not a 1242 * power of 2, but always a multiple of 16. The intent is 1243 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1244 */ 1245 some = QEMU_ALIGN_DOWN(oprsz, 32); 1246 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1247 t_vec, g->scalar_first, g->fniv); 1248 if (some == oprsz) { 1249 break; 1250 } 1251 dofs += some; 1252 aofs += some; 1253 oprsz -= some; 1254 maxsz -= some; 1255 /* fallthru */ 1256 1257 case TCG_TYPE_V128: 1258 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1259 t_vec, g->scalar_first, g->fniv); 1260 break; 1261 1262 case TCG_TYPE_V64: 1263 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1264 t_vec, g->scalar_first, g->fniv); 1265 break; 1266 1267 default: 1268 g_assert_not_reached(); 1269 } 1270 tcg_temp_free_vec(t_vec); 1271 tcg_swap_vecop_list(hold_list); 1272 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1273 TCGv_i64 t64 = tcg_temp_new_i64(); 1274 1275 gen_dup_i64(g->vece, t64, c); 1276 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1277 tcg_temp_free_i64(t64); 1278 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1279 TCGv_i32 t32 = tcg_temp_new_i32(); 1280 1281 tcg_gen_extrl_i64_i32(t32, c); 1282 gen_dup_i32(g->vece, t32, t32); 1283 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1284 tcg_temp_free_i32(t32); 1285 } else { 1286 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1287 return; 1288 } 1289 1290 if (oprsz < maxsz) { 1291 expand_clr(dofs + oprsz, maxsz - oprsz); 1292 } 1293 } 1294 1295 /* Expand a vector three-operand operation. */ 1296 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1297 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1298 { 1299 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1300 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1301 TCGType type; 1302 uint32_t some; 1303 1304 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1305 check_overlap_3(dofs, aofs, bofs, maxsz); 1306 1307 type = 0; 1308 if (g->fniv) { 1309 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1310 } 1311 switch (type) { 1312 case TCG_TYPE_V256: 1313 /* Recall that ARM SVE allows vector sizes that are not a 1314 * power of 2, but always a multiple of 16. The intent is 1315 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1316 */ 1317 some = QEMU_ALIGN_DOWN(oprsz, 32); 1318 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1319 g->load_dest, g->fniv); 1320 if (some == oprsz) { 1321 break; 1322 } 1323 dofs += some; 1324 aofs += some; 1325 bofs += some; 1326 oprsz -= some; 1327 maxsz -= some; 1328 /* fallthru */ 1329 case TCG_TYPE_V128: 1330 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1331 g->load_dest, g->fniv); 1332 break; 1333 case TCG_TYPE_V64: 1334 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1335 g->load_dest, g->fniv); 1336 break; 1337 1338 case 0: 1339 if (g->fni8 && check_size_impl(oprsz, 8)) { 1340 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1341 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1342 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1343 } else { 1344 assert(g->fno != NULL); 1345 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1346 maxsz, g->data, g->fno); 1347 oprsz = maxsz; 1348 } 1349 break; 1350 1351 default: 1352 g_assert_not_reached(); 1353 } 1354 tcg_swap_vecop_list(hold_list); 1355 1356 if (oprsz < maxsz) { 1357 expand_clr(dofs + oprsz, maxsz - oprsz); 1358 } 1359 } 1360 1361 /* Expand a vector operation with three vectors and an immediate. */ 1362 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1363 uint32_t oprsz, uint32_t maxsz, int64_t c, 1364 const GVecGen3i *g) 1365 { 1366 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1367 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1368 TCGType type; 1369 uint32_t some; 1370 1371 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1372 check_overlap_3(dofs, aofs, bofs, maxsz); 1373 1374 type = 0; 1375 if (g->fniv) { 1376 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1377 } 1378 switch (type) { 1379 case TCG_TYPE_V256: 1380 /* 1381 * Recall that ARM SVE allows vector sizes that are not a 1382 * power of 2, but always a multiple of 16. The intent is 1383 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1384 */ 1385 some = QEMU_ALIGN_DOWN(oprsz, 32); 1386 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1387 c, g->load_dest, g->fniv); 1388 if (some == oprsz) { 1389 break; 1390 } 1391 dofs += some; 1392 aofs += some; 1393 bofs += some; 1394 oprsz -= some; 1395 maxsz -= some; 1396 /* fallthru */ 1397 case TCG_TYPE_V128: 1398 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1399 c, g->load_dest, g->fniv); 1400 break; 1401 case TCG_TYPE_V64: 1402 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1403 c, g->load_dest, g->fniv); 1404 break; 1405 1406 case 0: 1407 if (g->fni8 && check_size_impl(oprsz, 8)) { 1408 expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8); 1409 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1410 expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4); 1411 } else { 1412 assert(g->fno != NULL); 1413 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1414 oprsz = maxsz; 1415 } 1416 break; 1417 1418 default: 1419 g_assert_not_reached(); 1420 } 1421 tcg_swap_vecop_list(hold_list); 1422 1423 if (oprsz < maxsz) { 1424 expand_clr(dofs + oprsz, maxsz - oprsz); 1425 } 1426 } 1427 1428 /* Expand a vector four-operand operation. */ 1429 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1430 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1431 { 1432 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1433 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1434 TCGType type; 1435 uint32_t some; 1436 1437 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1438 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1439 1440 type = 0; 1441 if (g->fniv) { 1442 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1443 } 1444 switch (type) { 1445 case TCG_TYPE_V256: 1446 /* Recall that ARM SVE allows vector sizes that are not a 1447 * power of 2, but always a multiple of 16. The intent is 1448 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1449 */ 1450 some = QEMU_ALIGN_DOWN(oprsz, 32); 1451 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1452 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1453 if (some == oprsz) { 1454 break; 1455 } 1456 dofs += some; 1457 aofs += some; 1458 bofs += some; 1459 cofs += some; 1460 oprsz -= some; 1461 maxsz -= some; 1462 /* fallthru */ 1463 case TCG_TYPE_V128: 1464 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1465 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1466 break; 1467 case TCG_TYPE_V64: 1468 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1469 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1470 break; 1471 1472 case 0: 1473 if (g->fni8 && check_size_impl(oprsz, 8)) { 1474 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1475 g->write_aofs, g->fni8); 1476 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1477 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1478 g->write_aofs, g->fni4); 1479 } else { 1480 assert(g->fno != NULL); 1481 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1482 oprsz, maxsz, g->data, g->fno); 1483 oprsz = maxsz; 1484 } 1485 break; 1486 1487 default: 1488 g_assert_not_reached(); 1489 } 1490 tcg_swap_vecop_list(hold_list); 1491 1492 if (oprsz < maxsz) { 1493 expand_clr(dofs + oprsz, maxsz - oprsz); 1494 } 1495 } 1496 1497 /* 1498 * Expand specific vector operations. 1499 */ 1500 1501 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1502 { 1503 tcg_gen_mov_vec(a, b); 1504 } 1505 1506 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1507 uint32_t oprsz, uint32_t maxsz) 1508 { 1509 static const GVecGen2 g = { 1510 .fni8 = tcg_gen_mov_i64, 1511 .fniv = vec_mov2, 1512 .fno = gen_helper_gvec_mov, 1513 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1514 }; 1515 if (dofs != aofs) { 1516 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1517 } else { 1518 check_size_align(oprsz, maxsz, dofs); 1519 if (oprsz < maxsz) { 1520 expand_clr(dofs + oprsz, maxsz - oprsz); 1521 } 1522 } 1523 } 1524 1525 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1526 uint32_t maxsz, TCGv_i32 in) 1527 { 1528 check_size_align(oprsz, maxsz, dofs); 1529 tcg_debug_assert(vece <= MO_32); 1530 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1531 } 1532 1533 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1534 uint32_t maxsz, TCGv_i64 in) 1535 { 1536 check_size_align(oprsz, maxsz, dofs); 1537 tcg_debug_assert(vece <= MO_64); 1538 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1539 } 1540 1541 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1542 uint32_t oprsz, uint32_t maxsz) 1543 { 1544 check_size_align(oprsz, maxsz, dofs); 1545 if (vece <= MO_64) { 1546 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1547 if (type != 0) { 1548 TCGv_vec t_vec = tcg_temp_new_vec(type); 1549 tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs); 1550 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1551 tcg_temp_free_vec(t_vec); 1552 } else if (vece <= MO_32) { 1553 TCGv_i32 in = tcg_temp_new_i32(); 1554 switch (vece) { 1555 case MO_8: 1556 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1557 break; 1558 case MO_16: 1559 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1560 break; 1561 default: 1562 tcg_gen_ld_i32(in, cpu_env, aofs); 1563 break; 1564 } 1565 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1566 tcg_temp_free_i32(in); 1567 } else { 1568 TCGv_i64 in = tcg_temp_new_i64(); 1569 tcg_gen_ld_i64(in, cpu_env, aofs); 1570 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1571 tcg_temp_free_i64(in); 1572 } 1573 } else { 1574 /* 128-bit duplicate. */ 1575 /* ??? Dup to 256-bit vector. */ 1576 int i; 1577 1578 tcg_debug_assert(vece == 4); 1579 tcg_debug_assert(oprsz >= 16); 1580 if (TCG_TARGET_HAS_v128) { 1581 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1582 1583 tcg_gen_ld_vec(in, cpu_env, aofs); 1584 for (i = 0; i < oprsz; i += 16) { 1585 tcg_gen_st_vec(in, cpu_env, dofs + i); 1586 } 1587 tcg_temp_free_vec(in); 1588 } else { 1589 TCGv_i64 in0 = tcg_temp_new_i64(); 1590 TCGv_i64 in1 = tcg_temp_new_i64(); 1591 1592 tcg_gen_ld_i64(in0, cpu_env, aofs); 1593 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1594 for (i = 0; i < oprsz; i += 16) { 1595 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1596 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1597 } 1598 tcg_temp_free_i64(in0); 1599 tcg_temp_free_i64(in1); 1600 } 1601 if (oprsz < maxsz) { 1602 expand_clr(dofs + oprsz, maxsz - oprsz); 1603 } 1604 } 1605 } 1606 1607 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz, 1608 uint32_t maxsz, uint64_t x) 1609 { 1610 check_size_align(oprsz, maxsz, dofs); 1611 do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x); 1612 } 1613 1614 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1615 uint32_t oprsz, uint32_t maxsz) 1616 { 1617 static const GVecGen2 g = { 1618 .fni8 = tcg_gen_not_i64, 1619 .fniv = tcg_gen_not_vec, 1620 .fno = gen_helper_gvec_not, 1621 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1622 }; 1623 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1624 } 1625 1626 /* Perform a vector addition using normal addition and a mask. The mask 1627 should be the sign bit of each lane. This 6-operation form is more 1628 efficient than separate additions when there are 4 or more lanes in 1629 the 64-bit operation. */ 1630 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1631 { 1632 TCGv_i64 t1 = tcg_temp_new_i64(); 1633 TCGv_i64 t2 = tcg_temp_new_i64(); 1634 TCGv_i64 t3 = tcg_temp_new_i64(); 1635 1636 tcg_gen_andc_i64(t1, a, m); 1637 tcg_gen_andc_i64(t2, b, m); 1638 tcg_gen_xor_i64(t3, a, b); 1639 tcg_gen_add_i64(d, t1, t2); 1640 tcg_gen_and_i64(t3, t3, m); 1641 tcg_gen_xor_i64(d, d, t3); 1642 1643 tcg_temp_free_i64(t1); 1644 tcg_temp_free_i64(t2); 1645 tcg_temp_free_i64(t3); 1646 } 1647 1648 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1649 { 1650 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1651 gen_addv_mask(d, a, b, m); 1652 tcg_temp_free_i64(m); 1653 } 1654 1655 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1656 { 1657 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1658 gen_addv_mask(d, a, b, m); 1659 tcg_temp_free_i64(m); 1660 } 1661 1662 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1663 { 1664 TCGv_i64 t1 = tcg_temp_new_i64(); 1665 TCGv_i64 t2 = tcg_temp_new_i64(); 1666 1667 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1668 tcg_gen_add_i64(t2, a, b); 1669 tcg_gen_add_i64(t1, t1, b); 1670 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1671 1672 tcg_temp_free_i64(t1); 1673 tcg_temp_free_i64(t2); 1674 } 1675 1676 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1677 1678 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1679 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1680 { 1681 static const GVecGen3 g[4] = { 1682 { .fni8 = tcg_gen_vec_add8_i64, 1683 .fniv = tcg_gen_add_vec, 1684 .fno = gen_helper_gvec_add8, 1685 .opt_opc = vecop_list_add, 1686 .vece = MO_8 }, 1687 { .fni8 = tcg_gen_vec_add16_i64, 1688 .fniv = tcg_gen_add_vec, 1689 .fno = gen_helper_gvec_add16, 1690 .opt_opc = vecop_list_add, 1691 .vece = MO_16 }, 1692 { .fni4 = tcg_gen_add_i32, 1693 .fniv = tcg_gen_add_vec, 1694 .fno = gen_helper_gvec_add32, 1695 .opt_opc = vecop_list_add, 1696 .vece = MO_32 }, 1697 { .fni8 = tcg_gen_add_i64, 1698 .fniv = tcg_gen_add_vec, 1699 .fno = gen_helper_gvec_add64, 1700 .opt_opc = vecop_list_add, 1701 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1702 .vece = MO_64 }, 1703 }; 1704 1705 tcg_debug_assert(vece <= MO_64); 1706 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1707 } 1708 1709 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1710 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1711 { 1712 static const GVecGen2s g[4] = { 1713 { .fni8 = tcg_gen_vec_add8_i64, 1714 .fniv = tcg_gen_add_vec, 1715 .fno = gen_helper_gvec_adds8, 1716 .opt_opc = vecop_list_add, 1717 .vece = MO_8 }, 1718 { .fni8 = tcg_gen_vec_add16_i64, 1719 .fniv = tcg_gen_add_vec, 1720 .fno = gen_helper_gvec_adds16, 1721 .opt_opc = vecop_list_add, 1722 .vece = MO_16 }, 1723 { .fni4 = tcg_gen_add_i32, 1724 .fniv = tcg_gen_add_vec, 1725 .fno = gen_helper_gvec_adds32, 1726 .opt_opc = vecop_list_add, 1727 .vece = MO_32 }, 1728 { .fni8 = tcg_gen_add_i64, 1729 .fniv = tcg_gen_add_vec, 1730 .fno = gen_helper_gvec_adds64, 1731 .opt_opc = vecop_list_add, 1732 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1733 .vece = MO_64 }, 1734 }; 1735 1736 tcg_debug_assert(vece <= MO_64); 1737 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1738 } 1739 1740 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1741 int64_t c, uint32_t oprsz, uint32_t maxsz) 1742 { 1743 TCGv_i64 tmp = tcg_const_i64(c); 1744 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1745 tcg_temp_free_i64(tmp); 1746 } 1747 1748 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 1749 1750 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1751 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1752 { 1753 static const GVecGen2s g[4] = { 1754 { .fni8 = tcg_gen_vec_sub8_i64, 1755 .fniv = tcg_gen_sub_vec, 1756 .fno = gen_helper_gvec_subs8, 1757 .opt_opc = vecop_list_sub, 1758 .vece = MO_8 }, 1759 { .fni8 = tcg_gen_vec_sub16_i64, 1760 .fniv = tcg_gen_sub_vec, 1761 .fno = gen_helper_gvec_subs16, 1762 .opt_opc = vecop_list_sub, 1763 .vece = MO_16 }, 1764 { .fni4 = tcg_gen_sub_i32, 1765 .fniv = tcg_gen_sub_vec, 1766 .fno = gen_helper_gvec_subs32, 1767 .opt_opc = vecop_list_sub, 1768 .vece = MO_32 }, 1769 { .fni8 = tcg_gen_sub_i64, 1770 .fniv = tcg_gen_sub_vec, 1771 .fno = gen_helper_gvec_subs64, 1772 .opt_opc = vecop_list_sub, 1773 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1774 .vece = MO_64 }, 1775 }; 1776 1777 tcg_debug_assert(vece <= MO_64); 1778 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1779 } 1780 1781 /* Perform a vector subtraction using normal subtraction and a mask. 1782 Compare gen_addv_mask above. */ 1783 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1784 { 1785 TCGv_i64 t1 = tcg_temp_new_i64(); 1786 TCGv_i64 t2 = tcg_temp_new_i64(); 1787 TCGv_i64 t3 = tcg_temp_new_i64(); 1788 1789 tcg_gen_or_i64(t1, a, m); 1790 tcg_gen_andc_i64(t2, b, m); 1791 tcg_gen_eqv_i64(t3, a, b); 1792 tcg_gen_sub_i64(d, t1, t2); 1793 tcg_gen_and_i64(t3, t3, m); 1794 tcg_gen_xor_i64(d, d, t3); 1795 1796 tcg_temp_free_i64(t1); 1797 tcg_temp_free_i64(t2); 1798 tcg_temp_free_i64(t3); 1799 } 1800 1801 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1802 { 1803 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1804 gen_subv_mask(d, a, b, m); 1805 tcg_temp_free_i64(m); 1806 } 1807 1808 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1809 { 1810 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1811 gen_subv_mask(d, a, b, m); 1812 tcg_temp_free_i64(m); 1813 } 1814 1815 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1816 { 1817 TCGv_i64 t1 = tcg_temp_new_i64(); 1818 TCGv_i64 t2 = tcg_temp_new_i64(); 1819 1820 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1821 tcg_gen_sub_i64(t2, a, b); 1822 tcg_gen_sub_i64(t1, a, t1); 1823 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1824 1825 tcg_temp_free_i64(t1); 1826 tcg_temp_free_i64(t2); 1827 } 1828 1829 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1830 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1831 { 1832 static const GVecGen3 g[4] = { 1833 { .fni8 = tcg_gen_vec_sub8_i64, 1834 .fniv = tcg_gen_sub_vec, 1835 .fno = gen_helper_gvec_sub8, 1836 .opt_opc = vecop_list_sub, 1837 .vece = MO_8 }, 1838 { .fni8 = tcg_gen_vec_sub16_i64, 1839 .fniv = tcg_gen_sub_vec, 1840 .fno = gen_helper_gvec_sub16, 1841 .opt_opc = vecop_list_sub, 1842 .vece = MO_16 }, 1843 { .fni4 = tcg_gen_sub_i32, 1844 .fniv = tcg_gen_sub_vec, 1845 .fno = gen_helper_gvec_sub32, 1846 .opt_opc = vecop_list_sub, 1847 .vece = MO_32 }, 1848 { .fni8 = tcg_gen_sub_i64, 1849 .fniv = tcg_gen_sub_vec, 1850 .fno = gen_helper_gvec_sub64, 1851 .opt_opc = vecop_list_sub, 1852 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1853 .vece = MO_64 }, 1854 }; 1855 1856 tcg_debug_assert(vece <= MO_64); 1857 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1858 } 1859 1860 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 1861 1862 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1863 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1864 { 1865 static const GVecGen3 g[4] = { 1866 { .fniv = tcg_gen_mul_vec, 1867 .fno = gen_helper_gvec_mul8, 1868 .opt_opc = vecop_list_mul, 1869 .vece = MO_8 }, 1870 { .fniv = tcg_gen_mul_vec, 1871 .fno = gen_helper_gvec_mul16, 1872 .opt_opc = vecop_list_mul, 1873 .vece = MO_16 }, 1874 { .fni4 = tcg_gen_mul_i32, 1875 .fniv = tcg_gen_mul_vec, 1876 .fno = gen_helper_gvec_mul32, 1877 .opt_opc = vecop_list_mul, 1878 .vece = MO_32 }, 1879 { .fni8 = tcg_gen_mul_i64, 1880 .fniv = tcg_gen_mul_vec, 1881 .fno = gen_helper_gvec_mul64, 1882 .opt_opc = vecop_list_mul, 1883 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1884 .vece = MO_64 }, 1885 }; 1886 1887 tcg_debug_assert(vece <= MO_64); 1888 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1889 } 1890 1891 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1892 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1893 { 1894 static const GVecGen2s g[4] = { 1895 { .fniv = tcg_gen_mul_vec, 1896 .fno = gen_helper_gvec_muls8, 1897 .opt_opc = vecop_list_mul, 1898 .vece = MO_8 }, 1899 { .fniv = tcg_gen_mul_vec, 1900 .fno = gen_helper_gvec_muls16, 1901 .opt_opc = vecop_list_mul, 1902 .vece = MO_16 }, 1903 { .fni4 = tcg_gen_mul_i32, 1904 .fniv = tcg_gen_mul_vec, 1905 .fno = gen_helper_gvec_muls32, 1906 .opt_opc = vecop_list_mul, 1907 .vece = MO_32 }, 1908 { .fni8 = tcg_gen_mul_i64, 1909 .fniv = tcg_gen_mul_vec, 1910 .fno = gen_helper_gvec_muls64, 1911 .opt_opc = vecop_list_mul, 1912 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1913 .vece = MO_64 }, 1914 }; 1915 1916 tcg_debug_assert(vece <= MO_64); 1917 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1918 } 1919 1920 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1921 int64_t c, uint32_t oprsz, uint32_t maxsz) 1922 { 1923 TCGv_i64 tmp = tcg_const_i64(c); 1924 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1925 tcg_temp_free_i64(tmp); 1926 } 1927 1928 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1929 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1930 { 1931 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 1932 static const GVecGen3 g[4] = { 1933 { .fniv = tcg_gen_ssadd_vec, 1934 .fno = gen_helper_gvec_ssadd8, 1935 .opt_opc = vecop_list, 1936 .vece = MO_8 }, 1937 { .fniv = tcg_gen_ssadd_vec, 1938 .fno = gen_helper_gvec_ssadd16, 1939 .opt_opc = vecop_list, 1940 .vece = MO_16 }, 1941 { .fniv = tcg_gen_ssadd_vec, 1942 .fno = gen_helper_gvec_ssadd32, 1943 .opt_opc = vecop_list, 1944 .vece = MO_32 }, 1945 { .fniv = tcg_gen_ssadd_vec, 1946 .fno = gen_helper_gvec_ssadd64, 1947 .opt_opc = vecop_list, 1948 .vece = MO_64 }, 1949 }; 1950 tcg_debug_assert(vece <= MO_64); 1951 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1952 } 1953 1954 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 1955 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1956 { 1957 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 1958 static const GVecGen3 g[4] = { 1959 { .fniv = tcg_gen_sssub_vec, 1960 .fno = gen_helper_gvec_sssub8, 1961 .opt_opc = vecop_list, 1962 .vece = MO_8 }, 1963 { .fniv = tcg_gen_sssub_vec, 1964 .fno = gen_helper_gvec_sssub16, 1965 .opt_opc = vecop_list, 1966 .vece = MO_16 }, 1967 { .fniv = tcg_gen_sssub_vec, 1968 .fno = gen_helper_gvec_sssub32, 1969 .opt_opc = vecop_list, 1970 .vece = MO_32 }, 1971 { .fniv = tcg_gen_sssub_vec, 1972 .fno = gen_helper_gvec_sssub64, 1973 .opt_opc = vecop_list, 1974 .vece = MO_64 }, 1975 }; 1976 tcg_debug_assert(vece <= MO_64); 1977 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1978 } 1979 1980 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1981 { 1982 TCGv_i32 max = tcg_const_i32(-1); 1983 tcg_gen_add_i32(d, a, b); 1984 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 1985 tcg_temp_free_i32(max); 1986 } 1987 1988 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1989 { 1990 TCGv_i64 max = tcg_const_i64(-1); 1991 tcg_gen_add_i64(d, a, b); 1992 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 1993 tcg_temp_free_i64(max); 1994 } 1995 1996 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1997 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1998 { 1999 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 2000 static const GVecGen3 g[4] = { 2001 { .fniv = tcg_gen_usadd_vec, 2002 .fno = gen_helper_gvec_usadd8, 2003 .opt_opc = vecop_list, 2004 .vece = MO_8 }, 2005 { .fniv = tcg_gen_usadd_vec, 2006 .fno = gen_helper_gvec_usadd16, 2007 .opt_opc = vecop_list, 2008 .vece = MO_16 }, 2009 { .fni4 = tcg_gen_usadd_i32, 2010 .fniv = tcg_gen_usadd_vec, 2011 .fno = gen_helper_gvec_usadd32, 2012 .opt_opc = vecop_list, 2013 .vece = MO_32 }, 2014 { .fni8 = tcg_gen_usadd_i64, 2015 .fniv = tcg_gen_usadd_vec, 2016 .fno = gen_helper_gvec_usadd64, 2017 .opt_opc = vecop_list, 2018 .vece = MO_64 } 2019 }; 2020 tcg_debug_assert(vece <= MO_64); 2021 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2022 } 2023 2024 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2025 { 2026 TCGv_i32 min = tcg_const_i32(0); 2027 tcg_gen_sub_i32(d, a, b); 2028 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 2029 tcg_temp_free_i32(min); 2030 } 2031 2032 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2033 { 2034 TCGv_i64 min = tcg_const_i64(0); 2035 tcg_gen_sub_i64(d, a, b); 2036 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 2037 tcg_temp_free_i64(min); 2038 } 2039 2040 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 2041 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2042 { 2043 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 2044 static const GVecGen3 g[4] = { 2045 { .fniv = tcg_gen_ussub_vec, 2046 .fno = gen_helper_gvec_ussub8, 2047 .opt_opc = vecop_list, 2048 .vece = MO_8 }, 2049 { .fniv = tcg_gen_ussub_vec, 2050 .fno = gen_helper_gvec_ussub16, 2051 .opt_opc = vecop_list, 2052 .vece = MO_16 }, 2053 { .fni4 = tcg_gen_ussub_i32, 2054 .fniv = tcg_gen_ussub_vec, 2055 .fno = gen_helper_gvec_ussub32, 2056 .opt_opc = vecop_list, 2057 .vece = MO_32 }, 2058 { .fni8 = tcg_gen_ussub_i64, 2059 .fniv = tcg_gen_ussub_vec, 2060 .fno = gen_helper_gvec_ussub64, 2061 .opt_opc = vecop_list, 2062 .vece = MO_64 } 2063 }; 2064 tcg_debug_assert(vece <= MO_64); 2065 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2066 } 2067 2068 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 2069 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2070 { 2071 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 2072 static const GVecGen3 g[4] = { 2073 { .fniv = tcg_gen_smin_vec, 2074 .fno = gen_helper_gvec_smin8, 2075 .opt_opc = vecop_list, 2076 .vece = MO_8 }, 2077 { .fniv = tcg_gen_smin_vec, 2078 .fno = gen_helper_gvec_smin16, 2079 .opt_opc = vecop_list, 2080 .vece = MO_16 }, 2081 { .fni4 = tcg_gen_smin_i32, 2082 .fniv = tcg_gen_smin_vec, 2083 .fno = gen_helper_gvec_smin32, 2084 .opt_opc = vecop_list, 2085 .vece = MO_32 }, 2086 { .fni8 = tcg_gen_smin_i64, 2087 .fniv = tcg_gen_smin_vec, 2088 .fno = gen_helper_gvec_smin64, 2089 .opt_opc = vecop_list, 2090 .vece = MO_64 } 2091 }; 2092 tcg_debug_assert(vece <= MO_64); 2093 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2094 } 2095 2096 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2097 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2098 { 2099 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2100 static const GVecGen3 g[4] = { 2101 { .fniv = tcg_gen_umin_vec, 2102 .fno = gen_helper_gvec_umin8, 2103 .opt_opc = vecop_list, 2104 .vece = MO_8 }, 2105 { .fniv = tcg_gen_umin_vec, 2106 .fno = gen_helper_gvec_umin16, 2107 .opt_opc = vecop_list, 2108 .vece = MO_16 }, 2109 { .fni4 = tcg_gen_umin_i32, 2110 .fniv = tcg_gen_umin_vec, 2111 .fno = gen_helper_gvec_umin32, 2112 .opt_opc = vecop_list, 2113 .vece = MO_32 }, 2114 { .fni8 = tcg_gen_umin_i64, 2115 .fniv = tcg_gen_umin_vec, 2116 .fno = gen_helper_gvec_umin64, 2117 .opt_opc = vecop_list, 2118 .vece = MO_64 } 2119 }; 2120 tcg_debug_assert(vece <= MO_64); 2121 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2122 } 2123 2124 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2125 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2126 { 2127 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2128 static const GVecGen3 g[4] = { 2129 { .fniv = tcg_gen_smax_vec, 2130 .fno = gen_helper_gvec_smax8, 2131 .opt_opc = vecop_list, 2132 .vece = MO_8 }, 2133 { .fniv = tcg_gen_smax_vec, 2134 .fno = gen_helper_gvec_smax16, 2135 .opt_opc = vecop_list, 2136 .vece = MO_16 }, 2137 { .fni4 = tcg_gen_smax_i32, 2138 .fniv = tcg_gen_smax_vec, 2139 .fno = gen_helper_gvec_smax32, 2140 .opt_opc = vecop_list, 2141 .vece = MO_32 }, 2142 { .fni8 = tcg_gen_smax_i64, 2143 .fniv = tcg_gen_smax_vec, 2144 .fno = gen_helper_gvec_smax64, 2145 .opt_opc = vecop_list, 2146 .vece = MO_64 } 2147 }; 2148 tcg_debug_assert(vece <= MO_64); 2149 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2150 } 2151 2152 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2153 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2154 { 2155 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2156 static const GVecGen3 g[4] = { 2157 { .fniv = tcg_gen_umax_vec, 2158 .fno = gen_helper_gvec_umax8, 2159 .opt_opc = vecop_list, 2160 .vece = MO_8 }, 2161 { .fniv = tcg_gen_umax_vec, 2162 .fno = gen_helper_gvec_umax16, 2163 .opt_opc = vecop_list, 2164 .vece = MO_16 }, 2165 { .fni4 = tcg_gen_umax_i32, 2166 .fniv = tcg_gen_umax_vec, 2167 .fno = gen_helper_gvec_umax32, 2168 .opt_opc = vecop_list, 2169 .vece = MO_32 }, 2170 { .fni8 = tcg_gen_umax_i64, 2171 .fniv = tcg_gen_umax_vec, 2172 .fno = gen_helper_gvec_umax64, 2173 .opt_opc = vecop_list, 2174 .vece = MO_64 } 2175 }; 2176 tcg_debug_assert(vece <= MO_64); 2177 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2178 } 2179 2180 /* Perform a vector negation using normal negation and a mask. 2181 Compare gen_subv_mask above. */ 2182 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2183 { 2184 TCGv_i64 t2 = tcg_temp_new_i64(); 2185 TCGv_i64 t3 = tcg_temp_new_i64(); 2186 2187 tcg_gen_andc_i64(t3, m, b); 2188 tcg_gen_andc_i64(t2, b, m); 2189 tcg_gen_sub_i64(d, m, t2); 2190 tcg_gen_xor_i64(d, d, t3); 2191 2192 tcg_temp_free_i64(t2); 2193 tcg_temp_free_i64(t3); 2194 } 2195 2196 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2197 { 2198 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 2199 gen_negv_mask(d, b, m); 2200 tcg_temp_free_i64(m); 2201 } 2202 2203 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2204 { 2205 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 2206 gen_negv_mask(d, b, m); 2207 tcg_temp_free_i64(m); 2208 } 2209 2210 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2211 { 2212 TCGv_i64 t1 = tcg_temp_new_i64(); 2213 TCGv_i64 t2 = tcg_temp_new_i64(); 2214 2215 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2216 tcg_gen_neg_i64(t2, b); 2217 tcg_gen_neg_i64(t1, t1); 2218 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2219 2220 tcg_temp_free_i64(t1); 2221 tcg_temp_free_i64(t2); 2222 } 2223 2224 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2225 uint32_t oprsz, uint32_t maxsz) 2226 { 2227 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2228 static const GVecGen2 g[4] = { 2229 { .fni8 = tcg_gen_vec_neg8_i64, 2230 .fniv = tcg_gen_neg_vec, 2231 .fno = gen_helper_gvec_neg8, 2232 .opt_opc = vecop_list, 2233 .vece = MO_8 }, 2234 { .fni8 = tcg_gen_vec_neg16_i64, 2235 .fniv = tcg_gen_neg_vec, 2236 .fno = gen_helper_gvec_neg16, 2237 .opt_opc = vecop_list, 2238 .vece = MO_16 }, 2239 { .fni4 = tcg_gen_neg_i32, 2240 .fniv = tcg_gen_neg_vec, 2241 .fno = gen_helper_gvec_neg32, 2242 .opt_opc = vecop_list, 2243 .vece = MO_32 }, 2244 { .fni8 = tcg_gen_neg_i64, 2245 .fniv = tcg_gen_neg_vec, 2246 .fno = gen_helper_gvec_neg64, 2247 .opt_opc = vecop_list, 2248 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2249 .vece = MO_64 }, 2250 }; 2251 2252 tcg_debug_assert(vece <= MO_64); 2253 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2254 } 2255 2256 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2257 { 2258 TCGv_i64 t = tcg_temp_new_i64(); 2259 int nbit = 8 << vece; 2260 2261 /* Create -1 for each negative element. */ 2262 tcg_gen_shri_i64(t, b, nbit - 1); 2263 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2264 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2265 2266 /* 2267 * Invert (via xor -1) and add one (via sub -1). 2268 * Because of the ordering the msb is cleared, 2269 * so we never have carry into the next element. 2270 */ 2271 tcg_gen_xor_i64(d, b, t); 2272 tcg_gen_sub_i64(d, d, t); 2273 2274 tcg_temp_free_i64(t); 2275 } 2276 2277 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2278 { 2279 gen_absv_mask(d, b, MO_8); 2280 } 2281 2282 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2283 { 2284 gen_absv_mask(d, b, MO_16); 2285 } 2286 2287 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2288 uint32_t oprsz, uint32_t maxsz) 2289 { 2290 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2291 static const GVecGen2 g[4] = { 2292 { .fni8 = tcg_gen_vec_abs8_i64, 2293 .fniv = tcg_gen_abs_vec, 2294 .fno = gen_helper_gvec_abs8, 2295 .opt_opc = vecop_list, 2296 .vece = MO_8 }, 2297 { .fni8 = tcg_gen_vec_abs16_i64, 2298 .fniv = tcg_gen_abs_vec, 2299 .fno = gen_helper_gvec_abs16, 2300 .opt_opc = vecop_list, 2301 .vece = MO_16 }, 2302 { .fni4 = tcg_gen_abs_i32, 2303 .fniv = tcg_gen_abs_vec, 2304 .fno = gen_helper_gvec_abs32, 2305 .opt_opc = vecop_list, 2306 .vece = MO_32 }, 2307 { .fni8 = tcg_gen_abs_i64, 2308 .fniv = tcg_gen_abs_vec, 2309 .fno = gen_helper_gvec_abs64, 2310 .opt_opc = vecop_list, 2311 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2312 .vece = MO_64 }, 2313 }; 2314 2315 tcg_debug_assert(vece <= MO_64); 2316 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2317 } 2318 2319 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2320 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2321 { 2322 static const GVecGen3 g = { 2323 .fni8 = tcg_gen_and_i64, 2324 .fniv = tcg_gen_and_vec, 2325 .fno = gen_helper_gvec_and, 2326 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2327 }; 2328 2329 if (aofs == bofs) { 2330 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2331 } else { 2332 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2333 } 2334 } 2335 2336 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2337 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2338 { 2339 static const GVecGen3 g = { 2340 .fni8 = tcg_gen_or_i64, 2341 .fniv = tcg_gen_or_vec, 2342 .fno = gen_helper_gvec_or, 2343 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2344 }; 2345 2346 if (aofs == bofs) { 2347 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2348 } else { 2349 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2350 } 2351 } 2352 2353 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2354 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2355 { 2356 static const GVecGen3 g = { 2357 .fni8 = tcg_gen_xor_i64, 2358 .fniv = tcg_gen_xor_vec, 2359 .fno = gen_helper_gvec_xor, 2360 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2361 }; 2362 2363 if (aofs == bofs) { 2364 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2365 } else { 2366 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2367 } 2368 } 2369 2370 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2371 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2372 { 2373 static const GVecGen3 g = { 2374 .fni8 = tcg_gen_andc_i64, 2375 .fniv = tcg_gen_andc_vec, 2376 .fno = gen_helper_gvec_andc, 2377 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2378 }; 2379 2380 if (aofs == bofs) { 2381 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2382 } else { 2383 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2384 } 2385 } 2386 2387 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2388 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2389 { 2390 static const GVecGen3 g = { 2391 .fni8 = tcg_gen_orc_i64, 2392 .fniv = tcg_gen_orc_vec, 2393 .fno = gen_helper_gvec_orc, 2394 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2395 }; 2396 2397 if (aofs == bofs) { 2398 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2399 } else { 2400 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2401 } 2402 } 2403 2404 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2405 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2406 { 2407 static const GVecGen3 g = { 2408 .fni8 = tcg_gen_nand_i64, 2409 .fniv = tcg_gen_nand_vec, 2410 .fno = gen_helper_gvec_nand, 2411 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2412 }; 2413 2414 if (aofs == bofs) { 2415 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2416 } else { 2417 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2418 } 2419 } 2420 2421 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2422 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2423 { 2424 static const GVecGen3 g = { 2425 .fni8 = tcg_gen_nor_i64, 2426 .fniv = tcg_gen_nor_vec, 2427 .fno = gen_helper_gvec_nor, 2428 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2429 }; 2430 2431 if (aofs == bofs) { 2432 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2433 } else { 2434 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2435 } 2436 } 2437 2438 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2439 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2440 { 2441 static const GVecGen3 g = { 2442 .fni8 = tcg_gen_eqv_i64, 2443 .fniv = tcg_gen_eqv_vec, 2444 .fno = gen_helper_gvec_eqv, 2445 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2446 }; 2447 2448 if (aofs == bofs) { 2449 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2450 } else { 2451 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2452 } 2453 } 2454 2455 static const GVecGen2s gop_ands = { 2456 .fni8 = tcg_gen_and_i64, 2457 .fniv = tcg_gen_and_vec, 2458 .fno = gen_helper_gvec_ands, 2459 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2460 .vece = MO_64 2461 }; 2462 2463 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2464 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2465 { 2466 TCGv_i64 tmp = tcg_temp_new_i64(); 2467 gen_dup_i64(vece, tmp, c); 2468 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2469 tcg_temp_free_i64(tmp); 2470 } 2471 2472 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2473 int64_t c, uint32_t oprsz, uint32_t maxsz) 2474 { 2475 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2476 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2477 tcg_temp_free_i64(tmp); 2478 } 2479 2480 static const GVecGen2s gop_xors = { 2481 .fni8 = tcg_gen_xor_i64, 2482 .fniv = tcg_gen_xor_vec, 2483 .fno = gen_helper_gvec_xors, 2484 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2485 .vece = MO_64 2486 }; 2487 2488 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2489 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2490 { 2491 TCGv_i64 tmp = tcg_temp_new_i64(); 2492 gen_dup_i64(vece, tmp, c); 2493 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2494 tcg_temp_free_i64(tmp); 2495 } 2496 2497 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2498 int64_t c, uint32_t oprsz, uint32_t maxsz) 2499 { 2500 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2501 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2502 tcg_temp_free_i64(tmp); 2503 } 2504 2505 static const GVecGen2s gop_ors = { 2506 .fni8 = tcg_gen_or_i64, 2507 .fniv = tcg_gen_or_vec, 2508 .fno = gen_helper_gvec_ors, 2509 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2510 .vece = MO_64 2511 }; 2512 2513 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2514 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2515 { 2516 TCGv_i64 tmp = tcg_temp_new_i64(); 2517 gen_dup_i64(vece, tmp, c); 2518 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2519 tcg_temp_free_i64(tmp); 2520 } 2521 2522 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2523 int64_t c, uint32_t oprsz, uint32_t maxsz) 2524 { 2525 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2526 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2527 tcg_temp_free_i64(tmp); 2528 } 2529 2530 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2531 { 2532 uint64_t mask = dup_const(MO_8, 0xff << c); 2533 tcg_gen_shli_i64(d, a, c); 2534 tcg_gen_andi_i64(d, d, mask); 2535 } 2536 2537 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2538 { 2539 uint64_t mask = dup_const(MO_16, 0xffff << c); 2540 tcg_gen_shli_i64(d, a, c); 2541 tcg_gen_andi_i64(d, d, mask); 2542 } 2543 2544 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2545 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2546 { 2547 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2548 static const GVecGen2i g[4] = { 2549 { .fni8 = tcg_gen_vec_shl8i_i64, 2550 .fniv = tcg_gen_shli_vec, 2551 .fno = gen_helper_gvec_shl8i, 2552 .opt_opc = vecop_list, 2553 .vece = MO_8 }, 2554 { .fni8 = tcg_gen_vec_shl16i_i64, 2555 .fniv = tcg_gen_shli_vec, 2556 .fno = gen_helper_gvec_shl16i, 2557 .opt_opc = vecop_list, 2558 .vece = MO_16 }, 2559 { .fni4 = tcg_gen_shli_i32, 2560 .fniv = tcg_gen_shli_vec, 2561 .fno = gen_helper_gvec_shl32i, 2562 .opt_opc = vecop_list, 2563 .vece = MO_32 }, 2564 { .fni8 = tcg_gen_shli_i64, 2565 .fniv = tcg_gen_shli_vec, 2566 .fno = gen_helper_gvec_shl64i, 2567 .opt_opc = vecop_list, 2568 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2569 .vece = MO_64 }, 2570 }; 2571 2572 tcg_debug_assert(vece <= MO_64); 2573 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2574 if (shift == 0) { 2575 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2576 } else { 2577 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2578 } 2579 } 2580 2581 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2582 { 2583 uint64_t mask = dup_const(MO_8, 0xff >> c); 2584 tcg_gen_shri_i64(d, a, c); 2585 tcg_gen_andi_i64(d, d, mask); 2586 } 2587 2588 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2589 { 2590 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2591 tcg_gen_shri_i64(d, a, c); 2592 tcg_gen_andi_i64(d, d, mask); 2593 } 2594 2595 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2596 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2597 { 2598 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2599 static const GVecGen2i g[4] = { 2600 { .fni8 = tcg_gen_vec_shr8i_i64, 2601 .fniv = tcg_gen_shri_vec, 2602 .fno = gen_helper_gvec_shr8i, 2603 .opt_opc = vecop_list, 2604 .vece = MO_8 }, 2605 { .fni8 = tcg_gen_vec_shr16i_i64, 2606 .fniv = tcg_gen_shri_vec, 2607 .fno = gen_helper_gvec_shr16i, 2608 .opt_opc = vecop_list, 2609 .vece = MO_16 }, 2610 { .fni4 = tcg_gen_shri_i32, 2611 .fniv = tcg_gen_shri_vec, 2612 .fno = gen_helper_gvec_shr32i, 2613 .opt_opc = vecop_list, 2614 .vece = MO_32 }, 2615 { .fni8 = tcg_gen_shri_i64, 2616 .fniv = tcg_gen_shri_vec, 2617 .fno = gen_helper_gvec_shr64i, 2618 .opt_opc = vecop_list, 2619 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2620 .vece = MO_64 }, 2621 }; 2622 2623 tcg_debug_assert(vece <= MO_64); 2624 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2625 if (shift == 0) { 2626 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2627 } else { 2628 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2629 } 2630 } 2631 2632 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2633 { 2634 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2635 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2636 TCGv_i64 s = tcg_temp_new_i64(); 2637 2638 tcg_gen_shri_i64(d, a, c); 2639 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2640 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2641 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2642 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2643 tcg_temp_free_i64(s); 2644 } 2645 2646 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2647 { 2648 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2649 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2650 TCGv_i64 s = tcg_temp_new_i64(); 2651 2652 tcg_gen_shri_i64(d, a, c); 2653 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2654 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2655 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2656 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2657 tcg_temp_free_i64(s); 2658 } 2659 2660 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2661 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2662 { 2663 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 2664 static const GVecGen2i g[4] = { 2665 { .fni8 = tcg_gen_vec_sar8i_i64, 2666 .fniv = tcg_gen_sari_vec, 2667 .fno = gen_helper_gvec_sar8i, 2668 .opt_opc = vecop_list, 2669 .vece = MO_8 }, 2670 { .fni8 = tcg_gen_vec_sar16i_i64, 2671 .fniv = tcg_gen_sari_vec, 2672 .fno = gen_helper_gvec_sar16i, 2673 .opt_opc = vecop_list, 2674 .vece = MO_16 }, 2675 { .fni4 = tcg_gen_sari_i32, 2676 .fniv = tcg_gen_sari_vec, 2677 .fno = gen_helper_gvec_sar32i, 2678 .opt_opc = vecop_list, 2679 .vece = MO_32 }, 2680 { .fni8 = tcg_gen_sari_i64, 2681 .fniv = tcg_gen_sari_vec, 2682 .fno = gen_helper_gvec_sar64i, 2683 .opt_opc = vecop_list, 2684 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2685 .vece = MO_64 }, 2686 }; 2687 2688 tcg_debug_assert(vece <= MO_64); 2689 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2690 if (shift == 0) { 2691 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2692 } else { 2693 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2694 } 2695 } 2696 2697 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2698 { 2699 uint64_t mask = dup_const(MO_8, 0xff << c); 2700 2701 tcg_gen_shli_i64(d, a, c); 2702 tcg_gen_shri_i64(a, a, 8 - c); 2703 tcg_gen_andi_i64(d, d, mask); 2704 tcg_gen_andi_i64(a, a, ~mask); 2705 tcg_gen_or_i64(d, d, a); 2706 } 2707 2708 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2709 { 2710 uint64_t mask = dup_const(MO_16, 0xffff << c); 2711 2712 tcg_gen_shli_i64(d, a, c); 2713 tcg_gen_shri_i64(a, a, 16 - c); 2714 tcg_gen_andi_i64(d, d, mask); 2715 tcg_gen_andi_i64(a, a, ~mask); 2716 tcg_gen_or_i64(d, d, a); 2717 } 2718 2719 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs, 2720 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2721 { 2722 static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 }; 2723 static const GVecGen2i g[4] = { 2724 { .fni8 = tcg_gen_vec_rotl8i_i64, 2725 .fniv = tcg_gen_rotli_vec, 2726 .fno = gen_helper_gvec_rotl8i, 2727 .opt_opc = vecop_list, 2728 .vece = MO_8 }, 2729 { .fni8 = tcg_gen_vec_rotl16i_i64, 2730 .fniv = tcg_gen_rotli_vec, 2731 .fno = gen_helper_gvec_rotl16i, 2732 .opt_opc = vecop_list, 2733 .vece = MO_16 }, 2734 { .fni4 = tcg_gen_rotli_i32, 2735 .fniv = tcg_gen_rotli_vec, 2736 .fno = gen_helper_gvec_rotl32i, 2737 .opt_opc = vecop_list, 2738 .vece = MO_32 }, 2739 { .fni8 = tcg_gen_rotli_i64, 2740 .fniv = tcg_gen_rotli_vec, 2741 .fno = gen_helper_gvec_rotl64i, 2742 .opt_opc = vecop_list, 2743 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2744 .vece = MO_64 }, 2745 }; 2746 2747 tcg_debug_assert(vece <= MO_64); 2748 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2749 if (shift == 0) { 2750 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2751 } else { 2752 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2753 } 2754 } 2755 2756 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs, 2757 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2758 { 2759 tcg_debug_assert(vece <= MO_64); 2760 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2761 tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1), 2762 oprsz, maxsz); 2763 } 2764 2765 /* 2766 * Specialized generation vector shifts by a non-constant scalar. 2767 */ 2768 2769 typedef struct { 2770 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 2771 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 2772 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 2773 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 2774 gen_helper_gvec_2 *fno[4]; 2775 TCGOpcode s_list[2]; 2776 TCGOpcode v_list[2]; 2777 } GVecGen2sh; 2778 2779 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2780 uint32_t oprsz, uint32_t tysz, TCGType type, 2781 TCGv_i32 shift, 2782 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 2783 { 2784 TCGv_vec t0 = tcg_temp_new_vec(type); 2785 uint32_t i; 2786 2787 for (i = 0; i < oprsz; i += tysz) { 2788 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2789 fni(vece, t0, t0, shift); 2790 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2791 } 2792 tcg_temp_free_vec(t0); 2793 } 2794 2795 static void 2796 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 2797 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 2798 { 2799 TCGType type; 2800 uint32_t some; 2801 2802 check_size_align(oprsz, maxsz, dofs | aofs); 2803 check_overlap_2(dofs, aofs, maxsz); 2804 2805 /* If the backend has a scalar expansion, great. */ 2806 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 2807 if (type) { 2808 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2809 switch (type) { 2810 case TCG_TYPE_V256: 2811 some = QEMU_ALIGN_DOWN(oprsz, 32); 2812 expand_2sh_vec(vece, dofs, aofs, some, 32, 2813 TCG_TYPE_V256, shift, g->fniv_s); 2814 if (some == oprsz) { 2815 break; 2816 } 2817 dofs += some; 2818 aofs += some; 2819 oprsz -= some; 2820 maxsz -= some; 2821 /* fallthru */ 2822 case TCG_TYPE_V128: 2823 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 2824 TCG_TYPE_V128, shift, g->fniv_s); 2825 break; 2826 case TCG_TYPE_V64: 2827 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 2828 TCG_TYPE_V64, shift, g->fniv_s); 2829 break; 2830 default: 2831 g_assert_not_reached(); 2832 } 2833 tcg_swap_vecop_list(hold_list); 2834 goto clear_tail; 2835 } 2836 2837 /* If the backend supports variable vector shifts, also cool. */ 2838 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 2839 if (type) { 2840 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2841 TCGv_vec v_shift = tcg_temp_new_vec(type); 2842 2843 if (vece == MO_64) { 2844 TCGv_i64 sh64 = tcg_temp_new_i64(); 2845 tcg_gen_extu_i32_i64(sh64, shift); 2846 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 2847 tcg_temp_free_i64(sh64); 2848 } else { 2849 tcg_gen_dup_i32_vec(vece, v_shift, shift); 2850 } 2851 2852 switch (type) { 2853 case TCG_TYPE_V256: 2854 some = QEMU_ALIGN_DOWN(oprsz, 32); 2855 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 2856 v_shift, false, g->fniv_v); 2857 if (some == oprsz) { 2858 break; 2859 } 2860 dofs += some; 2861 aofs += some; 2862 oprsz -= some; 2863 maxsz -= some; 2864 /* fallthru */ 2865 case TCG_TYPE_V128: 2866 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 2867 v_shift, false, g->fniv_v); 2868 break; 2869 case TCG_TYPE_V64: 2870 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 2871 v_shift, false, g->fniv_v); 2872 break; 2873 default: 2874 g_assert_not_reached(); 2875 } 2876 tcg_temp_free_vec(v_shift); 2877 tcg_swap_vecop_list(hold_list); 2878 goto clear_tail; 2879 } 2880 2881 /* Otherwise fall back to integral... */ 2882 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2883 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 2884 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2885 TCGv_i64 sh64 = tcg_temp_new_i64(); 2886 tcg_gen_extu_i32_i64(sh64, shift); 2887 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 2888 tcg_temp_free_i64(sh64); 2889 } else { 2890 TCGv_ptr a0 = tcg_temp_new_ptr(); 2891 TCGv_ptr a1 = tcg_temp_new_ptr(); 2892 TCGv_i32 desc = tcg_temp_new_i32(); 2893 2894 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 2895 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 2896 tcg_gen_addi_ptr(a0, cpu_env, dofs); 2897 tcg_gen_addi_ptr(a1, cpu_env, aofs); 2898 2899 g->fno[vece](a0, a1, desc); 2900 2901 tcg_temp_free_ptr(a0); 2902 tcg_temp_free_ptr(a1); 2903 tcg_temp_free_i32(desc); 2904 return; 2905 } 2906 2907 clear_tail: 2908 if (oprsz < maxsz) { 2909 expand_clr(dofs + oprsz, maxsz - oprsz); 2910 } 2911 } 2912 2913 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 2914 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2915 { 2916 static const GVecGen2sh g = { 2917 .fni4 = tcg_gen_shl_i32, 2918 .fni8 = tcg_gen_shl_i64, 2919 .fniv_s = tcg_gen_shls_vec, 2920 .fniv_v = tcg_gen_shlv_vec, 2921 .fno = { 2922 gen_helper_gvec_shl8i, 2923 gen_helper_gvec_shl16i, 2924 gen_helper_gvec_shl32i, 2925 gen_helper_gvec_shl64i, 2926 }, 2927 .s_list = { INDEX_op_shls_vec, 0 }, 2928 .v_list = { INDEX_op_shlv_vec, 0 }, 2929 }; 2930 2931 tcg_debug_assert(vece <= MO_64); 2932 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2933 } 2934 2935 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 2936 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2937 { 2938 static const GVecGen2sh g = { 2939 .fni4 = tcg_gen_shr_i32, 2940 .fni8 = tcg_gen_shr_i64, 2941 .fniv_s = tcg_gen_shrs_vec, 2942 .fniv_v = tcg_gen_shrv_vec, 2943 .fno = { 2944 gen_helper_gvec_shr8i, 2945 gen_helper_gvec_shr16i, 2946 gen_helper_gvec_shr32i, 2947 gen_helper_gvec_shr64i, 2948 }, 2949 .s_list = { INDEX_op_shrs_vec, 0 }, 2950 .v_list = { INDEX_op_shrv_vec, 0 }, 2951 }; 2952 2953 tcg_debug_assert(vece <= MO_64); 2954 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2955 } 2956 2957 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 2958 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2959 { 2960 static const GVecGen2sh g = { 2961 .fni4 = tcg_gen_sar_i32, 2962 .fni8 = tcg_gen_sar_i64, 2963 .fniv_s = tcg_gen_sars_vec, 2964 .fniv_v = tcg_gen_sarv_vec, 2965 .fno = { 2966 gen_helper_gvec_sar8i, 2967 gen_helper_gvec_sar16i, 2968 gen_helper_gvec_sar32i, 2969 gen_helper_gvec_sar64i, 2970 }, 2971 .s_list = { INDEX_op_sars_vec, 0 }, 2972 .v_list = { INDEX_op_sarv_vec, 0 }, 2973 }; 2974 2975 tcg_debug_assert(vece <= MO_64); 2976 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2977 } 2978 2979 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs, 2980 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2981 { 2982 static const GVecGen2sh g = { 2983 .fni4 = tcg_gen_rotl_i32, 2984 .fni8 = tcg_gen_rotl_i64, 2985 .fniv_s = tcg_gen_rotls_vec, 2986 .fniv_v = tcg_gen_rotlv_vec, 2987 .fno = { 2988 gen_helper_gvec_rotl8i, 2989 gen_helper_gvec_rotl16i, 2990 gen_helper_gvec_rotl32i, 2991 gen_helper_gvec_rotl64i, 2992 }, 2993 .s_list = { INDEX_op_rotls_vec, 0 }, 2994 .v_list = { INDEX_op_rotlv_vec, 0 }, 2995 }; 2996 2997 tcg_debug_assert(vece <= MO_64); 2998 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2999 } 3000 3001 /* 3002 * Expand D = A << (B % element bits) 3003 * 3004 * Unlike scalar shifts, where it is easy for the target front end 3005 * to include the modulo as part of the expansion. If the target 3006 * naturally includes the modulo as part of the operation, great! 3007 * If the target has some other behaviour from out-of-range shifts, 3008 * then it could not use this function anyway, and would need to 3009 * do it's own expansion with custom functions. 3010 */ 3011 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 3012 TCGv_vec a, TCGv_vec b) 3013 { 3014 TCGv_vec t = tcg_temp_new_vec_matching(d); 3015 3016 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3017 tcg_gen_and_vec(vece, t, t, b); 3018 tcg_gen_shlv_vec(vece, d, a, t); 3019 tcg_temp_free_vec(t); 3020 } 3021 3022 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3023 { 3024 TCGv_i32 t = tcg_temp_new_i32(); 3025 3026 tcg_gen_andi_i32(t, b, 31); 3027 tcg_gen_shl_i32(d, a, t); 3028 tcg_temp_free_i32(t); 3029 } 3030 3031 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3032 { 3033 TCGv_i64 t = tcg_temp_new_i64(); 3034 3035 tcg_gen_andi_i64(t, b, 63); 3036 tcg_gen_shl_i64(d, a, t); 3037 tcg_temp_free_i64(t); 3038 } 3039 3040 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3041 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3042 { 3043 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 3044 static const GVecGen3 g[4] = { 3045 { .fniv = tcg_gen_shlv_mod_vec, 3046 .fno = gen_helper_gvec_shl8v, 3047 .opt_opc = vecop_list, 3048 .vece = MO_8 }, 3049 { .fniv = tcg_gen_shlv_mod_vec, 3050 .fno = gen_helper_gvec_shl16v, 3051 .opt_opc = vecop_list, 3052 .vece = MO_16 }, 3053 { .fni4 = tcg_gen_shl_mod_i32, 3054 .fniv = tcg_gen_shlv_mod_vec, 3055 .fno = gen_helper_gvec_shl32v, 3056 .opt_opc = vecop_list, 3057 .vece = MO_32 }, 3058 { .fni8 = tcg_gen_shl_mod_i64, 3059 .fniv = tcg_gen_shlv_mod_vec, 3060 .fno = gen_helper_gvec_shl64v, 3061 .opt_opc = vecop_list, 3062 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3063 .vece = MO_64 }, 3064 }; 3065 3066 tcg_debug_assert(vece <= MO_64); 3067 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3068 } 3069 3070 /* 3071 * Similarly for logical right shifts. 3072 */ 3073 3074 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 3075 TCGv_vec a, TCGv_vec b) 3076 { 3077 TCGv_vec t = tcg_temp_new_vec_matching(d); 3078 3079 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3080 tcg_gen_and_vec(vece, t, t, b); 3081 tcg_gen_shrv_vec(vece, d, a, t); 3082 tcg_temp_free_vec(t); 3083 } 3084 3085 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3086 { 3087 TCGv_i32 t = tcg_temp_new_i32(); 3088 3089 tcg_gen_andi_i32(t, b, 31); 3090 tcg_gen_shr_i32(d, a, t); 3091 tcg_temp_free_i32(t); 3092 } 3093 3094 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3095 { 3096 TCGv_i64 t = tcg_temp_new_i64(); 3097 3098 tcg_gen_andi_i64(t, b, 63); 3099 tcg_gen_shr_i64(d, a, t); 3100 tcg_temp_free_i64(t); 3101 } 3102 3103 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3104 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3105 { 3106 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 3107 static const GVecGen3 g[4] = { 3108 { .fniv = tcg_gen_shrv_mod_vec, 3109 .fno = gen_helper_gvec_shr8v, 3110 .opt_opc = vecop_list, 3111 .vece = MO_8 }, 3112 { .fniv = tcg_gen_shrv_mod_vec, 3113 .fno = gen_helper_gvec_shr16v, 3114 .opt_opc = vecop_list, 3115 .vece = MO_16 }, 3116 { .fni4 = tcg_gen_shr_mod_i32, 3117 .fniv = tcg_gen_shrv_mod_vec, 3118 .fno = gen_helper_gvec_shr32v, 3119 .opt_opc = vecop_list, 3120 .vece = MO_32 }, 3121 { .fni8 = tcg_gen_shr_mod_i64, 3122 .fniv = tcg_gen_shrv_mod_vec, 3123 .fno = gen_helper_gvec_shr64v, 3124 .opt_opc = vecop_list, 3125 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3126 .vece = MO_64 }, 3127 }; 3128 3129 tcg_debug_assert(vece <= MO_64); 3130 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3131 } 3132 3133 /* 3134 * Similarly for arithmetic right shifts. 3135 */ 3136 3137 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 3138 TCGv_vec a, TCGv_vec b) 3139 { 3140 TCGv_vec t = tcg_temp_new_vec_matching(d); 3141 3142 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3143 tcg_gen_and_vec(vece, t, t, b); 3144 tcg_gen_sarv_vec(vece, d, a, t); 3145 tcg_temp_free_vec(t); 3146 } 3147 3148 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3149 { 3150 TCGv_i32 t = tcg_temp_new_i32(); 3151 3152 tcg_gen_andi_i32(t, b, 31); 3153 tcg_gen_sar_i32(d, a, t); 3154 tcg_temp_free_i32(t); 3155 } 3156 3157 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3158 { 3159 TCGv_i64 t = tcg_temp_new_i64(); 3160 3161 tcg_gen_andi_i64(t, b, 63); 3162 tcg_gen_sar_i64(d, a, t); 3163 tcg_temp_free_i64(t); 3164 } 3165 3166 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3167 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3168 { 3169 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3170 static const GVecGen3 g[4] = { 3171 { .fniv = tcg_gen_sarv_mod_vec, 3172 .fno = gen_helper_gvec_sar8v, 3173 .opt_opc = vecop_list, 3174 .vece = MO_8 }, 3175 { .fniv = tcg_gen_sarv_mod_vec, 3176 .fno = gen_helper_gvec_sar16v, 3177 .opt_opc = vecop_list, 3178 .vece = MO_16 }, 3179 { .fni4 = tcg_gen_sar_mod_i32, 3180 .fniv = tcg_gen_sarv_mod_vec, 3181 .fno = gen_helper_gvec_sar32v, 3182 .opt_opc = vecop_list, 3183 .vece = MO_32 }, 3184 { .fni8 = tcg_gen_sar_mod_i64, 3185 .fniv = tcg_gen_sarv_mod_vec, 3186 .fno = gen_helper_gvec_sar64v, 3187 .opt_opc = vecop_list, 3188 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3189 .vece = MO_64 }, 3190 }; 3191 3192 tcg_debug_assert(vece <= MO_64); 3193 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3194 } 3195 3196 /* 3197 * Similarly for rotates. 3198 */ 3199 3200 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d, 3201 TCGv_vec a, TCGv_vec b) 3202 { 3203 TCGv_vec t = tcg_temp_new_vec_matching(d); 3204 3205 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3206 tcg_gen_and_vec(vece, t, t, b); 3207 tcg_gen_rotlv_vec(vece, d, a, t); 3208 tcg_temp_free_vec(t); 3209 } 3210 3211 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3212 { 3213 TCGv_i32 t = tcg_temp_new_i32(); 3214 3215 tcg_gen_andi_i32(t, b, 31); 3216 tcg_gen_rotl_i32(d, a, t); 3217 tcg_temp_free_i32(t); 3218 } 3219 3220 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3221 { 3222 TCGv_i64 t = tcg_temp_new_i64(); 3223 3224 tcg_gen_andi_i64(t, b, 63); 3225 tcg_gen_rotl_i64(d, a, t); 3226 tcg_temp_free_i64(t); 3227 } 3228 3229 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3230 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3231 { 3232 static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 }; 3233 static const GVecGen3 g[4] = { 3234 { .fniv = tcg_gen_rotlv_mod_vec, 3235 .fno = gen_helper_gvec_rotl8v, 3236 .opt_opc = vecop_list, 3237 .vece = MO_8 }, 3238 { .fniv = tcg_gen_rotlv_mod_vec, 3239 .fno = gen_helper_gvec_rotl16v, 3240 .opt_opc = vecop_list, 3241 .vece = MO_16 }, 3242 { .fni4 = tcg_gen_rotl_mod_i32, 3243 .fniv = tcg_gen_rotlv_mod_vec, 3244 .fno = gen_helper_gvec_rotl32v, 3245 .opt_opc = vecop_list, 3246 .vece = MO_32 }, 3247 { .fni8 = tcg_gen_rotl_mod_i64, 3248 .fniv = tcg_gen_rotlv_mod_vec, 3249 .fno = gen_helper_gvec_rotl64v, 3250 .opt_opc = vecop_list, 3251 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3252 .vece = MO_64 }, 3253 }; 3254 3255 tcg_debug_assert(vece <= MO_64); 3256 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3257 } 3258 3259 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d, 3260 TCGv_vec a, TCGv_vec b) 3261 { 3262 TCGv_vec t = tcg_temp_new_vec_matching(d); 3263 3264 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3265 tcg_gen_and_vec(vece, t, t, b); 3266 tcg_gen_rotrv_vec(vece, d, a, t); 3267 tcg_temp_free_vec(t); 3268 } 3269 3270 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3271 { 3272 TCGv_i32 t = tcg_temp_new_i32(); 3273 3274 tcg_gen_andi_i32(t, b, 31); 3275 tcg_gen_rotr_i32(d, a, t); 3276 tcg_temp_free_i32(t); 3277 } 3278 3279 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3280 { 3281 TCGv_i64 t = tcg_temp_new_i64(); 3282 3283 tcg_gen_andi_i64(t, b, 63); 3284 tcg_gen_rotr_i64(d, a, t); 3285 tcg_temp_free_i64(t); 3286 } 3287 3288 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3289 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3290 { 3291 static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 }; 3292 static const GVecGen3 g[4] = { 3293 { .fniv = tcg_gen_rotrv_mod_vec, 3294 .fno = gen_helper_gvec_rotr8v, 3295 .opt_opc = vecop_list, 3296 .vece = MO_8 }, 3297 { .fniv = tcg_gen_rotrv_mod_vec, 3298 .fno = gen_helper_gvec_rotr16v, 3299 .opt_opc = vecop_list, 3300 .vece = MO_16 }, 3301 { .fni4 = tcg_gen_rotr_mod_i32, 3302 .fniv = tcg_gen_rotrv_mod_vec, 3303 .fno = gen_helper_gvec_rotr32v, 3304 .opt_opc = vecop_list, 3305 .vece = MO_32 }, 3306 { .fni8 = tcg_gen_rotr_mod_i64, 3307 .fniv = tcg_gen_rotrv_mod_vec, 3308 .fno = gen_helper_gvec_rotr64v, 3309 .opt_opc = vecop_list, 3310 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3311 .vece = MO_64 }, 3312 }; 3313 3314 tcg_debug_assert(vece <= MO_64); 3315 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3316 } 3317 3318 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3319 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3320 uint32_t oprsz, TCGCond cond) 3321 { 3322 TCGv_i32 t0 = tcg_temp_new_i32(); 3323 TCGv_i32 t1 = tcg_temp_new_i32(); 3324 uint32_t i; 3325 3326 for (i = 0; i < oprsz; i += 4) { 3327 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 3328 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 3329 tcg_gen_setcond_i32(cond, t0, t0, t1); 3330 tcg_gen_neg_i32(t0, t0); 3331 tcg_gen_st_i32(t0, cpu_env, dofs + i); 3332 } 3333 tcg_temp_free_i32(t1); 3334 tcg_temp_free_i32(t0); 3335 } 3336 3337 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3338 uint32_t oprsz, TCGCond cond) 3339 { 3340 TCGv_i64 t0 = tcg_temp_new_i64(); 3341 TCGv_i64 t1 = tcg_temp_new_i64(); 3342 uint32_t i; 3343 3344 for (i = 0; i < oprsz; i += 8) { 3345 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 3346 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 3347 tcg_gen_setcond_i64(cond, t0, t0, t1); 3348 tcg_gen_neg_i64(t0, t0); 3349 tcg_gen_st_i64(t0, cpu_env, dofs + i); 3350 } 3351 tcg_temp_free_i64(t1); 3352 tcg_temp_free_i64(t0); 3353 } 3354 3355 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3356 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3357 TCGType type, TCGCond cond) 3358 { 3359 TCGv_vec t0 = tcg_temp_new_vec(type); 3360 TCGv_vec t1 = tcg_temp_new_vec(type); 3361 uint32_t i; 3362 3363 for (i = 0; i < oprsz; i += tysz) { 3364 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 3365 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 3366 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 3367 tcg_gen_st_vec(t0, cpu_env, dofs + i); 3368 } 3369 tcg_temp_free_vec(t1); 3370 tcg_temp_free_vec(t0); 3371 } 3372 3373 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3374 uint32_t aofs, uint32_t bofs, 3375 uint32_t oprsz, uint32_t maxsz) 3376 { 3377 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3378 static gen_helper_gvec_3 * const eq_fn[4] = { 3379 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3380 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3381 }; 3382 static gen_helper_gvec_3 * const ne_fn[4] = { 3383 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3384 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3385 }; 3386 static gen_helper_gvec_3 * const lt_fn[4] = { 3387 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3388 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3389 }; 3390 static gen_helper_gvec_3 * const le_fn[4] = { 3391 gen_helper_gvec_le8, gen_helper_gvec_le16, 3392 gen_helper_gvec_le32, gen_helper_gvec_le64 3393 }; 3394 static gen_helper_gvec_3 * const ltu_fn[4] = { 3395 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3396 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3397 }; 3398 static gen_helper_gvec_3 * const leu_fn[4] = { 3399 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3400 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3401 }; 3402 static gen_helper_gvec_3 * const * const fns[16] = { 3403 [TCG_COND_EQ] = eq_fn, 3404 [TCG_COND_NE] = ne_fn, 3405 [TCG_COND_LT] = lt_fn, 3406 [TCG_COND_LE] = le_fn, 3407 [TCG_COND_LTU] = ltu_fn, 3408 [TCG_COND_LEU] = leu_fn, 3409 }; 3410 3411 const TCGOpcode *hold_list; 3412 TCGType type; 3413 uint32_t some; 3414 3415 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3416 check_overlap_3(dofs, aofs, bofs, maxsz); 3417 3418 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3419 do_dup(MO_8, dofs, oprsz, maxsz, 3420 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3421 return; 3422 } 3423 3424 /* 3425 * Implement inline with a vector type, if possible. 3426 * Prefer integer when 64-bit host and 64-bit comparison. 3427 */ 3428 hold_list = tcg_swap_vecop_list(cmp_list); 3429 type = choose_vector_type(cmp_list, vece, oprsz, 3430 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3431 switch (type) { 3432 case TCG_TYPE_V256: 3433 /* Recall that ARM SVE allows vector sizes that are not a 3434 * power of 2, but always a multiple of 16. The intent is 3435 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3436 */ 3437 some = QEMU_ALIGN_DOWN(oprsz, 32); 3438 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3439 if (some == oprsz) { 3440 break; 3441 } 3442 dofs += some; 3443 aofs += some; 3444 bofs += some; 3445 oprsz -= some; 3446 maxsz -= some; 3447 /* fallthru */ 3448 case TCG_TYPE_V128: 3449 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3450 break; 3451 case TCG_TYPE_V64: 3452 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3453 break; 3454 3455 case 0: 3456 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3457 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3458 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3459 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3460 } else { 3461 gen_helper_gvec_3 * const *fn = fns[cond]; 3462 3463 if (fn == NULL) { 3464 uint32_t tmp; 3465 tmp = aofs, aofs = bofs, bofs = tmp; 3466 cond = tcg_swap_cond(cond); 3467 fn = fns[cond]; 3468 assert(fn != NULL); 3469 } 3470 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3471 oprsz = maxsz; 3472 } 3473 break; 3474 3475 default: 3476 g_assert_not_reached(); 3477 } 3478 tcg_swap_vecop_list(hold_list); 3479 3480 if (oprsz < maxsz) { 3481 expand_clr(dofs + oprsz, maxsz - oprsz); 3482 } 3483 } 3484 3485 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 3486 { 3487 TCGv_i64 t = tcg_temp_new_i64(); 3488 3489 tcg_gen_and_i64(t, b, a); 3490 tcg_gen_andc_i64(d, c, a); 3491 tcg_gen_or_i64(d, d, t); 3492 tcg_temp_free_i64(t); 3493 } 3494 3495 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 3496 uint32_t bofs, uint32_t cofs, 3497 uint32_t oprsz, uint32_t maxsz) 3498 { 3499 static const GVecGen4 g = { 3500 .fni8 = tcg_gen_bitsel_i64, 3501 .fniv = tcg_gen_bitsel_vec, 3502 .fno = gen_helper_gvec_bitsel, 3503 }; 3504 3505 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 3506 } 3507