1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "tcg/tcg.h" 22 #include "tcg/tcg-op.h" 23 #include "tcg/tcg-op-gvec.h" 24 #include "qemu/main-loop.h" 25 #include "tcg/tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 #ifdef CONFIG_DEBUG_TCG 30 static const TCGOpcode vecop_list_empty[1] = { 0 }; 31 #else 32 #define vecop_list_empty NULL 33 #endif 34 35 36 /* Verify vector size and alignment rules. OFS should be the OR of all 37 of the operand offsets so that we can check them all at once. */ 38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 39 { 40 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 41 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 42 tcg_debug_assert(oprsz > 0); 43 tcg_debug_assert(oprsz <= maxsz); 44 tcg_debug_assert((oprsz & opr_align) == 0); 45 tcg_debug_assert((maxsz & max_align) == 0); 46 tcg_debug_assert((ofs & max_align) == 0); 47 } 48 49 /* Verify vector overlap rules for two operands. */ 50 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 51 { 52 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 53 } 54 55 /* Verify vector overlap rules for three operands. */ 56 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 57 { 58 check_overlap_2(d, a, s); 59 check_overlap_2(d, b, s); 60 check_overlap_2(a, b, s); 61 } 62 63 /* Verify vector overlap rules for four operands. */ 64 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 65 uint32_t c, uint32_t s) 66 { 67 check_overlap_2(d, a, s); 68 check_overlap_2(d, b, s); 69 check_overlap_2(d, c, s); 70 check_overlap_2(a, b, s); 71 check_overlap_2(a, c, s); 72 check_overlap_2(b, c, s); 73 } 74 75 /* Create a descriptor from components. */ 76 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 77 { 78 uint32_t desc = 0; 79 80 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 81 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 82 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 83 84 oprsz = (oprsz / 8) - 1; 85 maxsz = (maxsz / 8) - 1; 86 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 87 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 88 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 89 90 return desc; 91 } 92 93 /* Generate a call to a gvec-style helper with two vector operands. */ 94 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 95 uint32_t oprsz, uint32_t maxsz, int32_t data, 96 gen_helper_gvec_2 *fn) 97 { 98 TCGv_ptr a0, a1; 99 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 100 101 a0 = tcg_temp_new_ptr(); 102 a1 = tcg_temp_new_ptr(); 103 104 tcg_gen_addi_ptr(a0, cpu_env, dofs); 105 tcg_gen_addi_ptr(a1, cpu_env, aofs); 106 107 fn(a0, a1, desc); 108 109 tcg_temp_free_ptr(a0); 110 tcg_temp_free_ptr(a1); 111 tcg_temp_free_i32(desc); 112 } 113 114 /* Generate a call to a gvec-style helper with two vector operands 115 and one scalar operand. */ 116 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 117 uint32_t oprsz, uint32_t maxsz, int32_t data, 118 gen_helper_gvec_2i *fn) 119 { 120 TCGv_ptr a0, a1; 121 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 122 123 a0 = tcg_temp_new_ptr(); 124 a1 = tcg_temp_new_ptr(); 125 126 tcg_gen_addi_ptr(a0, cpu_env, dofs); 127 tcg_gen_addi_ptr(a1, cpu_env, aofs); 128 129 fn(a0, a1, c, desc); 130 131 tcg_temp_free_ptr(a0); 132 tcg_temp_free_ptr(a1); 133 tcg_temp_free_i32(desc); 134 } 135 136 /* Generate a call to a gvec-style helper with three vector operands. */ 137 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 138 uint32_t oprsz, uint32_t maxsz, int32_t data, 139 gen_helper_gvec_3 *fn) 140 { 141 TCGv_ptr a0, a1, a2; 142 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 143 144 a0 = tcg_temp_new_ptr(); 145 a1 = tcg_temp_new_ptr(); 146 a2 = tcg_temp_new_ptr(); 147 148 tcg_gen_addi_ptr(a0, cpu_env, dofs); 149 tcg_gen_addi_ptr(a1, cpu_env, aofs); 150 tcg_gen_addi_ptr(a2, cpu_env, bofs); 151 152 fn(a0, a1, a2, desc); 153 154 tcg_temp_free_ptr(a0); 155 tcg_temp_free_ptr(a1); 156 tcg_temp_free_ptr(a2); 157 tcg_temp_free_i32(desc); 158 } 159 160 /* Generate a call to a gvec-style helper with four vector operands. */ 161 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 162 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 163 int32_t data, gen_helper_gvec_4 *fn) 164 { 165 TCGv_ptr a0, a1, a2, a3; 166 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 167 168 a0 = tcg_temp_new_ptr(); 169 a1 = tcg_temp_new_ptr(); 170 a2 = tcg_temp_new_ptr(); 171 a3 = tcg_temp_new_ptr(); 172 173 tcg_gen_addi_ptr(a0, cpu_env, dofs); 174 tcg_gen_addi_ptr(a1, cpu_env, aofs); 175 tcg_gen_addi_ptr(a2, cpu_env, bofs); 176 tcg_gen_addi_ptr(a3, cpu_env, cofs); 177 178 fn(a0, a1, a2, a3, desc); 179 180 tcg_temp_free_ptr(a0); 181 tcg_temp_free_ptr(a1); 182 tcg_temp_free_ptr(a2); 183 tcg_temp_free_ptr(a3); 184 tcg_temp_free_i32(desc); 185 } 186 187 /* Generate a call to a gvec-style helper with five vector operands. */ 188 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 189 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 190 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 191 { 192 TCGv_ptr a0, a1, a2, a3, a4; 193 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 194 195 a0 = tcg_temp_new_ptr(); 196 a1 = tcg_temp_new_ptr(); 197 a2 = tcg_temp_new_ptr(); 198 a3 = tcg_temp_new_ptr(); 199 a4 = tcg_temp_new_ptr(); 200 201 tcg_gen_addi_ptr(a0, cpu_env, dofs); 202 tcg_gen_addi_ptr(a1, cpu_env, aofs); 203 tcg_gen_addi_ptr(a2, cpu_env, bofs); 204 tcg_gen_addi_ptr(a3, cpu_env, cofs); 205 tcg_gen_addi_ptr(a4, cpu_env, xofs); 206 207 fn(a0, a1, a2, a3, a4, desc); 208 209 tcg_temp_free_ptr(a0); 210 tcg_temp_free_ptr(a1); 211 tcg_temp_free_ptr(a2); 212 tcg_temp_free_ptr(a3); 213 tcg_temp_free_ptr(a4); 214 tcg_temp_free_i32(desc); 215 } 216 217 /* Generate a call to a gvec-style helper with three vector operands 218 and an extra pointer operand. */ 219 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 220 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 221 int32_t data, gen_helper_gvec_2_ptr *fn) 222 { 223 TCGv_ptr a0, a1; 224 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 225 226 a0 = tcg_temp_new_ptr(); 227 a1 = tcg_temp_new_ptr(); 228 229 tcg_gen_addi_ptr(a0, cpu_env, dofs); 230 tcg_gen_addi_ptr(a1, cpu_env, aofs); 231 232 fn(a0, a1, ptr, desc); 233 234 tcg_temp_free_ptr(a0); 235 tcg_temp_free_ptr(a1); 236 tcg_temp_free_i32(desc); 237 } 238 239 /* Generate a call to a gvec-style helper with three vector operands 240 and an extra pointer operand. */ 241 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 242 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 243 int32_t data, gen_helper_gvec_3_ptr *fn) 244 { 245 TCGv_ptr a0, a1, a2; 246 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 247 248 a0 = tcg_temp_new_ptr(); 249 a1 = tcg_temp_new_ptr(); 250 a2 = tcg_temp_new_ptr(); 251 252 tcg_gen_addi_ptr(a0, cpu_env, dofs); 253 tcg_gen_addi_ptr(a1, cpu_env, aofs); 254 tcg_gen_addi_ptr(a2, cpu_env, bofs); 255 256 fn(a0, a1, a2, ptr, desc); 257 258 tcg_temp_free_ptr(a0); 259 tcg_temp_free_ptr(a1); 260 tcg_temp_free_ptr(a2); 261 tcg_temp_free_i32(desc); 262 } 263 264 /* Generate a call to a gvec-style helper with four vector operands 265 and an extra pointer operand. */ 266 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 267 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 268 uint32_t maxsz, int32_t data, 269 gen_helper_gvec_4_ptr *fn) 270 { 271 TCGv_ptr a0, a1, a2, a3; 272 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 273 274 a0 = tcg_temp_new_ptr(); 275 a1 = tcg_temp_new_ptr(); 276 a2 = tcg_temp_new_ptr(); 277 a3 = tcg_temp_new_ptr(); 278 279 tcg_gen_addi_ptr(a0, cpu_env, dofs); 280 tcg_gen_addi_ptr(a1, cpu_env, aofs); 281 tcg_gen_addi_ptr(a2, cpu_env, bofs); 282 tcg_gen_addi_ptr(a3, cpu_env, cofs); 283 284 fn(a0, a1, a2, a3, ptr, desc); 285 286 tcg_temp_free_ptr(a0); 287 tcg_temp_free_ptr(a1); 288 tcg_temp_free_ptr(a2); 289 tcg_temp_free_ptr(a3); 290 tcg_temp_free_i32(desc); 291 } 292 293 /* Generate a call to a gvec-style helper with five vector operands 294 and an extra pointer operand. */ 295 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 296 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr, 297 uint32_t oprsz, uint32_t maxsz, int32_t data, 298 gen_helper_gvec_5_ptr *fn) 299 { 300 TCGv_ptr a0, a1, a2, a3, a4; 301 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 302 303 a0 = tcg_temp_new_ptr(); 304 a1 = tcg_temp_new_ptr(); 305 a2 = tcg_temp_new_ptr(); 306 a3 = tcg_temp_new_ptr(); 307 a4 = tcg_temp_new_ptr(); 308 309 tcg_gen_addi_ptr(a0, cpu_env, dofs); 310 tcg_gen_addi_ptr(a1, cpu_env, aofs); 311 tcg_gen_addi_ptr(a2, cpu_env, bofs); 312 tcg_gen_addi_ptr(a3, cpu_env, cofs); 313 tcg_gen_addi_ptr(a4, cpu_env, eofs); 314 315 fn(a0, a1, a2, a3, a4, ptr, desc); 316 317 tcg_temp_free_ptr(a0); 318 tcg_temp_free_ptr(a1); 319 tcg_temp_free_ptr(a2); 320 tcg_temp_free_ptr(a3); 321 tcg_temp_free_ptr(a4); 322 tcg_temp_free_i32(desc); 323 } 324 325 /* Return true if we want to implement something of OPRSZ bytes 326 in units of LNSZ. This limits the expansion of inline code. */ 327 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 328 { 329 uint32_t q, r; 330 331 if (oprsz < lnsz) { 332 return false; 333 } 334 335 q = oprsz / lnsz; 336 r = oprsz % lnsz; 337 tcg_debug_assert((r & 7) == 0); 338 339 if (lnsz < 16) { 340 /* For sizes below 16, accept no remainder. */ 341 if (r != 0) { 342 return false; 343 } 344 } else { 345 /* 346 * Recall that ARM SVE allows vector sizes that are not a 347 * power of 2, but always a multiple of 16. The intent is 348 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 349 * In addition, expand_clr needs to handle a multiple of 8. 350 * Thus we can handle the tail with one more operation per 351 * diminishing power of 2. 352 */ 353 q += ctpop32(r); 354 } 355 356 return q <= MAX_UNROLL; 357 } 358 359 static void expand_clr(uint32_t dofs, uint32_t maxsz); 360 361 /* Duplicate C as per VECE. */ 362 uint64_t (dup_const)(unsigned vece, uint64_t c) 363 { 364 switch (vece) { 365 case MO_8: 366 return 0x0101010101010101ull * (uint8_t)c; 367 case MO_16: 368 return 0x0001000100010001ull * (uint16_t)c; 369 case MO_32: 370 return 0x0000000100000001ull * (uint32_t)c; 371 case MO_64: 372 return c; 373 default: 374 g_assert_not_reached(); 375 } 376 } 377 378 /* Duplicate IN into OUT as per VECE. */ 379 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 380 { 381 switch (vece) { 382 case MO_8: 383 tcg_gen_ext8u_i32(out, in); 384 tcg_gen_muli_i32(out, out, 0x01010101); 385 break; 386 case MO_16: 387 tcg_gen_deposit_i32(out, in, in, 16, 16); 388 break; 389 case MO_32: 390 tcg_gen_mov_i32(out, in); 391 break; 392 default: 393 g_assert_not_reached(); 394 } 395 } 396 397 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 398 { 399 switch (vece) { 400 case MO_8: 401 tcg_gen_ext8u_i64(out, in); 402 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 403 break; 404 case MO_16: 405 tcg_gen_ext16u_i64(out, in); 406 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 407 break; 408 case MO_32: 409 tcg_gen_deposit_i64(out, in, in, 32, 32); 410 break; 411 case MO_64: 412 tcg_gen_mov_i64(out, in); 413 break; 414 default: 415 g_assert_not_reached(); 416 } 417 } 418 419 /* Select a supported vector type for implementing an operation on SIZE 420 * bytes. If OP is 0, assume that the real operation to be performed is 421 * required by all backends. Otherwise, make sure than OP can be performed 422 * on elements of size VECE in the selected type. Do not select V64 if 423 * PREFER_I64 is true. Return 0 if no vector type is selected. 424 */ 425 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 426 uint32_t size, bool prefer_i64) 427 { 428 /* 429 * Recall that ARM SVE allows vector sizes that are not a 430 * power of 2, but always a multiple of 16. The intent is 431 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 432 * It is hard to imagine a case in which v256 is supported 433 * but v128 is not, but check anyway. 434 * In addition, expand_clr needs to handle a multiple of 8. 435 */ 436 if (TCG_TARGET_HAS_v256 && 437 check_size_impl(size, 32) && 438 tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && 439 (!(size & 16) || 440 (TCG_TARGET_HAS_v128 && 441 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && 442 (!(size & 8) || 443 (TCG_TARGET_HAS_v64 && 444 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 445 return TCG_TYPE_V256; 446 } 447 if (TCG_TARGET_HAS_v128 && 448 check_size_impl(size, 16) && 449 tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && 450 (!(size & 8) || 451 (TCG_TARGET_HAS_v64 && 452 tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { 453 return TCG_TYPE_V128; 454 } 455 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 456 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 457 return TCG_TYPE_V64; 458 } 459 return 0; 460 } 461 462 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 463 uint32_t maxsz, TCGv_vec t_vec) 464 { 465 uint32_t i = 0; 466 467 tcg_debug_assert(oprsz >= 8); 468 469 /* 470 * This may be expand_clr for the tail of an operation, e.g. 471 * oprsz == 8 && maxsz == 64. The first 8 bytes of this store 472 * are misaligned wrt the maximum vector size, so do that first. 473 */ 474 if (dofs & 8) { 475 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 476 i += 8; 477 } 478 479 switch (type) { 480 case TCG_TYPE_V256: 481 /* 482 * Recall that ARM SVE allows vector sizes that are not a 483 * power of 2, but always a multiple of 16. The intent is 484 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 485 */ 486 for (; i + 32 <= oprsz; i += 32) { 487 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 488 } 489 /* fallthru */ 490 case TCG_TYPE_V128: 491 for (; i + 16 <= oprsz; i += 16) { 492 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 493 } 494 break; 495 case TCG_TYPE_V64: 496 for (; i < oprsz; i += 8) { 497 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 498 } 499 break; 500 default: 501 g_assert_not_reached(); 502 } 503 504 if (oprsz < maxsz) { 505 expand_clr(dofs + oprsz, maxsz - oprsz); 506 } 507 } 508 509 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 510 * Only one of IN_32 or IN_64 may be set; 511 * IN_C is used if IN_32 and IN_64 are unset. 512 */ 513 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 514 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 515 uint64_t in_c) 516 { 517 TCGType type; 518 TCGv_i64 t_64; 519 TCGv_i32 t_32, t_desc; 520 TCGv_ptr t_ptr; 521 uint32_t i; 522 523 assert(vece <= (in_32 ? MO_32 : MO_64)); 524 assert(in_32 == NULL || in_64 == NULL); 525 526 /* If we're storing 0, expand oprsz to maxsz. */ 527 if (in_32 == NULL && in_64 == NULL) { 528 in_c = dup_const(vece, in_c); 529 if (in_c == 0) { 530 oprsz = maxsz; 531 } 532 } 533 534 /* Implement inline with a vector type, if possible. 535 * Prefer integer when 64-bit host and no variable dup. 536 */ 537 type = choose_vector_type(NULL, vece, oprsz, 538 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 539 && (in_64 == NULL || vece == MO_64))); 540 if (type != 0) { 541 TCGv_vec t_vec = tcg_temp_new_vec(type); 542 543 if (in_32) { 544 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 545 } else if (in_64) { 546 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 547 } else { 548 tcg_gen_dupi_vec(vece, t_vec, in_c); 549 } 550 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 551 tcg_temp_free_vec(t_vec); 552 return; 553 } 554 555 /* Otherwise, inline with an integer type, unless "large". */ 556 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 557 t_64 = NULL; 558 t_32 = NULL; 559 560 if (in_32) { 561 /* We are given a 32-bit variable input. For a 64-bit host, 562 use a 64-bit operation unless the 32-bit operation would 563 be simple enough. */ 564 if (TCG_TARGET_REG_BITS == 64 565 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 566 t_64 = tcg_temp_new_i64(); 567 tcg_gen_extu_i32_i64(t_64, in_32); 568 gen_dup_i64(vece, t_64, t_64); 569 } else { 570 t_32 = tcg_temp_new_i32(); 571 gen_dup_i32(vece, t_32, in_32); 572 } 573 } else if (in_64) { 574 /* We are given a 64-bit variable input. */ 575 t_64 = tcg_temp_new_i64(); 576 gen_dup_i64(vece, t_64, in_64); 577 } else { 578 /* We are given a constant input. */ 579 /* For 64-bit hosts, use 64-bit constants for "simple" constants 580 or when we'd need too many 32-bit stores, or when a 64-bit 581 constant is really required. */ 582 if (vece == MO_64 583 || (TCG_TARGET_REG_BITS == 64 584 && (in_c == 0 || in_c == -1 585 || !check_size_impl(oprsz, 4)))) { 586 t_64 = tcg_const_i64(in_c); 587 } else { 588 t_32 = tcg_const_i32(in_c); 589 } 590 } 591 592 /* Implement inline if we picked an implementation size above. */ 593 if (t_32) { 594 for (i = 0; i < oprsz; i += 4) { 595 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 596 } 597 tcg_temp_free_i32(t_32); 598 goto done; 599 } 600 if (t_64) { 601 for (i = 0; i < oprsz; i += 8) { 602 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 603 } 604 tcg_temp_free_i64(t_64); 605 goto done; 606 } 607 } 608 609 /* Otherwise implement out of line. */ 610 t_ptr = tcg_temp_new_ptr(); 611 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 612 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 613 614 if (vece == MO_64) { 615 if (in_64) { 616 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 617 } else { 618 t_64 = tcg_const_i64(in_c); 619 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 620 tcg_temp_free_i64(t_64); 621 } 622 } else { 623 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 624 static dup_fn * const fns[3] = { 625 gen_helper_gvec_dup8, 626 gen_helper_gvec_dup16, 627 gen_helper_gvec_dup32 628 }; 629 630 if (in_32) { 631 fns[vece](t_ptr, t_desc, in_32); 632 } else { 633 t_32 = tcg_temp_new_i32(); 634 if (in_64) { 635 tcg_gen_extrl_i64_i32(t_32, in_64); 636 } else if (vece == MO_8) { 637 tcg_gen_movi_i32(t_32, in_c & 0xff); 638 } else if (vece == MO_16) { 639 tcg_gen_movi_i32(t_32, in_c & 0xffff); 640 } else { 641 tcg_gen_movi_i32(t_32, in_c); 642 } 643 fns[vece](t_ptr, t_desc, t_32); 644 tcg_temp_free_i32(t_32); 645 } 646 } 647 648 tcg_temp_free_ptr(t_ptr); 649 tcg_temp_free_i32(t_desc); 650 return; 651 652 done: 653 if (oprsz < maxsz) { 654 expand_clr(dofs + oprsz, maxsz - oprsz); 655 } 656 } 657 658 /* Likewise, but with zero. */ 659 static void expand_clr(uint32_t dofs, uint32_t maxsz) 660 { 661 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 662 } 663 664 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 665 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 666 bool load_dest, void (*fni)(TCGv_i32, TCGv_i32)) 667 { 668 TCGv_i32 t0 = tcg_temp_new_i32(); 669 TCGv_i32 t1 = tcg_temp_new_i32(); 670 uint32_t i; 671 672 for (i = 0; i < oprsz; i += 4) { 673 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 674 if (load_dest) { 675 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 676 } 677 fni(t1, t0); 678 tcg_gen_st_i32(t1, cpu_env, dofs + i); 679 } 680 tcg_temp_free_i32(t0); 681 tcg_temp_free_i32(t1); 682 } 683 684 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 685 int32_t c, bool load_dest, 686 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 687 { 688 TCGv_i32 t0 = tcg_temp_new_i32(); 689 TCGv_i32 t1 = tcg_temp_new_i32(); 690 uint32_t i; 691 692 for (i = 0; i < oprsz; i += 4) { 693 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 694 if (load_dest) { 695 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 696 } 697 fni(t1, t0, c); 698 tcg_gen_st_i32(t1, cpu_env, dofs + i); 699 } 700 tcg_temp_free_i32(t0); 701 tcg_temp_free_i32(t1); 702 } 703 704 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 705 TCGv_i32 c, bool scalar_first, 706 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 707 { 708 TCGv_i32 t0 = tcg_temp_new_i32(); 709 TCGv_i32 t1 = tcg_temp_new_i32(); 710 uint32_t i; 711 712 for (i = 0; i < oprsz; i += 4) { 713 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 714 if (scalar_first) { 715 fni(t1, c, t0); 716 } else { 717 fni(t1, t0, c); 718 } 719 tcg_gen_st_i32(t1, cpu_env, dofs + i); 720 } 721 tcg_temp_free_i32(t0); 722 tcg_temp_free_i32(t1); 723 } 724 725 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 726 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 727 uint32_t bofs, uint32_t oprsz, bool load_dest, 728 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 729 { 730 TCGv_i32 t0 = tcg_temp_new_i32(); 731 TCGv_i32 t1 = tcg_temp_new_i32(); 732 TCGv_i32 t2 = tcg_temp_new_i32(); 733 uint32_t i; 734 735 for (i = 0; i < oprsz; i += 4) { 736 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 737 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 738 if (load_dest) { 739 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 740 } 741 fni(t2, t0, t1); 742 tcg_gen_st_i32(t2, cpu_env, dofs + i); 743 } 744 tcg_temp_free_i32(t2); 745 tcg_temp_free_i32(t1); 746 tcg_temp_free_i32(t0); 747 } 748 749 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 750 uint32_t oprsz, int32_t c, bool load_dest, 751 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 752 { 753 TCGv_i32 t0 = tcg_temp_new_i32(); 754 TCGv_i32 t1 = tcg_temp_new_i32(); 755 TCGv_i32 t2 = tcg_temp_new_i32(); 756 uint32_t i; 757 758 for (i = 0; i < oprsz; i += 4) { 759 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 760 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 761 if (load_dest) { 762 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 763 } 764 fni(t2, t0, t1, c); 765 tcg_gen_st_i32(t2, cpu_env, dofs + i); 766 } 767 tcg_temp_free_i32(t0); 768 tcg_temp_free_i32(t1); 769 tcg_temp_free_i32(t2); 770 } 771 772 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 773 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 774 uint32_t cofs, uint32_t oprsz, bool write_aofs, 775 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 776 { 777 TCGv_i32 t0 = tcg_temp_new_i32(); 778 TCGv_i32 t1 = tcg_temp_new_i32(); 779 TCGv_i32 t2 = tcg_temp_new_i32(); 780 TCGv_i32 t3 = tcg_temp_new_i32(); 781 uint32_t i; 782 783 for (i = 0; i < oprsz; i += 4) { 784 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 785 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 786 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 787 fni(t0, t1, t2, t3); 788 tcg_gen_st_i32(t0, cpu_env, dofs + i); 789 if (write_aofs) { 790 tcg_gen_st_i32(t1, cpu_env, aofs + i); 791 } 792 } 793 tcg_temp_free_i32(t3); 794 tcg_temp_free_i32(t2); 795 tcg_temp_free_i32(t1); 796 tcg_temp_free_i32(t0); 797 } 798 799 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 800 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 801 bool load_dest, void (*fni)(TCGv_i64, TCGv_i64)) 802 { 803 TCGv_i64 t0 = tcg_temp_new_i64(); 804 TCGv_i64 t1 = tcg_temp_new_i64(); 805 uint32_t i; 806 807 for (i = 0; i < oprsz; i += 8) { 808 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 809 if (load_dest) { 810 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 811 } 812 fni(t1, t0); 813 tcg_gen_st_i64(t1, cpu_env, dofs + i); 814 } 815 tcg_temp_free_i64(t0); 816 tcg_temp_free_i64(t1); 817 } 818 819 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 820 int64_t c, bool load_dest, 821 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 822 { 823 TCGv_i64 t0 = tcg_temp_new_i64(); 824 TCGv_i64 t1 = tcg_temp_new_i64(); 825 uint32_t i; 826 827 for (i = 0; i < oprsz; i += 8) { 828 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 829 if (load_dest) { 830 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 831 } 832 fni(t1, t0, c); 833 tcg_gen_st_i64(t1, cpu_env, dofs + i); 834 } 835 tcg_temp_free_i64(t0); 836 tcg_temp_free_i64(t1); 837 } 838 839 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 840 TCGv_i64 c, bool scalar_first, 841 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 842 { 843 TCGv_i64 t0 = tcg_temp_new_i64(); 844 TCGv_i64 t1 = tcg_temp_new_i64(); 845 uint32_t i; 846 847 for (i = 0; i < oprsz; i += 8) { 848 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 849 if (scalar_first) { 850 fni(t1, c, t0); 851 } else { 852 fni(t1, t0, c); 853 } 854 tcg_gen_st_i64(t1, cpu_env, dofs + i); 855 } 856 tcg_temp_free_i64(t0); 857 tcg_temp_free_i64(t1); 858 } 859 860 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 861 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 862 uint32_t bofs, uint32_t oprsz, bool load_dest, 863 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 864 { 865 TCGv_i64 t0 = tcg_temp_new_i64(); 866 TCGv_i64 t1 = tcg_temp_new_i64(); 867 TCGv_i64 t2 = tcg_temp_new_i64(); 868 uint32_t i; 869 870 for (i = 0; i < oprsz; i += 8) { 871 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 872 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 873 if (load_dest) { 874 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 875 } 876 fni(t2, t0, t1); 877 tcg_gen_st_i64(t2, cpu_env, dofs + i); 878 } 879 tcg_temp_free_i64(t2); 880 tcg_temp_free_i64(t1); 881 tcg_temp_free_i64(t0); 882 } 883 884 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 885 uint32_t oprsz, int64_t c, bool load_dest, 886 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 887 { 888 TCGv_i64 t0 = tcg_temp_new_i64(); 889 TCGv_i64 t1 = tcg_temp_new_i64(); 890 TCGv_i64 t2 = tcg_temp_new_i64(); 891 uint32_t i; 892 893 for (i = 0; i < oprsz; i += 8) { 894 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 895 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 896 if (load_dest) { 897 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 898 } 899 fni(t2, t0, t1, c); 900 tcg_gen_st_i64(t2, cpu_env, dofs + i); 901 } 902 tcg_temp_free_i64(t0); 903 tcg_temp_free_i64(t1); 904 tcg_temp_free_i64(t2); 905 } 906 907 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 908 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 909 uint32_t cofs, uint32_t oprsz, bool write_aofs, 910 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 911 { 912 TCGv_i64 t0 = tcg_temp_new_i64(); 913 TCGv_i64 t1 = tcg_temp_new_i64(); 914 TCGv_i64 t2 = tcg_temp_new_i64(); 915 TCGv_i64 t3 = tcg_temp_new_i64(); 916 uint32_t i; 917 918 for (i = 0; i < oprsz; i += 8) { 919 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 920 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 921 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 922 fni(t0, t1, t2, t3); 923 tcg_gen_st_i64(t0, cpu_env, dofs + i); 924 if (write_aofs) { 925 tcg_gen_st_i64(t1, cpu_env, aofs + i); 926 } 927 } 928 tcg_temp_free_i64(t3); 929 tcg_temp_free_i64(t2); 930 tcg_temp_free_i64(t1); 931 tcg_temp_free_i64(t0); 932 } 933 934 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 935 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 936 uint32_t oprsz, uint32_t tysz, TCGType type, 937 bool load_dest, 938 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 939 { 940 TCGv_vec t0 = tcg_temp_new_vec(type); 941 TCGv_vec t1 = tcg_temp_new_vec(type); 942 uint32_t i; 943 944 for (i = 0; i < oprsz; i += tysz) { 945 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 946 if (load_dest) { 947 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 948 } 949 fni(vece, t1, t0); 950 tcg_gen_st_vec(t1, cpu_env, dofs + i); 951 } 952 tcg_temp_free_vec(t0); 953 tcg_temp_free_vec(t1); 954 } 955 956 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 957 using host vectors. */ 958 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 959 uint32_t oprsz, uint32_t tysz, TCGType type, 960 int64_t c, bool load_dest, 961 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 962 { 963 TCGv_vec t0 = tcg_temp_new_vec(type); 964 TCGv_vec t1 = tcg_temp_new_vec(type); 965 uint32_t i; 966 967 for (i = 0; i < oprsz; i += tysz) { 968 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 969 if (load_dest) { 970 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 971 } 972 fni(vece, t1, t0, c); 973 tcg_gen_st_vec(t1, cpu_env, dofs + i); 974 } 975 tcg_temp_free_vec(t0); 976 tcg_temp_free_vec(t1); 977 } 978 979 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 980 uint32_t oprsz, uint32_t tysz, TCGType type, 981 TCGv_vec c, bool scalar_first, 982 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 983 { 984 TCGv_vec t0 = tcg_temp_new_vec(type); 985 TCGv_vec t1 = tcg_temp_new_vec(type); 986 uint32_t i; 987 988 for (i = 0; i < oprsz; i += tysz) { 989 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 990 if (scalar_first) { 991 fni(vece, t1, c, t0); 992 } else { 993 fni(vece, t1, t0, c); 994 } 995 tcg_gen_st_vec(t1, cpu_env, dofs + i); 996 } 997 tcg_temp_free_vec(t0); 998 tcg_temp_free_vec(t1); 999 } 1000 1001 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 1002 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1003 uint32_t bofs, uint32_t oprsz, 1004 uint32_t tysz, TCGType type, bool load_dest, 1005 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 1006 { 1007 TCGv_vec t0 = tcg_temp_new_vec(type); 1008 TCGv_vec t1 = tcg_temp_new_vec(type); 1009 TCGv_vec t2 = tcg_temp_new_vec(type); 1010 uint32_t i; 1011 1012 for (i = 0; i < oprsz; i += tysz) { 1013 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1014 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 1015 if (load_dest) { 1016 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 1017 } 1018 fni(vece, t2, t0, t1); 1019 tcg_gen_st_vec(t2, cpu_env, dofs + i); 1020 } 1021 tcg_temp_free_vec(t2); 1022 tcg_temp_free_vec(t1); 1023 tcg_temp_free_vec(t0); 1024 } 1025 1026 /* 1027 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 1028 * using host vectors. 1029 */ 1030 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1031 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 1032 TCGType type, int64_t c, bool load_dest, 1033 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 1034 int64_t)) 1035 { 1036 TCGv_vec t0 = tcg_temp_new_vec(type); 1037 TCGv_vec t1 = tcg_temp_new_vec(type); 1038 TCGv_vec t2 = tcg_temp_new_vec(type); 1039 uint32_t i; 1040 1041 for (i = 0; i < oprsz; i += tysz) { 1042 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 1043 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 1044 if (load_dest) { 1045 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 1046 } 1047 fni(vece, t2, t0, t1, c); 1048 tcg_gen_st_vec(t2, cpu_env, dofs + i); 1049 } 1050 tcg_temp_free_vec(t0); 1051 tcg_temp_free_vec(t1); 1052 tcg_temp_free_vec(t2); 1053 } 1054 1055 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 1056 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 1057 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 1058 uint32_t tysz, TCGType type, bool write_aofs, 1059 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1060 TCGv_vec, TCGv_vec)) 1061 { 1062 TCGv_vec t0 = tcg_temp_new_vec(type); 1063 TCGv_vec t1 = tcg_temp_new_vec(type); 1064 TCGv_vec t2 = tcg_temp_new_vec(type); 1065 TCGv_vec t3 = tcg_temp_new_vec(type); 1066 uint32_t i; 1067 1068 for (i = 0; i < oprsz; i += tysz) { 1069 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 1070 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 1071 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 1072 fni(vece, t0, t1, t2, t3); 1073 tcg_gen_st_vec(t0, cpu_env, dofs + i); 1074 if (write_aofs) { 1075 tcg_gen_st_vec(t1, cpu_env, aofs + i); 1076 } 1077 } 1078 tcg_temp_free_vec(t3); 1079 tcg_temp_free_vec(t2); 1080 tcg_temp_free_vec(t1); 1081 tcg_temp_free_vec(t0); 1082 } 1083 1084 /* Expand a vector two-operand operation. */ 1085 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 1086 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 1087 { 1088 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1089 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1090 TCGType type; 1091 uint32_t some; 1092 1093 check_size_align(oprsz, maxsz, dofs | aofs); 1094 check_overlap_2(dofs, aofs, maxsz); 1095 1096 type = 0; 1097 if (g->fniv) { 1098 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1099 } 1100 switch (type) { 1101 case TCG_TYPE_V256: 1102 /* Recall that ARM SVE allows vector sizes that are not a 1103 * power of 2, but always a multiple of 16. The intent is 1104 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1105 */ 1106 some = QEMU_ALIGN_DOWN(oprsz, 32); 1107 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1108 g->load_dest, g->fniv); 1109 if (some == oprsz) { 1110 break; 1111 } 1112 dofs += some; 1113 aofs += some; 1114 oprsz -= some; 1115 maxsz -= some; 1116 /* fallthru */ 1117 case TCG_TYPE_V128: 1118 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1119 g->load_dest, g->fniv); 1120 break; 1121 case TCG_TYPE_V64: 1122 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1123 g->load_dest, g->fniv); 1124 break; 1125 1126 case 0: 1127 if (g->fni8 && check_size_impl(oprsz, 8)) { 1128 expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8); 1129 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1130 expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4); 1131 } else { 1132 assert(g->fno != NULL); 1133 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1134 oprsz = maxsz; 1135 } 1136 break; 1137 1138 default: 1139 g_assert_not_reached(); 1140 } 1141 tcg_swap_vecop_list(hold_list); 1142 1143 if (oprsz < maxsz) { 1144 expand_clr(dofs + oprsz, maxsz - oprsz); 1145 } 1146 } 1147 1148 /* Expand a vector operation with two vectors and an immediate. */ 1149 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1150 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1151 { 1152 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1153 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1154 TCGType type; 1155 uint32_t some; 1156 1157 check_size_align(oprsz, maxsz, dofs | aofs); 1158 check_overlap_2(dofs, aofs, maxsz); 1159 1160 type = 0; 1161 if (g->fniv) { 1162 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1163 } 1164 switch (type) { 1165 case TCG_TYPE_V256: 1166 /* Recall that ARM SVE allows vector sizes that are not a 1167 * power of 2, but always a multiple of 16. The intent is 1168 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1169 */ 1170 some = QEMU_ALIGN_DOWN(oprsz, 32); 1171 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1172 c, g->load_dest, g->fniv); 1173 if (some == oprsz) { 1174 break; 1175 } 1176 dofs += some; 1177 aofs += some; 1178 oprsz -= some; 1179 maxsz -= some; 1180 /* fallthru */ 1181 case TCG_TYPE_V128: 1182 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1183 c, g->load_dest, g->fniv); 1184 break; 1185 case TCG_TYPE_V64: 1186 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1187 c, g->load_dest, g->fniv); 1188 break; 1189 1190 case 0: 1191 if (g->fni8 && check_size_impl(oprsz, 8)) { 1192 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1193 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1194 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1195 } else { 1196 if (g->fno) { 1197 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1198 } else { 1199 TCGv_i64 tcg_c = tcg_const_i64(c); 1200 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1201 maxsz, c, g->fnoi); 1202 tcg_temp_free_i64(tcg_c); 1203 } 1204 oprsz = maxsz; 1205 } 1206 break; 1207 1208 default: 1209 g_assert_not_reached(); 1210 } 1211 tcg_swap_vecop_list(hold_list); 1212 1213 if (oprsz < maxsz) { 1214 expand_clr(dofs + oprsz, maxsz - oprsz); 1215 } 1216 } 1217 1218 /* Expand a vector operation with two vectors and a scalar. */ 1219 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1220 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1221 { 1222 TCGType type; 1223 1224 check_size_align(oprsz, maxsz, dofs | aofs); 1225 check_overlap_2(dofs, aofs, maxsz); 1226 1227 type = 0; 1228 if (g->fniv) { 1229 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1230 } 1231 if (type != 0) { 1232 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1233 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1234 TCGv_vec t_vec = tcg_temp_new_vec(type); 1235 uint32_t some; 1236 1237 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1238 1239 switch (type) { 1240 case TCG_TYPE_V256: 1241 /* Recall that ARM SVE allows vector sizes that are not a 1242 * power of 2, but always a multiple of 16. The intent is 1243 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1244 */ 1245 some = QEMU_ALIGN_DOWN(oprsz, 32); 1246 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1247 t_vec, g->scalar_first, g->fniv); 1248 if (some == oprsz) { 1249 break; 1250 } 1251 dofs += some; 1252 aofs += some; 1253 oprsz -= some; 1254 maxsz -= some; 1255 /* fallthru */ 1256 1257 case TCG_TYPE_V128: 1258 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1259 t_vec, g->scalar_first, g->fniv); 1260 break; 1261 1262 case TCG_TYPE_V64: 1263 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1264 t_vec, g->scalar_first, g->fniv); 1265 break; 1266 1267 default: 1268 g_assert_not_reached(); 1269 } 1270 tcg_temp_free_vec(t_vec); 1271 tcg_swap_vecop_list(hold_list); 1272 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1273 TCGv_i64 t64 = tcg_temp_new_i64(); 1274 1275 gen_dup_i64(g->vece, t64, c); 1276 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1277 tcg_temp_free_i64(t64); 1278 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1279 TCGv_i32 t32 = tcg_temp_new_i32(); 1280 1281 tcg_gen_extrl_i64_i32(t32, c); 1282 gen_dup_i32(g->vece, t32, t32); 1283 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1284 tcg_temp_free_i32(t32); 1285 } else { 1286 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1287 return; 1288 } 1289 1290 if (oprsz < maxsz) { 1291 expand_clr(dofs + oprsz, maxsz - oprsz); 1292 } 1293 } 1294 1295 /* Expand a vector three-operand operation. */ 1296 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1297 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1298 { 1299 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1300 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1301 TCGType type; 1302 uint32_t some; 1303 1304 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1305 check_overlap_3(dofs, aofs, bofs, maxsz); 1306 1307 type = 0; 1308 if (g->fniv) { 1309 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1310 } 1311 switch (type) { 1312 case TCG_TYPE_V256: 1313 /* Recall that ARM SVE allows vector sizes that are not a 1314 * power of 2, but always a multiple of 16. The intent is 1315 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1316 */ 1317 some = QEMU_ALIGN_DOWN(oprsz, 32); 1318 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1319 g->load_dest, g->fniv); 1320 if (some == oprsz) { 1321 break; 1322 } 1323 dofs += some; 1324 aofs += some; 1325 bofs += some; 1326 oprsz -= some; 1327 maxsz -= some; 1328 /* fallthru */ 1329 case TCG_TYPE_V128: 1330 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1331 g->load_dest, g->fniv); 1332 break; 1333 case TCG_TYPE_V64: 1334 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1335 g->load_dest, g->fniv); 1336 break; 1337 1338 case 0: 1339 if (g->fni8 && check_size_impl(oprsz, 8)) { 1340 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1341 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1342 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1343 } else { 1344 assert(g->fno != NULL); 1345 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1346 maxsz, g->data, g->fno); 1347 oprsz = maxsz; 1348 } 1349 break; 1350 1351 default: 1352 g_assert_not_reached(); 1353 } 1354 tcg_swap_vecop_list(hold_list); 1355 1356 if (oprsz < maxsz) { 1357 expand_clr(dofs + oprsz, maxsz - oprsz); 1358 } 1359 } 1360 1361 /* Expand a vector operation with three vectors and an immediate. */ 1362 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1363 uint32_t oprsz, uint32_t maxsz, int64_t c, 1364 const GVecGen3i *g) 1365 { 1366 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1367 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1368 TCGType type; 1369 uint32_t some; 1370 1371 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1372 check_overlap_3(dofs, aofs, bofs, maxsz); 1373 1374 type = 0; 1375 if (g->fniv) { 1376 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1377 } 1378 switch (type) { 1379 case TCG_TYPE_V256: 1380 /* 1381 * Recall that ARM SVE allows vector sizes that are not a 1382 * power of 2, but always a multiple of 16. The intent is 1383 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1384 */ 1385 some = QEMU_ALIGN_DOWN(oprsz, 32); 1386 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1387 c, g->load_dest, g->fniv); 1388 if (some == oprsz) { 1389 break; 1390 } 1391 dofs += some; 1392 aofs += some; 1393 bofs += some; 1394 oprsz -= some; 1395 maxsz -= some; 1396 /* fallthru */ 1397 case TCG_TYPE_V128: 1398 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1399 c, g->load_dest, g->fniv); 1400 break; 1401 case TCG_TYPE_V64: 1402 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1403 c, g->load_dest, g->fniv); 1404 break; 1405 1406 case 0: 1407 if (g->fni8 && check_size_impl(oprsz, 8)) { 1408 expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8); 1409 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1410 expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4); 1411 } else { 1412 assert(g->fno != NULL); 1413 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1414 oprsz = maxsz; 1415 } 1416 break; 1417 1418 default: 1419 g_assert_not_reached(); 1420 } 1421 tcg_swap_vecop_list(hold_list); 1422 1423 if (oprsz < maxsz) { 1424 expand_clr(dofs + oprsz, maxsz - oprsz); 1425 } 1426 } 1427 1428 /* Expand a vector four-operand operation. */ 1429 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1430 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1431 { 1432 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1433 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1434 TCGType type; 1435 uint32_t some; 1436 1437 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1438 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1439 1440 type = 0; 1441 if (g->fniv) { 1442 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1443 } 1444 switch (type) { 1445 case TCG_TYPE_V256: 1446 /* Recall that ARM SVE allows vector sizes that are not a 1447 * power of 2, but always a multiple of 16. The intent is 1448 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1449 */ 1450 some = QEMU_ALIGN_DOWN(oprsz, 32); 1451 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1452 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1453 if (some == oprsz) { 1454 break; 1455 } 1456 dofs += some; 1457 aofs += some; 1458 bofs += some; 1459 cofs += some; 1460 oprsz -= some; 1461 maxsz -= some; 1462 /* fallthru */ 1463 case TCG_TYPE_V128: 1464 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1465 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1466 break; 1467 case TCG_TYPE_V64: 1468 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1469 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1470 break; 1471 1472 case 0: 1473 if (g->fni8 && check_size_impl(oprsz, 8)) { 1474 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1475 g->write_aofs, g->fni8); 1476 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1477 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1478 g->write_aofs, g->fni4); 1479 } else { 1480 assert(g->fno != NULL); 1481 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1482 oprsz, maxsz, g->data, g->fno); 1483 oprsz = maxsz; 1484 } 1485 break; 1486 1487 default: 1488 g_assert_not_reached(); 1489 } 1490 tcg_swap_vecop_list(hold_list); 1491 1492 if (oprsz < maxsz) { 1493 expand_clr(dofs + oprsz, maxsz - oprsz); 1494 } 1495 } 1496 1497 /* 1498 * Expand specific vector operations. 1499 */ 1500 1501 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1502 { 1503 tcg_gen_mov_vec(a, b); 1504 } 1505 1506 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1507 uint32_t oprsz, uint32_t maxsz) 1508 { 1509 static const GVecGen2 g = { 1510 .fni8 = tcg_gen_mov_i64, 1511 .fniv = vec_mov2, 1512 .fno = gen_helper_gvec_mov, 1513 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1514 }; 1515 if (dofs != aofs) { 1516 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1517 } else { 1518 check_size_align(oprsz, maxsz, dofs); 1519 if (oprsz < maxsz) { 1520 expand_clr(dofs + oprsz, maxsz - oprsz); 1521 } 1522 } 1523 } 1524 1525 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1526 uint32_t maxsz, TCGv_i32 in) 1527 { 1528 check_size_align(oprsz, maxsz, dofs); 1529 tcg_debug_assert(vece <= MO_32); 1530 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1531 } 1532 1533 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1534 uint32_t maxsz, TCGv_i64 in) 1535 { 1536 check_size_align(oprsz, maxsz, dofs); 1537 tcg_debug_assert(vece <= MO_64); 1538 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1539 } 1540 1541 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1542 uint32_t oprsz, uint32_t maxsz) 1543 { 1544 check_size_align(oprsz, maxsz, dofs); 1545 if (vece <= MO_64) { 1546 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1547 if (type != 0) { 1548 TCGv_vec t_vec = tcg_temp_new_vec(type); 1549 tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs); 1550 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1551 tcg_temp_free_vec(t_vec); 1552 } else if (vece <= MO_32) { 1553 TCGv_i32 in = tcg_temp_new_i32(); 1554 switch (vece) { 1555 case MO_8: 1556 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1557 break; 1558 case MO_16: 1559 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1560 break; 1561 default: 1562 tcg_gen_ld_i32(in, cpu_env, aofs); 1563 break; 1564 } 1565 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1566 tcg_temp_free_i32(in); 1567 } else { 1568 TCGv_i64 in = tcg_temp_new_i64(); 1569 tcg_gen_ld_i64(in, cpu_env, aofs); 1570 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1571 tcg_temp_free_i64(in); 1572 } 1573 } else if (vece == 4) { 1574 /* 128-bit duplicate. */ 1575 int i; 1576 1577 tcg_debug_assert(oprsz >= 16); 1578 if (TCG_TARGET_HAS_v128) { 1579 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1580 1581 tcg_gen_ld_vec(in, cpu_env, aofs); 1582 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1583 tcg_gen_st_vec(in, cpu_env, dofs + i); 1584 } 1585 tcg_temp_free_vec(in); 1586 } else { 1587 TCGv_i64 in0 = tcg_temp_new_i64(); 1588 TCGv_i64 in1 = tcg_temp_new_i64(); 1589 1590 tcg_gen_ld_i64(in0, cpu_env, aofs); 1591 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1592 for (i = (aofs == dofs) * 16; i < oprsz; i += 16) { 1593 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1594 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1595 } 1596 tcg_temp_free_i64(in0); 1597 tcg_temp_free_i64(in1); 1598 } 1599 if (oprsz < maxsz) { 1600 expand_clr(dofs + oprsz, maxsz - oprsz); 1601 } 1602 } else if (vece == 5) { 1603 /* 256-bit duplicate. */ 1604 int i; 1605 1606 tcg_debug_assert(oprsz >= 32); 1607 tcg_debug_assert(oprsz % 32 == 0); 1608 if (TCG_TARGET_HAS_v256) { 1609 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256); 1610 1611 tcg_gen_ld_vec(in, cpu_env, aofs); 1612 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1613 tcg_gen_st_vec(in, cpu_env, dofs + i); 1614 } 1615 tcg_temp_free_vec(in); 1616 } else if (TCG_TARGET_HAS_v128) { 1617 TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128); 1618 TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128); 1619 1620 tcg_gen_ld_vec(in0, cpu_env, aofs); 1621 tcg_gen_ld_vec(in1, cpu_env, aofs + 16); 1622 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1623 tcg_gen_st_vec(in0, cpu_env, dofs + i); 1624 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16); 1625 } 1626 tcg_temp_free_vec(in0); 1627 tcg_temp_free_vec(in1); 1628 } else { 1629 TCGv_i64 in[4]; 1630 int j; 1631 1632 for (j = 0; j < 4; ++j) { 1633 in[j] = tcg_temp_new_i64(); 1634 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8); 1635 } 1636 for (i = (aofs == dofs) * 32; i < oprsz; i += 32) { 1637 for (j = 0; j < 4; ++j) { 1638 tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8); 1639 } 1640 } 1641 for (j = 0; j < 4; ++j) { 1642 tcg_temp_free_i64(in[j]); 1643 } 1644 } 1645 if (oprsz < maxsz) { 1646 expand_clr(dofs + oprsz, maxsz - oprsz); 1647 } 1648 } else { 1649 g_assert_not_reached(); 1650 } 1651 } 1652 1653 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz, 1654 uint32_t maxsz, uint64_t x) 1655 { 1656 check_size_align(oprsz, maxsz, dofs); 1657 do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x); 1658 } 1659 1660 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1661 uint32_t oprsz, uint32_t maxsz) 1662 { 1663 static const GVecGen2 g = { 1664 .fni8 = tcg_gen_not_i64, 1665 .fniv = tcg_gen_not_vec, 1666 .fno = gen_helper_gvec_not, 1667 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1668 }; 1669 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1670 } 1671 1672 /* Perform a vector addition using normal addition and a mask. The mask 1673 should be the sign bit of each lane. This 6-operation form is more 1674 efficient than separate additions when there are 4 or more lanes in 1675 the 64-bit operation. */ 1676 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1677 { 1678 TCGv_i64 t1 = tcg_temp_new_i64(); 1679 TCGv_i64 t2 = tcg_temp_new_i64(); 1680 TCGv_i64 t3 = tcg_temp_new_i64(); 1681 1682 tcg_gen_andc_i64(t1, a, m); 1683 tcg_gen_andc_i64(t2, b, m); 1684 tcg_gen_xor_i64(t3, a, b); 1685 tcg_gen_add_i64(d, t1, t2); 1686 tcg_gen_and_i64(t3, t3, m); 1687 tcg_gen_xor_i64(d, d, t3); 1688 1689 tcg_temp_free_i64(t1); 1690 tcg_temp_free_i64(t2); 1691 tcg_temp_free_i64(t3); 1692 } 1693 1694 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1695 { 1696 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1697 gen_addv_mask(d, a, b, m); 1698 tcg_temp_free_i64(m); 1699 } 1700 1701 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1702 { 1703 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1704 gen_addv_mask(d, a, b, m); 1705 tcg_temp_free_i64(m); 1706 } 1707 1708 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1709 { 1710 TCGv_i64 t1 = tcg_temp_new_i64(); 1711 TCGv_i64 t2 = tcg_temp_new_i64(); 1712 1713 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1714 tcg_gen_add_i64(t2, a, b); 1715 tcg_gen_add_i64(t1, t1, b); 1716 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1717 1718 tcg_temp_free_i64(t1); 1719 tcg_temp_free_i64(t2); 1720 } 1721 1722 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1723 1724 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1725 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1726 { 1727 static const GVecGen3 g[4] = { 1728 { .fni8 = tcg_gen_vec_add8_i64, 1729 .fniv = tcg_gen_add_vec, 1730 .fno = gen_helper_gvec_add8, 1731 .opt_opc = vecop_list_add, 1732 .vece = MO_8 }, 1733 { .fni8 = tcg_gen_vec_add16_i64, 1734 .fniv = tcg_gen_add_vec, 1735 .fno = gen_helper_gvec_add16, 1736 .opt_opc = vecop_list_add, 1737 .vece = MO_16 }, 1738 { .fni4 = tcg_gen_add_i32, 1739 .fniv = tcg_gen_add_vec, 1740 .fno = gen_helper_gvec_add32, 1741 .opt_opc = vecop_list_add, 1742 .vece = MO_32 }, 1743 { .fni8 = tcg_gen_add_i64, 1744 .fniv = tcg_gen_add_vec, 1745 .fno = gen_helper_gvec_add64, 1746 .opt_opc = vecop_list_add, 1747 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1748 .vece = MO_64 }, 1749 }; 1750 1751 tcg_debug_assert(vece <= MO_64); 1752 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1753 } 1754 1755 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1756 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1757 { 1758 static const GVecGen2s g[4] = { 1759 { .fni8 = tcg_gen_vec_add8_i64, 1760 .fniv = tcg_gen_add_vec, 1761 .fno = gen_helper_gvec_adds8, 1762 .opt_opc = vecop_list_add, 1763 .vece = MO_8 }, 1764 { .fni8 = tcg_gen_vec_add16_i64, 1765 .fniv = tcg_gen_add_vec, 1766 .fno = gen_helper_gvec_adds16, 1767 .opt_opc = vecop_list_add, 1768 .vece = MO_16 }, 1769 { .fni4 = tcg_gen_add_i32, 1770 .fniv = tcg_gen_add_vec, 1771 .fno = gen_helper_gvec_adds32, 1772 .opt_opc = vecop_list_add, 1773 .vece = MO_32 }, 1774 { .fni8 = tcg_gen_add_i64, 1775 .fniv = tcg_gen_add_vec, 1776 .fno = gen_helper_gvec_adds64, 1777 .opt_opc = vecop_list_add, 1778 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1779 .vece = MO_64 }, 1780 }; 1781 1782 tcg_debug_assert(vece <= MO_64); 1783 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1784 } 1785 1786 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1787 int64_t c, uint32_t oprsz, uint32_t maxsz) 1788 { 1789 TCGv_i64 tmp = tcg_const_i64(c); 1790 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1791 tcg_temp_free_i64(tmp); 1792 } 1793 1794 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 1795 1796 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1797 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1798 { 1799 static const GVecGen2s g[4] = { 1800 { .fni8 = tcg_gen_vec_sub8_i64, 1801 .fniv = tcg_gen_sub_vec, 1802 .fno = gen_helper_gvec_subs8, 1803 .opt_opc = vecop_list_sub, 1804 .vece = MO_8 }, 1805 { .fni8 = tcg_gen_vec_sub16_i64, 1806 .fniv = tcg_gen_sub_vec, 1807 .fno = gen_helper_gvec_subs16, 1808 .opt_opc = vecop_list_sub, 1809 .vece = MO_16 }, 1810 { .fni4 = tcg_gen_sub_i32, 1811 .fniv = tcg_gen_sub_vec, 1812 .fno = gen_helper_gvec_subs32, 1813 .opt_opc = vecop_list_sub, 1814 .vece = MO_32 }, 1815 { .fni8 = tcg_gen_sub_i64, 1816 .fniv = tcg_gen_sub_vec, 1817 .fno = gen_helper_gvec_subs64, 1818 .opt_opc = vecop_list_sub, 1819 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1820 .vece = MO_64 }, 1821 }; 1822 1823 tcg_debug_assert(vece <= MO_64); 1824 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1825 } 1826 1827 /* Perform a vector subtraction using normal subtraction and a mask. 1828 Compare gen_addv_mask above. */ 1829 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1830 { 1831 TCGv_i64 t1 = tcg_temp_new_i64(); 1832 TCGv_i64 t2 = tcg_temp_new_i64(); 1833 TCGv_i64 t3 = tcg_temp_new_i64(); 1834 1835 tcg_gen_or_i64(t1, a, m); 1836 tcg_gen_andc_i64(t2, b, m); 1837 tcg_gen_eqv_i64(t3, a, b); 1838 tcg_gen_sub_i64(d, t1, t2); 1839 tcg_gen_and_i64(t3, t3, m); 1840 tcg_gen_xor_i64(d, d, t3); 1841 1842 tcg_temp_free_i64(t1); 1843 tcg_temp_free_i64(t2); 1844 tcg_temp_free_i64(t3); 1845 } 1846 1847 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1848 { 1849 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1850 gen_subv_mask(d, a, b, m); 1851 tcg_temp_free_i64(m); 1852 } 1853 1854 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1855 { 1856 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1857 gen_subv_mask(d, a, b, m); 1858 tcg_temp_free_i64(m); 1859 } 1860 1861 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1862 { 1863 TCGv_i64 t1 = tcg_temp_new_i64(); 1864 TCGv_i64 t2 = tcg_temp_new_i64(); 1865 1866 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1867 tcg_gen_sub_i64(t2, a, b); 1868 tcg_gen_sub_i64(t1, a, t1); 1869 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1870 1871 tcg_temp_free_i64(t1); 1872 tcg_temp_free_i64(t2); 1873 } 1874 1875 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1876 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1877 { 1878 static const GVecGen3 g[4] = { 1879 { .fni8 = tcg_gen_vec_sub8_i64, 1880 .fniv = tcg_gen_sub_vec, 1881 .fno = gen_helper_gvec_sub8, 1882 .opt_opc = vecop_list_sub, 1883 .vece = MO_8 }, 1884 { .fni8 = tcg_gen_vec_sub16_i64, 1885 .fniv = tcg_gen_sub_vec, 1886 .fno = gen_helper_gvec_sub16, 1887 .opt_opc = vecop_list_sub, 1888 .vece = MO_16 }, 1889 { .fni4 = tcg_gen_sub_i32, 1890 .fniv = tcg_gen_sub_vec, 1891 .fno = gen_helper_gvec_sub32, 1892 .opt_opc = vecop_list_sub, 1893 .vece = MO_32 }, 1894 { .fni8 = tcg_gen_sub_i64, 1895 .fniv = tcg_gen_sub_vec, 1896 .fno = gen_helper_gvec_sub64, 1897 .opt_opc = vecop_list_sub, 1898 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1899 .vece = MO_64 }, 1900 }; 1901 1902 tcg_debug_assert(vece <= MO_64); 1903 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1904 } 1905 1906 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 1907 1908 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1909 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1910 { 1911 static const GVecGen3 g[4] = { 1912 { .fniv = tcg_gen_mul_vec, 1913 .fno = gen_helper_gvec_mul8, 1914 .opt_opc = vecop_list_mul, 1915 .vece = MO_8 }, 1916 { .fniv = tcg_gen_mul_vec, 1917 .fno = gen_helper_gvec_mul16, 1918 .opt_opc = vecop_list_mul, 1919 .vece = MO_16 }, 1920 { .fni4 = tcg_gen_mul_i32, 1921 .fniv = tcg_gen_mul_vec, 1922 .fno = gen_helper_gvec_mul32, 1923 .opt_opc = vecop_list_mul, 1924 .vece = MO_32 }, 1925 { .fni8 = tcg_gen_mul_i64, 1926 .fniv = tcg_gen_mul_vec, 1927 .fno = gen_helper_gvec_mul64, 1928 .opt_opc = vecop_list_mul, 1929 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1930 .vece = MO_64 }, 1931 }; 1932 1933 tcg_debug_assert(vece <= MO_64); 1934 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1935 } 1936 1937 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1938 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1939 { 1940 static const GVecGen2s g[4] = { 1941 { .fniv = tcg_gen_mul_vec, 1942 .fno = gen_helper_gvec_muls8, 1943 .opt_opc = vecop_list_mul, 1944 .vece = MO_8 }, 1945 { .fniv = tcg_gen_mul_vec, 1946 .fno = gen_helper_gvec_muls16, 1947 .opt_opc = vecop_list_mul, 1948 .vece = MO_16 }, 1949 { .fni4 = tcg_gen_mul_i32, 1950 .fniv = tcg_gen_mul_vec, 1951 .fno = gen_helper_gvec_muls32, 1952 .opt_opc = vecop_list_mul, 1953 .vece = MO_32 }, 1954 { .fni8 = tcg_gen_mul_i64, 1955 .fniv = tcg_gen_mul_vec, 1956 .fno = gen_helper_gvec_muls64, 1957 .opt_opc = vecop_list_mul, 1958 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1959 .vece = MO_64 }, 1960 }; 1961 1962 tcg_debug_assert(vece <= MO_64); 1963 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1964 } 1965 1966 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1967 int64_t c, uint32_t oprsz, uint32_t maxsz) 1968 { 1969 TCGv_i64 tmp = tcg_const_i64(c); 1970 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1971 tcg_temp_free_i64(tmp); 1972 } 1973 1974 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1975 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1976 { 1977 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 1978 static const GVecGen3 g[4] = { 1979 { .fniv = tcg_gen_ssadd_vec, 1980 .fno = gen_helper_gvec_ssadd8, 1981 .opt_opc = vecop_list, 1982 .vece = MO_8 }, 1983 { .fniv = tcg_gen_ssadd_vec, 1984 .fno = gen_helper_gvec_ssadd16, 1985 .opt_opc = vecop_list, 1986 .vece = MO_16 }, 1987 { .fniv = tcg_gen_ssadd_vec, 1988 .fno = gen_helper_gvec_ssadd32, 1989 .opt_opc = vecop_list, 1990 .vece = MO_32 }, 1991 { .fniv = tcg_gen_ssadd_vec, 1992 .fno = gen_helper_gvec_ssadd64, 1993 .opt_opc = vecop_list, 1994 .vece = MO_64 }, 1995 }; 1996 tcg_debug_assert(vece <= MO_64); 1997 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1998 } 1999 2000 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 2001 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2002 { 2003 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 2004 static const GVecGen3 g[4] = { 2005 { .fniv = tcg_gen_sssub_vec, 2006 .fno = gen_helper_gvec_sssub8, 2007 .opt_opc = vecop_list, 2008 .vece = MO_8 }, 2009 { .fniv = tcg_gen_sssub_vec, 2010 .fno = gen_helper_gvec_sssub16, 2011 .opt_opc = vecop_list, 2012 .vece = MO_16 }, 2013 { .fniv = tcg_gen_sssub_vec, 2014 .fno = gen_helper_gvec_sssub32, 2015 .opt_opc = vecop_list, 2016 .vece = MO_32 }, 2017 { .fniv = tcg_gen_sssub_vec, 2018 .fno = gen_helper_gvec_sssub64, 2019 .opt_opc = vecop_list, 2020 .vece = MO_64 }, 2021 }; 2022 tcg_debug_assert(vece <= MO_64); 2023 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2024 } 2025 2026 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2027 { 2028 TCGv_i32 max = tcg_const_i32(-1); 2029 tcg_gen_add_i32(d, a, b); 2030 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 2031 tcg_temp_free_i32(max); 2032 } 2033 2034 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2035 { 2036 TCGv_i64 max = tcg_const_i64(-1); 2037 tcg_gen_add_i64(d, a, b); 2038 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 2039 tcg_temp_free_i64(max); 2040 } 2041 2042 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 2043 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2044 { 2045 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 2046 static const GVecGen3 g[4] = { 2047 { .fniv = tcg_gen_usadd_vec, 2048 .fno = gen_helper_gvec_usadd8, 2049 .opt_opc = vecop_list, 2050 .vece = MO_8 }, 2051 { .fniv = tcg_gen_usadd_vec, 2052 .fno = gen_helper_gvec_usadd16, 2053 .opt_opc = vecop_list, 2054 .vece = MO_16 }, 2055 { .fni4 = tcg_gen_usadd_i32, 2056 .fniv = tcg_gen_usadd_vec, 2057 .fno = gen_helper_gvec_usadd32, 2058 .opt_opc = vecop_list, 2059 .vece = MO_32 }, 2060 { .fni8 = tcg_gen_usadd_i64, 2061 .fniv = tcg_gen_usadd_vec, 2062 .fno = gen_helper_gvec_usadd64, 2063 .opt_opc = vecop_list, 2064 .vece = MO_64 } 2065 }; 2066 tcg_debug_assert(vece <= MO_64); 2067 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2068 } 2069 2070 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2071 { 2072 TCGv_i32 min = tcg_const_i32(0); 2073 tcg_gen_sub_i32(d, a, b); 2074 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 2075 tcg_temp_free_i32(min); 2076 } 2077 2078 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2079 { 2080 TCGv_i64 min = tcg_const_i64(0); 2081 tcg_gen_sub_i64(d, a, b); 2082 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 2083 tcg_temp_free_i64(min); 2084 } 2085 2086 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 2087 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2088 { 2089 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 2090 static const GVecGen3 g[4] = { 2091 { .fniv = tcg_gen_ussub_vec, 2092 .fno = gen_helper_gvec_ussub8, 2093 .opt_opc = vecop_list, 2094 .vece = MO_8 }, 2095 { .fniv = tcg_gen_ussub_vec, 2096 .fno = gen_helper_gvec_ussub16, 2097 .opt_opc = vecop_list, 2098 .vece = MO_16 }, 2099 { .fni4 = tcg_gen_ussub_i32, 2100 .fniv = tcg_gen_ussub_vec, 2101 .fno = gen_helper_gvec_ussub32, 2102 .opt_opc = vecop_list, 2103 .vece = MO_32 }, 2104 { .fni8 = tcg_gen_ussub_i64, 2105 .fniv = tcg_gen_ussub_vec, 2106 .fno = gen_helper_gvec_ussub64, 2107 .opt_opc = vecop_list, 2108 .vece = MO_64 } 2109 }; 2110 tcg_debug_assert(vece <= MO_64); 2111 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2112 } 2113 2114 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 2115 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2116 { 2117 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 2118 static const GVecGen3 g[4] = { 2119 { .fniv = tcg_gen_smin_vec, 2120 .fno = gen_helper_gvec_smin8, 2121 .opt_opc = vecop_list, 2122 .vece = MO_8 }, 2123 { .fniv = tcg_gen_smin_vec, 2124 .fno = gen_helper_gvec_smin16, 2125 .opt_opc = vecop_list, 2126 .vece = MO_16 }, 2127 { .fni4 = tcg_gen_smin_i32, 2128 .fniv = tcg_gen_smin_vec, 2129 .fno = gen_helper_gvec_smin32, 2130 .opt_opc = vecop_list, 2131 .vece = MO_32 }, 2132 { .fni8 = tcg_gen_smin_i64, 2133 .fniv = tcg_gen_smin_vec, 2134 .fno = gen_helper_gvec_smin64, 2135 .opt_opc = vecop_list, 2136 .vece = MO_64 } 2137 }; 2138 tcg_debug_assert(vece <= MO_64); 2139 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2140 } 2141 2142 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2143 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2144 { 2145 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2146 static const GVecGen3 g[4] = { 2147 { .fniv = tcg_gen_umin_vec, 2148 .fno = gen_helper_gvec_umin8, 2149 .opt_opc = vecop_list, 2150 .vece = MO_8 }, 2151 { .fniv = tcg_gen_umin_vec, 2152 .fno = gen_helper_gvec_umin16, 2153 .opt_opc = vecop_list, 2154 .vece = MO_16 }, 2155 { .fni4 = tcg_gen_umin_i32, 2156 .fniv = tcg_gen_umin_vec, 2157 .fno = gen_helper_gvec_umin32, 2158 .opt_opc = vecop_list, 2159 .vece = MO_32 }, 2160 { .fni8 = tcg_gen_umin_i64, 2161 .fniv = tcg_gen_umin_vec, 2162 .fno = gen_helper_gvec_umin64, 2163 .opt_opc = vecop_list, 2164 .vece = MO_64 } 2165 }; 2166 tcg_debug_assert(vece <= MO_64); 2167 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2168 } 2169 2170 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2171 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2172 { 2173 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2174 static const GVecGen3 g[4] = { 2175 { .fniv = tcg_gen_smax_vec, 2176 .fno = gen_helper_gvec_smax8, 2177 .opt_opc = vecop_list, 2178 .vece = MO_8 }, 2179 { .fniv = tcg_gen_smax_vec, 2180 .fno = gen_helper_gvec_smax16, 2181 .opt_opc = vecop_list, 2182 .vece = MO_16 }, 2183 { .fni4 = tcg_gen_smax_i32, 2184 .fniv = tcg_gen_smax_vec, 2185 .fno = gen_helper_gvec_smax32, 2186 .opt_opc = vecop_list, 2187 .vece = MO_32 }, 2188 { .fni8 = tcg_gen_smax_i64, 2189 .fniv = tcg_gen_smax_vec, 2190 .fno = gen_helper_gvec_smax64, 2191 .opt_opc = vecop_list, 2192 .vece = MO_64 } 2193 }; 2194 tcg_debug_assert(vece <= MO_64); 2195 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2196 } 2197 2198 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2199 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2200 { 2201 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2202 static const GVecGen3 g[4] = { 2203 { .fniv = tcg_gen_umax_vec, 2204 .fno = gen_helper_gvec_umax8, 2205 .opt_opc = vecop_list, 2206 .vece = MO_8 }, 2207 { .fniv = tcg_gen_umax_vec, 2208 .fno = gen_helper_gvec_umax16, 2209 .opt_opc = vecop_list, 2210 .vece = MO_16 }, 2211 { .fni4 = tcg_gen_umax_i32, 2212 .fniv = tcg_gen_umax_vec, 2213 .fno = gen_helper_gvec_umax32, 2214 .opt_opc = vecop_list, 2215 .vece = MO_32 }, 2216 { .fni8 = tcg_gen_umax_i64, 2217 .fniv = tcg_gen_umax_vec, 2218 .fno = gen_helper_gvec_umax64, 2219 .opt_opc = vecop_list, 2220 .vece = MO_64 } 2221 }; 2222 tcg_debug_assert(vece <= MO_64); 2223 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2224 } 2225 2226 /* Perform a vector negation using normal negation and a mask. 2227 Compare gen_subv_mask above. */ 2228 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2229 { 2230 TCGv_i64 t2 = tcg_temp_new_i64(); 2231 TCGv_i64 t3 = tcg_temp_new_i64(); 2232 2233 tcg_gen_andc_i64(t3, m, b); 2234 tcg_gen_andc_i64(t2, b, m); 2235 tcg_gen_sub_i64(d, m, t2); 2236 tcg_gen_xor_i64(d, d, t3); 2237 2238 tcg_temp_free_i64(t2); 2239 tcg_temp_free_i64(t3); 2240 } 2241 2242 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2243 { 2244 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 2245 gen_negv_mask(d, b, m); 2246 tcg_temp_free_i64(m); 2247 } 2248 2249 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2250 { 2251 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 2252 gen_negv_mask(d, b, m); 2253 tcg_temp_free_i64(m); 2254 } 2255 2256 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2257 { 2258 TCGv_i64 t1 = tcg_temp_new_i64(); 2259 TCGv_i64 t2 = tcg_temp_new_i64(); 2260 2261 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2262 tcg_gen_neg_i64(t2, b); 2263 tcg_gen_neg_i64(t1, t1); 2264 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2265 2266 tcg_temp_free_i64(t1); 2267 tcg_temp_free_i64(t2); 2268 } 2269 2270 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2271 uint32_t oprsz, uint32_t maxsz) 2272 { 2273 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2274 static const GVecGen2 g[4] = { 2275 { .fni8 = tcg_gen_vec_neg8_i64, 2276 .fniv = tcg_gen_neg_vec, 2277 .fno = gen_helper_gvec_neg8, 2278 .opt_opc = vecop_list, 2279 .vece = MO_8 }, 2280 { .fni8 = tcg_gen_vec_neg16_i64, 2281 .fniv = tcg_gen_neg_vec, 2282 .fno = gen_helper_gvec_neg16, 2283 .opt_opc = vecop_list, 2284 .vece = MO_16 }, 2285 { .fni4 = tcg_gen_neg_i32, 2286 .fniv = tcg_gen_neg_vec, 2287 .fno = gen_helper_gvec_neg32, 2288 .opt_opc = vecop_list, 2289 .vece = MO_32 }, 2290 { .fni8 = tcg_gen_neg_i64, 2291 .fniv = tcg_gen_neg_vec, 2292 .fno = gen_helper_gvec_neg64, 2293 .opt_opc = vecop_list, 2294 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2295 .vece = MO_64 }, 2296 }; 2297 2298 tcg_debug_assert(vece <= MO_64); 2299 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2300 } 2301 2302 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2303 { 2304 TCGv_i64 t = tcg_temp_new_i64(); 2305 int nbit = 8 << vece; 2306 2307 /* Create -1 for each negative element. */ 2308 tcg_gen_shri_i64(t, b, nbit - 1); 2309 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2310 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2311 2312 /* 2313 * Invert (via xor -1) and add one. 2314 * Because of the ordering the msb is cleared, 2315 * so we never have carry into the next element. 2316 */ 2317 tcg_gen_xor_i64(d, b, t); 2318 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2319 tcg_gen_add_i64(d, d, t); 2320 2321 tcg_temp_free_i64(t); 2322 } 2323 2324 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2325 { 2326 gen_absv_mask(d, b, MO_8); 2327 } 2328 2329 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2330 { 2331 gen_absv_mask(d, b, MO_16); 2332 } 2333 2334 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2335 uint32_t oprsz, uint32_t maxsz) 2336 { 2337 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2338 static const GVecGen2 g[4] = { 2339 { .fni8 = tcg_gen_vec_abs8_i64, 2340 .fniv = tcg_gen_abs_vec, 2341 .fno = gen_helper_gvec_abs8, 2342 .opt_opc = vecop_list, 2343 .vece = MO_8 }, 2344 { .fni8 = tcg_gen_vec_abs16_i64, 2345 .fniv = tcg_gen_abs_vec, 2346 .fno = gen_helper_gvec_abs16, 2347 .opt_opc = vecop_list, 2348 .vece = MO_16 }, 2349 { .fni4 = tcg_gen_abs_i32, 2350 .fniv = tcg_gen_abs_vec, 2351 .fno = gen_helper_gvec_abs32, 2352 .opt_opc = vecop_list, 2353 .vece = MO_32 }, 2354 { .fni8 = tcg_gen_abs_i64, 2355 .fniv = tcg_gen_abs_vec, 2356 .fno = gen_helper_gvec_abs64, 2357 .opt_opc = vecop_list, 2358 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2359 .vece = MO_64 }, 2360 }; 2361 2362 tcg_debug_assert(vece <= MO_64); 2363 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2364 } 2365 2366 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2367 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2368 { 2369 static const GVecGen3 g = { 2370 .fni8 = tcg_gen_and_i64, 2371 .fniv = tcg_gen_and_vec, 2372 .fno = gen_helper_gvec_and, 2373 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2374 }; 2375 2376 if (aofs == bofs) { 2377 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2378 } else { 2379 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2380 } 2381 } 2382 2383 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2384 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2385 { 2386 static const GVecGen3 g = { 2387 .fni8 = tcg_gen_or_i64, 2388 .fniv = tcg_gen_or_vec, 2389 .fno = gen_helper_gvec_or, 2390 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2391 }; 2392 2393 if (aofs == bofs) { 2394 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2395 } else { 2396 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2397 } 2398 } 2399 2400 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2401 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2402 { 2403 static const GVecGen3 g = { 2404 .fni8 = tcg_gen_xor_i64, 2405 .fniv = tcg_gen_xor_vec, 2406 .fno = gen_helper_gvec_xor, 2407 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2408 }; 2409 2410 if (aofs == bofs) { 2411 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2412 } else { 2413 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2414 } 2415 } 2416 2417 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2418 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2419 { 2420 static const GVecGen3 g = { 2421 .fni8 = tcg_gen_andc_i64, 2422 .fniv = tcg_gen_andc_vec, 2423 .fno = gen_helper_gvec_andc, 2424 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2425 }; 2426 2427 if (aofs == bofs) { 2428 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0); 2429 } else { 2430 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2431 } 2432 } 2433 2434 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2435 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2436 { 2437 static const GVecGen3 g = { 2438 .fni8 = tcg_gen_orc_i64, 2439 .fniv = tcg_gen_orc_vec, 2440 .fno = gen_helper_gvec_orc, 2441 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2442 }; 2443 2444 if (aofs == bofs) { 2445 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2446 } else { 2447 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2448 } 2449 } 2450 2451 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2452 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2453 { 2454 static const GVecGen3 g = { 2455 .fni8 = tcg_gen_nand_i64, 2456 .fniv = tcg_gen_nand_vec, 2457 .fno = gen_helper_gvec_nand, 2458 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2459 }; 2460 2461 if (aofs == bofs) { 2462 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2463 } else { 2464 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2465 } 2466 } 2467 2468 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2469 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2470 { 2471 static const GVecGen3 g = { 2472 .fni8 = tcg_gen_nor_i64, 2473 .fniv = tcg_gen_nor_vec, 2474 .fno = gen_helper_gvec_nor, 2475 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2476 }; 2477 2478 if (aofs == bofs) { 2479 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2480 } else { 2481 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2482 } 2483 } 2484 2485 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2486 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2487 { 2488 static const GVecGen3 g = { 2489 .fni8 = tcg_gen_eqv_i64, 2490 .fniv = tcg_gen_eqv_vec, 2491 .fno = gen_helper_gvec_eqv, 2492 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2493 }; 2494 2495 if (aofs == bofs) { 2496 tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1); 2497 } else { 2498 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2499 } 2500 } 2501 2502 static const GVecGen2s gop_ands = { 2503 .fni8 = tcg_gen_and_i64, 2504 .fniv = tcg_gen_and_vec, 2505 .fno = gen_helper_gvec_ands, 2506 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2507 .vece = MO_64 2508 }; 2509 2510 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2511 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2512 { 2513 TCGv_i64 tmp = tcg_temp_new_i64(); 2514 gen_dup_i64(vece, tmp, c); 2515 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2516 tcg_temp_free_i64(tmp); 2517 } 2518 2519 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2520 int64_t c, uint32_t oprsz, uint32_t maxsz) 2521 { 2522 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2523 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2524 tcg_temp_free_i64(tmp); 2525 } 2526 2527 static const GVecGen2s gop_xors = { 2528 .fni8 = tcg_gen_xor_i64, 2529 .fniv = tcg_gen_xor_vec, 2530 .fno = gen_helper_gvec_xors, 2531 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2532 .vece = MO_64 2533 }; 2534 2535 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2536 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2537 { 2538 TCGv_i64 tmp = tcg_temp_new_i64(); 2539 gen_dup_i64(vece, tmp, c); 2540 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2541 tcg_temp_free_i64(tmp); 2542 } 2543 2544 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2545 int64_t c, uint32_t oprsz, uint32_t maxsz) 2546 { 2547 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2548 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2549 tcg_temp_free_i64(tmp); 2550 } 2551 2552 static const GVecGen2s gop_ors = { 2553 .fni8 = tcg_gen_or_i64, 2554 .fniv = tcg_gen_or_vec, 2555 .fno = gen_helper_gvec_ors, 2556 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2557 .vece = MO_64 2558 }; 2559 2560 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2561 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2562 { 2563 TCGv_i64 tmp = tcg_temp_new_i64(); 2564 gen_dup_i64(vece, tmp, c); 2565 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2566 tcg_temp_free_i64(tmp); 2567 } 2568 2569 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2570 int64_t c, uint32_t oprsz, uint32_t maxsz) 2571 { 2572 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2573 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2574 tcg_temp_free_i64(tmp); 2575 } 2576 2577 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2578 { 2579 uint64_t mask = dup_const(MO_8, 0xff << c); 2580 tcg_gen_shli_i64(d, a, c); 2581 tcg_gen_andi_i64(d, d, mask); 2582 } 2583 2584 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2585 { 2586 uint64_t mask = dup_const(MO_16, 0xffff << c); 2587 tcg_gen_shli_i64(d, a, c); 2588 tcg_gen_andi_i64(d, d, mask); 2589 } 2590 2591 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2592 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2593 { 2594 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2595 static const GVecGen2i g[4] = { 2596 { .fni8 = tcg_gen_vec_shl8i_i64, 2597 .fniv = tcg_gen_shli_vec, 2598 .fno = gen_helper_gvec_shl8i, 2599 .opt_opc = vecop_list, 2600 .vece = MO_8 }, 2601 { .fni8 = tcg_gen_vec_shl16i_i64, 2602 .fniv = tcg_gen_shli_vec, 2603 .fno = gen_helper_gvec_shl16i, 2604 .opt_opc = vecop_list, 2605 .vece = MO_16 }, 2606 { .fni4 = tcg_gen_shli_i32, 2607 .fniv = tcg_gen_shli_vec, 2608 .fno = gen_helper_gvec_shl32i, 2609 .opt_opc = vecop_list, 2610 .vece = MO_32 }, 2611 { .fni8 = tcg_gen_shli_i64, 2612 .fniv = tcg_gen_shli_vec, 2613 .fno = gen_helper_gvec_shl64i, 2614 .opt_opc = vecop_list, 2615 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2616 .vece = MO_64 }, 2617 }; 2618 2619 tcg_debug_assert(vece <= MO_64); 2620 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2621 if (shift == 0) { 2622 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2623 } else { 2624 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2625 } 2626 } 2627 2628 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2629 { 2630 uint64_t mask = dup_const(MO_8, 0xff >> c); 2631 tcg_gen_shri_i64(d, a, c); 2632 tcg_gen_andi_i64(d, d, mask); 2633 } 2634 2635 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2636 { 2637 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2638 tcg_gen_shri_i64(d, a, c); 2639 tcg_gen_andi_i64(d, d, mask); 2640 } 2641 2642 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2643 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2644 { 2645 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2646 static const GVecGen2i g[4] = { 2647 { .fni8 = tcg_gen_vec_shr8i_i64, 2648 .fniv = tcg_gen_shri_vec, 2649 .fno = gen_helper_gvec_shr8i, 2650 .opt_opc = vecop_list, 2651 .vece = MO_8 }, 2652 { .fni8 = tcg_gen_vec_shr16i_i64, 2653 .fniv = tcg_gen_shri_vec, 2654 .fno = gen_helper_gvec_shr16i, 2655 .opt_opc = vecop_list, 2656 .vece = MO_16 }, 2657 { .fni4 = tcg_gen_shri_i32, 2658 .fniv = tcg_gen_shri_vec, 2659 .fno = gen_helper_gvec_shr32i, 2660 .opt_opc = vecop_list, 2661 .vece = MO_32 }, 2662 { .fni8 = tcg_gen_shri_i64, 2663 .fniv = tcg_gen_shri_vec, 2664 .fno = gen_helper_gvec_shr64i, 2665 .opt_opc = vecop_list, 2666 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2667 .vece = MO_64 }, 2668 }; 2669 2670 tcg_debug_assert(vece <= MO_64); 2671 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2672 if (shift == 0) { 2673 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2674 } else { 2675 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2676 } 2677 } 2678 2679 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2680 { 2681 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2682 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2683 TCGv_i64 s = tcg_temp_new_i64(); 2684 2685 tcg_gen_shri_i64(d, a, c); 2686 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2687 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2688 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2689 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2690 tcg_temp_free_i64(s); 2691 } 2692 2693 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2694 { 2695 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2696 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2697 TCGv_i64 s = tcg_temp_new_i64(); 2698 2699 tcg_gen_shri_i64(d, a, c); 2700 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2701 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2702 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2703 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2704 tcg_temp_free_i64(s); 2705 } 2706 2707 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2708 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2709 { 2710 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 2711 static const GVecGen2i g[4] = { 2712 { .fni8 = tcg_gen_vec_sar8i_i64, 2713 .fniv = tcg_gen_sari_vec, 2714 .fno = gen_helper_gvec_sar8i, 2715 .opt_opc = vecop_list, 2716 .vece = MO_8 }, 2717 { .fni8 = tcg_gen_vec_sar16i_i64, 2718 .fniv = tcg_gen_sari_vec, 2719 .fno = gen_helper_gvec_sar16i, 2720 .opt_opc = vecop_list, 2721 .vece = MO_16 }, 2722 { .fni4 = tcg_gen_sari_i32, 2723 .fniv = tcg_gen_sari_vec, 2724 .fno = gen_helper_gvec_sar32i, 2725 .opt_opc = vecop_list, 2726 .vece = MO_32 }, 2727 { .fni8 = tcg_gen_sari_i64, 2728 .fniv = tcg_gen_sari_vec, 2729 .fno = gen_helper_gvec_sar64i, 2730 .opt_opc = vecop_list, 2731 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2732 .vece = MO_64 }, 2733 }; 2734 2735 tcg_debug_assert(vece <= MO_64); 2736 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2737 if (shift == 0) { 2738 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2739 } else { 2740 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2741 } 2742 } 2743 2744 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2745 { 2746 uint64_t mask = dup_const(MO_8, 0xff << c); 2747 2748 tcg_gen_shli_i64(d, a, c); 2749 tcg_gen_shri_i64(a, a, 8 - c); 2750 tcg_gen_andi_i64(d, d, mask); 2751 tcg_gen_andi_i64(a, a, ~mask); 2752 tcg_gen_or_i64(d, d, a); 2753 } 2754 2755 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2756 { 2757 uint64_t mask = dup_const(MO_16, 0xffff << c); 2758 2759 tcg_gen_shli_i64(d, a, c); 2760 tcg_gen_shri_i64(a, a, 16 - c); 2761 tcg_gen_andi_i64(d, d, mask); 2762 tcg_gen_andi_i64(a, a, ~mask); 2763 tcg_gen_or_i64(d, d, a); 2764 } 2765 2766 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs, 2767 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2768 { 2769 static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 }; 2770 static const GVecGen2i g[4] = { 2771 { .fni8 = tcg_gen_vec_rotl8i_i64, 2772 .fniv = tcg_gen_rotli_vec, 2773 .fno = gen_helper_gvec_rotl8i, 2774 .opt_opc = vecop_list, 2775 .vece = MO_8 }, 2776 { .fni8 = tcg_gen_vec_rotl16i_i64, 2777 .fniv = tcg_gen_rotli_vec, 2778 .fno = gen_helper_gvec_rotl16i, 2779 .opt_opc = vecop_list, 2780 .vece = MO_16 }, 2781 { .fni4 = tcg_gen_rotli_i32, 2782 .fniv = tcg_gen_rotli_vec, 2783 .fno = gen_helper_gvec_rotl32i, 2784 .opt_opc = vecop_list, 2785 .vece = MO_32 }, 2786 { .fni8 = tcg_gen_rotli_i64, 2787 .fniv = tcg_gen_rotli_vec, 2788 .fno = gen_helper_gvec_rotl64i, 2789 .opt_opc = vecop_list, 2790 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2791 .vece = MO_64 }, 2792 }; 2793 2794 tcg_debug_assert(vece <= MO_64); 2795 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2796 if (shift == 0) { 2797 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2798 } else { 2799 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2800 } 2801 } 2802 2803 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs, 2804 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2805 { 2806 tcg_debug_assert(vece <= MO_64); 2807 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2808 tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1), 2809 oprsz, maxsz); 2810 } 2811 2812 /* 2813 * Specialized generation vector shifts by a non-constant scalar. 2814 */ 2815 2816 typedef struct { 2817 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 2818 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 2819 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 2820 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 2821 gen_helper_gvec_2 *fno[4]; 2822 TCGOpcode s_list[2]; 2823 TCGOpcode v_list[2]; 2824 } GVecGen2sh; 2825 2826 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2827 uint32_t oprsz, uint32_t tysz, TCGType type, 2828 TCGv_i32 shift, 2829 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 2830 { 2831 TCGv_vec t0 = tcg_temp_new_vec(type); 2832 uint32_t i; 2833 2834 for (i = 0; i < oprsz; i += tysz) { 2835 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2836 fni(vece, t0, t0, shift); 2837 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2838 } 2839 tcg_temp_free_vec(t0); 2840 } 2841 2842 static void 2843 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 2844 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 2845 { 2846 TCGType type; 2847 uint32_t some; 2848 2849 check_size_align(oprsz, maxsz, dofs | aofs); 2850 check_overlap_2(dofs, aofs, maxsz); 2851 2852 /* If the backend has a scalar expansion, great. */ 2853 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 2854 if (type) { 2855 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2856 switch (type) { 2857 case TCG_TYPE_V256: 2858 some = QEMU_ALIGN_DOWN(oprsz, 32); 2859 expand_2sh_vec(vece, dofs, aofs, some, 32, 2860 TCG_TYPE_V256, shift, g->fniv_s); 2861 if (some == oprsz) { 2862 break; 2863 } 2864 dofs += some; 2865 aofs += some; 2866 oprsz -= some; 2867 maxsz -= some; 2868 /* fallthru */ 2869 case TCG_TYPE_V128: 2870 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 2871 TCG_TYPE_V128, shift, g->fniv_s); 2872 break; 2873 case TCG_TYPE_V64: 2874 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 2875 TCG_TYPE_V64, shift, g->fniv_s); 2876 break; 2877 default: 2878 g_assert_not_reached(); 2879 } 2880 tcg_swap_vecop_list(hold_list); 2881 goto clear_tail; 2882 } 2883 2884 /* If the backend supports variable vector shifts, also cool. */ 2885 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 2886 if (type) { 2887 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2888 TCGv_vec v_shift = tcg_temp_new_vec(type); 2889 2890 if (vece == MO_64) { 2891 TCGv_i64 sh64 = tcg_temp_new_i64(); 2892 tcg_gen_extu_i32_i64(sh64, shift); 2893 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 2894 tcg_temp_free_i64(sh64); 2895 } else { 2896 tcg_gen_dup_i32_vec(vece, v_shift, shift); 2897 } 2898 2899 switch (type) { 2900 case TCG_TYPE_V256: 2901 some = QEMU_ALIGN_DOWN(oprsz, 32); 2902 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 2903 v_shift, false, g->fniv_v); 2904 if (some == oprsz) { 2905 break; 2906 } 2907 dofs += some; 2908 aofs += some; 2909 oprsz -= some; 2910 maxsz -= some; 2911 /* fallthru */ 2912 case TCG_TYPE_V128: 2913 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 2914 v_shift, false, g->fniv_v); 2915 break; 2916 case TCG_TYPE_V64: 2917 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 2918 v_shift, false, g->fniv_v); 2919 break; 2920 default: 2921 g_assert_not_reached(); 2922 } 2923 tcg_temp_free_vec(v_shift); 2924 tcg_swap_vecop_list(hold_list); 2925 goto clear_tail; 2926 } 2927 2928 /* Otherwise fall back to integral... */ 2929 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2930 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 2931 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2932 TCGv_i64 sh64 = tcg_temp_new_i64(); 2933 tcg_gen_extu_i32_i64(sh64, shift); 2934 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 2935 tcg_temp_free_i64(sh64); 2936 } else { 2937 TCGv_ptr a0 = tcg_temp_new_ptr(); 2938 TCGv_ptr a1 = tcg_temp_new_ptr(); 2939 TCGv_i32 desc = tcg_temp_new_i32(); 2940 2941 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 2942 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 2943 tcg_gen_addi_ptr(a0, cpu_env, dofs); 2944 tcg_gen_addi_ptr(a1, cpu_env, aofs); 2945 2946 g->fno[vece](a0, a1, desc); 2947 2948 tcg_temp_free_ptr(a0); 2949 tcg_temp_free_ptr(a1); 2950 tcg_temp_free_i32(desc); 2951 return; 2952 } 2953 2954 clear_tail: 2955 if (oprsz < maxsz) { 2956 expand_clr(dofs + oprsz, maxsz - oprsz); 2957 } 2958 } 2959 2960 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 2961 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2962 { 2963 static const GVecGen2sh g = { 2964 .fni4 = tcg_gen_shl_i32, 2965 .fni8 = tcg_gen_shl_i64, 2966 .fniv_s = tcg_gen_shls_vec, 2967 .fniv_v = tcg_gen_shlv_vec, 2968 .fno = { 2969 gen_helper_gvec_shl8i, 2970 gen_helper_gvec_shl16i, 2971 gen_helper_gvec_shl32i, 2972 gen_helper_gvec_shl64i, 2973 }, 2974 .s_list = { INDEX_op_shls_vec, 0 }, 2975 .v_list = { INDEX_op_shlv_vec, 0 }, 2976 }; 2977 2978 tcg_debug_assert(vece <= MO_64); 2979 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2980 } 2981 2982 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 2983 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2984 { 2985 static const GVecGen2sh g = { 2986 .fni4 = tcg_gen_shr_i32, 2987 .fni8 = tcg_gen_shr_i64, 2988 .fniv_s = tcg_gen_shrs_vec, 2989 .fniv_v = tcg_gen_shrv_vec, 2990 .fno = { 2991 gen_helper_gvec_shr8i, 2992 gen_helper_gvec_shr16i, 2993 gen_helper_gvec_shr32i, 2994 gen_helper_gvec_shr64i, 2995 }, 2996 .s_list = { INDEX_op_shrs_vec, 0 }, 2997 .v_list = { INDEX_op_shrv_vec, 0 }, 2998 }; 2999 3000 tcg_debug_assert(vece <= MO_64); 3001 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3002 } 3003 3004 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 3005 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3006 { 3007 static const GVecGen2sh g = { 3008 .fni4 = tcg_gen_sar_i32, 3009 .fni8 = tcg_gen_sar_i64, 3010 .fniv_s = tcg_gen_sars_vec, 3011 .fniv_v = tcg_gen_sarv_vec, 3012 .fno = { 3013 gen_helper_gvec_sar8i, 3014 gen_helper_gvec_sar16i, 3015 gen_helper_gvec_sar32i, 3016 gen_helper_gvec_sar64i, 3017 }, 3018 .s_list = { INDEX_op_sars_vec, 0 }, 3019 .v_list = { INDEX_op_sarv_vec, 0 }, 3020 }; 3021 3022 tcg_debug_assert(vece <= MO_64); 3023 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3024 } 3025 3026 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs, 3027 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 3028 { 3029 static const GVecGen2sh g = { 3030 .fni4 = tcg_gen_rotl_i32, 3031 .fni8 = tcg_gen_rotl_i64, 3032 .fniv_s = tcg_gen_rotls_vec, 3033 .fniv_v = tcg_gen_rotlv_vec, 3034 .fno = { 3035 gen_helper_gvec_rotl8i, 3036 gen_helper_gvec_rotl16i, 3037 gen_helper_gvec_rotl32i, 3038 gen_helper_gvec_rotl64i, 3039 }, 3040 .s_list = { INDEX_op_rotls_vec, 0 }, 3041 .v_list = { INDEX_op_rotlv_vec, 0 }, 3042 }; 3043 3044 tcg_debug_assert(vece <= MO_64); 3045 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 3046 } 3047 3048 /* 3049 * Expand D = A << (B % element bits) 3050 * 3051 * Unlike scalar shifts, where it is easy for the target front end 3052 * to include the modulo as part of the expansion. If the target 3053 * naturally includes the modulo as part of the operation, great! 3054 * If the target has some other behaviour from out-of-range shifts, 3055 * then it could not use this function anyway, and would need to 3056 * do it's own expansion with custom functions. 3057 */ 3058 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 3059 TCGv_vec a, TCGv_vec b) 3060 { 3061 TCGv_vec t = tcg_temp_new_vec_matching(d); 3062 3063 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3064 tcg_gen_and_vec(vece, t, t, b); 3065 tcg_gen_shlv_vec(vece, d, a, t); 3066 tcg_temp_free_vec(t); 3067 } 3068 3069 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3070 { 3071 TCGv_i32 t = tcg_temp_new_i32(); 3072 3073 tcg_gen_andi_i32(t, b, 31); 3074 tcg_gen_shl_i32(d, a, t); 3075 tcg_temp_free_i32(t); 3076 } 3077 3078 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3079 { 3080 TCGv_i64 t = tcg_temp_new_i64(); 3081 3082 tcg_gen_andi_i64(t, b, 63); 3083 tcg_gen_shl_i64(d, a, t); 3084 tcg_temp_free_i64(t); 3085 } 3086 3087 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3088 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3089 { 3090 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 3091 static const GVecGen3 g[4] = { 3092 { .fniv = tcg_gen_shlv_mod_vec, 3093 .fno = gen_helper_gvec_shl8v, 3094 .opt_opc = vecop_list, 3095 .vece = MO_8 }, 3096 { .fniv = tcg_gen_shlv_mod_vec, 3097 .fno = gen_helper_gvec_shl16v, 3098 .opt_opc = vecop_list, 3099 .vece = MO_16 }, 3100 { .fni4 = tcg_gen_shl_mod_i32, 3101 .fniv = tcg_gen_shlv_mod_vec, 3102 .fno = gen_helper_gvec_shl32v, 3103 .opt_opc = vecop_list, 3104 .vece = MO_32 }, 3105 { .fni8 = tcg_gen_shl_mod_i64, 3106 .fniv = tcg_gen_shlv_mod_vec, 3107 .fno = gen_helper_gvec_shl64v, 3108 .opt_opc = vecop_list, 3109 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3110 .vece = MO_64 }, 3111 }; 3112 3113 tcg_debug_assert(vece <= MO_64); 3114 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3115 } 3116 3117 /* 3118 * Similarly for logical right shifts. 3119 */ 3120 3121 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 3122 TCGv_vec a, TCGv_vec b) 3123 { 3124 TCGv_vec t = tcg_temp_new_vec_matching(d); 3125 3126 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3127 tcg_gen_and_vec(vece, t, t, b); 3128 tcg_gen_shrv_vec(vece, d, a, t); 3129 tcg_temp_free_vec(t); 3130 } 3131 3132 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3133 { 3134 TCGv_i32 t = tcg_temp_new_i32(); 3135 3136 tcg_gen_andi_i32(t, b, 31); 3137 tcg_gen_shr_i32(d, a, t); 3138 tcg_temp_free_i32(t); 3139 } 3140 3141 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3142 { 3143 TCGv_i64 t = tcg_temp_new_i64(); 3144 3145 tcg_gen_andi_i64(t, b, 63); 3146 tcg_gen_shr_i64(d, a, t); 3147 tcg_temp_free_i64(t); 3148 } 3149 3150 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3151 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3152 { 3153 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 3154 static const GVecGen3 g[4] = { 3155 { .fniv = tcg_gen_shrv_mod_vec, 3156 .fno = gen_helper_gvec_shr8v, 3157 .opt_opc = vecop_list, 3158 .vece = MO_8 }, 3159 { .fniv = tcg_gen_shrv_mod_vec, 3160 .fno = gen_helper_gvec_shr16v, 3161 .opt_opc = vecop_list, 3162 .vece = MO_16 }, 3163 { .fni4 = tcg_gen_shr_mod_i32, 3164 .fniv = tcg_gen_shrv_mod_vec, 3165 .fno = gen_helper_gvec_shr32v, 3166 .opt_opc = vecop_list, 3167 .vece = MO_32 }, 3168 { .fni8 = tcg_gen_shr_mod_i64, 3169 .fniv = tcg_gen_shrv_mod_vec, 3170 .fno = gen_helper_gvec_shr64v, 3171 .opt_opc = vecop_list, 3172 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3173 .vece = MO_64 }, 3174 }; 3175 3176 tcg_debug_assert(vece <= MO_64); 3177 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3178 } 3179 3180 /* 3181 * Similarly for arithmetic right shifts. 3182 */ 3183 3184 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 3185 TCGv_vec a, TCGv_vec b) 3186 { 3187 TCGv_vec t = tcg_temp_new_vec_matching(d); 3188 3189 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3190 tcg_gen_and_vec(vece, t, t, b); 3191 tcg_gen_sarv_vec(vece, d, a, t); 3192 tcg_temp_free_vec(t); 3193 } 3194 3195 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3196 { 3197 TCGv_i32 t = tcg_temp_new_i32(); 3198 3199 tcg_gen_andi_i32(t, b, 31); 3200 tcg_gen_sar_i32(d, a, t); 3201 tcg_temp_free_i32(t); 3202 } 3203 3204 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3205 { 3206 TCGv_i64 t = tcg_temp_new_i64(); 3207 3208 tcg_gen_andi_i64(t, b, 63); 3209 tcg_gen_sar_i64(d, a, t); 3210 tcg_temp_free_i64(t); 3211 } 3212 3213 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3214 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3215 { 3216 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3217 static const GVecGen3 g[4] = { 3218 { .fniv = tcg_gen_sarv_mod_vec, 3219 .fno = gen_helper_gvec_sar8v, 3220 .opt_opc = vecop_list, 3221 .vece = MO_8 }, 3222 { .fniv = tcg_gen_sarv_mod_vec, 3223 .fno = gen_helper_gvec_sar16v, 3224 .opt_opc = vecop_list, 3225 .vece = MO_16 }, 3226 { .fni4 = tcg_gen_sar_mod_i32, 3227 .fniv = tcg_gen_sarv_mod_vec, 3228 .fno = gen_helper_gvec_sar32v, 3229 .opt_opc = vecop_list, 3230 .vece = MO_32 }, 3231 { .fni8 = tcg_gen_sar_mod_i64, 3232 .fniv = tcg_gen_sarv_mod_vec, 3233 .fno = gen_helper_gvec_sar64v, 3234 .opt_opc = vecop_list, 3235 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3236 .vece = MO_64 }, 3237 }; 3238 3239 tcg_debug_assert(vece <= MO_64); 3240 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3241 } 3242 3243 /* 3244 * Similarly for rotates. 3245 */ 3246 3247 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d, 3248 TCGv_vec a, TCGv_vec b) 3249 { 3250 TCGv_vec t = tcg_temp_new_vec_matching(d); 3251 3252 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3253 tcg_gen_and_vec(vece, t, t, b); 3254 tcg_gen_rotlv_vec(vece, d, a, t); 3255 tcg_temp_free_vec(t); 3256 } 3257 3258 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3259 { 3260 TCGv_i32 t = tcg_temp_new_i32(); 3261 3262 tcg_gen_andi_i32(t, b, 31); 3263 tcg_gen_rotl_i32(d, a, t); 3264 tcg_temp_free_i32(t); 3265 } 3266 3267 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3268 { 3269 TCGv_i64 t = tcg_temp_new_i64(); 3270 3271 tcg_gen_andi_i64(t, b, 63); 3272 tcg_gen_rotl_i64(d, a, t); 3273 tcg_temp_free_i64(t); 3274 } 3275 3276 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs, 3277 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3278 { 3279 static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 }; 3280 static const GVecGen3 g[4] = { 3281 { .fniv = tcg_gen_rotlv_mod_vec, 3282 .fno = gen_helper_gvec_rotl8v, 3283 .opt_opc = vecop_list, 3284 .vece = MO_8 }, 3285 { .fniv = tcg_gen_rotlv_mod_vec, 3286 .fno = gen_helper_gvec_rotl16v, 3287 .opt_opc = vecop_list, 3288 .vece = MO_16 }, 3289 { .fni4 = tcg_gen_rotl_mod_i32, 3290 .fniv = tcg_gen_rotlv_mod_vec, 3291 .fno = gen_helper_gvec_rotl32v, 3292 .opt_opc = vecop_list, 3293 .vece = MO_32 }, 3294 { .fni8 = tcg_gen_rotl_mod_i64, 3295 .fniv = tcg_gen_rotlv_mod_vec, 3296 .fno = gen_helper_gvec_rotl64v, 3297 .opt_opc = vecop_list, 3298 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3299 .vece = MO_64 }, 3300 }; 3301 3302 tcg_debug_assert(vece <= MO_64); 3303 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3304 } 3305 3306 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d, 3307 TCGv_vec a, TCGv_vec b) 3308 { 3309 TCGv_vec t = tcg_temp_new_vec_matching(d); 3310 3311 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3312 tcg_gen_and_vec(vece, t, t, b); 3313 tcg_gen_rotrv_vec(vece, d, a, t); 3314 tcg_temp_free_vec(t); 3315 } 3316 3317 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3318 { 3319 TCGv_i32 t = tcg_temp_new_i32(); 3320 3321 tcg_gen_andi_i32(t, b, 31); 3322 tcg_gen_rotr_i32(d, a, t); 3323 tcg_temp_free_i32(t); 3324 } 3325 3326 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3327 { 3328 TCGv_i64 t = tcg_temp_new_i64(); 3329 3330 tcg_gen_andi_i64(t, b, 63); 3331 tcg_gen_rotr_i64(d, a, t); 3332 tcg_temp_free_i64(t); 3333 } 3334 3335 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs, 3336 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3337 { 3338 static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 }; 3339 static const GVecGen3 g[4] = { 3340 { .fniv = tcg_gen_rotrv_mod_vec, 3341 .fno = gen_helper_gvec_rotr8v, 3342 .opt_opc = vecop_list, 3343 .vece = MO_8 }, 3344 { .fniv = tcg_gen_rotrv_mod_vec, 3345 .fno = gen_helper_gvec_rotr16v, 3346 .opt_opc = vecop_list, 3347 .vece = MO_16 }, 3348 { .fni4 = tcg_gen_rotr_mod_i32, 3349 .fniv = tcg_gen_rotrv_mod_vec, 3350 .fno = gen_helper_gvec_rotr32v, 3351 .opt_opc = vecop_list, 3352 .vece = MO_32 }, 3353 { .fni8 = tcg_gen_rotr_mod_i64, 3354 .fniv = tcg_gen_rotrv_mod_vec, 3355 .fno = gen_helper_gvec_rotr64v, 3356 .opt_opc = vecop_list, 3357 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3358 .vece = MO_64 }, 3359 }; 3360 3361 tcg_debug_assert(vece <= MO_64); 3362 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3363 } 3364 3365 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3366 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3367 uint32_t oprsz, TCGCond cond) 3368 { 3369 TCGv_i32 t0 = tcg_temp_new_i32(); 3370 TCGv_i32 t1 = tcg_temp_new_i32(); 3371 uint32_t i; 3372 3373 for (i = 0; i < oprsz; i += 4) { 3374 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 3375 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 3376 tcg_gen_setcond_i32(cond, t0, t0, t1); 3377 tcg_gen_neg_i32(t0, t0); 3378 tcg_gen_st_i32(t0, cpu_env, dofs + i); 3379 } 3380 tcg_temp_free_i32(t1); 3381 tcg_temp_free_i32(t0); 3382 } 3383 3384 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3385 uint32_t oprsz, TCGCond cond) 3386 { 3387 TCGv_i64 t0 = tcg_temp_new_i64(); 3388 TCGv_i64 t1 = tcg_temp_new_i64(); 3389 uint32_t i; 3390 3391 for (i = 0; i < oprsz; i += 8) { 3392 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 3393 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 3394 tcg_gen_setcond_i64(cond, t0, t0, t1); 3395 tcg_gen_neg_i64(t0, t0); 3396 tcg_gen_st_i64(t0, cpu_env, dofs + i); 3397 } 3398 tcg_temp_free_i64(t1); 3399 tcg_temp_free_i64(t0); 3400 } 3401 3402 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3403 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3404 TCGType type, TCGCond cond) 3405 { 3406 TCGv_vec t0 = tcg_temp_new_vec(type); 3407 TCGv_vec t1 = tcg_temp_new_vec(type); 3408 uint32_t i; 3409 3410 for (i = 0; i < oprsz; i += tysz) { 3411 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 3412 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 3413 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 3414 tcg_gen_st_vec(t0, cpu_env, dofs + i); 3415 } 3416 tcg_temp_free_vec(t1); 3417 tcg_temp_free_vec(t0); 3418 } 3419 3420 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3421 uint32_t aofs, uint32_t bofs, 3422 uint32_t oprsz, uint32_t maxsz) 3423 { 3424 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3425 static gen_helper_gvec_3 * const eq_fn[4] = { 3426 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3427 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3428 }; 3429 static gen_helper_gvec_3 * const ne_fn[4] = { 3430 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3431 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3432 }; 3433 static gen_helper_gvec_3 * const lt_fn[4] = { 3434 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3435 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3436 }; 3437 static gen_helper_gvec_3 * const le_fn[4] = { 3438 gen_helper_gvec_le8, gen_helper_gvec_le16, 3439 gen_helper_gvec_le32, gen_helper_gvec_le64 3440 }; 3441 static gen_helper_gvec_3 * const ltu_fn[4] = { 3442 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3443 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3444 }; 3445 static gen_helper_gvec_3 * const leu_fn[4] = { 3446 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3447 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3448 }; 3449 static gen_helper_gvec_3 * const * const fns[16] = { 3450 [TCG_COND_EQ] = eq_fn, 3451 [TCG_COND_NE] = ne_fn, 3452 [TCG_COND_LT] = lt_fn, 3453 [TCG_COND_LE] = le_fn, 3454 [TCG_COND_LTU] = ltu_fn, 3455 [TCG_COND_LEU] = leu_fn, 3456 }; 3457 3458 const TCGOpcode *hold_list; 3459 TCGType type; 3460 uint32_t some; 3461 3462 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3463 check_overlap_3(dofs, aofs, bofs, maxsz); 3464 3465 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3466 do_dup(MO_8, dofs, oprsz, maxsz, 3467 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3468 return; 3469 } 3470 3471 /* 3472 * Implement inline with a vector type, if possible. 3473 * Prefer integer when 64-bit host and 64-bit comparison. 3474 */ 3475 hold_list = tcg_swap_vecop_list(cmp_list); 3476 type = choose_vector_type(cmp_list, vece, oprsz, 3477 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3478 switch (type) { 3479 case TCG_TYPE_V256: 3480 /* Recall that ARM SVE allows vector sizes that are not a 3481 * power of 2, but always a multiple of 16. The intent is 3482 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3483 */ 3484 some = QEMU_ALIGN_DOWN(oprsz, 32); 3485 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3486 if (some == oprsz) { 3487 break; 3488 } 3489 dofs += some; 3490 aofs += some; 3491 bofs += some; 3492 oprsz -= some; 3493 maxsz -= some; 3494 /* fallthru */ 3495 case TCG_TYPE_V128: 3496 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3497 break; 3498 case TCG_TYPE_V64: 3499 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3500 break; 3501 3502 case 0: 3503 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3504 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3505 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3506 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3507 } else { 3508 gen_helper_gvec_3 * const *fn = fns[cond]; 3509 3510 if (fn == NULL) { 3511 uint32_t tmp; 3512 tmp = aofs, aofs = bofs, bofs = tmp; 3513 cond = tcg_swap_cond(cond); 3514 fn = fns[cond]; 3515 assert(fn != NULL); 3516 } 3517 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3518 oprsz = maxsz; 3519 } 3520 break; 3521 3522 default: 3523 g_assert_not_reached(); 3524 } 3525 tcg_swap_vecop_list(hold_list); 3526 3527 if (oprsz < maxsz) { 3528 expand_clr(dofs + oprsz, maxsz - oprsz); 3529 } 3530 } 3531 3532 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 3533 { 3534 TCGv_i64 t = tcg_temp_new_i64(); 3535 3536 tcg_gen_and_i64(t, b, a); 3537 tcg_gen_andc_i64(d, c, a); 3538 tcg_gen_or_i64(d, d, t); 3539 tcg_temp_free_i64(t); 3540 } 3541 3542 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 3543 uint32_t bofs, uint32_t cofs, 3544 uint32_t oprsz, uint32_t maxsz) 3545 { 3546 static const GVecGen4 g = { 3547 .fni8 = tcg_gen_bitsel_i64, 3548 .fniv = tcg_gen_bitsel_vec, 3549 .fno = gen_helper_gvec_bitsel, 3550 }; 3551 3552 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 3553 } 3554