1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "tcg/tcg.h" 22 #include "tcg/tcg-op.h" 23 #include "tcg/tcg-op-gvec.h" 24 #include "qemu/main-loop.h" 25 #include "tcg/tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 #ifdef CONFIG_DEBUG_TCG 30 static const TCGOpcode vecop_list_empty[1] = { 0 }; 31 #else 32 #define vecop_list_empty NULL 33 #endif 34 35 36 /* Verify vector size and alignment rules. OFS should be the OR of all 37 of the operand offsets so that we can check them all at once. */ 38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 39 { 40 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 41 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 42 tcg_debug_assert(oprsz > 0); 43 tcg_debug_assert(oprsz <= maxsz); 44 tcg_debug_assert((oprsz & opr_align) == 0); 45 tcg_debug_assert((maxsz & max_align) == 0); 46 tcg_debug_assert((ofs & max_align) == 0); 47 } 48 49 /* Verify vector overlap rules for two operands. */ 50 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 51 { 52 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 53 } 54 55 /* Verify vector overlap rules for three operands. */ 56 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 57 { 58 check_overlap_2(d, a, s); 59 check_overlap_2(d, b, s); 60 check_overlap_2(a, b, s); 61 } 62 63 /* Verify vector overlap rules for four operands. */ 64 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 65 uint32_t c, uint32_t s) 66 { 67 check_overlap_2(d, a, s); 68 check_overlap_2(d, b, s); 69 check_overlap_2(d, c, s); 70 check_overlap_2(a, b, s); 71 check_overlap_2(a, c, s); 72 check_overlap_2(b, c, s); 73 } 74 75 /* Create a descriptor from components. */ 76 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 77 { 78 uint32_t desc = 0; 79 80 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 81 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 82 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 83 84 oprsz = (oprsz / 8) - 1; 85 maxsz = (maxsz / 8) - 1; 86 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 87 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 88 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 89 90 return desc; 91 } 92 93 /* Generate a call to a gvec-style helper with two vector operands. */ 94 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 95 uint32_t oprsz, uint32_t maxsz, int32_t data, 96 gen_helper_gvec_2 *fn) 97 { 98 TCGv_ptr a0, a1; 99 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 100 101 a0 = tcg_temp_new_ptr(); 102 a1 = tcg_temp_new_ptr(); 103 104 tcg_gen_addi_ptr(a0, cpu_env, dofs); 105 tcg_gen_addi_ptr(a1, cpu_env, aofs); 106 107 fn(a0, a1, desc); 108 109 tcg_temp_free_ptr(a0); 110 tcg_temp_free_ptr(a1); 111 tcg_temp_free_i32(desc); 112 } 113 114 /* Generate a call to a gvec-style helper with two vector operands 115 and one scalar operand. */ 116 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 117 uint32_t oprsz, uint32_t maxsz, int32_t data, 118 gen_helper_gvec_2i *fn) 119 { 120 TCGv_ptr a0, a1; 121 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 122 123 a0 = tcg_temp_new_ptr(); 124 a1 = tcg_temp_new_ptr(); 125 126 tcg_gen_addi_ptr(a0, cpu_env, dofs); 127 tcg_gen_addi_ptr(a1, cpu_env, aofs); 128 129 fn(a0, a1, c, desc); 130 131 tcg_temp_free_ptr(a0); 132 tcg_temp_free_ptr(a1); 133 tcg_temp_free_i32(desc); 134 } 135 136 /* Generate a call to a gvec-style helper with three vector operands. */ 137 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 138 uint32_t oprsz, uint32_t maxsz, int32_t data, 139 gen_helper_gvec_3 *fn) 140 { 141 TCGv_ptr a0, a1, a2; 142 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 143 144 a0 = tcg_temp_new_ptr(); 145 a1 = tcg_temp_new_ptr(); 146 a2 = tcg_temp_new_ptr(); 147 148 tcg_gen_addi_ptr(a0, cpu_env, dofs); 149 tcg_gen_addi_ptr(a1, cpu_env, aofs); 150 tcg_gen_addi_ptr(a2, cpu_env, bofs); 151 152 fn(a0, a1, a2, desc); 153 154 tcg_temp_free_ptr(a0); 155 tcg_temp_free_ptr(a1); 156 tcg_temp_free_ptr(a2); 157 tcg_temp_free_i32(desc); 158 } 159 160 /* Generate a call to a gvec-style helper with four vector operands. */ 161 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 162 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 163 int32_t data, gen_helper_gvec_4 *fn) 164 { 165 TCGv_ptr a0, a1, a2, a3; 166 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 167 168 a0 = tcg_temp_new_ptr(); 169 a1 = tcg_temp_new_ptr(); 170 a2 = tcg_temp_new_ptr(); 171 a3 = tcg_temp_new_ptr(); 172 173 tcg_gen_addi_ptr(a0, cpu_env, dofs); 174 tcg_gen_addi_ptr(a1, cpu_env, aofs); 175 tcg_gen_addi_ptr(a2, cpu_env, bofs); 176 tcg_gen_addi_ptr(a3, cpu_env, cofs); 177 178 fn(a0, a1, a2, a3, desc); 179 180 tcg_temp_free_ptr(a0); 181 tcg_temp_free_ptr(a1); 182 tcg_temp_free_ptr(a2); 183 tcg_temp_free_ptr(a3); 184 tcg_temp_free_i32(desc); 185 } 186 187 /* Generate a call to a gvec-style helper with five vector operands. */ 188 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 189 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 190 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 191 { 192 TCGv_ptr a0, a1, a2, a3, a4; 193 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 194 195 a0 = tcg_temp_new_ptr(); 196 a1 = tcg_temp_new_ptr(); 197 a2 = tcg_temp_new_ptr(); 198 a3 = tcg_temp_new_ptr(); 199 a4 = tcg_temp_new_ptr(); 200 201 tcg_gen_addi_ptr(a0, cpu_env, dofs); 202 tcg_gen_addi_ptr(a1, cpu_env, aofs); 203 tcg_gen_addi_ptr(a2, cpu_env, bofs); 204 tcg_gen_addi_ptr(a3, cpu_env, cofs); 205 tcg_gen_addi_ptr(a4, cpu_env, xofs); 206 207 fn(a0, a1, a2, a3, a4, desc); 208 209 tcg_temp_free_ptr(a0); 210 tcg_temp_free_ptr(a1); 211 tcg_temp_free_ptr(a2); 212 tcg_temp_free_ptr(a3); 213 tcg_temp_free_ptr(a4); 214 tcg_temp_free_i32(desc); 215 } 216 217 /* Generate a call to a gvec-style helper with three vector operands 218 and an extra pointer operand. */ 219 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 220 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 221 int32_t data, gen_helper_gvec_2_ptr *fn) 222 { 223 TCGv_ptr a0, a1; 224 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 225 226 a0 = tcg_temp_new_ptr(); 227 a1 = tcg_temp_new_ptr(); 228 229 tcg_gen_addi_ptr(a0, cpu_env, dofs); 230 tcg_gen_addi_ptr(a1, cpu_env, aofs); 231 232 fn(a0, a1, ptr, desc); 233 234 tcg_temp_free_ptr(a0); 235 tcg_temp_free_ptr(a1); 236 tcg_temp_free_i32(desc); 237 } 238 239 /* Generate a call to a gvec-style helper with three vector operands 240 and an extra pointer operand. */ 241 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 242 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 243 int32_t data, gen_helper_gvec_3_ptr *fn) 244 { 245 TCGv_ptr a0, a1, a2; 246 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 247 248 a0 = tcg_temp_new_ptr(); 249 a1 = tcg_temp_new_ptr(); 250 a2 = tcg_temp_new_ptr(); 251 252 tcg_gen_addi_ptr(a0, cpu_env, dofs); 253 tcg_gen_addi_ptr(a1, cpu_env, aofs); 254 tcg_gen_addi_ptr(a2, cpu_env, bofs); 255 256 fn(a0, a1, a2, ptr, desc); 257 258 tcg_temp_free_ptr(a0); 259 tcg_temp_free_ptr(a1); 260 tcg_temp_free_ptr(a2); 261 tcg_temp_free_i32(desc); 262 } 263 264 /* Generate a call to a gvec-style helper with four vector operands 265 and an extra pointer operand. */ 266 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 267 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 268 uint32_t maxsz, int32_t data, 269 gen_helper_gvec_4_ptr *fn) 270 { 271 TCGv_ptr a0, a1, a2, a3; 272 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 273 274 a0 = tcg_temp_new_ptr(); 275 a1 = tcg_temp_new_ptr(); 276 a2 = tcg_temp_new_ptr(); 277 a3 = tcg_temp_new_ptr(); 278 279 tcg_gen_addi_ptr(a0, cpu_env, dofs); 280 tcg_gen_addi_ptr(a1, cpu_env, aofs); 281 tcg_gen_addi_ptr(a2, cpu_env, bofs); 282 tcg_gen_addi_ptr(a3, cpu_env, cofs); 283 284 fn(a0, a1, a2, a3, ptr, desc); 285 286 tcg_temp_free_ptr(a0); 287 tcg_temp_free_ptr(a1); 288 tcg_temp_free_ptr(a2); 289 tcg_temp_free_ptr(a3); 290 tcg_temp_free_i32(desc); 291 } 292 293 /* Generate a call to a gvec-style helper with five vector operands 294 and an extra pointer operand. */ 295 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 296 uint32_t cofs, uint32_t eofs, TCGv_ptr ptr, 297 uint32_t oprsz, uint32_t maxsz, int32_t data, 298 gen_helper_gvec_5_ptr *fn) 299 { 300 TCGv_ptr a0, a1, a2, a3, a4; 301 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 302 303 a0 = tcg_temp_new_ptr(); 304 a1 = tcg_temp_new_ptr(); 305 a2 = tcg_temp_new_ptr(); 306 a3 = tcg_temp_new_ptr(); 307 a4 = tcg_temp_new_ptr(); 308 309 tcg_gen_addi_ptr(a0, cpu_env, dofs); 310 tcg_gen_addi_ptr(a1, cpu_env, aofs); 311 tcg_gen_addi_ptr(a2, cpu_env, bofs); 312 tcg_gen_addi_ptr(a3, cpu_env, cofs); 313 tcg_gen_addi_ptr(a4, cpu_env, eofs); 314 315 fn(a0, a1, a2, a3, a4, ptr, desc); 316 317 tcg_temp_free_ptr(a0); 318 tcg_temp_free_ptr(a1); 319 tcg_temp_free_ptr(a2); 320 tcg_temp_free_ptr(a3); 321 tcg_temp_free_ptr(a4); 322 tcg_temp_free_i32(desc); 323 } 324 325 /* Return true if we want to implement something of OPRSZ bytes 326 in units of LNSZ. This limits the expansion of inline code. */ 327 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 328 { 329 if (oprsz % lnsz == 0) { 330 uint32_t lnct = oprsz / lnsz; 331 return lnct >= 1 && lnct <= MAX_UNROLL; 332 } 333 return false; 334 } 335 336 static void expand_clr(uint32_t dofs, uint32_t maxsz); 337 338 /* Duplicate C as per VECE. */ 339 uint64_t (dup_const)(unsigned vece, uint64_t c) 340 { 341 switch (vece) { 342 case MO_8: 343 return 0x0101010101010101ull * (uint8_t)c; 344 case MO_16: 345 return 0x0001000100010001ull * (uint16_t)c; 346 case MO_32: 347 return 0x0000000100000001ull * (uint32_t)c; 348 case MO_64: 349 return c; 350 default: 351 g_assert_not_reached(); 352 } 353 } 354 355 /* Duplicate IN into OUT as per VECE. */ 356 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 357 { 358 switch (vece) { 359 case MO_8: 360 tcg_gen_ext8u_i32(out, in); 361 tcg_gen_muli_i32(out, out, 0x01010101); 362 break; 363 case MO_16: 364 tcg_gen_deposit_i32(out, in, in, 16, 16); 365 break; 366 case MO_32: 367 tcg_gen_mov_i32(out, in); 368 break; 369 default: 370 g_assert_not_reached(); 371 } 372 } 373 374 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 375 { 376 switch (vece) { 377 case MO_8: 378 tcg_gen_ext8u_i64(out, in); 379 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 380 break; 381 case MO_16: 382 tcg_gen_ext16u_i64(out, in); 383 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 384 break; 385 case MO_32: 386 tcg_gen_deposit_i64(out, in, in, 32, 32); 387 break; 388 case MO_64: 389 tcg_gen_mov_i64(out, in); 390 break; 391 default: 392 g_assert_not_reached(); 393 } 394 } 395 396 /* Select a supported vector type for implementing an operation on SIZE 397 * bytes. If OP is 0, assume that the real operation to be performed is 398 * required by all backends. Otherwise, make sure than OP can be performed 399 * on elements of size VECE in the selected type. Do not select V64 if 400 * PREFER_I64 is true. Return 0 if no vector type is selected. 401 */ 402 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 403 uint32_t size, bool prefer_i64) 404 { 405 if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { 406 /* 407 * Recall that ARM SVE allows vector sizes that are not a 408 * power of 2, but always a multiple of 16. The intent is 409 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 410 * It is hard to imagine a case in which v256 is supported 411 * but v128 is not, but check anyway. 412 */ 413 if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) 414 && (size % 32 == 0 415 || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) { 416 return TCG_TYPE_V256; 417 } 418 } 419 if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) 420 && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) { 421 return TCG_TYPE_V128; 422 } 423 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 424 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 425 return TCG_TYPE_V64; 426 } 427 return 0; 428 } 429 430 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 431 uint32_t maxsz, TCGv_vec t_vec) 432 { 433 uint32_t i = 0; 434 435 switch (type) { 436 case TCG_TYPE_V256: 437 /* 438 * Recall that ARM SVE allows vector sizes that are not a 439 * power of 2, but always a multiple of 16. The intent is 440 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 441 */ 442 for (; i + 32 <= oprsz; i += 32) { 443 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 444 } 445 /* fallthru */ 446 case TCG_TYPE_V128: 447 for (; i + 16 <= oprsz; i += 16) { 448 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 449 } 450 break; 451 case TCG_TYPE_V64: 452 for (; i < oprsz; i += 8) { 453 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 454 } 455 break; 456 default: 457 g_assert_not_reached(); 458 } 459 460 if (oprsz < maxsz) { 461 expand_clr(dofs + oprsz, maxsz - oprsz); 462 } 463 } 464 465 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 466 * Only one of IN_32 or IN_64 may be set; 467 * IN_C is used if IN_32 and IN_64 are unset. 468 */ 469 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 470 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 471 uint64_t in_c) 472 { 473 TCGType type; 474 TCGv_i64 t_64; 475 TCGv_i32 t_32, t_desc; 476 TCGv_ptr t_ptr; 477 uint32_t i; 478 479 assert(vece <= (in_32 ? MO_32 : MO_64)); 480 assert(in_32 == NULL || in_64 == NULL); 481 482 /* If we're storing 0, expand oprsz to maxsz. */ 483 if (in_32 == NULL && in_64 == NULL) { 484 in_c = dup_const(vece, in_c); 485 if (in_c == 0) { 486 oprsz = maxsz; 487 } 488 } 489 490 /* Implement inline with a vector type, if possible. 491 * Prefer integer when 64-bit host and no variable dup. 492 */ 493 type = choose_vector_type(NULL, vece, oprsz, 494 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 495 && (in_64 == NULL || vece == MO_64))); 496 if (type != 0) { 497 TCGv_vec t_vec = tcg_temp_new_vec(type); 498 499 if (in_32) { 500 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 501 } else if (in_64) { 502 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 503 } else { 504 tcg_gen_dupi_vec(vece, t_vec, in_c); 505 } 506 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 507 tcg_temp_free_vec(t_vec); 508 return; 509 } 510 511 /* Otherwise, inline with an integer type, unless "large". */ 512 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 513 t_64 = NULL; 514 t_32 = NULL; 515 516 if (in_32) { 517 /* We are given a 32-bit variable input. For a 64-bit host, 518 use a 64-bit operation unless the 32-bit operation would 519 be simple enough. */ 520 if (TCG_TARGET_REG_BITS == 64 521 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 522 t_64 = tcg_temp_new_i64(); 523 tcg_gen_extu_i32_i64(t_64, in_32); 524 gen_dup_i64(vece, t_64, t_64); 525 } else { 526 t_32 = tcg_temp_new_i32(); 527 gen_dup_i32(vece, t_32, in_32); 528 } 529 } else if (in_64) { 530 /* We are given a 64-bit variable input. */ 531 t_64 = tcg_temp_new_i64(); 532 gen_dup_i64(vece, t_64, in_64); 533 } else { 534 /* We are given a constant input. */ 535 /* For 64-bit hosts, use 64-bit constants for "simple" constants 536 or when we'd need too many 32-bit stores, or when a 64-bit 537 constant is really required. */ 538 if (vece == MO_64 539 || (TCG_TARGET_REG_BITS == 64 540 && (in_c == 0 || in_c == -1 541 || !check_size_impl(oprsz, 4)))) { 542 t_64 = tcg_const_i64(in_c); 543 } else { 544 t_32 = tcg_const_i32(in_c); 545 } 546 } 547 548 /* Implement inline if we picked an implementation size above. */ 549 if (t_32) { 550 for (i = 0; i < oprsz; i += 4) { 551 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 552 } 553 tcg_temp_free_i32(t_32); 554 goto done; 555 } 556 if (t_64) { 557 for (i = 0; i < oprsz; i += 8) { 558 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 559 } 560 tcg_temp_free_i64(t_64); 561 goto done; 562 } 563 } 564 565 /* Otherwise implement out of line. */ 566 t_ptr = tcg_temp_new_ptr(); 567 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 568 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 569 570 if (vece == MO_64) { 571 if (in_64) { 572 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 573 } else { 574 t_64 = tcg_const_i64(in_c); 575 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 576 tcg_temp_free_i64(t_64); 577 } 578 } else { 579 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 580 static dup_fn * const fns[3] = { 581 gen_helper_gvec_dup8, 582 gen_helper_gvec_dup16, 583 gen_helper_gvec_dup32 584 }; 585 586 if (in_32) { 587 fns[vece](t_ptr, t_desc, in_32); 588 } else { 589 t_32 = tcg_temp_new_i32(); 590 if (in_64) { 591 tcg_gen_extrl_i64_i32(t_32, in_64); 592 } else if (vece == MO_8) { 593 tcg_gen_movi_i32(t_32, in_c & 0xff); 594 } else if (vece == MO_16) { 595 tcg_gen_movi_i32(t_32, in_c & 0xffff); 596 } else { 597 tcg_gen_movi_i32(t_32, in_c); 598 } 599 fns[vece](t_ptr, t_desc, t_32); 600 tcg_temp_free_i32(t_32); 601 } 602 } 603 604 tcg_temp_free_ptr(t_ptr); 605 tcg_temp_free_i32(t_desc); 606 return; 607 608 done: 609 if (oprsz < maxsz) { 610 expand_clr(dofs + oprsz, maxsz - oprsz); 611 } 612 } 613 614 /* Likewise, but with zero. */ 615 static void expand_clr(uint32_t dofs, uint32_t maxsz) 616 { 617 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 618 } 619 620 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 621 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 622 void (*fni)(TCGv_i32, TCGv_i32)) 623 { 624 TCGv_i32 t0 = tcg_temp_new_i32(); 625 uint32_t i; 626 627 for (i = 0; i < oprsz; i += 4) { 628 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 629 fni(t0, t0); 630 tcg_gen_st_i32(t0, cpu_env, dofs + i); 631 } 632 tcg_temp_free_i32(t0); 633 } 634 635 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 636 int32_t c, bool load_dest, 637 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 638 { 639 TCGv_i32 t0 = tcg_temp_new_i32(); 640 TCGv_i32 t1 = tcg_temp_new_i32(); 641 uint32_t i; 642 643 for (i = 0; i < oprsz; i += 4) { 644 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 645 if (load_dest) { 646 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 647 } 648 fni(t1, t0, c); 649 tcg_gen_st_i32(t1, cpu_env, dofs + i); 650 } 651 tcg_temp_free_i32(t0); 652 tcg_temp_free_i32(t1); 653 } 654 655 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 656 TCGv_i32 c, bool scalar_first, 657 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 658 { 659 TCGv_i32 t0 = tcg_temp_new_i32(); 660 TCGv_i32 t1 = tcg_temp_new_i32(); 661 uint32_t i; 662 663 for (i = 0; i < oprsz; i += 4) { 664 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 665 if (scalar_first) { 666 fni(t1, c, t0); 667 } else { 668 fni(t1, t0, c); 669 } 670 tcg_gen_st_i32(t1, cpu_env, dofs + i); 671 } 672 tcg_temp_free_i32(t0); 673 tcg_temp_free_i32(t1); 674 } 675 676 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 677 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 678 uint32_t bofs, uint32_t oprsz, bool load_dest, 679 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 680 { 681 TCGv_i32 t0 = tcg_temp_new_i32(); 682 TCGv_i32 t1 = tcg_temp_new_i32(); 683 TCGv_i32 t2 = tcg_temp_new_i32(); 684 uint32_t i; 685 686 for (i = 0; i < oprsz; i += 4) { 687 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 688 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 689 if (load_dest) { 690 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 691 } 692 fni(t2, t0, t1); 693 tcg_gen_st_i32(t2, cpu_env, dofs + i); 694 } 695 tcg_temp_free_i32(t2); 696 tcg_temp_free_i32(t1); 697 tcg_temp_free_i32(t0); 698 } 699 700 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 701 uint32_t oprsz, int32_t c, bool load_dest, 702 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 703 { 704 TCGv_i32 t0 = tcg_temp_new_i32(); 705 TCGv_i32 t1 = tcg_temp_new_i32(); 706 TCGv_i32 t2 = tcg_temp_new_i32(); 707 uint32_t i; 708 709 for (i = 0; i < oprsz; i += 4) { 710 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 711 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 712 if (load_dest) { 713 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 714 } 715 fni(t2, t0, t1, c); 716 tcg_gen_st_i32(t2, cpu_env, dofs + i); 717 } 718 tcg_temp_free_i32(t0); 719 tcg_temp_free_i32(t1); 720 tcg_temp_free_i32(t2); 721 } 722 723 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 724 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 725 uint32_t cofs, uint32_t oprsz, bool write_aofs, 726 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 727 { 728 TCGv_i32 t0 = tcg_temp_new_i32(); 729 TCGv_i32 t1 = tcg_temp_new_i32(); 730 TCGv_i32 t2 = tcg_temp_new_i32(); 731 TCGv_i32 t3 = tcg_temp_new_i32(); 732 uint32_t i; 733 734 for (i = 0; i < oprsz; i += 4) { 735 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 736 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 737 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 738 fni(t0, t1, t2, t3); 739 tcg_gen_st_i32(t0, cpu_env, dofs + i); 740 if (write_aofs) { 741 tcg_gen_st_i32(t1, cpu_env, aofs + i); 742 } 743 } 744 tcg_temp_free_i32(t3); 745 tcg_temp_free_i32(t2); 746 tcg_temp_free_i32(t1); 747 tcg_temp_free_i32(t0); 748 } 749 750 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 751 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 752 void (*fni)(TCGv_i64, TCGv_i64)) 753 { 754 TCGv_i64 t0 = tcg_temp_new_i64(); 755 uint32_t i; 756 757 for (i = 0; i < oprsz; i += 8) { 758 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 759 fni(t0, t0); 760 tcg_gen_st_i64(t0, cpu_env, dofs + i); 761 } 762 tcg_temp_free_i64(t0); 763 } 764 765 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 766 int64_t c, bool load_dest, 767 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 768 { 769 TCGv_i64 t0 = tcg_temp_new_i64(); 770 TCGv_i64 t1 = tcg_temp_new_i64(); 771 uint32_t i; 772 773 for (i = 0; i < oprsz; i += 8) { 774 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 775 if (load_dest) { 776 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 777 } 778 fni(t1, t0, c); 779 tcg_gen_st_i64(t1, cpu_env, dofs + i); 780 } 781 tcg_temp_free_i64(t0); 782 tcg_temp_free_i64(t1); 783 } 784 785 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 786 TCGv_i64 c, bool scalar_first, 787 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 788 { 789 TCGv_i64 t0 = tcg_temp_new_i64(); 790 TCGv_i64 t1 = tcg_temp_new_i64(); 791 uint32_t i; 792 793 for (i = 0; i < oprsz; i += 8) { 794 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 795 if (scalar_first) { 796 fni(t1, c, t0); 797 } else { 798 fni(t1, t0, c); 799 } 800 tcg_gen_st_i64(t1, cpu_env, dofs + i); 801 } 802 tcg_temp_free_i64(t0); 803 tcg_temp_free_i64(t1); 804 } 805 806 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 807 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 808 uint32_t bofs, uint32_t oprsz, bool load_dest, 809 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 810 { 811 TCGv_i64 t0 = tcg_temp_new_i64(); 812 TCGv_i64 t1 = tcg_temp_new_i64(); 813 TCGv_i64 t2 = tcg_temp_new_i64(); 814 uint32_t i; 815 816 for (i = 0; i < oprsz; i += 8) { 817 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 818 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 819 if (load_dest) { 820 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 821 } 822 fni(t2, t0, t1); 823 tcg_gen_st_i64(t2, cpu_env, dofs + i); 824 } 825 tcg_temp_free_i64(t2); 826 tcg_temp_free_i64(t1); 827 tcg_temp_free_i64(t0); 828 } 829 830 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 831 uint32_t oprsz, int64_t c, bool load_dest, 832 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 833 { 834 TCGv_i64 t0 = tcg_temp_new_i64(); 835 TCGv_i64 t1 = tcg_temp_new_i64(); 836 TCGv_i64 t2 = tcg_temp_new_i64(); 837 uint32_t i; 838 839 for (i = 0; i < oprsz; i += 8) { 840 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 841 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 842 if (load_dest) { 843 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 844 } 845 fni(t2, t0, t1, c); 846 tcg_gen_st_i64(t2, cpu_env, dofs + i); 847 } 848 tcg_temp_free_i64(t0); 849 tcg_temp_free_i64(t1); 850 tcg_temp_free_i64(t2); 851 } 852 853 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 854 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 855 uint32_t cofs, uint32_t oprsz, bool write_aofs, 856 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 857 { 858 TCGv_i64 t0 = tcg_temp_new_i64(); 859 TCGv_i64 t1 = tcg_temp_new_i64(); 860 TCGv_i64 t2 = tcg_temp_new_i64(); 861 TCGv_i64 t3 = tcg_temp_new_i64(); 862 uint32_t i; 863 864 for (i = 0; i < oprsz; i += 8) { 865 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 866 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 867 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 868 fni(t0, t1, t2, t3); 869 tcg_gen_st_i64(t0, cpu_env, dofs + i); 870 if (write_aofs) { 871 tcg_gen_st_i64(t1, cpu_env, aofs + i); 872 } 873 } 874 tcg_temp_free_i64(t3); 875 tcg_temp_free_i64(t2); 876 tcg_temp_free_i64(t1); 877 tcg_temp_free_i64(t0); 878 } 879 880 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 881 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 882 uint32_t oprsz, uint32_t tysz, TCGType type, 883 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 884 { 885 TCGv_vec t0 = tcg_temp_new_vec(type); 886 uint32_t i; 887 888 for (i = 0; i < oprsz; i += tysz) { 889 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 890 fni(vece, t0, t0); 891 tcg_gen_st_vec(t0, cpu_env, dofs + i); 892 } 893 tcg_temp_free_vec(t0); 894 } 895 896 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 897 using host vectors. */ 898 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 899 uint32_t oprsz, uint32_t tysz, TCGType type, 900 int64_t c, bool load_dest, 901 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 902 { 903 TCGv_vec t0 = tcg_temp_new_vec(type); 904 TCGv_vec t1 = tcg_temp_new_vec(type); 905 uint32_t i; 906 907 for (i = 0; i < oprsz; i += tysz) { 908 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 909 if (load_dest) { 910 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 911 } 912 fni(vece, t1, t0, c); 913 tcg_gen_st_vec(t1, cpu_env, dofs + i); 914 } 915 tcg_temp_free_vec(t0); 916 tcg_temp_free_vec(t1); 917 } 918 919 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 920 uint32_t oprsz, uint32_t tysz, TCGType type, 921 TCGv_vec c, bool scalar_first, 922 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 923 { 924 TCGv_vec t0 = tcg_temp_new_vec(type); 925 TCGv_vec t1 = tcg_temp_new_vec(type); 926 uint32_t i; 927 928 for (i = 0; i < oprsz; i += tysz) { 929 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 930 if (scalar_first) { 931 fni(vece, t1, c, t0); 932 } else { 933 fni(vece, t1, t0, c); 934 } 935 tcg_gen_st_vec(t1, cpu_env, dofs + i); 936 } 937 tcg_temp_free_vec(t0); 938 tcg_temp_free_vec(t1); 939 } 940 941 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 942 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 943 uint32_t bofs, uint32_t oprsz, 944 uint32_t tysz, TCGType type, bool load_dest, 945 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 946 { 947 TCGv_vec t0 = tcg_temp_new_vec(type); 948 TCGv_vec t1 = tcg_temp_new_vec(type); 949 TCGv_vec t2 = tcg_temp_new_vec(type); 950 uint32_t i; 951 952 for (i = 0; i < oprsz; i += tysz) { 953 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 954 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 955 if (load_dest) { 956 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 957 } 958 fni(vece, t2, t0, t1); 959 tcg_gen_st_vec(t2, cpu_env, dofs + i); 960 } 961 tcg_temp_free_vec(t2); 962 tcg_temp_free_vec(t1); 963 tcg_temp_free_vec(t0); 964 } 965 966 /* 967 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 968 * using host vectors. 969 */ 970 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 971 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 972 TCGType type, int64_t c, bool load_dest, 973 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 974 int64_t)) 975 { 976 TCGv_vec t0 = tcg_temp_new_vec(type); 977 TCGv_vec t1 = tcg_temp_new_vec(type); 978 TCGv_vec t2 = tcg_temp_new_vec(type); 979 uint32_t i; 980 981 for (i = 0; i < oprsz; i += tysz) { 982 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 983 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 984 if (load_dest) { 985 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 986 } 987 fni(vece, t2, t0, t1, c); 988 tcg_gen_st_vec(t2, cpu_env, dofs + i); 989 } 990 tcg_temp_free_vec(t0); 991 tcg_temp_free_vec(t1); 992 tcg_temp_free_vec(t2); 993 } 994 995 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 996 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 997 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 998 uint32_t tysz, TCGType type, bool write_aofs, 999 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 1000 TCGv_vec, TCGv_vec)) 1001 { 1002 TCGv_vec t0 = tcg_temp_new_vec(type); 1003 TCGv_vec t1 = tcg_temp_new_vec(type); 1004 TCGv_vec t2 = tcg_temp_new_vec(type); 1005 TCGv_vec t3 = tcg_temp_new_vec(type); 1006 uint32_t i; 1007 1008 for (i = 0; i < oprsz; i += tysz) { 1009 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 1010 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 1011 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 1012 fni(vece, t0, t1, t2, t3); 1013 tcg_gen_st_vec(t0, cpu_env, dofs + i); 1014 if (write_aofs) { 1015 tcg_gen_st_vec(t1, cpu_env, aofs + i); 1016 } 1017 } 1018 tcg_temp_free_vec(t3); 1019 tcg_temp_free_vec(t2); 1020 tcg_temp_free_vec(t1); 1021 tcg_temp_free_vec(t0); 1022 } 1023 1024 /* Expand a vector two-operand operation. */ 1025 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 1026 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 1027 { 1028 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1029 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1030 TCGType type; 1031 uint32_t some; 1032 1033 check_size_align(oprsz, maxsz, dofs | aofs); 1034 check_overlap_2(dofs, aofs, maxsz); 1035 1036 type = 0; 1037 if (g->fniv) { 1038 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1039 } 1040 switch (type) { 1041 case TCG_TYPE_V256: 1042 /* Recall that ARM SVE allows vector sizes that are not a 1043 * power of 2, but always a multiple of 16. The intent is 1044 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1045 */ 1046 some = QEMU_ALIGN_DOWN(oprsz, 32); 1047 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); 1048 if (some == oprsz) { 1049 break; 1050 } 1051 dofs += some; 1052 aofs += some; 1053 oprsz -= some; 1054 maxsz -= some; 1055 /* fallthru */ 1056 case TCG_TYPE_V128: 1057 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); 1058 break; 1059 case TCG_TYPE_V64: 1060 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); 1061 break; 1062 1063 case 0: 1064 if (g->fni8 && check_size_impl(oprsz, 8)) { 1065 expand_2_i64(dofs, aofs, oprsz, g->fni8); 1066 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1067 expand_2_i32(dofs, aofs, oprsz, g->fni4); 1068 } else { 1069 assert(g->fno != NULL); 1070 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1071 oprsz = maxsz; 1072 } 1073 break; 1074 1075 default: 1076 g_assert_not_reached(); 1077 } 1078 tcg_swap_vecop_list(hold_list); 1079 1080 if (oprsz < maxsz) { 1081 expand_clr(dofs + oprsz, maxsz - oprsz); 1082 } 1083 } 1084 1085 /* Expand a vector operation with two vectors and an immediate. */ 1086 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1087 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1088 { 1089 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1090 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1091 TCGType type; 1092 uint32_t some; 1093 1094 check_size_align(oprsz, maxsz, dofs | aofs); 1095 check_overlap_2(dofs, aofs, maxsz); 1096 1097 type = 0; 1098 if (g->fniv) { 1099 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1100 } 1101 switch (type) { 1102 case TCG_TYPE_V256: 1103 /* Recall that ARM SVE allows vector sizes that are not a 1104 * power of 2, but always a multiple of 16. The intent is 1105 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1106 */ 1107 some = QEMU_ALIGN_DOWN(oprsz, 32); 1108 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1109 c, g->load_dest, g->fniv); 1110 if (some == oprsz) { 1111 break; 1112 } 1113 dofs += some; 1114 aofs += some; 1115 oprsz -= some; 1116 maxsz -= some; 1117 /* fallthru */ 1118 case TCG_TYPE_V128: 1119 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1120 c, g->load_dest, g->fniv); 1121 break; 1122 case TCG_TYPE_V64: 1123 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1124 c, g->load_dest, g->fniv); 1125 break; 1126 1127 case 0: 1128 if (g->fni8 && check_size_impl(oprsz, 8)) { 1129 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1130 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1131 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1132 } else { 1133 if (g->fno) { 1134 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1135 } else { 1136 TCGv_i64 tcg_c = tcg_const_i64(c); 1137 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1138 maxsz, c, g->fnoi); 1139 tcg_temp_free_i64(tcg_c); 1140 } 1141 oprsz = maxsz; 1142 } 1143 break; 1144 1145 default: 1146 g_assert_not_reached(); 1147 } 1148 tcg_swap_vecop_list(hold_list); 1149 1150 if (oprsz < maxsz) { 1151 expand_clr(dofs + oprsz, maxsz - oprsz); 1152 } 1153 } 1154 1155 /* Expand a vector operation with two vectors and a scalar. */ 1156 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1157 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1158 { 1159 TCGType type; 1160 1161 check_size_align(oprsz, maxsz, dofs | aofs); 1162 check_overlap_2(dofs, aofs, maxsz); 1163 1164 type = 0; 1165 if (g->fniv) { 1166 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1167 } 1168 if (type != 0) { 1169 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1170 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1171 TCGv_vec t_vec = tcg_temp_new_vec(type); 1172 uint32_t some; 1173 1174 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1175 1176 switch (type) { 1177 case TCG_TYPE_V256: 1178 /* Recall that ARM SVE allows vector sizes that are not a 1179 * power of 2, but always a multiple of 16. The intent is 1180 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1181 */ 1182 some = QEMU_ALIGN_DOWN(oprsz, 32); 1183 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1184 t_vec, g->scalar_first, g->fniv); 1185 if (some == oprsz) { 1186 break; 1187 } 1188 dofs += some; 1189 aofs += some; 1190 oprsz -= some; 1191 maxsz -= some; 1192 /* fallthru */ 1193 1194 case TCG_TYPE_V128: 1195 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1196 t_vec, g->scalar_first, g->fniv); 1197 break; 1198 1199 case TCG_TYPE_V64: 1200 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1201 t_vec, g->scalar_first, g->fniv); 1202 break; 1203 1204 default: 1205 g_assert_not_reached(); 1206 } 1207 tcg_temp_free_vec(t_vec); 1208 tcg_swap_vecop_list(hold_list); 1209 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1210 TCGv_i64 t64 = tcg_temp_new_i64(); 1211 1212 gen_dup_i64(g->vece, t64, c); 1213 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1214 tcg_temp_free_i64(t64); 1215 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1216 TCGv_i32 t32 = tcg_temp_new_i32(); 1217 1218 tcg_gen_extrl_i64_i32(t32, c); 1219 gen_dup_i32(g->vece, t32, t32); 1220 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1221 tcg_temp_free_i32(t32); 1222 } else { 1223 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1224 return; 1225 } 1226 1227 if (oprsz < maxsz) { 1228 expand_clr(dofs + oprsz, maxsz - oprsz); 1229 } 1230 } 1231 1232 /* Expand a vector three-operand operation. */ 1233 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1234 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1235 { 1236 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1237 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1238 TCGType type; 1239 uint32_t some; 1240 1241 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1242 check_overlap_3(dofs, aofs, bofs, maxsz); 1243 1244 type = 0; 1245 if (g->fniv) { 1246 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1247 } 1248 switch (type) { 1249 case TCG_TYPE_V256: 1250 /* Recall that ARM SVE allows vector sizes that are not a 1251 * power of 2, but always a multiple of 16. The intent is 1252 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1253 */ 1254 some = QEMU_ALIGN_DOWN(oprsz, 32); 1255 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1256 g->load_dest, g->fniv); 1257 if (some == oprsz) { 1258 break; 1259 } 1260 dofs += some; 1261 aofs += some; 1262 bofs += some; 1263 oprsz -= some; 1264 maxsz -= some; 1265 /* fallthru */ 1266 case TCG_TYPE_V128: 1267 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1268 g->load_dest, g->fniv); 1269 break; 1270 case TCG_TYPE_V64: 1271 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1272 g->load_dest, g->fniv); 1273 break; 1274 1275 case 0: 1276 if (g->fni8 && check_size_impl(oprsz, 8)) { 1277 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1278 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1279 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1280 } else { 1281 assert(g->fno != NULL); 1282 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1283 maxsz, g->data, g->fno); 1284 oprsz = maxsz; 1285 } 1286 break; 1287 1288 default: 1289 g_assert_not_reached(); 1290 } 1291 tcg_swap_vecop_list(hold_list); 1292 1293 if (oprsz < maxsz) { 1294 expand_clr(dofs + oprsz, maxsz - oprsz); 1295 } 1296 } 1297 1298 /* Expand a vector operation with three vectors and an immediate. */ 1299 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1300 uint32_t oprsz, uint32_t maxsz, int64_t c, 1301 const GVecGen3i *g) 1302 { 1303 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1304 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1305 TCGType type; 1306 uint32_t some; 1307 1308 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1309 check_overlap_3(dofs, aofs, bofs, maxsz); 1310 1311 type = 0; 1312 if (g->fniv) { 1313 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1314 } 1315 switch (type) { 1316 case TCG_TYPE_V256: 1317 /* 1318 * Recall that ARM SVE allows vector sizes that are not a 1319 * power of 2, but always a multiple of 16. The intent is 1320 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1321 */ 1322 some = QEMU_ALIGN_DOWN(oprsz, 32); 1323 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1324 c, g->load_dest, g->fniv); 1325 if (some == oprsz) { 1326 break; 1327 } 1328 dofs += some; 1329 aofs += some; 1330 bofs += some; 1331 oprsz -= some; 1332 maxsz -= some; 1333 /* fallthru */ 1334 case TCG_TYPE_V128: 1335 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1336 c, g->load_dest, g->fniv); 1337 break; 1338 case TCG_TYPE_V64: 1339 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1340 c, g->load_dest, g->fniv); 1341 break; 1342 1343 case 0: 1344 if (g->fni8 && check_size_impl(oprsz, 8)) { 1345 expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8); 1346 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1347 expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4); 1348 } else { 1349 assert(g->fno != NULL); 1350 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1351 oprsz = maxsz; 1352 } 1353 break; 1354 1355 default: 1356 g_assert_not_reached(); 1357 } 1358 tcg_swap_vecop_list(hold_list); 1359 1360 if (oprsz < maxsz) { 1361 expand_clr(dofs + oprsz, maxsz - oprsz); 1362 } 1363 } 1364 1365 /* Expand a vector four-operand operation. */ 1366 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1367 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1368 { 1369 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1370 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1371 TCGType type; 1372 uint32_t some; 1373 1374 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1375 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1376 1377 type = 0; 1378 if (g->fniv) { 1379 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1380 } 1381 switch (type) { 1382 case TCG_TYPE_V256: 1383 /* Recall that ARM SVE allows vector sizes that are not a 1384 * power of 2, but always a multiple of 16. The intent is 1385 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1386 */ 1387 some = QEMU_ALIGN_DOWN(oprsz, 32); 1388 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1389 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1390 if (some == oprsz) { 1391 break; 1392 } 1393 dofs += some; 1394 aofs += some; 1395 bofs += some; 1396 cofs += some; 1397 oprsz -= some; 1398 maxsz -= some; 1399 /* fallthru */ 1400 case TCG_TYPE_V128: 1401 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1402 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1403 break; 1404 case TCG_TYPE_V64: 1405 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1406 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1407 break; 1408 1409 case 0: 1410 if (g->fni8 && check_size_impl(oprsz, 8)) { 1411 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1412 g->write_aofs, g->fni8); 1413 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1414 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1415 g->write_aofs, g->fni4); 1416 } else { 1417 assert(g->fno != NULL); 1418 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1419 oprsz, maxsz, g->data, g->fno); 1420 oprsz = maxsz; 1421 } 1422 break; 1423 1424 default: 1425 g_assert_not_reached(); 1426 } 1427 tcg_swap_vecop_list(hold_list); 1428 1429 if (oprsz < maxsz) { 1430 expand_clr(dofs + oprsz, maxsz - oprsz); 1431 } 1432 } 1433 1434 /* 1435 * Expand specific vector operations. 1436 */ 1437 1438 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1439 { 1440 tcg_gen_mov_vec(a, b); 1441 } 1442 1443 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1444 uint32_t oprsz, uint32_t maxsz) 1445 { 1446 static const GVecGen2 g = { 1447 .fni8 = tcg_gen_mov_i64, 1448 .fniv = vec_mov2, 1449 .fno = gen_helper_gvec_mov, 1450 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1451 }; 1452 if (dofs != aofs) { 1453 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1454 } else { 1455 check_size_align(oprsz, maxsz, dofs); 1456 if (oprsz < maxsz) { 1457 expand_clr(dofs + oprsz, maxsz - oprsz); 1458 } 1459 } 1460 } 1461 1462 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1463 uint32_t maxsz, TCGv_i32 in) 1464 { 1465 check_size_align(oprsz, maxsz, dofs); 1466 tcg_debug_assert(vece <= MO_32); 1467 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1468 } 1469 1470 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1471 uint32_t maxsz, TCGv_i64 in) 1472 { 1473 check_size_align(oprsz, maxsz, dofs); 1474 tcg_debug_assert(vece <= MO_64); 1475 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1476 } 1477 1478 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1479 uint32_t oprsz, uint32_t maxsz) 1480 { 1481 check_size_align(oprsz, maxsz, dofs); 1482 if (vece <= MO_64) { 1483 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1484 if (type != 0) { 1485 TCGv_vec t_vec = tcg_temp_new_vec(type); 1486 tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs); 1487 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1488 tcg_temp_free_vec(t_vec); 1489 } else if (vece <= MO_32) { 1490 TCGv_i32 in = tcg_temp_new_i32(); 1491 switch (vece) { 1492 case MO_8: 1493 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1494 break; 1495 case MO_16: 1496 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1497 break; 1498 default: 1499 tcg_gen_ld_i32(in, cpu_env, aofs); 1500 break; 1501 } 1502 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1503 tcg_temp_free_i32(in); 1504 } else { 1505 TCGv_i64 in = tcg_temp_new_i64(); 1506 tcg_gen_ld_i64(in, cpu_env, aofs); 1507 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1508 tcg_temp_free_i64(in); 1509 } 1510 } else { 1511 /* 128-bit duplicate. */ 1512 /* ??? Dup to 256-bit vector. */ 1513 int i; 1514 1515 tcg_debug_assert(vece == 4); 1516 tcg_debug_assert(oprsz >= 16); 1517 if (TCG_TARGET_HAS_v128) { 1518 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1519 1520 tcg_gen_ld_vec(in, cpu_env, aofs); 1521 for (i = 0; i < oprsz; i += 16) { 1522 tcg_gen_st_vec(in, cpu_env, dofs + i); 1523 } 1524 tcg_temp_free_vec(in); 1525 } else { 1526 TCGv_i64 in0 = tcg_temp_new_i64(); 1527 TCGv_i64 in1 = tcg_temp_new_i64(); 1528 1529 tcg_gen_ld_i64(in0, cpu_env, aofs); 1530 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1531 for (i = 0; i < oprsz; i += 16) { 1532 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1533 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1534 } 1535 tcg_temp_free_i64(in0); 1536 tcg_temp_free_i64(in1); 1537 } 1538 if (oprsz < maxsz) { 1539 expand_clr(dofs + oprsz, maxsz - oprsz); 1540 } 1541 } 1542 } 1543 1544 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, 1545 uint32_t maxsz, uint64_t x) 1546 { 1547 check_size_align(oprsz, maxsz, dofs); 1548 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); 1549 } 1550 1551 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, 1552 uint32_t maxsz, uint32_t x) 1553 { 1554 check_size_align(oprsz, maxsz, dofs); 1555 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); 1556 } 1557 1558 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, 1559 uint32_t maxsz, uint16_t x) 1560 { 1561 check_size_align(oprsz, maxsz, dofs); 1562 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); 1563 } 1564 1565 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, 1566 uint32_t maxsz, uint8_t x) 1567 { 1568 check_size_align(oprsz, maxsz, dofs); 1569 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); 1570 } 1571 1572 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1573 uint32_t oprsz, uint32_t maxsz) 1574 { 1575 static const GVecGen2 g = { 1576 .fni8 = tcg_gen_not_i64, 1577 .fniv = tcg_gen_not_vec, 1578 .fno = gen_helper_gvec_not, 1579 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1580 }; 1581 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1582 } 1583 1584 /* Perform a vector addition using normal addition and a mask. The mask 1585 should be the sign bit of each lane. This 6-operation form is more 1586 efficient than separate additions when there are 4 or more lanes in 1587 the 64-bit operation. */ 1588 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1589 { 1590 TCGv_i64 t1 = tcg_temp_new_i64(); 1591 TCGv_i64 t2 = tcg_temp_new_i64(); 1592 TCGv_i64 t3 = tcg_temp_new_i64(); 1593 1594 tcg_gen_andc_i64(t1, a, m); 1595 tcg_gen_andc_i64(t2, b, m); 1596 tcg_gen_xor_i64(t3, a, b); 1597 tcg_gen_add_i64(d, t1, t2); 1598 tcg_gen_and_i64(t3, t3, m); 1599 tcg_gen_xor_i64(d, d, t3); 1600 1601 tcg_temp_free_i64(t1); 1602 tcg_temp_free_i64(t2); 1603 tcg_temp_free_i64(t3); 1604 } 1605 1606 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1607 { 1608 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1609 gen_addv_mask(d, a, b, m); 1610 tcg_temp_free_i64(m); 1611 } 1612 1613 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1614 { 1615 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1616 gen_addv_mask(d, a, b, m); 1617 tcg_temp_free_i64(m); 1618 } 1619 1620 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1621 { 1622 TCGv_i64 t1 = tcg_temp_new_i64(); 1623 TCGv_i64 t2 = tcg_temp_new_i64(); 1624 1625 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1626 tcg_gen_add_i64(t2, a, b); 1627 tcg_gen_add_i64(t1, t1, b); 1628 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1629 1630 tcg_temp_free_i64(t1); 1631 tcg_temp_free_i64(t2); 1632 } 1633 1634 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1635 1636 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1637 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1638 { 1639 static const GVecGen3 g[4] = { 1640 { .fni8 = tcg_gen_vec_add8_i64, 1641 .fniv = tcg_gen_add_vec, 1642 .fno = gen_helper_gvec_add8, 1643 .opt_opc = vecop_list_add, 1644 .vece = MO_8 }, 1645 { .fni8 = tcg_gen_vec_add16_i64, 1646 .fniv = tcg_gen_add_vec, 1647 .fno = gen_helper_gvec_add16, 1648 .opt_opc = vecop_list_add, 1649 .vece = MO_16 }, 1650 { .fni4 = tcg_gen_add_i32, 1651 .fniv = tcg_gen_add_vec, 1652 .fno = gen_helper_gvec_add32, 1653 .opt_opc = vecop_list_add, 1654 .vece = MO_32 }, 1655 { .fni8 = tcg_gen_add_i64, 1656 .fniv = tcg_gen_add_vec, 1657 .fno = gen_helper_gvec_add64, 1658 .opt_opc = vecop_list_add, 1659 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1660 .vece = MO_64 }, 1661 }; 1662 1663 tcg_debug_assert(vece <= MO_64); 1664 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1665 } 1666 1667 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1668 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1669 { 1670 static const GVecGen2s g[4] = { 1671 { .fni8 = tcg_gen_vec_add8_i64, 1672 .fniv = tcg_gen_add_vec, 1673 .fno = gen_helper_gvec_adds8, 1674 .opt_opc = vecop_list_add, 1675 .vece = MO_8 }, 1676 { .fni8 = tcg_gen_vec_add16_i64, 1677 .fniv = tcg_gen_add_vec, 1678 .fno = gen_helper_gvec_adds16, 1679 .opt_opc = vecop_list_add, 1680 .vece = MO_16 }, 1681 { .fni4 = tcg_gen_add_i32, 1682 .fniv = tcg_gen_add_vec, 1683 .fno = gen_helper_gvec_adds32, 1684 .opt_opc = vecop_list_add, 1685 .vece = MO_32 }, 1686 { .fni8 = tcg_gen_add_i64, 1687 .fniv = tcg_gen_add_vec, 1688 .fno = gen_helper_gvec_adds64, 1689 .opt_opc = vecop_list_add, 1690 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1691 .vece = MO_64 }, 1692 }; 1693 1694 tcg_debug_assert(vece <= MO_64); 1695 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1696 } 1697 1698 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1699 int64_t c, uint32_t oprsz, uint32_t maxsz) 1700 { 1701 TCGv_i64 tmp = tcg_const_i64(c); 1702 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1703 tcg_temp_free_i64(tmp); 1704 } 1705 1706 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 1707 1708 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1709 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1710 { 1711 static const GVecGen2s g[4] = { 1712 { .fni8 = tcg_gen_vec_sub8_i64, 1713 .fniv = tcg_gen_sub_vec, 1714 .fno = gen_helper_gvec_subs8, 1715 .opt_opc = vecop_list_sub, 1716 .vece = MO_8 }, 1717 { .fni8 = tcg_gen_vec_sub16_i64, 1718 .fniv = tcg_gen_sub_vec, 1719 .fno = gen_helper_gvec_subs16, 1720 .opt_opc = vecop_list_sub, 1721 .vece = MO_16 }, 1722 { .fni4 = tcg_gen_sub_i32, 1723 .fniv = tcg_gen_sub_vec, 1724 .fno = gen_helper_gvec_subs32, 1725 .opt_opc = vecop_list_sub, 1726 .vece = MO_32 }, 1727 { .fni8 = tcg_gen_sub_i64, 1728 .fniv = tcg_gen_sub_vec, 1729 .fno = gen_helper_gvec_subs64, 1730 .opt_opc = vecop_list_sub, 1731 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1732 .vece = MO_64 }, 1733 }; 1734 1735 tcg_debug_assert(vece <= MO_64); 1736 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1737 } 1738 1739 /* Perform a vector subtraction using normal subtraction and a mask. 1740 Compare gen_addv_mask above. */ 1741 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1742 { 1743 TCGv_i64 t1 = tcg_temp_new_i64(); 1744 TCGv_i64 t2 = tcg_temp_new_i64(); 1745 TCGv_i64 t3 = tcg_temp_new_i64(); 1746 1747 tcg_gen_or_i64(t1, a, m); 1748 tcg_gen_andc_i64(t2, b, m); 1749 tcg_gen_eqv_i64(t3, a, b); 1750 tcg_gen_sub_i64(d, t1, t2); 1751 tcg_gen_and_i64(t3, t3, m); 1752 tcg_gen_xor_i64(d, d, t3); 1753 1754 tcg_temp_free_i64(t1); 1755 tcg_temp_free_i64(t2); 1756 tcg_temp_free_i64(t3); 1757 } 1758 1759 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1760 { 1761 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1762 gen_subv_mask(d, a, b, m); 1763 tcg_temp_free_i64(m); 1764 } 1765 1766 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1767 { 1768 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1769 gen_subv_mask(d, a, b, m); 1770 tcg_temp_free_i64(m); 1771 } 1772 1773 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1774 { 1775 TCGv_i64 t1 = tcg_temp_new_i64(); 1776 TCGv_i64 t2 = tcg_temp_new_i64(); 1777 1778 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1779 tcg_gen_sub_i64(t2, a, b); 1780 tcg_gen_sub_i64(t1, a, t1); 1781 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1782 1783 tcg_temp_free_i64(t1); 1784 tcg_temp_free_i64(t2); 1785 } 1786 1787 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1788 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1789 { 1790 static const GVecGen3 g[4] = { 1791 { .fni8 = tcg_gen_vec_sub8_i64, 1792 .fniv = tcg_gen_sub_vec, 1793 .fno = gen_helper_gvec_sub8, 1794 .opt_opc = vecop_list_sub, 1795 .vece = MO_8 }, 1796 { .fni8 = tcg_gen_vec_sub16_i64, 1797 .fniv = tcg_gen_sub_vec, 1798 .fno = gen_helper_gvec_sub16, 1799 .opt_opc = vecop_list_sub, 1800 .vece = MO_16 }, 1801 { .fni4 = tcg_gen_sub_i32, 1802 .fniv = tcg_gen_sub_vec, 1803 .fno = gen_helper_gvec_sub32, 1804 .opt_opc = vecop_list_sub, 1805 .vece = MO_32 }, 1806 { .fni8 = tcg_gen_sub_i64, 1807 .fniv = tcg_gen_sub_vec, 1808 .fno = gen_helper_gvec_sub64, 1809 .opt_opc = vecop_list_sub, 1810 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1811 .vece = MO_64 }, 1812 }; 1813 1814 tcg_debug_assert(vece <= MO_64); 1815 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1816 } 1817 1818 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 1819 1820 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1821 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1822 { 1823 static const GVecGen3 g[4] = { 1824 { .fniv = tcg_gen_mul_vec, 1825 .fno = gen_helper_gvec_mul8, 1826 .opt_opc = vecop_list_mul, 1827 .vece = MO_8 }, 1828 { .fniv = tcg_gen_mul_vec, 1829 .fno = gen_helper_gvec_mul16, 1830 .opt_opc = vecop_list_mul, 1831 .vece = MO_16 }, 1832 { .fni4 = tcg_gen_mul_i32, 1833 .fniv = tcg_gen_mul_vec, 1834 .fno = gen_helper_gvec_mul32, 1835 .opt_opc = vecop_list_mul, 1836 .vece = MO_32 }, 1837 { .fni8 = tcg_gen_mul_i64, 1838 .fniv = tcg_gen_mul_vec, 1839 .fno = gen_helper_gvec_mul64, 1840 .opt_opc = vecop_list_mul, 1841 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1842 .vece = MO_64 }, 1843 }; 1844 1845 tcg_debug_assert(vece <= MO_64); 1846 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1847 } 1848 1849 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1850 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1851 { 1852 static const GVecGen2s g[4] = { 1853 { .fniv = tcg_gen_mul_vec, 1854 .fno = gen_helper_gvec_muls8, 1855 .opt_opc = vecop_list_mul, 1856 .vece = MO_8 }, 1857 { .fniv = tcg_gen_mul_vec, 1858 .fno = gen_helper_gvec_muls16, 1859 .opt_opc = vecop_list_mul, 1860 .vece = MO_16 }, 1861 { .fni4 = tcg_gen_mul_i32, 1862 .fniv = tcg_gen_mul_vec, 1863 .fno = gen_helper_gvec_muls32, 1864 .opt_opc = vecop_list_mul, 1865 .vece = MO_32 }, 1866 { .fni8 = tcg_gen_mul_i64, 1867 .fniv = tcg_gen_mul_vec, 1868 .fno = gen_helper_gvec_muls64, 1869 .opt_opc = vecop_list_mul, 1870 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1871 .vece = MO_64 }, 1872 }; 1873 1874 tcg_debug_assert(vece <= MO_64); 1875 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1876 } 1877 1878 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1879 int64_t c, uint32_t oprsz, uint32_t maxsz) 1880 { 1881 TCGv_i64 tmp = tcg_const_i64(c); 1882 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1883 tcg_temp_free_i64(tmp); 1884 } 1885 1886 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1887 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1888 { 1889 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 1890 static const GVecGen3 g[4] = { 1891 { .fniv = tcg_gen_ssadd_vec, 1892 .fno = gen_helper_gvec_ssadd8, 1893 .opt_opc = vecop_list, 1894 .vece = MO_8 }, 1895 { .fniv = tcg_gen_ssadd_vec, 1896 .fno = gen_helper_gvec_ssadd16, 1897 .opt_opc = vecop_list, 1898 .vece = MO_16 }, 1899 { .fniv = tcg_gen_ssadd_vec, 1900 .fno = gen_helper_gvec_ssadd32, 1901 .opt_opc = vecop_list, 1902 .vece = MO_32 }, 1903 { .fniv = tcg_gen_ssadd_vec, 1904 .fno = gen_helper_gvec_ssadd64, 1905 .opt_opc = vecop_list, 1906 .vece = MO_64 }, 1907 }; 1908 tcg_debug_assert(vece <= MO_64); 1909 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1910 } 1911 1912 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 1913 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1914 { 1915 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 1916 static const GVecGen3 g[4] = { 1917 { .fniv = tcg_gen_sssub_vec, 1918 .fno = gen_helper_gvec_sssub8, 1919 .opt_opc = vecop_list, 1920 .vece = MO_8 }, 1921 { .fniv = tcg_gen_sssub_vec, 1922 .fno = gen_helper_gvec_sssub16, 1923 .opt_opc = vecop_list, 1924 .vece = MO_16 }, 1925 { .fniv = tcg_gen_sssub_vec, 1926 .fno = gen_helper_gvec_sssub32, 1927 .opt_opc = vecop_list, 1928 .vece = MO_32 }, 1929 { .fniv = tcg_gen_sssub_vec, 1930 .fno = gen_helper_gvec_sssub64, 1931 .opt_opc = vecop_list, 1932 .vece = MO_64 }, 1933 }; 1934 tcg_debug_assert(vece <= MO_64); 1935 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1936 } 1937 1938 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1939 { 1940 TCGv_i32 max = tcg_const_i32(-1); 1941 tcg_gen_add_i32(d, a, b); 1942 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 1943 tcg_temp_free_i32(max); 1944 } 1945 1946 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1947 { 1948 TCGv_i64 max = tcg_const_i64(-1); 1949 tcg_gen_add_i64(d, a, b); 1950 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 1951 tcg_temp_free_i64(max); 1952 } 1953 1954 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1955 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1956 { 1957 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 1958 static const GVecGen3 g[4] = { 1959 { .fniv = tcg_gen_usadd_vec, 1960 .fno = gen_helper_gvec_usadd8, 1961 .opt_opc = vecop_list, 1962 .vece = MO_8 }, 1963 { .fniv = tcg_gen_usadd_vec, 1964 .fno = gen_helper_gvec_usadd16, 1965 .opt_opc = vecop_list, 1966 .vece = MO_16 }, 1967 { .fni4 = tcg_gen_usadd_i32, 1968 .fniv = tcg_gen_usadd_vec, 1969 .fno = gen_helper_gvec_usadd32, 1970 .opt_opc = vecop_list, 1971 .vece = MO_32 }, 1972 { .fni8 = tcg_gen_usadd_i64, 1973 .fniv = tcg_gen_usadd_vec, 1974 .fno = gen_helper_gvec_usadd64, 1975 .opt_opc = vecop_list, 1976 .vece = MO_64 } 1977 }; 1978 tcg_debug_assert(vece <= MO_64); 1979 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1980 } 1981 1982 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1983 { 1984 TCGv_i32 min = tcg_const_i32(0); 1985 tcg_gen_sub_i32(d, a, b); 1986 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 1987 tcg_temp_free_i32(min); 1988 } 1989 1990 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1991 { 1992 TCGv_i64 min = tcg_const_i64(0); 1993 tcg_gen_sub_i64(d, a, b); 1994 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 1995 tcg_temp_free_i64(min); 1996 } 1997 1998 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 1999 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2000 { 2001 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 2002 static const GVecGen3 g[4] = { 2003 { .fniv = tcg_gen_ussub_vec, 2004 .fno = gen_helper_gvec_ussub8, 2005 .opt_opc = vecop_list, 2006 .vece = MO_8 }, 2007 { .fniv = tcg_gen_ussub_vec, 2008 .fno = gen_helper_gvec_ussub16, 2009 .opt_opc = vecop_list, 2010 .vece = MO_16 }, 2011 { .fni4 = tcg_gen_ussub_i32, 2012 .fniv = tcg_gen_ussub_vec, 2013 .fno = gen_helper_gvec_ussub32, 2014 .opt_opc = vecop_list, 2015 .vece = MO_32 }, 2016 { .fni8 = tcg_gen_ussub_i64, 2017 .fniv = tcg_gen_ussub_vec, 2018 .fno = gen_helper_gvec_ussub64, 2019 .opt_opc = vecop_list, 2020 .vece = MO_64 } 2021 }; 2022 tcg_debug_assert(vece <= MO_64); 2023 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2024 } 2025 2026 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 2027 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2028 { 2029 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 2030 static const GVecGen3 g[4] = { 2031 { .fniv = tcg_gen_smin_vec, 2032 .fno = gen_helper_gvec_smin8, 2033 .opt_opc = vecop_list, 2034 .vece = MO_8 }, 2035 { .fniv = tcg_gen_smin_vec, 2036 .fno = gen_helper_gvec_smin16, 2037 .opt_opc = vecop_list, 2038 .vece = MO_16 }, 2039 { .fni4 = tcg_gen_smin_i32, 2040 .fniv = tcg_gen_smin_vec, 2041 .fno = gen_helper_gvec_smin32, 2042 .opt_opc = vecop_list, 2043 .vece = MO_32 }, 2044 { .fni8 = tcg_gen_smin_i64, 2045 .fniv = tcg_gen_smin_vec, 2046 .fno = gen_helper_gvec_smin64, 2047 .opt_opc = vecop_list, 2048 .vece = MO_64 } 2049 }; 2050 tcg_debug_assert(vece <= MO_64); 2051 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2052 } 2053 2054 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2055 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2056 { 2057 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2058 static const GVecGen3 g[4] = { 2059 { .fniv = tcg_gen_umin_vec, 2060 .fno = gen_helper_gvec_umin8, 2061 .opt_opc = vecop_list, 2062 .vece = MO_8 }, 2063 { .fniv = tcg_gen_umin_vec, 2064 .fno = gen_helper_gvec_umin16, 2065 .opt_opc = vecop_list, 2066 .vece = MO_16 }, 2067 { .fni4 = tcg_gen_umin_i32, 2068 .fniv = tcg_gen_umin_vec, 2069 .fno = gen_helper_gvec_umin32, 2070 .opt_opc = vecop_list, 2071 .vece = MO_32 }, 2072 { .fni8 = tcg_gen_umin_i64, 2073 .fniv = tcg_gen_umin_vec, 2074 .fno = gen_helper_gvec_umin64, 2075 .opt_opc = vecop_list, 2076 .vece = MO_64 } 2077 }; 2078 tcg_debug_assert(vece <= MO_64); 2079 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2080 } 2081 2082 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2083 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2084 { 2085 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2086 static const GVecGen3 g[4] = { 2087 { .fniv = tcg_gen_smax_vec, 2088 .fno = gen_helper_gvec_smax8, 2089 .opt_opc = vecop_list, 2090 .vece = MO_8 }, 2091 { .fniv = tcg_gen_smax_vec, 2092 .fno = gen_helper_gvec_smax16, 2093 .opt_opc = vecop_list, 2094 .vece = MO_16 }, 2095 { .fni4 = tcg_gen_smax_i32, 2096 .fniv = tcg_gen_smax_vec, 2097 .fno = gen_helper_gvec_smax32, 2098 .opt_opc = vecop_list, 2099 .vece = MO_32 }, 2100 { .fni8 = tcg_gen_smax_i64, 2101 .fniv = tcg_gen_smax_vec, 2102 .fno = gen_helper_gvec_smax64, 2103 .opt_opc = vecop_list, 2104 .vece = MO_64 } 2105 }; 2106 tcg_debug_assert(vece <= MO_64); 2107 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2108 } 2109 2110 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2111 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2112 { 2113 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2114 static const GVecGen3 g[4] = { 2115 { .fniv = tcg_gen_umax_vec, 2116 .fno = gen_helper_gvec_umax8, 2117 .opt_opc = vecop_list, 2118 .vece = MO_8 }, 2119 { .fniv = tcg_gen_umax_vec, 2120 .fno = gen_helper_gvec_umax16, 2121 .opt_opc = vecop_list, 2122 .vece = MO_16 }, 2123 { .fni4 = tcg_gen_umax_i32, 2124 .fniv = tcg_gen_umax_vec, 2125 .fno = gen_helper_gvec_umax32, 2126 .opt_opc = vecop_list, 2127 .vece = MO_32 }, 2128 { .fni8 = tcg_gen_umax_i64, 2129 .fniv = tcg_gen_umax_vec, 2130 .fno = gen_helper_gvec_umax64, 2131 .opt_opc = vecop_list, 2132 .vece = MO_64 } 2133 }; 2134 tcg_debug_assert(vece <= MO_64); 2135 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2136 } 2137 2138 /* Perform a vector negation using normal negation and a mask. 2139 Compare gen_subv_mask above. */ 2140 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2141 { 2142 TCGv_i64 t2 = tcg_temp_new_i64(); 2143 TCGv_i64 t3 = tcg_temp_new_i64(); 2144 2145 tcg_gen_andc_i64(t3, m, b); 2146 tcg_gen_andc_i64(t2, b, m); 2147 tcg_gen_sub_i64(d, m, t2); 2148 tcg_gen_xor_i64(d, d, t3); 2149 2150 tcg_temp_free_i64(t2); 2151 tcg_temp_free_i64(t3); 2152 } 2153 2154 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2155 { 2156 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 2157 gen_negv_mask(d, b, m); 2158 tcg_temp_free_i64(m); 2159 } 2160 2161 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2162 { 2163 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 2164 gen_negv_mask(d, b, m); 2165 tcg_temp_free_i64(m); 2166 } 2167 2168 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2169 { 2170 TCGv_i64 t1 = tcg_temp_new_i64(); 2171 TCGv_i64 t2 = tcg_temp_new_i64(); 2172 2173 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2174 tcg_gen_neg_i64(t2, b); 2175 tcg_gen_neg_i64(t1, t1); 2176 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2177 2178 tcg_temp_free_i64(t1); 2179 tcg_temp_free_i64(t2); 2180 } 2181 2182 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2183 uint32_t oprsz, uint32_t maxsz) 2184 { 2185 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2186 static const GVecGen2 g[4] = { 2187 { .fni8 = tcg_gen_vec_neg8_i64, 2188 .fniv = tcg_gen_neg_vec, 2189 .fno = gen_helper_gvec_neg8, 2190 .opt_opc = vecop_list, 2191 .vece = MO_8 }, 2192 { .fni8 = tcg_gen_vec_neg16_i64, 2193 .fniv = tcg_gen_neg_vec, 2194 .fno = gen_helper_gvec_neg16, 2195 .opt_opc = vecop_list, 2196 .vece = MO_16 }, 2197 { .fni4 = tcg_gen_neg_i32, 2198 .fniv = tcg_gen_neg_vec, 2199 .fno = gen_helper_gvec_neg32, 2200 .opt_opc = vecop_list, 2201 .vece = MO_32 }, 2202 { .fni8 = tcg_gen_neg_i64, 2203 .fniv = tcg_gen_neg_vec, 2204 .fno = gen_helper_gvec_neg64, 2205 .opt_opc = vecop_list, 2206 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2207 .vece = MO_64 }, 2208 }; 2209 2210 tcg_debug_assert(vece <= MO_64); 2211 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2212 } 2213 2214 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2215 { 2216 TCGv_i64 t = tcg_temp_new_i64(); 2217 int nbit = 8 << vece; 2218 2219 /* Create -1 for each negative element. */ 2220 tcg_gen_shri_i64(t, b, nbit - 1); 2221 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2222 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2223 2224 /* 2225 * Invert (via xor -1) and add one (via sub -1). 2226 * Because of the ordering the msb is cleared, 2227 * so we never have carry into the next element. 2228 */ 2229 tcg_gen_xor_i64(d, b, t); 2230 tcg_gen_sub_i64(d, d, t); 2231 2232 tcg_temp_free_i64(t); 2233 } 2234 2235 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2236 { 2237 gen_absv_mask(d, b, MO_8); 2238 } 2239 2240 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2241 { 2242 gen_absv_mask(d, b, MO_16); 2243 } 2244 2245 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2246 uint32_t oprsz, uint32_t maxsz) 2247 { 2248 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2249 static const GVecGen2 g[4] = { 2250 { .fni8 = tcg_gen_vec_abs8_i64, 2251 .fniv = tcg_gen_abs_vec, 2252 .fno = gen_helper_gvec_abs8, 2253 .opt_opc = vecop_list, 2254 .vece = MO_8 }, 2255 { .fni8 = tcg_gen_vec_abs16_i64, 2256 .fniv = tcg_gen_abs_vec, 2257 .fno = gen_helper_gvec_abs16, 2258 .opt_opc = vecop_list, 2259 .vece = MO_16 }, 2260 { .fni4 = tcg_gen_abs_i32, 2261 .fniv = tcg_gen_abs_vec, 2262 .fno = gen_helper_gvec_abs32, 2263 .opt_opc = vecop_list, 2264 .vece = MO_32 }, 2265 { .fni8 = tcg_gen_abs_i64, 2266 .fniv = tcg_gen_abs_vec, 2267 .fno = gen_helper_gvec_abs64, 2268 .opt_opc = vecop_list, 2269 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2270 .vece = MO_64 }, 2271 }; 2272 2273 tcg_debug_assert(vece <= MO_64); 2274 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2275 } 2276 2277 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2278 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2279 { 2280 static const GVecGen3 g = { 2281 .fni8 = tcg_gen_and_i64, 2282 .fniv = tcg_gen_and_vec, 2283 .fno = gen_helper_gvec_and, 2284 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2285 }; 2286 2287 if (aofs == bofs) { 2288 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2289 } else { 2290 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2291 } 2292 } 2293 2294 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2295 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2296 { 2297 static const GVecGen3 g = { 2298 .fni8 = tcg_gen_or_i64, 2299 .fniv = tcg_gen_or_vec, 2300 .fno = gen_helper_gvec_or, 2301 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2302 }; 2303 2304 if (aofs == bofs) { 2305 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2306 } else { 2307 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2308 } 2309 } 2310 2311 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2312 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2313 { 2314 static const GVecGen3 g = { 2315 .fni8 = tcg_gen_xor_i64, 2316 .fniv = tcg_gen_xor_vec, 2317 .fno = gen_helper_gvec_xor, 2318 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2319 }; 2320 2321 if (aofs == bofs) { 2322 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); 2323 } else { 2324 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2325 } 2326 } 2327 2328 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2329 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2330 { 2331 static const GVecGen3 g = { 2332 .fni8 = tcg_gen_andc_i64, 2333 .fniv = tcg_gen_andc_vec, 2334 .fno = gen_helper_gvec_andc, 2335 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2336 }; 2337 2338 if (aofs == bofs) { 2339 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); 2340 } else { 2341 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2342 } 2343 } 2344 2345 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2346 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2347 { 2348 static const GVecGen3 g = { 2349 .fni8 = tcg_gen_orc_i64, 2350 .fniv = tcg_gen_orc_vec, 2351 .fno = gen_helper_gvec_orc, 2352 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2353 }; 2354 2355 if (aofs == bofs) { 2356 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); 2357 } else { 2358 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2359 } 2360 } 2361 2362 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2363 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2364 { 2365 static const GVecGen3 g = { 2366 .fni8 = tcg_gen_nand_i64, 2367 .fniv = tcg_gen_nand_vec, 2368 .fno = gen_helper_gvec_nand, 2369 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2370 }; 2371 2372 if (aofs == bofs) { 2373 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2374 } else { 2375 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2376 } 2377 } 2378 2379 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2380 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2381 { 2382 static const GVecGen3 g = { 2383 .fni8 = tcg_gen_nor_i64, 2384 .fniv = tcg_gen_nor_vec, 2385 .fno = gen_helper_gvec_nor, 2386 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2387 }; 2388 2389 if (aofs == bofs) { 2390 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2391 } else { 2392 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2393 } 2394 } 2395 2396 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2397 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2398 { 2399 static const GVecGen3 g = { 2400 .fni8 = tcg_gen_eqv_i64, 2401 .fniv = tcg_gen_eqv_vec, 2402 .fno = gen_helper_gvec_eqv, 2403 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2404 }; 2405 2406 if (aofs == bofs) { 2407 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); 2408 } else { 2409 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2410 } 2411 } 2412 2413 static const GVecGen2s gop_ands = { 2414 .fni8 = tcg_gen_and_i64, 2415 .fniv = tcg_gen_and_vec, 2416 .fno = gen_helper_gvec_ands, 2417 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2418 .vece = MO_64 2419 }; 2420 2421 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2422 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2423 { 2424 TCGv_i64 tmp = tcg_temp_new_i64(); 2425 gen_dup_i64(vece, tmp, c); 2426 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2427 tcg_temp_free_i64(tmp); 2428 } 2429 2430 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2431 int64_t c, uint32_t oprsz, uint32_t maxsz) 2432 { 2433 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2434 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2435 tcg_temp_free_i64(tmp); 2436 } 2437 2438 static const GVecGen2s gop_xors = { 2439 .fni8 = tcg_gen_xor_i64, 2440 .fniv = tcg_gen_xor_vec, 2441 .fno = gen_helper_gvec_xors, 2442 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2443 .vece = MO_64 2444 }; 2445 2446 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2447 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2448 { 2449 TCGv_i64 tmp = tcg_temp_new_i64(); 2450 gen_dup_i64(vece, tmp, c); 2451 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2452 tcg_temp_free_i64(tmp); 2453 } 2454 2455 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2456 int64_t c, uint32_t oprsz, uint32_t maxsz) 2457 { 2458 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2459 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2460 tcg_temp_free_i64(tmp); 2461 } 2462 2463 static const GVecGen2s gop_ors = { 2464 .fni8 = tcg_gen_or_i64, 2465 .fniv = tcg_gen_or_vec, 2466 .fno = gen_helper_gvec_ors, 2467 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2468 .vece = MO_64 2469 }; 2470 2471 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2472 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2473 { 2474 TCGv_i64 tmp = tcg_temp_new_i64(); 2475 gen_dup_i64(vece, tmp, c); 2476 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2477 tcg_temp_free_i64(tmp); 2478 } 2479 2480 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2481 int64_t c, uint32_t oprsz, uint32_t maxsz) 2482 { 2483 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2484 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2485 tcg_temp_free_i64(tmp); 2486 } 2487 2488 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2489 { 2490 uint64_t mask = dup_const(MO_8, 0xff << c); 2491 tcg_gen_shli_i64(d, a, c); 2492 tcg_gen_andi_i64(d, d, mask); 2493 } 2494 2495 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2496 { 2497 uint64_t mask = dup_const(MO_16, 0xffff << c); 2498 tcg_gen_shli_i64(d, a, c); 2499 tcg_gen_andi_i64(d, d, mask); 2500 } 2501 2502 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2503 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2504 { 2505 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2506 static const GVecGen2i g[4] = { 2507 { .fni8 = tcg_gen_vec_shl8i_i64, 2508 .fniv = tcg_gen_shli_vec, 2509 .fno = gen_helper_gvec_shl8i, 2510 .opt_opc = vecop_list, 2511 .vece = MO_8 }, 2512 { .fni8 = tcg_gen_vec_shl16i_i64, 2513 .fniv = tcg_gen_shli_vec, 2514 .fno = gen_helper_gvec_shl16i, 2515 .opt_opc = vecop_list, 2516 .vece = MO_16 }, 2517 { .fni4 = tcg_gen_shli_i32, 2518 .fniv = tcg_gen_shli_vec, 2519 .fno = gen_helper_gvec_shl32i, 2520 .opt_opc = vecop_list, 2521 .vece = MO_32 }, 2522 { .fni8 = tcg_gen_shli_i64, 2523 .fniv = tcg_gen_shli_vec, 2524 .fno = gen_helper_gvec_shl64i, 2525 .opt_opc = vecop_list, 2526 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2527 .vece = MO_64 }, 2528 }; 2529 2530 tcg_debug_assert(vece <= MO_64); 2531 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2532 if (shift == 0) { 2533 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2534 } else { 2535 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2536 } 2537 } 2538 2539 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2540 { 2541 uint64_t mask = dup_const(MO_8, 0xff >> c); 2542 tcg_gen_shri_i64(d, a, c); 2543 tcg_gen_andi_i64(d, d, mask); 2544 } 2545 2546 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2547 { 2548 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2549 tcg_gen_shri_i64(d, a, c); 2550 tcg_gen_andi_i64(d, d, mask); 2551 } 2552 2553 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2554 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2555 { 2556 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2557 static const GVecGen2i g[4] = { 2558 { .fni8 = tcg_gen_vec_shr8i_i64, 2559 .fniv = tcg_gen_shri_vec, 2560 .fno = gen_helper_gvec_shr8i, 2561 .opt_opc = vecop_list, 2562 .vece = MO_8 }, 2563 { .fni8 = tcg_gen_vec_shr16i_i64, 2564 .fniv = tcg_gen_shri_vec, 2565 .fno = gen_helper_gvec_shr16i, 2566 .opt_opc = vecop_list, 2567 .vece = MO_16 }, 2568 { .fni4 = tcg_gen_shri_i32, 2569 .fniv = tcg_gen_shri_vec, 2570 .fno = gen_helper_gvec_shr32i, 2571 .opt_opc = vecop_list, 2572 .vece = MO_32 }, 2573 { .fni8 = tcg_gen_shri_i64, 2574 .fniv = tcg_gen_shri_vec, 2575 .fno = gen_helper_gvec_shr64i, 2576 .opt_opc = vecop_list, 2577 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2578 .vece = MO_64 }, 2579 }; 2580 2581 tcg_debug_assert(vece <= MO_64); 2582 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2583 if (shift == 0) { 2584 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2585 } else { 2586 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2587 } 2588 } 2589 2590 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2591 { 2592 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2593 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2594 TCGv_i64 s = tcg_temp_new_i64(); 2595 2596 tcg_gen_shri_i64(d, a, c); 2597 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2598 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2599 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2600 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2601 tcg_temp_free_i64(s); 2602 } 2603 2604 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2605 { 2606 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2607 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2608 TCGv_i64 s = tcg_temp_new_i64(); 2609 2610 tcg_gen_shri_i64(d, a, c); 2611 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2612 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2613 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2614 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2615 tcg_temp_free_i64(s); 2616 } 2617 2618 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2619 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2620 { 2621 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 2622 static const GVecGen2i g[4] = { 2623 { .fni8 = tcg_gen_vec_sar8i_i64, 2624 .fniv = tcg_gen_sari_vec, 2625 .fno = gen_helper_gvec_sar8i, 2626 .opt_opc = vecop_list, 2627 .vece = MO_8 }, 2628 { .fni8 = tcg_gen_vec_sar16i_i64, 2629 .fniv = tcg_gen_sari_vec, 2630 .fno = gen_helper_gvec_sar16i, 2631 .opt_opc = vecop_list, 2632 .vece = MO_16 }, 2633 { .fni4 = tcg_gen_sari_i32, 2634 .fniv = tcg_gen_sari_vec, 2635 .fno = gen_helper_gvec_sar32i, 2636 .opt_opc = vecop_list, 2637 .vece = MO_32 }, 2638 { .fni8 = tcg_gen_sari_i64, 2639 .fniv = tcg_gen_sari_vec, 2640 .fno = gen_helper_gvec_sar64i, 2641 .opt_opc = vecop_list, 2642 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2643 .vece = MO_64 }, 2644 }; 2645 2646 tcg_debug_assert(vece <= MO_64); 2647 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2648 if (shift == 0) { 2649 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2650 } else { 2651 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2652 } 2653 } 2654 2655 /* 2656 * Specialized generation vector shifts by a non-constant scalar. 2657 */ 2658 2659 typedef struct { 2660 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 2661 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 2662 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 2663 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 2664 gen_helper_gvec_2 *fno[4]; 2665 TCGOpcode s_list[2]; 2666 TCGOpcode v_list[2]; 2667 } GVecGen2sh; 2668 2669 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2670 uint32_t oprsz, uint32_t tysz, TCGType type, 2671 TCGv_i32 shift, 2672 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 2673 { 2674 TCGv_vec t0 = tcg_temp_new_vec(type); 2675 uint32_t i; 2676 2677 for (i = 0; i < oprsz; i += tysz) { 2678 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2679 fni(vece, t0, t0, shift); 2680 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2681 } 2682 tcg_temp_free_vec(t0); 2683 } 2684 2685 static void 2686 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 2687 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 2688 { 2689 TCGType type; 2690 uint32_t some; 2691 2692 check_size_align(oprsz, maxsz, dofs | aofs); 2693 check_overlap_2(dofs, aofs, maxsz); 2694 2695 /* If the backend has a scalar expansion, great. */ 2696 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 2697 if (type) { 2698 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2699 switch (type) { 2700 case TCG_TYPE_V256: 2701 some = QEMU_ALIGN_DOWN(oprsz, 32); 2702 expand_2sh_vec(vece, dofs, aofs, some, 32, 2703 TCG_TYPE_V256, shift, g->fniv_s); 2704 if (some == oprsz) { 2705 break; 2706 } 2707 dofs += some; 2708 aofs += some; 2709 oprsz -= some; 2710 maxsz -= some; 2711 /* fallthru */ 2712 case TCG_TYPE_V128: 2713 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 2714 TCG_TYPE_V128, shift, g->fniv_s); 2715 break; 2716 case TCG_TYPE_V64: 2717 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 2718 TCG_TYPE_V64, shift, g->fniv_s); 2719 break; 2720 default: 2721 g_assert_not_reached(); 2722 } 2723 tcg_swap_vecop_list(hold_list); 2724 goto clear_tail; 2725 } 2726 2727 /* If the backend supports variable vector shifts, also cool. */ 2728 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 2729 if (type) { 2730 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2731 TCGv_vec v_shift = tcg_temp_new_vec(type); 2732 2733 if (vece == MO_64) { 2734 TCGv_i64 sh64 = tcg_temp_new_i64(); 2735 tcg_gen_extu_i32_i64(sh64, shift); 2736 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 2737 tcg_temp_free_i64(sh64); 2738 } else { 2739 tcg_gen_dup_i32_vec(vece, v_shift, shift); 2740 } 2741 2742 switch (type) { 2743 case TCG_TYPE_V256: 2744 some = QEMU_ALIGN_DOWN(oprsz, 32); 2745 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 2746 v_shift, false, g->fniv_v); 2747 if (some == oprsz) { 2748 break; 2749 } 2750 dofs += some; 2751 aofs += some; 2752 oprsz -= some; 2753 maxsz -= some; 2754 /* fallthru */ 2755 case TCG_TYPE_V128: 2756 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 2757 v_shift, false, g->fniv_v); 2758 break; 2759 case TCG_TYPE_V64: 2760 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 2761 v_shift, false, g->fniv_v); 2762 break; 2763 default: 2764 g_assert_not_reached(); 2765 } 2766 tcg_temp_free_vec(v_shift); 2767 tcg_swap_vecop_list(hold_list); 2768 goto clear_tail; 2769 } 2770 2771 /* Otherwise fall back to integral... */ 2772 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2773 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 2774 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2775 TCGv_i64 sh64 = tcg_temp_new_i64(); 2776 tcg_gen_extu_i32_i64(sh64, shift); 2777 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 2778 tcg_temp_free_i64(sh64); 2779 } else { 2780 TCGv_ptr a0 = tcg_temp_new_ptr(); 2781 TCGv_ptr a1 = tcg_temp_new_ptr(); 2782 TCGv_i32 desc = tcg_temp_new_i32(); 2783 2784 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 2785 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 2786 tcg_gen_addi_ptr(a0, cpu_env, dofs); 2787 tcg_gen_addi_ptr(a1, cpu_env, aofs); 2788 2789 g->fno[vece](a0, a1, desc); 2790 2791 tcg_temp_free_ptr(a0); 2792 tcg_temp_free_ptr(a1); 2793 tcg_temp_free_i32(desc); 2794 return; 2795 } 2796 2797 clear_tail: 2798 if (oprsz < maxsz) { 2799 expand_clr(dofs + oprsz, maxsz - oprsz); 2800 } 2801 } 2802 2803 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 2804 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2805 { 2806 static const GVecGen2sh g = { 2807 .fni4 = tcg_gen_shl_i32, 2808 .fni8 = tcg_gen_shl_i64, 2809 .fniv_s = tcg_gen_shls_vec, 2810 .fniv_v = tcg_gen_shlv_vec, 2811 .fno = { 2812 gen_helper_gvec_shl8i, 2813 gen_helper_gvec_shl16i, 2814 gen_helper_gvec_shl32i, 2815 gen_helper_gvec_shl64i, 2816 }, 2817 .s_list = { INDEX_op_shls_vec, 0 }, 2818 .v_list = { INDEX_op_shlv_vec, 0 }, 2819 }; 2820 2821 tcg_debug_assert(vece <= MO_64); 2822 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2823 } 2824 2825 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 2826 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2827 { 2828 static const GVecGen2sh g = { 2829 .fni4 = tcg_gen_shr_i32, 2830 .fni8 = tcg_gen_shr_i64, 2831 .fniv_s = tcg_gen_shrs_vec, 2832 .fniv_v = tcg_gen_shrv_vec, 2833 .fno = { 2834 gen_helper_gvec_shr8i, 2835 gen_helper_gvec_shr16i, 2836 gen_helper_gvec_shr32i, 2837 gen_helper_gvec_shr64i, 2838 }, 2839 .s_list = { INDEX_op_shrs_vec, 0 }, 2840 .v_list = { INDEX_op_shrv_vec, 0 }, 2841 }; 2842 2843 tcg_debug_assert(vece <= MO_64); 2844 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2845 } 2846 2847 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 2848 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2849 { 2850 static const GVecGen2sh g = { 2851 .fni4 = tcg_gen_sar_i32, 2852 .fni8 = tcg_gen_sar_i64, 2853 .fniv_s = tcg_gen_sars_vec, 2854 .fniv_v = tcg_gen_sarv_vec, 2855 .fno = { 2856 gen_helper_gvec_sar8i, 2857 gen_helper_gvec_sar16i, 2858 gen_helper_gvec_sar32i, 2859 gen_helper_gvec_sar64i, 2860 }, 2861 .s_list = { INDEX_op_sars_vec, 0 }, 2862 .v_list = { INDEX_op_sarv_vec, 0 }, 2863 }; 2864 2865 tcg_debug_assert(vece <= MO_64); 2866 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2867 } 2868 2869 /* 2870 * Expand D = A << (B % element bits) 2871 * 2872 * Unlike scalar shifts, where it is easy for the target front end 2873 * to include the modulo as part of the expansion. If the target 2874 * naturally includes the modulo as part of the operation, great! 2875 * If the target has some other behaviour from out-of-range shifts, 2876 * then it could not use this function anyway, and would need to 2877 * do it's own expansion with custom functions. 2878 */ 2879 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 2880 TCGv_vec a, TCGv_vec b) 2881 { 2882 TCGv_vec t = tcg_temp_new_vec_matching(d); 2883 2884 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2885 tcg_gen_and_vec(vece, t, t, b); 2886 tcg_gen_shlv_vec(vece, d, a, t); 2887 tcg_temp_free_vec(t); 2888 } 2889 2890 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2891 { 2892 TCGv_i32 t = tcg_temp_new_i32(); 2893 2894 tcg_gen_andi_i32(t, b, 31); 2895 tcg_gen_shl_i32(d, a, t); 2896 tcg_temp_free_i32(t); 2897 } 2898 2899 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2900 { 2901 TCGv_i64 t = tcg_temp_new_i64(); 2902 2903 tcg_gen_andi_i64(t, b, 63); 2904 tcg_gen_shl_i64(d, a, t); 2905 tcg_temp_free_i64(t); 2906 } 2907 2908 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 2909 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2910 { 2911 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 2912 static const GVecGen3 g[4] = { 2913 { .fniv = tcg_gen_shlv_mod_vec, 2914 .fno = gen_helper_gvec_shl8v, 2915 .opt_opc = vecop_list, 2916 .vece = MO_8 }, 2917 { .fniv = tcg_gen_shlv_mod_vec, 2918 .fno = gen_helper_gvec_shl16v, 2919 .opt_opc = vecop_list, 2920 .vece = MO_16 }, 2921 { .fni4 = tcg_gen_shl_mod_i32, 2922 .fniv = tcg_gen_shlv_mod_vec, 2923 .fno = gen_helper_gvec_shl32v, 2924 .opt_opc = vecop_list, 2925 .vece = MO_32 }, 2926 { .fni8 = tcg_gen_shl_mod_i64, 2927 .fniv = tcg_gen_shlv_mod_vec, 2928 .fno = gen_helper_gvec_shl64v, 2929 .opt_opc = vecop_list, 2930 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2931 .vece = MO_64 }, 2932 }; 2933 2934 tcg_debug_assert(vece <= MO_64); 2935 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2936 } 2937 2938 /* 2939 * Similarly for logical right shifts. 2940 */ 2941 2942 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 2943 TCGv_vec a, TCGv_vec b) 2944 { 2945 TCGv_vec t = tcg_temp_new_vec_matching(d); 2946 2947 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2948 tcg_gen_and_vec(vece, t, t, b); 2949 tcg_gen_shrv_vec(vece, d, a, t); 2950 tcg_temp_free_vec(t); 2951 } 2952 2953 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2954 { 2955 TCGv_i32 t = tcg_temp_new_i32(); 2956 2957 tcg_gen_andi_i32(t, b, 31); 2958 tcg_gen_shr_i32(d, a, t); 2959 tcg_temp_free_i32(t); 2960 } 2961 2962 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2963 { 2964 TCGv_i64 t = tcg_temp_new_i64(); 2965 2966 tcg_gen_andi_i64(t, b, 63); 2967 tcg_gen_shr_i64(d, a, t); 2968 tcg_temp_free_i64(t); 2969 } 2970 2971 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 2972 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2973 { 2974 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 2975 static const GVecGen3 g[4] = { 2976 { .fniv = tcg_gen_shrv_mod_vec, 2977 .fno = gen_helper_gvec_shr8v, 2978 .opt_opc = vecop_list, 2979 .vece = MO_8 }, 2980 { .fniv = tcg_gen_shrv_mod_vec, 2981 .fno = gen_helper_gvec_shr16v, 2982 .opt_opc = vecop_list, 2983 .vece = MO_16 }, 2984 { .fni4 = tcg_gen_shr_mod_i32, 2985 .fniv = tcg_gen_shrv_mod_vec, 2986 .fno = gen_helper_gvec_shr32v, 2987 .opt_opc = vecop_list, 2988 .vece = MO_32 }, 2989 { .fni8 = tcg_gen_shr_mod_i64, 2990 .fniv = tcg_gen_shrv_mod_vec, 2991 .fno = gen_helper_gvec_shr64v, 2992 .opt_opc = vecop_list, 2993 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2994 .vece = MO_64 }, 2995 }; 2996 2997 tcg_debug_assert(vece <= MO_64); 2998 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2999 } 3000 3001 /* 3002 * Similarly for arithmetic right shifts. 3003 */ 3004 3005 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 3006 TCGv_vec a, TCGv_vec b) 3007 { 3008 TCGv_vec t = tcg_temp_new_vec_matching(d); 3009 3010 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 3011 tcg_gen_and_vec(vece, t, t, b); 3012 tcg_gen_sarv_vec(vece, d, a, t); 3013 tcg_temp_free_vec(t); 3014 } 3015 3016 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 3017 { 3018 TCGv_i32 t = tcg_temp_new_i32(); 3019 3020 tcg_gen_andi_i32(t, b, 31); 3021 tcg_gen_sar_i32(d, a, t); 3022 tcg_temp_free_i32(t); 3023 } 3024 3025 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 3026 { 3027 TCGv_i64 t = tcg_temp_new_i64(); 3028 3029 tcg_gen_andi_i64(t, b, 63); 3030 tcg_gen_sar_i64(d, a, t); 3031 tcg_temp_free_i64(t); 3032 } 3033 3034 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3035 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3036 { 3037 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3038 static const GVecGen3 g[4] = { 3039 { .fniv = tcg_gen_sarv_mod_vec, 3040 .fno = gen_helper_gvec_sar8v, 3041 .opt_opc = vecop_list, 3042 .vece = MO_8 }, 3043 { .fniv = tcg_gen_sarv_mod_vec, 3044 .fno = gen_helper_gvec_sar16v, 3045 .opt_opc = vecop_list, 3046 .vece = MO_16 }, 3047 { .fni4 = tcg_gen_sar_mod_i32, 3048 .fniv = tcg_gen_sarv_mod_vec, 3049 .fno = gen_helper_gvec_sar32v, 3050 .opt_opc = vecop_list, 3051 .vece = MO_32 }, 3052 { .fni8 = tcg_gen_sar_mod_i64, 3053 .fniv = tcg_gen_sarv_mod_vec, 3054 .fno = gen_helper_gvec_sar64v, 3055 .opt_opc = vecop_list, 3056 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3057 .vece = MO_64 }, 3058 }; 3059 3060 tcg_debug_assert(vece <= MO_64); 3061 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3062 } 3063 3064 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3065 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3066 uint32_t oprsz, TCGCond cond) 3067 { 3068 TCGv_i32 t0 = tcg_temp_new_i32(); 3069 TCGv_i32 t1 = tcg_temp_new_i32(); 3070 uint32_t i; 3071 3072 for (i = 0; i < oprsz; i += 4) { 3073 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 3074 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 3075 tcg_gen_setcond_i32(cond, t0, t0, t1); 3076 tcg_gen_neg_i32(t0, t0); 3077 tcg_gen_st_i32(t0, cpu_env, dofs + i); 3078 } 3079 tcg_temp_free_i32(t1); 3080 tcg_temp_free_i32(t0); 3081 } 3082 3083 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3084 uint32_t oprsz, TCGCond cond) 3085 { 3086 TCGv_i64 t0 = tcg_temp_new_i64(); 3087 TCGv_i64 t1 = tcg_temp_new_i64(); 3088 uint32_t i; 3089 3090 for (i = 0; i < oprsz; i += 8) { 3091 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 3092 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 3093 tcg_gen_setcond_i64(cond, t0, t0, t1); 3094 tcg_gen_neg_i64(t0, t0); 3095 tcg_gen_st_i64(t0, cpu_env, dofs + i); 3096 } 3097 tcg_temp_free_i64(t1); 3098 tcg_temp_free_i64(t0); 3099 } 3100 3101 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3102 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3103 TCGType type, TCGCond cond) 3104 { 3105 TCGv_vec t0 = tcg_temp_new_vec(type); 3106 TCGv_vec t1 = tcg_temp_new_vec(type); 3107 uint32_t i; 3108 3109 for (i = 0; i < oprsz; i += tysz) { 3110 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 3111 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 3112 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 3113 tcg_gen_st_vec(t0, cpu_env, dofs + i); 3114 } 3115 tcg_temp_free_vec(t1); 3116 tcg_temp_free_vec(t0); 3117 } 3118 3119 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3120 uint32_t aofs, uint32_t bofs, 3121 uint32_t oprsz, uint32_t maxsz) 3122 { 3123 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3124 static gen_helper_gvec_3 * const eq_fn[4] = { 3125 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3126 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3127 }; 3128 static gen_helper_gvec_3 * const ne_fn[4] = { 3129 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3130 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3131 }; 3132 static gen_helper_gvec_3 * const lt_fn[4] = { 3133 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3134 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3135 }; 3136 static gen_helper_gvec_3 * const le_fn[4] = { 3137 gen_helper_gvec_le8, gen_helper_gvec_le16, 3138 gen_helper_gvec_le32, gen_helper_gvec_le64 3139 }; 3140 static gen_helper_gvec_3 * const ltu_fn[4] = { 3141 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3142 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3143 }; 3144 static gen_helper_gvec_3 * const leu_fn[4] = { 3145 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3146 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3147 }; 3148 static gen_helper_gvec_3 * const * const fns[16] = { 3149 [TCG_COND_EQ] = eq_fn, 3150 [TCG_COND_NE] = ne_fn, 3151 [TCG_COND_LT] = lt_fn, 3152 [TCG_COND_LE] = le_fn, 3153 [TCG_COND_LTU] = ltu_fn, 3154 [TCG_COND_LEU] = leu_fn, 3155 }; 3156 3157 const TCGOpcode *hold_list; 3158 TCGType type; 3159 uint32_t some; 3160 3161 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3162 check_overlap_3(dofs, aofs, bofs, maxsz); 3163 3164 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3165 do_dup(MO_8, dofs, oprsz, maxsz, 3166 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3167 return; 3168 } 3169 3170 /* 3171 * Implement inline with a vector type, if possible. 3172 * Prefer integer when 64-bit host and 64-bit comparison. 3173 */ 3174 hold_list = tcg_swap_vecop_list(cmp_list); 3175 type = choose_vector_type(cmp_list, vece, oprsz, 3176 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3177 switch (type) { 3178 case TCG_TYPE_V256: 3179 /* Recall that ARM SVE allows vector sizes that are not a 3180 * power of 2, but always a multiple of 16. The intent is 3181 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3182 */ 3183 some = QEMU_ALIGN_DOWN(oprsz, 32); 3184 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3185 if (some == oprsz) { 3186 break; 3187 } 3188 dofs += some; 3189 aofs += some; 3190 bofs += some; 3191 oprsz -= some; 3192 maxsz -= some; 3193 /* fallthru */ 3194 case TCG_TYPE_V128: 3195 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3196 break; 3197 case TCG_TYPE_V64: 3198 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3199 break; 3200 3201 case 0: 3202 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3203 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3204 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3205 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3206 } else { 3207 gen_helper_gvec_3 * const *fn = fns[cond]; 3208 3209 if (fn == NULL) { 3210 uint32_t tmp; 3211 tmp = aofs, aofs = bofs, bofs = tmp; 3212 cond = tcg_swap_cond(cond); 3213 fn = fns[cond]; 3214 assert(fn != NULL); 3215 } 3216 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3217 oprsz = maxsz; 3218 } 3219 break; 3220 3221 default: 3222 g_assert_not_reached(); 3223 } 3224 tcg_swap_vecop_list(hold_list); 3225 3226 if (oprsz < maxsz) { 3227 expand_clr(dofs + oprsz, maxsz - oprsz); 3228 } 3229 } 3230 3231 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 3232 { 3233 TCGv_i64 t = tcg_temp_new_i64(); 3234 3235 tcg_gen_and_i64(t, b, a); 3236 tcg_gen_andc_i64(d, c, a); 3237 tcg_gen_or_i64(d, d, t); 3238 tcg_temp_free_i64(t); 3239 } 3240 3241 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 3242 uint32_t bofs, uint32_t cofs, 3243 uint32_t oprsz, uint32_t maxsz) 3244 { 3245 static const GVecGen4 g = { 3246 .fni8 = tcg_gen_bitsel_i64, 3247 .fniv = tcg_gen_bitsel_vec, 3248 .fno = gen_helper_gvec_bitsel, 3249 }; 3250 3251 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 3252 } 3253