1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "tcg.h" 22 #include "tcg-op.h" 23 #include "tcg-op-gvec.h" 24 #include "tcg-gvec-desc.h" 25 26 #define MAX_UNROLL 4 27 28 #ifdef CONFIG_DEBUG_TCG 29 static const TCGOpcode vecop_list_empty[1] = { 0 }; 30 #else 31 #define vecop_list_empty NULL 32 #endif 33 34 35 /* Verify vector size and alignment rules. OFS should be the OR of all 36 of the operand offsets so that we can check them all at once. */ 37 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 38 { 39 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 40 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 41 tcg_debug_assert(oprsz > 0); 42 tcg_debug_assert(oprsz <= maxsz); 43 tcg_debug_assert((oprsz & opr_align) == 0); 44 tcg_debug_assert((maxsz & max_align) == 0); 45 tcg_debug_assert((ofs & max_align) == 0); 46 } 47 48 /* Verify vector overlap rules for two operands. */ 49 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 50 { 51 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 52 } 53 54 /* Verify vector overlap rules for three operands. */ 55 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 56 { 57 check_overlap_2(d, a, s); 58 check_overlap_2(d, b, s); 59 check_overlap_2(a, b, s); 60 } 61 62 /* Verify vector overlap rules for four operands. */ 63 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 64 uint32_t c, uint32_t s) 65 { 66 check_overlap_2(d, a, s); 67 check_overlap_2(d, b, s); 68 check_overlap_2(d, c, s); 69 check_overlap_2(a, b, s); 70 check_overlap_2(a, c, s); 71 check_overlap_2(b, c, s); 72 } 73 74 /* Create a descriptor from components. */ 75 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 76 { 77 uint32_t desc = 0; 78 79 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 80 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 81 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 82 83 oprsz = (oprsz / 8) - 1; 84 maxsz = (maxsz / 8) - 1; 85 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 86 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 87 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 88 89 return desc; 90 } 91 92 /* Generate a call to a gvec-style helper with two vector operands. */ 93 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 94 uint32_t oprsz, uint32_t maxsz, int32_t data, 95 gen_helper_gvec_2 *fn) 96 { 97 TCGv_ptr a0, a1; 98 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 99 100 a0 = tcg_temp_new_ptr(); 101 a1 = tcg_temp_new_ptr(); 102 103 tcg_gen_addi_ptr(a0, cpu_env, dofs); 104 tcg_gen_addi_ptr(a1, cpu_env, aofs); 105 106 fn(a0, a1, desc); 107 108 tcg_temp_free_ptr(a0); 109 tcg_temp_free_ptr(a1); 110 tcg_temp_free_i32(desc); 111 } 112 113 /* Generate a call to a gvec-style helper with two vector operands 114 and one scalar operand. */ 115 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 116 uint32_t oprsz, uint32_t maxsz, int32_t data, 117 gen_helper_gvec_2i *fn) 118 { 119 TCGv_ptr a0, a1; 120 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 121 122 a0 = tcg_temp_new_ptr(); 123 a1 = tcg_temp_new_ptr(); 124 125 tcg_gen_addi_ptr(a0, cpu_env, dofs); 126 tcg_gen_addi_ptr(a1, cpu_env, aofs); 127 128 fn(a0, a1, c, desc); 129 130 tcg_temp_free_ptr(a0); 131 tcg_temp_free_ptr(a1); 132 tcg_temp_free_i32(desc); 133 } 134 135 /* Generate a call to a gvec-style helper with three vector operands. */ 136 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 137 uint32_t oprsz, uint32_t maxsz, int32_t data, 138 gen_helper_gvec_3 *fn) 139 { 140 TCGv_ptr a0, a1, a2; 141 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 142 143 a0 = tcg_temp_new_ptr(); 144 a1 = tcg_temp_new_ptr(); 145 a2 = tcg_temp_new_ptr(); 146 147 tcg_gen_addi_ptr(a0, cpu_env, dofs); 148 tcg_gen_addi_ptr(a1, cpu_env, aofs); 149 tcg_gen_addi_ptr(a2, cpu_env, bofs); 150 151 fn(a0, a1, a2, desc); 152 153 tcg_temp_free_ptr(a0); 154 tcg_temp_free_ptr(a1); 155 tcg_temp_free_ptr(a2); 156 tcg_temp_free_i32(desc); 157 } 158 159 /* Generate a call to a gvec-style helper with four vector operands. */ 160 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 161 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 162 int32_t data, gen_helper_gvec_4 *fn) 163 { 164 TCGv_ptr a0, a1, a2, a3; 165 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 166 167 a0 = tcg_temp_new_ptr(); 168 a1 = tcg_temp_new_ptr(); 169 a2 = tcg_temp_new_ptr(); 170 a3 = tcg_temp_new_ptr(); 171 172 tcg_gen_addi_ptr(a0, cpu_env, dofs); 173 tcg_gen_addi_ptr(a1, cpu_env, aofs); 174 tcg_gen_addi_ptr(a2, cpu_env, bofs); 175 tcg_gen_addi_ptr(a3, cpu_env, cofs); 176 177 fn(a0, a1, a2, a3, desc); 178 179 tcg_temp_free_ptr(a0); 180 tcg_temp_free_ptr(a1); 181 tcg_temp_free_ptr(a2); 182 tcg_temp_free_ptr(a3); 183 tcg_temp_free_i32(desc); 184 } 185 186 /* Generate a call to a gvec-style helper with five vector operands. */ 187 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 188 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 189 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 190 { 191 TCGv_ptr a0, a1, a2, a3, a4; 192 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 193 194 a0 = tcg_temp_new_ptr(); 195 a1 = tcg_temp_new_ptr(); 196 a2 = tcg_temp_new_ptr(); 197 a3 = tcg_temp_new_ptr(); 198 a4 = tcg_temp_new_ptr(); 199 200 tcg_gen_addi_ptr(a0, cpu_env, dofs); 201 tcg_gen_addi_ptr(a1, cpu_env, aofs); 202 tcg_gen_addi_ptr(a2, cpu_env, bofs); 203 tcg_gen_addi_ptr(a3, cpu_env, cofs); 204 tcg_gen_addi_ptr(a4, cpu_env, xofs); 205 206 fn(a0, a1, a2, a3, a4, desc); 207 208 tcg_temp_free_ptr(a0); 209 tcg_temp_free_ptr(a1); 210 tcg_temp_free_ptr(a2); 211 tcg_temp_free_ptr(a3); 212 tcg_temp_free_ptr(a4); 213 tcg_temp_free_i32(desc); 214 } 215 216 /* Generate a call to a gvec-style helper with three vector operands 217 and an extra pointer operand. */ 218 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 219 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 220 int32_t data, gen_helper_gvec_2_ptr *fn) 221 { 222 TCGv_ptr a0, a1; 223 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 224 225 a0 = tcg_temp_new_ptr(); 226 a1 = tcg_temp_new_ptr(); 227 228 tcg_gen_addi_ptr(a0, cpu_env, dofs); 229 tcg_gen_addi_ptr(a1, cpu_env, aofs); 230 231 fn(a0, a1, ptr, desc); 232 233 tcg_temp_free_ptr(a0); 234 tcg_temp_free_ptr(a1); 235 tcg_temp_free_i32(desc); 236 } 237 238 /* Generate a call to a gvec-style helper with three vector operands 239 and an extra pointer operand. */ 240 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 241 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 242 int32_t data, gen_helper_gvec_3_ptr *fn) 243 { 244 TCGv_ptr a0, a1, a2; 245 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 246 247 a0 = tcg_temp_new_ptr(); 248 a1 = tcg_temp_new_ptr(); 249 a2 = tcg_temp_new_ptr(); 250 251 tcg_gen_addi_ptr(a0, cpu_env, dofs); 252 tcg_gen_addi_ptr(a1, cpu_env, aofs); 253 tcg_gen_addi_ptr(a2, cpu_env, bofs); 254 255 fn(a0, a1, a2, ptr, desc); 256 257 tcg_temp_free_ptr(a0); 258 tcg_temp_free_ptr(a1); 259 tcg_temp_free_ptr(a2); 260 tcg_temp_free_i32(desc); 261 } 262 263 /* Generate a call to a gvec-style helper with four vector operands 264 and an extra pointer operand. */ 265 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 266 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 267 uint32_t maxsz, int32_t data, 268 gen_helper_gvec_4_ptr *fn) 269 { 270 TCGv_ptr a0, a1, a2, a3; 271 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 272 273 a0 = tcg_temp_new_ptr(); 274 a1 = tcg_temp_new_ptr(); 275 a2 = tcg_temp_new_ptr(); 276 a3 = tcg_temp_new_ptr(); 277 278 tcg_gen_addi_ptr(a0, cpu_env, dofs); 279 tcg_gen_addi_ptr(a1, cpu_env, aofs); 280 tcg_gen_addi_ptr(a2, cpu_env, bofs); 281 tcg_gen_addi_ptr(a3, cpu_env, cofs); 282 283 fn(a0, a1, a2, a3, ptr, desc); 284 285 tcg_temp_free_ptr(a0); 286 tcg_temp_free_ptr(a1); 287 tcg_temp_free_ptr(a2); 288 tcg_temp_free_ptr(a3); 289 tcg_temp_free_i32(desc); 290 } 291 292 /* Return true if we want to implement something of OPRSZ bytes 293 in units of LNSZ. This limits the expansion of inline code. */ 294 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 295 { 296 if (oprsz % lnsz == 0) { 297 uint32_t lnct = oprsz / lnsz; 298 return lnct >= 1 && lnct <= MAX_UNROLL; 299 } 300 return false; 301 } 302 303 static void expand_clr(uint32_t dofs, uint32_t maxsz); 304 305 /* Duplicate C as per VECE. */ 306 uint64_t (dup_const)(unsigned vece, uint64_t c) 307 { 308 switch (vece) { 309 case MO_8: 310 return 0x0101010101010101ull * (uint8_t)c; 311 case MO_16: 312 return 0x0001000100010001ull * (uint16_t)c; 313 case MO_32: 314 return 0x0000000100000001ull * (uint32_t)c; 315 case MO_64: 316 return c; 317 default: 318 g_assert_not_reached(); 319 } 320 } 321 322 /* Duplicate IN into OUT as per VECE. */ 323 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 324 { 325 switch (vece) { 326 case MO_8: 327 tcg_gen_ext8u_i32(out, in); 328 tcg_gen_muli_i32(out, out, 0x01010101); 329 break; 330 case MO_16: 331 tcg_gen_deposit_i32(out, in, in, 16, 16); 332 break; 333 case MO_32: 334 tcg_gen_mov_i32(out, in); 335 break; 336 default: 337 g_assert_not_reached(); 338 } 339 } 340 341 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 342 { 343 switch (vece) { 344 case MO_8: 345 tcg_gen_ext8u_i64(out, in); 346 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 347 break; 348 case MO_16: 349 tcg_gen_ext16u_i64(out, in); 350 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 351 break; 352 case MO_32: 353 tcg_gen_deposit_i64(out, in, in, 32, 32); 354 break; 355 case MO_64: 356 tcg_gen_mov_i64(out, in); 357 break; 358 default: 359 g_assert_not_reached(); 360 } 361 } 362 363 /* Select a supported vector type for implementing an operation on SIZE 364 * bytes. If OP is 0, assume that the real operation to be performed is 365 * required by all backends. Otherwise, make sure than OP can be performed 366 * on elements of size VECE in the selected type. Do not select V64 if 367 * PREFER_I64 is true. Return 0 if no vector type is selected. 368 */ 369 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 370 uint32_t size, bool prefer_i64) 371 { 372 if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { 373 /* 374 * Recall that ARM SVE allows vector sizes that are not a 375 * power of 2, but always a multiple of 16. The intent is 376 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 377 * It is hard to imagine a case in which v256 is supported 378 * but v128 is not, but check anyway. 379 */ 380 if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) 381 && (size % 32 == 0 382 || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) { 383 return TCG_TYPE_V256; 384 } 385 } 386 if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) 387 && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) { 388 return TCG_TYPE_V128; 389 } 390 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 391 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 392 return TCG_TYPE_V64; 393 } 394 return 0; 395 } 396 397 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 398 uint32_t maxsz, TCGv_vec t_vec) 399 { 400 uint32_t i = 0; 401 402 switch (type) { 403 case TCG_TYPE_V256: 404 /* 405 * Recall that ARM SVE allows vector sizes that are not a 406 * power of 2, but always a multiple of 16. The intent is 407 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 408 */ 409 for (; i + 32 <= oprsz; i += 32) { 410 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 411 } 412 /* fallthru */ 413 case TCG_TYPE_V128: 414 for (; i + 16 <= oprsz; i += 16) { 415 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 416 } 417 break; 418 case TCG_TYPE_V64: 419 for (; i < oprsz; i += 8) { 420 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 421 } 422 break; 423 default: 424 g_assert_not_reached(); 425 } 426 427 if (oprsz < maxsz) { 428 expand_clr(dofs + oprsz, maxsz - oprsz); 429 } 430 } 431 432 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 433 * Only one of IN_32 or IN_64 may be set; 434 * IN_C is used if IN_32 and IN_64 are unset. 435 */ 436 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 437 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 438 uint64_t in_c) 439 { 440 TCGType type; 441 TCGv_i64 t_64; 442 TCGv_i32 t_32, t_desc; 443 TCGv_ptr t_ptr; 444 uint32_t i; 445 446 assert(vece <= (in_32 ? MO_32 : MO_64)); 447 assert(in_32 == NULL || in_64 == NULL); 448 449 /* If we're storing 0, expand oprsz to maxsz. */ 450 if (in_32 == NULL && in_64 == NULL) { 451 in_c = dup_const(vece, in_c); 452 if (in_c == 0) { 453 oprsz = maxsz; 454 } 455 } 456 457 /* Implement inline with a vector type, if possible. 458 * Prefer integer when 64-bit host and no variable dup. 459 */ 460 type = choose_vector_type(NULL, vece, oprsz, 461 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 462 && (in_64 == NULL || vece == MO_64))); 463 if (type != 0) { 464 TCGv_vec t_vec = tcg_temp_new_vec(type); 465 466 if (in_32) { 467 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 468 } else if (in_64) { 469 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 470 } else { 471 tcg_gen_dupi_vec(vece, t_vec, in_c); 472 } 473 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 474 tcg_temp_free_vec(t_vec); 475 return; 476 } 477 478 /* Otherwise, inline with an integer type, unless "large". */ 479 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 480 t_64 = NULL; 481 t_32 = NULL; 482 483 if (in_32) { 484 /* We are given a 32-bit variable input. For a 64-bit host, 485 use a 64-bit operation unless the 32-bit operation would 486 be simple enough. */ 487 if (TCG_TARGET_REG_BITS == 64 488 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 489 t_64 = tcg_temp_new_i64(); 490 tcg_gen_extu_i32_i64(t_64, in_32); 491 gen_dup_i64(vece, t_64, t_64); 492 } else { 493 t_32 = tcg_temp_new_i32(); 494 gen_dup_i32(vece, t_32, in_32); 495 } 496 } else if (in_64) { 497 /* We are given a 64-bit variable input. */ 498 t_64 = tcg_temp_new_i64(); 499 gen_dup_i64(vece, t_64, in_64); 500 } else { 501 /* We are given a constant input. */ 502 /* For 64-bit hosts, use 64-bit constants for "simple" constants 503 or when we'd need too many 32-bit stores, or when a 64-bit 504 constant is really required. */ 505 if (vece == MO_64 506 || (TCG_TARGET_REG_BITS == 64 507 && (in_c == 0 || in_c == -1 508 || !check_size_impl(oprsz, 4)))) { 509 t_64 = tcg_const_i64(in_c); 510 } else { 511 t_32 = tcg_const_i32(in_c); 512 } 513 } 514 515 /* Implement inline if we picked an implementation size above. */ 516 if (t_32) { 517 for (i = 0; i < oprsz; i += 4) { 518 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 519 } 520 tcg_temp_free_i32(t_32); 521 goto done; 522 } 523 if (t_64) { 524 for (i = 0; i < oprsz; i += 8) { 525 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 526 } 527 tcg_temp_free_i64(t_64); 528 goto done; 529 } 530 } 531 532 /* Otherwise implement out of line. */ 533 t_ptr = tcg_temp_new_ptr(); 534 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 535 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 536 537 if (vece == MO_64) { 538 if (in_64) { 539 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 540 } else { 541 t_64 = tcg_const_i64(in_c); 542 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 543 tcg_temp_free_i64(t_64); 544 } 545 } else { 546 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 547 static dup_fn * const fns[3] = { 548 gen_helper_gvec_dup8, 549 gen_helper_gvec_dup16, 550 gen_helper_gvec_dup32 551 }; 552 553 if (in_32) { 554 fns[vece](t_ptr, t_desc, in_32); 555 } else { 556 t_32 = tcg_temp_new_i32(); 557 if (in_64) { 558 tcg_gen_extrl_i64_i32(t_32, in_64); 559 } else if (vece == MO_8) { 560 tcg_gen_movi_i32(t_32, in_c & 0xff); 561 } else if (vece == MO_16) { 562 tcg_gen_movi_i32(t_32, in_c & 0xffff); 563 } else { 564 tcg_gen_movi_i32(t_32, in_c); 565 } 566 fns[vece](t_ptr, t_desc, t_32); 567 tcg_temp_free_i32(t_32); 568 } 569 } 570 571 tcg_temp_free_ptr(t_ptr); 572 tcg_temp_free_i32(t_desc); 573 return; 574 575 done: 576 if (oprsz < maxsz) { 577 expand_clr(dofs + oprsz, maxsz - oprsz); 578 } 579 } 580 581 /* Likewise, but with zero. */ 582 static void expand_clr(uint32_t dofs, uint32_t maxsz) 583 { 584 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 585 } 586 587 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 588 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 589 void (*fni)(TCGv_i32, TCGv_i32)) 590 { 591 TCGv_i32 t0 = tcg_temp_new_i32(); 592 uint32_t i; 593 594 for (i = 0; i < oprsz; i += 4) { 595 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 596 fni(t0, t0); 597 tcg_gen_st_i32(t0, cpu_env, dofs + i); 598 } 599 tcg_temp_free_i32(t0); 600 } 601 602 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 603 int32_t c, bool load_dest, 604 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 605 { 606 TCGv_i32 t0 = tcg_temp_new_i32(); 607 TCGv_i32 t1 = tcg_temp_new_i32(); 608 uint32_t i; 609 610 for (i = 0; i < oprsz; i += 4) { 611 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 612 if (load_dest) { 613 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 614 } 615 fni(t1, t0, c); 616 tcg_gen_st_i32(t1, cpu_env, dofs + i); 617 } 618 tcg_temp_free_i32(t0); 619 tcg_temp_free_i32(t1); 620 } 621 622 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 623 TCGv_i32 c, bool scalar_first, 624 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 625 { 626 TCGv_i32 t0 = tcg_temp_new_i32(); 627 TCGv_i32 t1 = tcg_temp_new_i32(); 628 uint32_t i; 629 630 for (i = 0; i < oprsz; i += 4) { 631 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 632 if (scalar_first) { 633 fni(t1, c, t0); 634 } else { 635 fni(t1, t0, c); 636 } 637 tcg_gen_st_i32(t1, cpu_env, dofs + i); 638 } 639 tcg_temp_free_i32(t0); 640 tcg_temp_free_i32(t1); 641 } 642 643 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 644 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 645 uint32_t bofs, uint32_t oprsz, bool load_dest, 646 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 647 { 648 TCGv_i32 t0 = tcg_temp_new_i32(); 649 TCGv_i32 t1 = tcg_temp_new_i32(); 650 TCGv_i32 t2 = tcg_temp_new_i32(); 651 uint32_t i; 652 653 for (i = 0; i < oprsz; i += 4) { 654 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 655 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 656 if (load_dest) { 657 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 658 } 659 fni(t2, t0, t1); 660 tcg_gen_st_i32(t2, cpu_env, dofs + i); 661 } 662 tcg_temp_free_i32(t2); 663 tcg_temp_free_i32(t1); 664 tcg_temp_free_i32(t0); 665 } 666 667 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 668 uint32_t oprsz, int32_t c, bool load_dest, 669 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 670 { 671 TCGv_i32 t0 = tcg_temp_new_i32(); 672 TCGv_i32 t1 = tcg_temp_new_i32(); 673 TCGv_i32 t2 = tcg_temp_new_i32(); 674 uint32_t i; 675 676 for (i = 0; i < oprsz; i += 4) { 677 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 678 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 679 if (load_dest) { 680 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 681 } 682 fni(t2, t0, t1, c); 683 tcg_gen_st_i32(t2, cpu_env, dofs + i); 684 } 685 tcg_temp_free_i32(t0); 686 tcg_temp_free_i32(t1); 687 tcg_temp_free_i32(t2); 688 } 689 690 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 691 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 692 uint32_t cofs, uint32_t oprsz, bool write_aofs, 693 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 694 { 695 TCGv_i32 t0 = tcg_temp_new_i32(); 696 TCGv_i32 t1 = tcg_temp_new_i32(); 697 TCGv_i32 t2 = tcg_temp_new_i32(); 698 TCGv_i32 t3 = tcg_temp_new_i32(); 699 uint32_t i; 700 701 for (i = 0; i < oprsz; i += 4) { 702 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 703 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 704 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 705 fni(t0, t1, t2, t3); 706 tcg_gen_st_i32(t0, cpu_env, dofs + i); 707 if (write_aofs) { 708 tcg_gen_st_i32(t1, cpu_env, aofs + i); 709 } 710 } 711 tcg_temp_free_i32(t3); 712 tcg_temp_free_i32(t2); 713 tcg_temp_free_i32(t1); 714 tcg_temp_free_i32(t0); 715 } 716 717 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 718 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 719 void (*fni)(TCGv_i64, TCGv_i64)) 720 { 721 TCGv_i64 t0 = tcg_temp_new_i64(); 722 uint32_t i; 723 724 for (i = 0; i < oprsz; i += 8) { 725 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 726 fni(t0, t0); 727 tcg_gen_st_i64(t0, cpu_env, dofs + i); 728 } 729 tcg_temp_free_i64(t0); 730 } 731 732 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 733 int64_t c, bool load_dest, 734 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 735 { 736 TCGv_i64 t0 = tcg_temp_new_i64(); 737 TCGv_i64 t1 = tcg_temp_new_i64(); 738 uint32_t i; 739 740 for (i = 0; i < oprsz; i += 8) { 741 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 742 if (load_dest) { 743 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 744 } 745 fni(t1, t0, c); 746 tcg_gen_st_i64(t1, cpu_env, dofs + i); 747 } 748 tcg_temp_free_i64(t0); 749 tcg_temp_free_i64(t1); 750 } 751 752 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 753 TCGv_i64 c, bool scalar_first, 754 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 755 { 756 TCGv_i64 t0 = tcg_temp_new_i64(); 757 TCGv_i64 t1 = tcg_temp_new_i64(); 758 uint32_t i; 759 760 for (i = 0; i < oprsz; i += 8) { 761 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 762 if (scalar_first) { 763 fni(t1, c, t0); 764 } else { 765 fni(t1, t0, c); 766 } 767 tcg_gen_st_i64(t1, cpu_env, dofs + i); 768 } 769 tcg_temp_free_i64(t0); 770 tcg_temp_free_i64(t1); 771 } 772 773 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 774 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 775 uint32_t bofs, uint32_t oprsz, bool load_dest, 776 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 777 { 778 TCGv_i64 t0 = tcg_temp_new_i64(); 779 TCGv_i64 t1 = tcg_temp_new_i64(); 780 TCGv_i64 t2 = tcg_temp_new_i64(); 781 uint32_t i; 782 783 for (i = 0; i < oprsz; i += 8) { 784 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 785 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 786 if (load_dest) { 787 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 788 } 789 fni(t2, t0, t1); 790 tcg_gen_st_i64(t2, cpu_env, dofs + i); 791 } 792 tcg_temp_free_i64(t2); 793 tcg_temp_free_i64(t1); 794 tcg_temp_free_i64(t0); 795 } 796 797 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 798 uint32_t oprsz, int64_t c, bool load_dest, 799 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 800 { 801 TCGv_i64 t0 = tcg_temp_new_i64(); 802 TCGv_i64 t1 = tcg_temp_new_i64(); 803 TCGv_i64 t2 = tcg_temp_new_i64(); 804 uint32_t i; 805 806 for (i = 0; i < oprsz; i += 8) { 807 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 808 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 809 if (load_dest) { 810 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 811 } 812 fni(t2, t0, t1, c); 813 tcg_gen_st_i64(t2, cpu_env, dofs + i); 814 } 815 tcg_temp_free_i64(t0); 816 tcg_temp_free_i64(t1); 817 tcg_temp_free_i64(t2); 818 } 819 820 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 821 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 822 uint32_t cofs, uint32_t oprsz, bool write_aofs, 823 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 824 { 825 TCGv_i64 t0 = tcg_temp_new_i64(); 826 TCGv_i64 t1 = tcg_temp_new_i64(); 827 TCGv_i64 t2 = tcg_temp_new_i64(); 828 TCGv_i64 t3 = tcg_temp_new_i64(); 829 uint32_t i; 830 831 for (i = 0; i < oprsz; i += 8) { 832 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 833 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 834 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 835 fni(t0, t1, t2, t3); 836 tcg_gen_st_i64(t0, cpu_env, dofs + i); 837 if (write_aofs) { 838 tcg_gen_st_i64(t1, cpu_env, aofs + i); 839 } 840 } 841 tcg_temp_free_i64(t3); 842 tcg_temp_free_i64(t2); 843 tcg_temp_free_i64(t1); 844 tcg_temp_free_i64(t0); 845 } 846 847 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 848 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 849 uint32_t oprsz, uint32_t tysz, TCGType type, 850 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 851 { 852 TCGv_vec t0 = tcg_temp_new_vec(type); 853 uint32_t i; 854 855 for (i = 0; i < oprsz; i += tysz) { 856 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 857 fni(vece, t0, t0); 858 tcg_gen_st_vec(t0, cpu_env, dofs + i); 859 } 860 tcg_temp_free_vec(t0); 861 } 862 863 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 864 using host vectors. */ 865 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 866 uint32_t oprsz, uint32_t tysz, TCGType type, 867 int64_t c, bool load_dest, 868 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 869 { 870 TCGv_vec t0 = tcg_temp_new_vec(type); 871 TCGv_vec t1 = tcg_temp_new_vec(type); 872 uint32_t i; 873 874 for (i = 0; i < oprsz; i += tysz) { 875 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 876 if (load_dest) { 877 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 878 } 879 fni(vece, t1, t0, c); 880 tcg_gen_st_vec(t1, cpu_env, dofs + i); 881 } 882 tcg_temp_free_vec(t0); 883 tcg_temp_free_vec(t1); 884 } 885 886 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 887 uint32_t oprsz, uint32_t tysz, TCGType type, 888 TCGv_vec c, bool scalar_first, 889 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 890 { 891 TCGv_vec t0 = tcg_temp_new_vec(type); 892 TCGv_vec t1 = tcg_temp_new_vec(type); 893 uint32_t i; 894 895 for (i = 0; i < oprsz; i += tysz) { 896 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 897 if (scalar_first) { 898 fni(vece, t1, c, t0); 899 } else { 900 fni(vece, t1, t0, c); 901 } 902 tcg_gen_st_vec(t1, cpu_env, dofs + i); 903 } 904 tcg_temp_free_vec(t0); 905 tcg_temp_free_vec(t1); 906 } 907 908 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 909 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 910 uint32_t bofs, uint32_t oprsz, 911 uint32_t tysz, TCGType type, bool load_dest, 912 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 913 { 914 TCGv_vec t0 = tcg_temp_new_vec(type); 915 TCGv_vec t1 = tcg_temp_new_vec(type); 916 TCGv_vec t2 = tcg_temp_new_vec(type); 917 uint32_t i; 918 919 for (i = 0; i < oprsz; i += tysz) { 920 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 921 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 922 if (load_dest) { 923 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 924 } 925 fni(vece, t2, t0, t1); 926 tcg_gen_st_vec(t2, cpu_env, dofs + i); 927 } 928 tcg_temp_free_vec(t2); 929 tcg_temp_free_vec(t1); 930 tcg_temp_free_vec(t0); 931 } 932 933 /* 934 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 935 * using host vectors. 936 */ 937 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 938 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 939 TCGType type, int64_t c, bool load_dest, 940 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 941 int64_t)) 942 { 943 TCGv_vec t0 = tcg_temp_new_vec(type); 944 TCGv_vec t1 = tcg_temp_new_vec(type); 945 TCGv_vec t2 = tcg_temp_new_vec(type); 946 uint32_t i; 947 948 for (i = 0; i < oprsz; i += tysz) { 949 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 950 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 951 if (load_dest) { 952 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 953 } 954 fni(vece, t2, t0, t1, c); 955 tcg_gen_st_vec(t2, cpu_env, dofs + i); 956 } 957 tcg_temp_free_vec(t0); 958 tcg_temp_free_vec(t1); 959 tcg_temp_free_vec(t2); 960 } 961 962 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 963 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 964 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 965 uint32_t tysz, TCGType type, bool write_aofs, 966 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 967 TCGv_vec, TCGv_vec)) 968 { 969 TCGv_vec t0 = tcg_temp_new_vec(type); 970 TCGv_vec t1 = tcg_temp_new_vec(type); 971 TCGv_vec t2 = tcg_temp_new_vec(type); 972 TCGv_vec t3 = tcg_temp_new_vec(type); 973 uint32_t i; 974 975 for (i = 0; i < oprsz; i += tysz) { 976 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 977 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 978 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 979 fni(vece, t0, t1, t2, t3); 980 tcg_gen_st_vec(t0, cpu_env, dofs + i); 981 if (write_aofs) { 982 tcg_gen_st_vec(t1, cpu_env, aofs + i); 983 } 984 } 985 tcg_temp_free_vec(t3); 986 tcg_temp_free_vec(t2); 987 tcg_temp_free_vec(t1); 988 tcg_temp_free_vec(t0); 989 } 990 991 /* Expand a vector two-operand operation. */ 992 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 993 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 994 { 995 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 996 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 997 TCGType type; 998 uint32_t some; 999 1000 check_size_align(oprsz, maxsz, dofs | aofs); 1001 check_overlap_2(dofs, aofs, maxsz); 1002 1003 type = 0; 1004 if (g->fniv) { 1005 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1006 } 1007 switch (type) { 1008 case TCG_TYPE_V256: 1009 /* Recall that ARM SVE allows vector sizes that are not a 1010 * power of 2, but always a multiple of 16. The intent is 1011 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1012 */ 1013 some = QEMU_ALIGN_DOWN(oprsz, 32); 1014 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); 1015 if (some == oprsz) { 1016 break; 1017 } 1018 dofs += some; 1019 aofs += some; 1020 oprsz -= some; 1021 maxsz -= some; 1022 /* fallthru */ 1023 case TCG_TYPE_V128: 1024 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); 1025 break; 1026 case TCG_TYPE_V64: 1027 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); 1028 break; 1029 1030 case 0: 1031 if (g->fni8 && check_size_impl(oprsz, 8)) { 1032 expand_2_i64(dofs, aofs, oprsz, g->fni8); 1033 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1034 expand_2_i32(dofs, aofs, oprsz, g->fni4); 1035 } else { 1036 assert(g->fno != NULL); 1037 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1038 oprsz = maxsz; 1039 } 1040 break; 1041 1042 default: 1043 g_assert_not_reached(); 1044 } 1045 tcg_swap_vecop_list(hold_list); 1046 1047 if (oprsz < maxsz) { 1048 expand_clr(dofs + oprsz, maxsz - oprsz); 1049 } 1050 } 1051 1052 /* Expand a vector operation with two vectors and an immediate. */ 1053 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1054 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1055 { 1056 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1057 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1058 TCGType type; 1059 uint32_t some; 1060 1061 check_size_align(oprsz, maxsz, dofs | aofs); 1062 check_overlap_2(dofs, aofs, maxsz); 1063 1064 type = 0; 1065 if (g->fniv) { 1066 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1067 } 1068 switch (type) { 1069 case TCG_TYPE_V256: 1070 /* Recall that ARM SVE allows vector sizes that are not a 1071 * power of 2, but always a multiple of 16. The intent is 1072 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1073 */ 1074 some = QEMU_ALIGN_DOWN(oprsz, 32); 1075 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1076 c, g->load_dest, g->fniv); 1077 if (some == oprsz) { 1078 break; 1079 } 1080 dofs += some; 1081 aofs += some; 1082 oprsz -= some; 1083 maxsz -= some; 1084 /* fallthru */ 1085 case TCG_TYPE_V128: 1086 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1087 c, g->load_dest, g->fniv); 1088 break; 1089 case TCG_TYPE_V64: 1090 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1091 c, g->load_dest, g->fniv); 1092 break; 1093 1094 case 0: 1095 if (g->fni8 && check_size_impl(oprsz, 8)) { 1096 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1097 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1098 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1099 } else { 1100 if (g->fno) { 1101 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1102 } else { 1103 TCGv_i64 tcg_c = tcg_const_i64(c); 1104 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1105 maxsz, c, g->fnoi); 1106 tcg_temp_free_i64(tcg_c); 1107 } 1108 oprsz = maxsz; 1109 } 1110 break; 1111 1112 default: 1113 g_assert_not_reached(); 1114 } 1115 tcg_swap_vecop_list(hold_list); 1116 1117 if (oprsz < maxsz) { 1118 expand_clr(dofs + oprsz, maxsz - oprsz); 1119 } 1120 } 1121 1122 /* Expand a vector operation with two vectors and a scalar. */ 1123 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1124 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1125 { 1126 TCGType type; 1127 1128 check_size_align(oprsz, maxsz, dofs | aofs); 1129 check_overlap_2(dofs, aofs, maxsz); 1130 1131 type = 0; 1132 if (g->fniv) { 1133 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1134 } 1135 if (type != 0) { 1136 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1137 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1138 TCGv_vec t_vec = tcg_temp_new_vec(type); 1139 uint32_t some; 1140 1141 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1142 1143 switch (type) { 1144 case TCG_TYPE_V256: 1145 /* Recall that ARM SVE allows vector sizes that are not a 1146 * power of 2, but always a multiple of 16. The intent is 1147 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1148 */ 1149 some = QEMU_ALIGN_DOWN(oprsz, 32); 1150 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1151 t_vec, g->scalar_first, g->fniv); 1152 if (some == oprsz) { 1153 break; 1154 } 1155 dofs += some; 1156 aofs += some; 1157 oprsz -= some; 1158 maxsz -= some; 1159 /* fallthru */ 1160 1161 case TCG_TYPE_V128: 1162 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1163 t_vec, g->scalar_first, g->fniv); 1164 break; 1165 1166 case TCG_TYPE_V64: 1167 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1168 t_vec, g->scalar_first, g->fniv); 1169 break; 1170 1171 default: 1172 g_assert_not_reached(); 1173 } 1174 tcg_temp_free_vec(t_vec); 1175 tcg_swap_vecop_list(hold_list); 1176 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1177 TCGv_i64 t64 = tcg_temp_new_i64(); 1178 1179 gen_dup_i64(g->vece, t64, c); 1180 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1181 tcg_temp_free_i64(t64); 1182 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1183 TCGv_i32 t32 = tcg_temp_new_i32(); 1184 1185 tcg_gen_extrl_i64_i32(t32, c); 1186 gen_dup_i32(g->vece, t32, t32); 1187 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1188 tcg_temp_free_i32(t32); 1189 } else { 1190 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1191 return; 1192 } 1193 1194 if (oprsz < maxsz) { 1195 expand_clr(dofs + oprsz, maxsz - oprsz); 1196 } 1197 } 1198 1199 /* Expand a vector three-operand operation. */ 1200 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1201 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1202 { 1203 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1204 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1205 TCGType type; 1206 uint32_t some; 1207 1208 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1209 check_overlap_3(dofs, aofs, bofs, maxsz); 1210 1211 type = 0; 1212 if (g->fniv) { 1213 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1214 } 1215 switch (type) { 1216 case TCG_TYPE_V256: 1217 /* Recall that ARM SVE allows vector sizes that are not a 1218 * power of 2, but always a multiple of 16. The intent is 1219 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1220 */ 1221 some = QEMU_ALIGN_DOWN(oprsz, 32); 1222 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1223 g->load_dest, g->fniv); 1224 if (some == oprsz) { 1225 break; 1226 } 1227 dofs += some; 1228 aofs += some; 1229 bofs += some; 1230 oprsz -= some; 1231 maxsz -= some; 1232 /* fallthru */ 1233 case TCG_TYPE_V128: 1234 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1235 g->load_dest, g->fniv); 1236 break; 1237 case TCG_TYPE_V64: 1238 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1239 g->load_dest, g->fniv); 1240 break; 1241 1242 case 0: 1243 if (g->fni8 && check_size_impl(oprsz, 8)) { 1244 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1245 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1246 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1247 } else { 1248 assert(g->fno != NULL); 1249 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1250 maxsz, g->data, g->fno); 1251 oprsz = maxsz; 1252 } 1253 break; 1254 1255 default: 1256 g_assert_not_reached(); 1257 } 1258 tcg_swap_vecop_list(hold_list); 1259 1260 if (oprsz < maxsz) { 1261 expand_clr(dofs + oprsz, maxsz - oprsz); 1262 } 1263 } 1264 1265 /* Expand a vector operation with three vectors and an immediate. */ 1266 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1267 uint32_t oprsz, uint32_t maxsz, int64_t c, 1268 const GVecGen3i *g) 1269 { 1270 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1271 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1272 TCGType type; 1273 uint32_t some; 1274 1275 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1276 check_overlap_3(dofs, aofs, bofs, maxsz); 1277 1278 type = 0; 1279 if (g->fniv) { 1280 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1281 } 1282 switch (type) { 1283 case TCG_TYPE_V256: 1284 /* 1285 * Recall that ARM SVE allows vector sizes that are not a 1286 * power of 2, but always a multiple of 16. The intent is 1287 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1288 */ 1289 some = QEMU_ALIGN_DOWN(oprsz, 32); 1290 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1291 c, g->load_dest, g->fniv); 1292 if (some == oprsz) { 1293 break; 1294 } 1295 dofs += some; 1296 aofs += some; 1297 bofs += some; 1298 oprsz -= some; 1299 maxsz -= some; 1300 /* fallthru */ 1301 case TCG_TYPE_V128: 1302 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1303 c, g->load_dest, g->fniv); 1304 break; 1305 case TCG_TYPE_V64: 1306 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1307 c, g->load_dest, g->fniv); 1308 break; 1309 1310 case 0: 1311 if (g->fni8 && check_size_impl(oprsz, 8)) { 1312 expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8); 1313 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1314 expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4); 1315 } else { 1316 assert(g->fno != NULL); 1317 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1318 oprsz = maxsz; 1319 } 1320 break; 1321 1322 default: 1323 g_assert_not_reached(); 1324 } 1325 tcg_swap_vecop_list(hold_list); 1326 1327 if (oprsz < maxsz) { 1328 expand_clr(dofs + oprsz, maxsz - oprsz); 1329 } 1330 } 1331 1332 /* Expand a vector four-operand operation. */ 1333 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1334 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1335 { 1336 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1337 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1338 TCGType type; 1339 uint32_t some; 1340 1341 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1342 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1343 1344 type = 0; 1345 if (g->fniv) { 1346 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1347 } 1348 switch (type) { 1349 case TCG_TYPE_V256: 1350 /* Recall that ARM SVE allows vector sizes that are not a 1351 * power of 2, but always a multiple of 16. The intent is 1352 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1353 */ 1354 some = QEMU_ALIGN_DOWN(oprsz, 32); 1355 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1356 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1357 if (some == oprsz) { 1358 break; 1359 } 1360 dofs += some; 1361 aofs += some; 1362 bofs += some; 1363 cofs += some; 1364 oprsz -= some; 1365 maxsz -= some; 1366 /* fallthru */ 1367 case TCG_TYPE_V128: 1368 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1369 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1370 break; 1371 case TCG_TYPE_V64: 1372 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1373 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1374 break; 1375 1376 case 0: 1377 if (g->fni8 && check_size_impl(oprsz, 8)) { 1378 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1379 g->write_aofs, g->fni8); 1380 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1381 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1382 g->write_aofs, g->fni4); 1383 } else { 1384 assert(g->fno != NULL); 1385 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1386 oprsz, maxsz, g->data, g->fno); 1387 oprsz = maxsz; 1388 } 1389 break; 1390 1391 default: 1392 g_assert_not_reached(); 1393 } 1394 tcg_swap_vecop_list(hold_list); 1395 1396 if (oprsz < maxsz) { 1397 expand_clr(dofs + oprsz, maxsz - oprsz); 1398 } 1399 } 1400 1401 /* 1402 * Expand specific vector operations. 1403 */ 1404 1405 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1406 { 1407 tcg_gen_mov_vec(a, b); 1408 } 1409 1410 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1411 uint32_t oprsz, uint32_t maxsz) 1412 { 1413 static const GVecGen2 g = { 1414 .fni8 = tcg_gen_mov_i64, 1415 .fniv = vec_mov2, 1416 .fno = gen_helper_gvec_mov, 1417 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1418 }; 1419 if (dofs != aofs) { 1420 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1421 } else { 1422 check_size_align(oprsz, maxsz, dofs); 1423 if (oprsz < maxsz) { 1424 expand_clr(dofs + oprsz, maxsz - oprsz); 1425 } 1426 } 1427 } 1428 1429 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1430 uint32_t maxsz, TCGv_i32 in) 1431 { 1432 check_size_align(oprsz, maxsz, dofs); 1433 tcg_debug_assert(vece <= MO_32); 1434 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1435 } 1436 1437 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1438 uint32_t maxsz, TCGv_i64 in) 1439 { 1440 check_size_align(oprsz, maxsz, dofs); 1441 tcg_debug_assert(vece <= MO_64); 1442 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1443 } 1444 1445 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1446 uint32_t oprsz, uint32_t maxsz) 1447 { 1448 check_size_align(oprsz, maxsz, dofs); 1449 if (vece <= MO_64) { 1450 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1451 if (type != 0) { 1452 TCGv_vec t_vec = tcg_temp_new_vec(type); 1453 tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs); 1454 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1455 tcg_temp_free_vec(t_vec); 1456 } else if (vece <= MO_32) { 1457 TCGv_i32 in = tcg_temp_new_i32(); 1458 switch (vece) { 1459 case MO_8: 1460 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1461 break; 1462 case MO_16: 1463 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1464 break; 1465 default: 1466 tcg_gen_ld_i32(in, cpu_env, aofs); 1467 break; 1468 } 1469 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1470 tcg_temp_free_i32(in); 1471 } else { 1472 TCGv_i64 in = tcg_temp_new_i64(); 1473 tcg_gen_ld_i64(in, cpu_env, aofs); 1474 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1475 tcg_temp_free_i64(in); 1476 } 1477 } else { 1478 /* 128-bit duplicate. */ 1479 /* ??? Dup to 256-bit vector. */ 1480 int i; 1481 1482 tcg_debug_assert(vece == 4); 1483 tcg_debug_assert(oprsz >= 16); 1484 if (TCG_TARGET_HAS_v128) { 1485 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1486 1487 tcg_gen_ld_vec(in, cpu_env, aofs); 1488 for (i = 0; i < oprsz; i += 16) { 1489 tcg_gen_st_vec(in, cpu_env, dofs + i); 1490 } 1491 tcg_temp_free_vec(in); 1492 } else { 1493 TCGv_i64 in0 = tcg_temp_new_i64(); 1494 TCGv_i64 in1 = tcg_temp_new_i64(); 1495 1496 tcg_gen_ld_i64(in0, cpu_env, aofs); 1497 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1498 for (i = 0; i < oprsz; i += 16) { 1499 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1500 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1501 } 1502 tcg_temp_free_i64(in0); 1503 tcg_temp_free_i64(in1); 1504 } 1505 if (oprsz < maxsz) { 1506 expand_clr(dofs + oprsz, maxsz - oprsz); 1507 } 1508 } 1509 } 1510 1511 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, 1512 uint32_t maxsz, uint64_t x) 1513 { 1514 check_size_align(oprsz, maxsz, dofs); 1515 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); 1516 } 1517 1518 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, 1519 uint32_t maxsz, uint32_t x) 1520 { 1521 check_size_align(oprsz, maxsz, dofs); 1522 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); 1523 } 1524 1525 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, 1526 uint32_t maxsz, uint16_t x) 1527 { 1528 check_size_align(oprsz, maxsz, dofs); 1529 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); 1530 } 1531 1532 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, 1533 uint32_t maxsz, uint8_t x) 1534 { 1535 check_size_align(oprsz, maxsz, dofs); 1536 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); 1537 } 1538 1539 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1540 uint32_t oprsz, uint32_t maxsz) 1541 { 1542 static const GVecGen2 g = { 1543 .fni8 = tcg_gen_not_i64, 1544 .fniv = tcg_gen_not_vec, 1545 .fno = gen_helper_gvec_not, 1546 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1547 }; 1548 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1549 } 1550 1551 /* Perform a vector addition using normal addition and a mask. The mask 1552 should be the sign bit of each lane. This 6-operation form is more 1553 efficient than separate additions when there are 4 or more lanes in 1554 the 64-bit operation. */ 1555 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1556 { 1557 TCGv_i64 t1 = tcg_temp_new_i64(); 1558 TCGv_i64 t2 = tcg_temp_new_i64(); 1559 TCGv_i64 t3 = tcg_temp_new_i64(); 1560 1561 tcg_gen_andc_i64(t1, a, m); 1562 tcg_gen_andc_i64(t2, b, m); 1563 tcg_gen_xor_i64(t3, a, b); 1564 tcg_gen_add_i64(d, t1, t2); 1565 tcg_gen_and_i64(t3, t3, m); 1566 tcg_gen_xor_i64(d, d, t3); 1567 1568 tcg_temp_free_i64(t1); 1569 tcg_temp_free_i64(t2); 1570 tcg_temp_free_i64(t3); 1571 } 1572 1573 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1574 { 1575 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1576 gen_addv_mask(d, a, b, m); 1577 tcg_temp_free_i64(m); 1578 } 1579 1580 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1581 { 1582 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1583 gen_addv_mask(d, a, b, m); 1584 tcg_temp_free_i64(m); 1585 } 1586 1587 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1588 { 1589 TCGv_i64 t1 = tcg_temp_new_i64(); 1590 TCGv_i64 t2 = tcg_temp_new_i64(); 1591 1592 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1593 tcg_gen_add_i64(t2, a, b); 1594 tcg_gen_add_i64(t1, t1, b); 1595 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1596 1597 tcg_temp_free_i64(t1); 1598 tcg_temp_free_i64(t2); 1599 } 1600 1601 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1602 1603 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1604 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1605 { 1606 static const GVecGen3 g[4] = { 1607 { .fni8 = tcg_gen_vec_add8_i64, 1608 .fniv = tcg_gen_add_vec, 1609 .fno = gen_helper_gvec_add8, 1610 .opt_opc = vecop_list_add, 1611 .vece = MO_8 }, 1612 { .fni8 = tcg_gen_vec_add16_i64, 1613 .fniv = tcg_gen_add_vec, 1614 .fno = gen_helper_gvec_add16, 1615 .opt_opc = vecop_list_add, 1616 .vece = MO_16 }, 1617 { .fni4 = tcg_gen_add_i32, 1618 .fniv = tcg_gen_add_vec, 1619 .fno = gen_helper_gvec_add32, 1620 .opt_opc = vecop_list_add, 1621 .vece = MO_32 }, 1622 { .fni8 = tcg_gen_add_i64, 1623 .fniv = tcg_gen_add_vec, 1624 .fno = gen_helper_gvec_add64, 1625 .opt_opc = vecop_list_add, 1626 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1627 .vece = MO_64 }, 1628 }; 1629 1630 tcg_debug_assert(vece <= MO_64); 1631 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1632 } 1633 1634 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1635 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1636 { 1637 static const GVecGen2s g[4] = { 1638 { .fni8 = tcg_gen_vec_add8_i64, 1639 .fniv = tcg_gen_add_vec, 1640 .fno = gen_helper_gvec_adds8, 1641 .opt_opc = vecop_list_add, 1642 .vece = MO_8 }, 1643 { .fni8 = tcg_gen_vec_add16_i64, 1644 .fniv = tcg_gen_add_vec, 1645 .fno = gen_helper_gvec_adds16, 1646 .opt_opc = vecop_list_add, 1647 .vece = MO_16 }, 1648 { .fni4 = tcg_gen_add_i32, 1649 .fniv = tcg_gen_add_vec, 1650 .fno = gen_helper_gvec_adds32, 1651 .opt_opc = vecop_list_add, 1652 .vece = MO_32 }, 1653 { .fni8 = tcg_gen_add_i64, 1654 .fniv = tcg_gen_add_vec, 1655 .fno = gen_helper_gvec_adds64, 1656 .opt_opc = vecop_list_add, 1657 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1658 .vece = MO_64 }, 1659 }; 1660 1661 tcg_debug_assert(vece <= MO_64); 1662 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1663 } 1664 1665 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1666 int64_t c, uint32_t oprsz, uint32_t maxsz) 1667 { 1668 TCGv_i64 tmp = tcg_const_i64(c); 1669 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1670 tcg_temp_free_i64(tmp); 1671 } 1672 1673 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 1674 1675 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1676 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1677 { 1678 static const GVecGen2s g[4] = { 1679 { .fni8 = tcg_gen_vec_sub8_i64, 1680 .fniv = tcg_gen_sub_vec, 1681 .fno = gen_helper_gvec_subs8, 1682 .opt_opc = vecop_list_sub, 1683 .vece = MO_8 }, 1684 { .fni8 = tcg_gen_vec_sub16_i64, 1685 .fniv = tcg_gen_sub_vec, 1686 .fno = gen_helper_gvec_subs16, 1687 .opt_opc = vecop_list_sub, 1688 .vece = MO_16 }, 1689 { .fni4 = tcg_gen_sub_i32, 1690 .fniv = tcg_gen_sub_vec, 1691 .fno = gen_helper_gvec_subs32, 1692 .opt_opc = vecop_list_sub, 1693 .vece = MO_32 }, 1694 { .fni8 = tcg_gen_sub_i64, 1695 .fniv = tcg_gen_sub_vec, 1696 .fno = gen_helper_gvec_subs64, 1697 .opt_opc = vecop_list_sub, 1698 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1699 .vece = MO_64 }, 1700 }; 1701 1702 tcg_debug_assert(vece <= MO_64); 1703 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1704 } 1705 1706 /* Perform a vector subtraction using normal subtraction and a mask. 1707 Compare gen_addv_mask above. */ 1708 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1709 { 1710 TCGv_i64 t1 = tcg_temp_new_i64(); 1711 TCGv_i64 t2 = tcg_temp_new_i64(); 1712 TCGv_i64 t3 = tcg_temp_new_i64(); 1713 1714 tcg_gen_or_i64(t1, a, m); 1715 tcg_gen_andc_i64(t2, b, m); 1716 tcg_gen_eqv_i64(t3, a, b); 1717 tcg_gen_sub_i64(d, t1, t2); 1718 tcg_gen_and_i64(t3, t3, m); 1719 tcg_gen_xor_i64(d, d, t3); 1720 1721 tcg_temp_free_i64(t1); 1722 tcg_temp_free_i64(t2); 1723 tcg_temp_free_i64(t3); 1724 } 1725 1726 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1727 { 1728 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1729 gen_subv_mask(d, a, b, m); 1730 tcg_temp_free_i64(m); 1731 } 1732 1733 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1734 { 1735 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1736 gen_subv_mask(d, a, b, m); 1737 tcg_temp_free_i64(m); 1738 } 1739 1740 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1741 { 1742 TCGv_i64 t1 = tcg_temp_new_i64(); 1743 TCGv_i64 t2 = tcg_temp_new_i64(); 1744 1745 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1746 tcg_gen_sub_i64(t2, a, b); 1747 tcg_gen_sub_i64(t1, a, t1); 1748 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1749 1750 tcg_temp_free_i64(t1); 1751 tcg_temp_free_i64(t2); 1752 } 1753 1754 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1755 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1756 { 1757 static const GVecGen3 g[4] = { 1758 { .fni8 = tcg_gen_vec_sub8_i64, 1759 .fniv = tcg_gen_sub_vec, 1760 .fno = gen_helper_gvec_sub8, 1761 .opt_opc = vecop_list_sub, 1762 .vece = MO_8 }, 1763 { .fni8 = tcg_gen_vec_sub16_i64, 1764 .fniv = tcg_gen_sub_vec, 1765 .fno = gen_helper_gvec_sub16, 1766 .opt_opc = vecop_list_sub, 1767 .vece = MO_16 }, 1768 { .fni4 = tcg_gen_sub_i32, 1769 .fniv = tcg_gen_sub_vec, 1770 .fno = gen_helper_gvec_sub32, 1771 .opt_opc = vecop_list_sub, 1772 .vece = MO_32 }, 1773 { .fni8 = tcg_gen_sub_i64, 1774 .fniv = tcg_gen_sub_vec, 1775 .fno = gen_helper_gvec_sub64, 1776 .opt_opc = vecop_list_sub, 1777 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1778 .vece = MO_64 }, 1779 }; 1780 1781 tcg_debug_assert(vece <= MO_64); 1782 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1783 } 1784 1785 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 1786 1787 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1788 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1789 { 1790 static const GVecGen3 g[4] = { 1791 { .fniv = tcg_gen_mul_vec, 1792 .fno = gen_helper_gvec_mul8, 1793 .opt_opc = vecop_list_mul, 1794 .vece = MO_8 }, 1795 { .fniv = tcg_gen_mul_vec, 1796 .fno = gen_helper_gvec_mul16, 1797 .opt_opc = vecop_list_mul, 1798 .vece = MO_16 }, 1799 { .fni4 = tcg_gen_mul_i32, 1800 .fniv = tcg_gen_mul_vec, 1801 .fno = gen_helper_gvec_mul32, 1802 .opt_opc = vecop_list_mul, 1803 .vece = MO_32 }, 1804 { .fni8 = tcg_gen_mul_i64, 1805 .fniv = tcg_gen_mul_vec, 1806 .fno = gen_helper_gvec_mul64, 1807 .opt_opc = vecop_list_mul, 1808 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1809 .vece = MO_64 }, 1810 }; 1811 1812 tcg_debug_assert(vece <= MO_64); 1813 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1814 } 1815 1816 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1817 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1818 { 1819 static const GVecGen2s g[4] = { 1820 { .fniv = tcg_gen_mul_vec, 1821 .fno = gen_helper_gvec_muls8, 1822 .opt_opc = vecop_list_mul, 1823 .vece = MO_8 }, 1824 { .fniv = tcg_gen_mul_vec, 1825 .fno = gen_helper_gvec_muls16, 1826 .opt_opc = vecop_list_mul, 1827 .vece = MO_16 }, 1828 { .fni4 = tcg_gen_mul_i32, 1829 .fniv = tcg_gen_mul_vec, 1830 .fno = gen_helper_gvec_muls32, 1831 .opt_opc = vecop_list_mul, 1832 .vece = MO_32 }, 1833 { .fni8 = tcg_gen_mul_i64, 1834 .fniv = tcg_gen_mul_vec, 1835 .fno = gen_helper_gvec_muls64, 1836 .opt_opc = vecop_list_mul, 1837 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1838 .vece = MO_64 }, 1839 }; 1840 1841 tcg_debug_assert(vece <= MO_64); 1842 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1843 } 1844 1845 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1846 int64_t c, uint32_t oprsz, uint32_t maxsz) 1847 { 1848 TCGv_i64 tmp = tcg_const_i64(c); 1849 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1850 tcg_temp_free_i64(tmp); 1851 } 1852 1853 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1854 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1855 { 1856 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 1857 static const GVecGen3 g[4] = { 1858 { .fniv = tcg_gen_ssadd_vec, 1859 .fno = gen_helper_gvec_ssadd8, 1860 .opt_opc = vecop_list, 1861 .vece = MO_8 }, 1862 { .fniv = tcg_gen_ssadd_vec, 1863 .fno = gen_helper_gvec_ssadd16, 1864 .opt_opc = vecop_list, 1865 .vece = MO_16 }, 1866 { .fniv = tcg_gen_ssadd_vec, 1867 .fno = gen_helper_gvec_ssadd32, 1868 .opt_opc = vecop_list, 1869 .vece = MO_32 }, 1870 { .fniv = tcg_gen_ssadd_vec, 1871 .fno = gen_helper_gvec_ssadd64, 1872 .opt_opc = vecop_list, 1873 .vece = MO_64 }, 1874 }; 1875 tcg_debug_assert(vece <= MO_64); 1876 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1877 } 1878 1879 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 1880 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1881 { 1882 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 1883 static const GVecGen3 g[4] = { 1884 { .fniv = tcg_gen_sssub_vec, 1885 .fno = gen_helper_gvec_sssub8, 1886 .opt_opc = vecop_list, 1887 .vece = MO_8 }, 1888 { .fniv = tcg_gen_sssub_vec, 1889 .fno = gen_helper_gvec_sssub16, 1890 .opt_opc = vecop_list, 1891 .vece = MO_16 }, 1892 { .fniv = tcg_gen_sssub_vec, 1893 .fno = gen_helper_gvec_sssub32, 1894 .opt_opc = vecop_list, 1895 .vece = MO_32 }, 1896 { .fniv = tcg_gen_sssub_vec, 1897 .fno = gen_helper_gvec_sssub64, 1898 .opt_opc = vecop_list, 1899 .vece = MO_64 }, 1900 }; 1901 tcg_debug_assert(vece <= MO_64); 1902 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1903 } 1904 1905 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1906 { 1907 TCGv_i32 max = tcg_const_i32(-1); 1908 tcg_gen_add_i32(d, a, b); 1909 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 1910 tcg_temp_free_i32(max); 1911 } 1912 1913 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1914 { 1915 TCGv_i64 max = tcg_const_i64(-1); 1916 tcg_gen_add_i64(d, a, b); 1917 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 1918 tcg_temp_free_i64(max); 1919 } 1920 1921 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1922 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1923 { 1924 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 1925 static const GVecGen3 g[4] = { 1926 { .fniv = tcg_gen_usadd_vec, 1927 .fno = gen_helper_gvec_usadd8, 1928 .opt_opc = vecop_list, 1929 .vece = MO_8 }, 1930 { .fniv = tcg_gen_usadd_vec, 1931 .fno = gen_helper_gvec_usadd16, 1932 .opt_opc = vecop_list, 1933 .vece = MO_16 }, 1934 { .fni4 = tcg_gen_usadd_i32, 1935 .fniv = tcg_gen_usadd_vec, 1936 .fno = gen_helper_gvec_usadd32, 1937 .opt_opc = vecop_list, 1938 .vece = MO_32 }, 1939 { .fni8 = tcg_gen_usadd_i64, 1940 .fniv = tcg_gen_usadd_vec, 1941 .fno = gen_helper_gvec_usadd64, 1942 .opt_opc = vecop_list, 1943 .vece = MO_64 } 1944 }; 1945 tcg_debug_assert(vece <= MO_64); 1946 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1947 } 1948 1949 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1950 { 1951 TCGv_i32 min = tcg_const_i32(0); 1952 tcg_gen_sub_i32(d, a, b); 1953 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 1954 tcg_temp_free_i32(min); 1955 } 1956 1957 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1958 { 1959 TCGv_i64 min = tcg_const_i64(0); 1960 tcg_gen_sub_i64(d, a, b); 1961 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 1962 tcg_temp_free_i64(min); 1963 } 1964 1965 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 1966 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1967 { 1968 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 1969 static const GVecGen3 g[4] = { 1970 { .fniv = tcg_gen_ussub_vec, 1971 .fno = gen_helper_gvec_ussub8, 1972 .opt_opc = vecop_list, 1973 .vece = MO_8 }, 1974 { .fniv = tcg_gen_ussub_vec, 1975 .fno = gen_helper_gvec_ussub16, 1976 .opt_opc = vecop_list, 1977 .vece = MO_16 }, 1978 { .fni4 = tcg_gen_ussub_i32, 1979 .fniv = tcg_gen_ussub_vec, 1980 .fno = gen_helper_gvec_ussub32, 1981 .opt_opc = vecop_list, 1982 .vece = MO_32 }, 1983 { .fni8 = tcg_gen_ussub_i64, 1984 .fniv = tcg_gen_ussub_vec, 1985 .fno = gen_helper_gvec_ussub64, 1986 .opt_opc = vecop_list, 1987 .vece = MO_64 } 1988 }; 1989 tcg_debug_assert(vece <= MO_64); 1990 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1991 } 1992 1993 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 1994 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1995 { 1996 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 1997 static const GVecGen3 g[4] = { 1998 { .fniv = tcg_gen_smin_vec, 1999 .fno = gen_helper_gvec_smin8, 2000 .opt_opc = vecop_list, 2001 .vece = MO_8 }, 2002 { .fniv = tcg_gen_smin_vec, 2003 .fno = gen_helper_gvec_smin16, 2004 .opt_opc = vecop_list, 2005 .vece = MO_16 }, 2006 { .fni4 = tcg_gen_smin_i32, 2007 .fniv = tcg_gen_smin_vec, 2008 .fno = gen_helper_gvec_smin32, 2009 .opt_opc = vecop_list, 2010 .vece = MO_32 }, 2011 { .fni8 = tcg_gen_smin_i64, 2012 .fniv = tcg_gen_smin_vec, 2013 .fno = gen_helper_gvec_smin64, 2014 .opt_opc = vecop_list, 2015 .vece = MO_64 } 2016 }; 2017 tcg_debug_assert(vece <= MO_64); 2018 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2019 } 2020 2021 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2022 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2023 { 2024 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2025 static const GVecGen3 g[4] = { 2026 { .fniv = tcg_gen_umin_vec, 2027 .fno = gen_helper_gvec_umin8, 2028 .opt_opc = vecop_list, 2029 .vece = MO_8 }, 2030 { .fniv = tcg_gen_umin_vec, 2031 .fno = gen_helper_gvec_umin16, 2032 .opt_opc = vecop_list, 2033 .vece = MO_16 }, 2034 { .fni4 = tcg_gen_umin_i32, 2035 .fniv = tcg_gen_umin_vec, 2036 .fno = gen_helper_gvec_umin32, 2037 .opt_opc = vecop_list, 2038 .vece = MO_32 }, 2039 { .fni8 = tcg_gen_umin_i64, 2040 .fniv = tcg_gen_umin_vec, 2041 .fno = gen_helper_gvec_umin64, 2042 .opt_opc = vecop_list, 2043 .vece = MO_64 } 2044 }; 2045 tcg_debug_assert(vece <= MO_64); 2046 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2047 } 2048 2049 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2050 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2051 { 2052 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2053 static const GVecGen3 g[4] = { 2054 { .fniv = tcg_gen_smax_vec, 2055 .fno = gen_helper_gvec_smax8, 2056 .opt_opc = vecop_list, 2057 .vece = MO_8 }, 2058 { .fniv = tcg_gen_smax_vec, 2059 .fno = gen_helper_gvec_smax16, 2060 .opt_opc = vecop_list, 2061 .vece = MO_16 }, 2062 { .fni4 = tcg_gen_smax_i32, 2063 .fniv = tcg_gen_smax_vec, 2064 .fno = gen_helper_gvec_smax32, 2065 .opt_opc = vecop_list, 2066 .vece = MO_32 }, 2067 { .fni8 = tcg_gen_smax_i64, 2068 .fniv = tcg_gen_smax_vec, 2069 .fno = gen_helper_gvec_smax64, 2070 .opt_opc = vecop_list, 2071 .vece = MO_64 } 2072 }; 2073 tcg_debug_assert(vece <= MO_64); 2074 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2075 } 2076 2077 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2078 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2079 { 2080 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2081 static const GVecGen3 g[4] = { 2082 { .fniv = tcg_gen_umax_vec, 2083 .fno = gen_helper_gvec_umax8, 2084 .opt_opc = vecop_list, 2085 .vece = MO_8 }, 2086 { .fniv = tcg_gen_umax_vec, 2087 .fno = gen_helper_gvec_umax16, 2088 .opt_opc = vecop_list, 2089 .vece = MO_16 }, 2090 { .fni4 = tcg_gen_umax_i32, 2091 .fniv = tcg_gen_umax_vec, 2092 .fno = gen_helper_gvec_umax32, 2093 .opt_opc = vecop_list, 2094 .vece = MO_32 }, 2095 { .fni8 = tcg_gen_umax_i64, 2096 .fniv = tcg_gen_umax_vec, 2097 .fno = gen_helper_gvec_umax64, 2098 .opt_opc = vecop_list, 2099 .vece = MO_64 } 2100 }; 2101 tcg_debug_assert(vece <= MO_64); 2102 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2103 } 2104 2105 /* Perform a vector negation using normal negation and a mask. 2106 Compare gen_subv_mask above. */ 2107 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2108 { 2109 TCGv_i64 t2 = tcg_temp_new_i64(); 2110 TCGv_i64 t3 = tcg_temp_new_i64(); 2111 2112 tcg_gen_andc_i64(t3, m, b); 2113 tcg_gen_andc_i64(t2, b, m); 2114 tcg_gen_sub_i64(d, m, t2); 2115 tcg_gen_xor_i64(d, d, t3); 2116 2117 tcg_temp_free_i64(t2); 2118 tcg_temp_free_i64(t3); 2119 } 2120 2121 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2122 { 2123 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 2124 gen_negv_mask(d, b, m); 2125 tcg_temp_free_i64(m); 2126 } 2127 2128 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2129 { 2130 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 2131 gen_negv_mask(d, b, m); 2132 tcg_temp_free_i64(m); 2133 } 2134 2135 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2136 { 2137 TCGv_i64 t1 = tcg_temp_new_i64(); 2138 TCGv_i64 t2 = tcg_temp_new_i64(); 2139 2140 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2141 tcg_gen_neg_i64(t2, b); 2142 tcg_gen_neg_i64(t1, t1); 2143 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2144 2145 tcg_temp_free_i64(t1); 2146 tcg_temp_free_i64(t2); 2147 } 2148 2149 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2150 uint32_t oprsz, uint32_t maxsz) 2151 { 2152 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2153 static const GVecGen2 g[4] = { 2154 { .fni8 = tcg_gen_vec_neg8_i64, 2155 .fniv = tcg_gen_neg_vec, 2156 .fno = gen_helper_gvec_neg8, 2157 .opt_opc = vecop_list, 2158 .vece = MO_8 }, 2159 { .fni8 = tcg_gen_vec_neg16_i64, 2160 .fniv = tcg_gen_neg_vec, 2161 .fno = gen_helper_gvec_neg16, 2162 .opt_opc = vecop_list, 2163 .vece = MO_16 }, 2164 { .fni4 = tcg_gen_neg_i32, 2165 .fniv = tcg_gen_neg_vec, 2166 .fno = gen_helper_gvec_neg32, 2167 .opt_opc = vecop_list, 2168 .vece = MO_32 }, 2169 { .fni8 = tcg_gen_neg_i64, 2170 .fniv = tcg_gen_neg_vec, 2171 .fno = gen_helper_gvec_neg64, 2172 .opt_opc = vecop_list, 2173 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2174 .vece = MO_64 }, 2175 }; 2176 2177 tcg_debug_assert(vece <= MO_64); 2178 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2179 } 2180 2181 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2182 { 2183 TCGv_i64 t = tcg_temp_new_i64(); 2184 int nbit = 8 << vece; 2185 2186 /* Create -1 for each negative element. */ 2187 tcg_gen_shri_i64(t, b, nbit - 1); 2188 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2189 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2190 2191 /* 2192 * Invert (via xor -1) and add one (via sub -1). 2193 * Because of the ordering the msb is cleared, 2194 * so we never have carry into the next element. 2195 */ 2196 tcg_gen_xor_i64(d, b, t); 2197 tcg_gen_sub_i64(d, d, t); 2198 2199 tcg_temp_free_i64(t); 2200 } 2201 2202 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2203 { 2204 gen_absv_mask(d, b, MO_8); 2205 } 2206 2207 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2208 { 2209 gen_absv_mask(d, b, MO_16); 2210 } 2211 2212 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2213 uint32_t oprsz, uint32_t maxsz) 2214 { 2215 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2216 static const GVecGen2 g[4] = { 2217 { .fni8 = tcg_gen_vec_abs8_i64, 2218 .fniv = tcg_gen_abs_vec, 2219 .fno = gen_helper_gvec_abs8, 2220 .opt_opc = vecop_list, 2221 .vece = MO_8 }, 2222 { .fni8 = tcg_gen_vec_abs16_i64, 2223 .fniv = tcg_gen_abs_vec, 2224 .fno = gen_helper_gvec_abs16, 2225 .opt_opc = vecop_list, 2226 .vece = MO_16 }, 2227 { .fni4 = tcg_gen_abs_i32, 2228 .fniv = tcg_gen_abs_vec, 2229 .fno = gen_helper_gvec_abs32, 2230 .opt_opc = vecop_list, 2231 .vece = MO_32 }, 2232 { .fni8 = tcg_gen_abs_i64, 2233 .fniv = tcg_gen_abs_vec, 2234 .fno = gen_helper_gvec_abs64, 2235 .opt_opc = vecop_list, 2236 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2237 .vece = MO_64 }, 2238 }; 2239 2240 tcg_debug_assert(vece <= MO_64); 2241 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2242 } 2243 2244 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2245 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2246 { 2247 static const GVecGen3 g = { 2248 .fni8 = tcg_gen_and_i64, 2249 .fniv = tcg_gen_and_vec, 2250 .fno = gen_helper_gvec_and, 2251 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2252 }; 2253 2254 if (aofs == bofs) { 2255 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2256 } else { 2257 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2258 } 2259 } 2260 2261 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2262 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2263 { 2264 static const GVecGen3 g = { 2265 .fni8 = tcg_gen_or_i64, 2266 .fniv = tcg_gen_or_vec, 2267 .fno = gen_helper_gvec_or, 2268 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2269 }; 2270 2271 if (aofs == bofs) { 2272 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2273 } else { 2274 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2275 } 2276 } 2277 2278 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2279 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2280 { 2281 static const GVecGen3 g = { 2282 .fni8 = tcg_gen_xor_i64, 2283 .fniv = tcg_gen_xor_vec, 2284 .fno = gen_helper_gvec_xor, 2285 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2286 }; 2287 2288 if (aofs == bofs) { 2289 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); 2290 } else { 2291 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2292 } 2293 } 2294 2295 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2296 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2297 { 2298 static const GVecGen3 g = { 2299 .fni8 = tcg_gen_andc_i64, 2300 .fniv = tcg_gen_andc_vec, 2301 .fno = gen_helper_gvec_andc, 2302 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2303 }; 2304 2305 if (aofs == bofs) { 2306 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); 2307 } else { 2308 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2309 } 2310 } 2311 2312 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2313 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2314 { 2315 static const GVecGen3 g = { 2316 .fni8 = tcg_gen_orc_i64, 2317 .fniv = tcg_gen_orc_vec, 2318 .fno = gen_helper_gvec_orc, 2319 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2320 }; 2321 2322 if (aofs == bofs) { 2323 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); 2324 } else { 2325 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2326 } 2327 } 2328 2329 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2330 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2331 { 2332 static const GVecGen3 g = { 2333 .fni8 = tcg_gen_nand_i64, 2334 .fniv = tcg_gen_nand_vec, 2335 .fno = gen_helper_gvec_nand, 2336 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2337 }; 2338 2339 if (aofs == bofs) { 2340 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2341 } else { 2342 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2343 } 2344 } 2345 2346 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2347 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2348 { 2349 static const GVecGen3 g = { 2350 .fni8 = tcg_gen_nor_i64, 2351 .fniv = tcg_gen_nor_vec, 2352 .fno = gen_helper_gvec_nor, 2353 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2354 }; 2355 2356 if (aofs == bofs) { 2357 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2358 } else { 2359 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2360 } 2361 } 2362 2363 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2364 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2365 { 2366 static const GVecGen3 g = { 2367 .fni8 = tcg_gen_eqv_i64, 2368 .fniv = tcg_gen_eqv_vec, 2369 .fno = gen_helper_gvec_eqv, 2370 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2371 }; 2372 2373 if (aofs == bofs) { 2374 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); 2375 } else { 2376 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2377 } 2378 } 2379 2380 static const GVecGen2s gop_ands = { 2381 .fni8 = tcg_gen_and_i64, 2382 .fniv = tcg_gen_and_vec, 2383 .fno = gen_helper_gvec_ands, 2384 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2385 .vece = MO_64 2386 }; 2387 2388 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2389 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2390 { 2391 TCGv_i64 tmp = tcg_temp_new_i64(); 2392 gen_dup_i64(vece, tmp, c); 2393 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2394 tcg_temp_free_i64(tmp); 2395 } 2396 2397 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2398 int64_t c, uint32_t oprsz, uint32_t maxsz) 2399 { 2400 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2401 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2402 tcg_temp_free_i64(tmp); 2403 } 2404 2405 static const GVecGen2s gop_xors = { 2406 .fni8 = tcg_gen_xor_i64, 2407 .fniv = tcg_gen_xor_vec, 2408 .fno = gen_helper_gvec_xors, 2409 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2410 .vece = MO_64 2411 }; 2412 2413 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2414 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2415 { 2416 TCGv_i64 tmp = tcg_temp_new_i64(); 2417 gen_dup_i64(vece, tmp, c); 2418 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2419 tcg_temp_free_i64(tmp); 2420 } 2421 2422 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2423 int64_t c, uint32_t oprsz, uint32_t maxsz) 2424 { 2425 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2426 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2427 tcg_temp_free_i64(tmp); 2428 } 2429 2430 static const GVecGen2s gop_ors = { 2431 .fni8 = tcg_gen_or_i64, 2432 .fniv = tcg_gen_or_vec, 2433 .fno = gen_helper_gvec_ors, 2434 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2435 .vece = MO_64 2436 }; 2437 2438 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2439 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2440 { 2441 TCGv_i64 tmp = tcg_temp_new_i64(); 2442 gen_dup_i64(vece, tmp, c); 2443 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2444 tcg_temp_free_i64(tmp); 2445 } 2446 2447 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2448 int64_t c, uint32_t oprsz, uint32_t maxsz) 2449 { 2450 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2451 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2452 tcg_temp_free_i64(tmp); 2453 } 2454 2455 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2456 { 2457 uint64_t mask = dup_const(MO_8, 0xff << c); 2458 tcg_gen_shli_i64(d, a, c); 2459 tcg_gen_andi_i64(d, d, mask); 2460 } 2461 2462 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2463 { 2464 uint64_t mask = dup_const(MO_16, 0xffff << c); 2465 tcg_gen_shli_i64(d, a, c); 2466 tcg_gen_andi_i64(d, d, mask); 2467 } 2468 2469 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2470 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2471 { 2472 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2473 static const GVecGen2i g[4] = { 2474 { .fni8 = tcg_gen_vec_shl8i_i64, 2475 .fniv = tcg_gen_shli_vec, 2476 .fno = gen_helper_gvec_shl8i, 2477 .opt_opc = vecop_list, 2478 .vece = MO_8 }, 2479 { .fni8 = tcg_gen_vec_shl16i_i64, 2480 .fniv = tcg_gen_shli_vec, 2481 .fno = gen_helper_gvec_shl16i, 2482 .opt_opc = vecop_list, 2483 .vece = MO_16 }, 2484 { .fni4 = tcg_gen_shli_i32, 2485 .fniv = tcg_gen_shli_vec, 2486 .fno = gen_helper_gvec_shl32i, 2487 .opt_opc = vecop_list, 2488 .vece = MO_32 }, 2489 { .fni8 = tcg_gen_shli_i64, 2490 .fniv = tcg_gen_shli_vec, 2491 .fno = gen_helper_gvec_shl64i, 2492 .opt_opc = vecop_list, 2493 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2494 .vece = MO_64 }, 2495 }; 2496 2497 tcg_debug_assert(vece <= MO_64); 2498 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2499 if (shift == 0) { 2500 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2501 } else { 2502 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2503 } 2504 } 2505 2506 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2507 { 2508 uint64_t mask = dup_const(MO_8, 0xff >> c); 2509 tcg_gen_shri_i64(d, a, c); 2510 tcg_gen_andi_i64(d, d, mask); 2511 } 2512 2513 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2514 { 2515 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2516 tcg_gen_shri_i64(d, a, c); 2517 tcg_gen_andi_i64(d, d, mask); 2518 } 2519 2520 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2521 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2522 { 2523 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2524 static const GVecGen2i g[4] = { 2525 { .fni8 = tcg_gen_vec_shr8i_i64, 2526 .fniv = tcg_gen_shri_vec, 2527 .fno = gen_helper_gvec_shr8i, 2528 .opt_opc = vecop_list, 2529 .vece = MO_8 }, 2530 { .fni8 = tcg_gen_vec_shr16i_i64, 2531 .fniv = tcg_gen_shri_vec, 2532 .fno = gen_helper_gvec_shr16i, 2533 .opt_opc = vecop_list, 2534 .vece = MO_16 }, 2535 { .fni4 = tcg_gen_shri_i32, 2536 .fniv = tcg_gen_shri_vec, 2537 .fno = gen_helper_gvec_shr32i, 2538 .opt_opc = vecop_list, 2539 .vece = MO_32 }, 2540 { .fni8 = tcg_gen_shri_i64, 2541 .fniv = tcg_gen_shri_vec, 2542 .fno = gen_helper_gvec_shr64i, 2543 .opt_opc = vecop_list, 2544 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2545 .vece = MO_64 }, 2546 }; 2547 2548 tcg_debug_assert(vece <= MO_64); 2549 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2550 if (shift == 0) { 2551 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2552 } else { 2553 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2554 } 2555 } 2556 2557 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2558 { 2559 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2560 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2561 TCGv_i64 s = tcg_temp_new_i64(); 2562 2563 tcg_gen_shri_i64(d, a, c); 2564 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2565 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2566 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2567 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2568 tcg_temp_free_i64(s); 2569 } 2570 2571 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2572 { 2573 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2574 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2575 TCGv_i64 s = tcg_temp_new_i64(); 2576 2577 tcg_gen_shri_i64(d, a, c); 2578 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2579 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2580 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2581 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2582 tcg_temp_free_i64(s); 2583 } 2584 2585 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2586 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2587 { 2588 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 2589 static const GVecGen2i g[4] = { 2590 { .fni8 = tcg_gen_vec_sar8i_i64, 2591 .fniv = tcg_gen_sari_vec, 2592 .fno = gen_helper_gvec_sar8i, 2593 .opt_opc = vecop_list, 2594 .vece = MO_8 }, 2595 { .fni8 = tcg_gen_vec_sar16i_i64, 2596 .fniv = tcg_gen_sari_vec, 2597 .fno = gen_helper_gvec_sar16i, 2598 .opt_opc = vecop_list, 2599 .vece = MO_16 }, 2600 { .fni4 = tcg_gen_sari_i32, 2601 .fniv = tcg_gen_sari_vec, 2602 .fno = gen_helper_gvec_sar32i, 2603 .opt_opc = vecop_list, 2604 .vece = MO_32 }, 2605 { .fni8 = tcg_gen_sari_i64, 2606 .fniv = tcg_gen_sari_vec, 2607 .fno = gen_helper_gvec_sar64i, 2608 .opt_opc = vecop_list, 2609 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2610 .vece = MO_64 }, 2611 }; 2612 2613 tcg_debug_assert(vece <= MO_64); 2614 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2615 if (shift == 0) { 2616 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2617 } else { 2618 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2619 } 2620 } 2621 2622 /* 2623 * Specialized generation vector shifts by a non-constant scalar. 2624 */ 2625 2626 typedef struct { 2627 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 2628 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 2629 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 2630 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 2631 gen_helper_gvec_2 *fno[4]; 2632 TCGOpcode s_list[2]; 2633 TCGOpcode v_list[2]; 2634 } GVecGen2sh; 2635 2636 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2637 uint32_t oprsz, uint32_t tysz, TCGType type, 2638 TCGv_i32 shift, 2639 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 2640 { 2641 TCGv_vec t0 = tcg_temp_new_vec(type); 2642 uint32_t i; 2643 2644 for (i = 0; i < oprsz; i += tysz) { 2645 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2646 fni(vece, t0, t0, shift); 2647 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2648 } 2649 tcg_temp_free_vec(t0); 2650 } 2651 2652 static void 2653 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 2654 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 2655 { 2656 TCGType type; 2657 uint32_t some; 2658 2659 check_size_align(oprsz, maxsz, dofs | aofs); 2660 check_overlap_2(dofs, aofs, maxsz); 2661 2662 /* If the backend has a scalar expansion, great. */ 2663 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 2664 if (type) { 2665 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2666 switch (type) { 2667 case TCG_TYPE_V256: 2668 some = QEMU_ALIGN_DOWN(oprsz, 32); 2669 expand_2sh_vec(vece, dofs, aofs, some, 32, 2670 TCG_TYPE_V256, shift, g->fniv_s); 2671 if (some == oprsz) { 2672 break; 2673 } 2674 dofs += some; 2675 aofs += some; 2676 oprsz -= some; 2677 maxsz -= some; 2678 /* fallthru */ 2679 case TCG_TYPE_V128: 2680 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 2681 TCG_TYPE_V128, shift, g->fniv_s); 2682 break; 2683 case TCG_TYPE_V64: 2684 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 2685 TCG_TYPE_V64, shift, g->fniv_s); 2686 break; 2687 default: 2688 g_assert_not_reached(); 2689 } 2690 tcg_swap_vecop_list(hold_list); 2691 goto clear_tail; 2692 } 2693 2694 /* If the backend supports variable vector shifts, also cool. */ 2695 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 2696 if (type) { 2697 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2698 TCGv_vec v_shift = tcg_temp_new_vec(type); 2699 2700 if (vece == MO_64) { 2701 TCGv_i64 sh64 = tcg_temp_new_i64(); 2702 tcg_gen_extu_i32_i64(sh64, shift); 2703 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 2704 tcg_temp_free_i64(sh64); 2705 } else { 2706 tcg_gen_dup_i32_vec(vece, v_shift, shift); 2707 } 2708 2709 switch (type) { 2710 case TCG_TYPE_V256: 2711 some = QEMU_ALIGN_DOWN(oprsz, 32); 2712 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 2713 v_shift, false, g->fniv_v); 2714 if (some == oprsz) { 2715 break; 2716 } 2717 dofs += some; 2718 aofs += some; 2719 oprsz -= some; 2720 maxsz -= some; 2721 /* fallthru */ 2722 case TCG_TYPE_V128: 2723 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 2724 v_shift, false, g->fniv_v); 2725 break; 2726 case TCG_TYPE_V64: 2727 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 2728 v_shift, false, g->fniv_v); 2729 break; 2730 default: 2731 g_assert_not_reached(); 2732 } 2733 tcg_temp_free_vec(v_shift); 2734 tcg_swap_vecop_list(hold_list); 2735 goto clear_tail; 2736 } 2737 2738 /* Otherwise fall back to integral... */ 2739 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2740 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 2741 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2742 TCGv_i64 sh64 = tcg_temp_new_i64(); 2743 tcg_gen_extu_i32_i64(sh64, shift); 2744 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 2745 tcg_temp_free_i64(sh64); 2746 } else { 2747 TCGv_ptr a0 = tcg_temp_new_ptr(); 2748 TCGv_ptr a1 = tcg_temp_new_ptr(); 2749 TCGv_i32 desc = tcg_temp_new_i32(); 2750 2751 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 2752 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 2753 tcg_gen_addi_ptr(a0, cpu_env, dofs); 2754 tcg_gen_addi_ptr(a1, cpu_env, aofs); 2755 2756 g->fno[vece](a0, a1, desc); 2757 2758 tcg_temp_free_ptr(a0); 2759 tcg_temp_free_ptr(a1); 2760 tcg_temp_free_i32(desc); 2761 return; 2762 } 2763 2764 clear_tail: 2765 if (oprsz < maxsz) { 2766 expand_clr(dofs + oprsz, maxsz - oprsz); 2767 } 2768 } 2769 2770 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 2771 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2772 { 2773 static const GVecGen2sh g = { 2774 .fni4 = tcg_gen_shl_i32, 2775 .fni8 = tcg_gen_shl_i64, 2776 .fniv_s = tcg_gen_shls_vec, 2777 .fniv_v = tcg_gen_shlv_vec, 2778 .fno = { 2779 gen_helper_gvec_shl8i, 2780 gen_helper_gvec_shl16i, 2781 gen_helper_gvec_shl32i, 2782 gen_helper_gvec_shl64i, 2783 }, 2784 .s_list = { INDEX_op_shls_vec, 0 }, 2785 .v_list = { INDEX_op_shlv_vec, 0 }, 2786 }; 2787 2788 tcg_debug_assert(vece <= MO_64); 2789 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2790 } 2791 2792 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 2793 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2794 { 2795 static const GVecGen2sh g = { 2796 .fni4 = tcg_gen_shr_i32, 2797 .fni8 = tcg_gen_shr_i64, 2798 .fniv_s = tcg_gen_shrs_vec, 2799 .fniv_v = tcg_gen_shrv_vec, 2800 .fno = { 2801 gen_helper_gvec_shr8i, 2802 gen_helper_gvec_shr16i, 2803 gen_helper_gvec_shr32i, 2804 gen_helper_gvec_shr64i, 2805 }, 2806 .s_list = { INDEX_op_shrs_vec, 0 }, 2807 .v_list = { INDEX_op_shrv_vec, 0 }, 2808 }; 2809 2810 tcg_debug_assert(vece <= MO_64); 2811 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2812 } 2813 2814 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 2815 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2816 { 2817 static const GVecGen2sh g = { 2818 .fni4 = tcg_gen_sar_i32, 2819 .fni8 = tcg_gen_sar_i64, 2820 .fniv_s = tcg_gen_sars_vec, 2821 .fniv_v = tcg_gen_sarv_vec, 2822 .fno = { 2823 gen_helper_gvec_sar8i, 2824 gen_helper_gvec_sar16i, 2825 gen_helper_gvec_sar32i, 2826 gen_helper_gvec_sar64i, 2827 }, 2828 .s_list = { INDEX_op_sars_vec, 0 }, 2829 .v_list = { INDEX_op_sarv_vec, 0 }, 2830 }; 2831 2832 tcg_debug_assert(vece <= MO_64); 2833 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2834 } 2835 2836 /* 2837 * Expand D = A << (B % element bits) 2838 * 2839 * Unlike scalar shifts, where it is easy for the target front end 2840 * to include the modulo as part of the expansion. If the target 2841 * naturally includes the modulo as part of the operation, great! 2842 * If the target has some other behaviour from out-of-range shifts, 2843 * then it could not use this function anyway, and would need to 2844 * do it's own expansion with custom functions. 2845 */ 2846 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 2847 TCGv_vec a, TCGv_vec b) 2848 { 2849 TCGv_vec t = tcg_temp_new_vec_matching(d); 2850 2851 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2852 tcg_gen_and_vec(vece, t, t, b); 2853 tcg_gen_shlv_vec(vece, d, a, t); 2854 tcg_temp_free_vec(t); 2855 } 2856 2857 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2858 { 2859 TCGv_i32 t = tcg_temp_new_i32(); 2860 2861 tcg_gen_andi_i32(t, b, 31); 2862 tcg_gen_shl_i32(d, a, t); 2863 tcg_temp_free_i32(t); 2864 } 2865 2866 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2867 { 2868 TCGv_i64 t = tcg_temp_new_i64(); 2869 2870 tcg_gen_andi_i64(t, b, 63); 2871 tcg_gen_shl_i64(d, a, t); 2872 tcg_temp_free_i64(t); 2873 } 2874 2875 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 2876 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2877 { 2878 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 2879 static const GVecGen3 g[4] = { 2880 { .fniv = tcg_gen_shlv_mod_vec, 2881 .fno = gen_helper_gvec_shl8v, 2882 .opt_opc = vecop_list, 2883 .vece = MO_8 }, 2884 { .fniv = tcg_gen_shlv_mod_vec, 2885 .fno = gen_helper_gvec_shl16v, 2886 .opt_opc = vecop_list, 2887 .vece = MO_16 }, 2888 { .fni4 = tcg_gen_shl_mod_i32, 2889 .fniv = tcg_gen_shlv_mod_vec, 2890 .fno = gen_helper_gvec_shl32v, 2891 .opt_opc = vecop_list, 2892 .vece = MO_32 }, 2893 { .fni8 = tcg_gen_shl_mod_i64, 2894 .fniv = tcg_gen_shlv_mod_vec, 2895 .fno = gen_helper_gvec_shl64v, 2896 .opt_opc = vecop_list, 2897 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2898 .vece = MO_64 }, 2899 }; 2900 2901 tcg_debug_assert(vece <= MO_64); 2902 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2903 } 2904 2905 /* 2906 * Similarly for logical right shifts. 2907 */ 2908 2909 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 2910 TCGv_vec a, TCGv_vec b) 2911 { 2912 TCGv_vec t = tcg_temp_new_vec_matching(d); 2913 2914 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2915 tcg_gen_and_vec(vece, t, t, b); 2916 tcg_gen_shrv_vec(vece, d, a, t); 2917 tcg_temp_free_vec(t); 2918 } 2919 2920 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2921 { 2922 TCGv_i32 t = tcg_temp_new_i32(); 2923 2924 tcg_gen_andi_i32(t, b, 31); 2925 tcg_gen_shr_i32(d, a, t); 2926 tcg_temp_free_i32(t); 2927 } 2928 2929 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2930 { 2931 TCGv_i64 t = tcg_temp_new_i64(); 2932 2933 tcg_gen_andi_i64(t, b, 63); 2934 tcg_gen_shr_i64(d, a, t); 2935 tcg_temp_free_i64(t); 2936 } 2937 2938 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 2939 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2940 { 2941 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 2942 static const GVecGen3 g[4] = { 2943 { .fniv = tcg_gen_shrv_mod_vec, 2944 .fno = gen_helper_gvec_shr8v, 2945 .opt_opc = vecop_list, 2946 .vece = MO_8 }, 2947 { .fniv = tcg_gen_shrv_mod_vec, 2948 .fno = gen_helper_gvec_shr16v, 2949 .opt_opc = vecop_list, 2950 .vece = MO_16 }, 2951 { .fni4 = tcg_gen_shr_mod_i32, 2952 .fniv = tcg_gen_shrv_mod_vec, 2953 .fno = gen_helper_gvec_shr32v, 2954 .opt_opc = vecop_list, 2955 .vece = MO_32 }, 2956 { .fni8 = tcg_gen_shr_mod_i64, 2957 .fniv = tcg_gen_shrv_mod_vec, 2958 .fno = gen_helper_gvec_shr64v, 2959 .opt_opc = vecop_list, 2960 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2961 .vece = MO_64 }, 2962 }; 2963 2964 tcg_debug_assert(vece <= MO_64); 2965 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2966 } 2967 2968 /* 2969 * Similarly for arithmetic right shifts. 2970 */ 2971 2972 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 2973 TCGv_vec a, TCGv_vec b) 2974 { 2975 TCGv_vec t = tcg_temp_new_vec_matching(d); 2976 2977 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2978 tcg_gen_and_vec(vece, t, t, b); 2979 tcg_gen_sarv_vec(vece, d, a, t); 2980 tcg_temp_free_vec(t); 2981 } 2982 2983 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2984 { 2985 TCGv_i32 t = tcg_temp_new_i32(); 2986 2987 tcg_gen_andi_i32(t, b, 31); 2988 tcg_gen_sar_i32(d, a, t); 2989 tcg_temp_free_i32(t); 2990 } 2991 2992 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2993 { 2994 TCGv_i64 t = tcg_temp_new_i64(); 2995 2996 tcg_gen_andi_i64(t, b, 63); 2997 tcg_gen_sar_i64(d, a, t); 2998 tcg_temp_free_i64(t); 2999 } 3000 3001 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3002 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3003 { 3004 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3005 static const GVecGen3 g[4] = { 3006 { .fniv = tcg_gen_sarv_mod_vec, 3007 .fno = gen_helper_gvec_sar8v, 3008 .opt_opc = vecop_list, 3009 .vece = MO_8 }, 3010 { .fniv = tcg_gen_sarv_mod_vec, 3011 .fno = gen_helper_gvec_sar16v, 3012 .opt_opc = vecop_list, 3013 .vece = MO_16 }, 3014 { .fni4 = tcg_gen_sar_mod_i32, 3015 .fniv = tcg_gen_sarv_mod_vec, 3016 .fno = gen_helper_gvec_sar32v, 3017 .opt_opc = vecop_list, 3018 .vece = MO_32 }, 3019 { .fni8 = tcg_gen_sar_mod_i64, 3020 .fniv = tcg_gen_sarv_mod_vec, 3021 .fno = gen_helper_gvec_sar64v, 3022 .opt_opc = vecop_list, 3023 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3024 .vece = MO_64 }, 3025 }; 3026 3027 tcg_debug_assert(vece <= MO_64); 3028 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3029 } 3030 3031 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3032 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3033 uint32_t oprsz, TCGCond cond) 3034 { 3035 TCGv_i32 t0 = tcg_temp_new_i32(); 3036 TCGv_i32 t1 = tcg_temp_new_i32(); 3037 uint32_t i; 3038 3039 for (i = 0; i < oprsz; i += 4) { 3040 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 3041 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 3042 tcg_gen_setcond_i32(cond, t0, t0, t1); 3043 tcg_gen_neg_i32(t0, t0); 3044 tcg_gen_st_i32(t0, cpu_env, dofs + i); 3045 } 3046 tcg_temp_free_i32(t1); 3047 tcg_temp_free_i32(t0); 3048 } 3049 3050 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3051 uint32_t oprsz, TCGCond cond) 3052 { 3053 TCGv_i64 t0 = tcg_temp_new_i64(); 3054 TCGv_i64 t1 = tcg_temp_new_i64(); 3055 uint32_t i; 3056 3057 for (i = 0; i < oprsz; i += 8) { 3058 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 3059 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 3060 tcg_gen_setcond_i64(cond, t0, t0, t1); 3061 tcg_gen_neg_i64(t0, t0); 3062 tcg_gen_st_i64(t0, cpu_env, dofs + i); 3063 } 3064 tcg_temp_free_i64(t1); 3065 tcg_temp_free_i64(t0); 3066 } 3067 3068 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3069 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3070 TCGType type, TCGCond cond) 3071 { 3072 TCGv_vec t0 = tcg_temp_new_vec(type); 3073 TCGv_vec t1 = tcg_temp_new_vec(type); 3074 uint32_t i; 3075 3076 for (i = 0; i < oprsz; i += tysz) { 3077 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 3078 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 3079 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 3080 tcg_gen_st_vec(t0, cpu_env, dofs + i); 3081 } 3082 tcg_temp_free_vec(t1); 3083 tcg_temp_free_vec(t0); 3084 } 3085 3086 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3087 uint32_t aofs, uint32_t bofs, 3088 uint32_t oprsz, uint32_t maxsz) 3089 { 3090 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3091 static gen_helper_gvec_3 * const eq_fn[4] = { 3092 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3093 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3094 }; 3095 static gen_helper_gvec_3 * const ne_fn[4] = { 3096 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3097 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3098 }; 3099 static gen_helper_gvec_3 * const lt_fn[4] = { 3100 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3101 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3102 }; 3103 static gen_helper_gvec_3 * const le_fn[4] = { 3104 gen_helper_gvec_le8, gen_helper_gvec_le16, 3105 gen_helper_gvec_le32, gen_helper_gvec_le64 3106 }; 3107 static gen_helper_gvec_3 * const ltu_fn[4] = { 3108 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3109 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3110 }; 3111 static gen_helper_gvec_3 * const leu_fn[4] = { 3112 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3113 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3114 }; 3115 static gen_helper_gvec_3 * const * const fns[16] = { 3116 [TCG_COND_EQ] = eq_fn, 3117 [TCG_COND_NE] = ne_fn, 3118 [TCG_COND_LT] = lt_fn, 3119 [TCG_COND_LE] = le_fn, 3120 [TCG_COND_LTU] = ltu_fn, 3121 [TCG_COND_LEU] = leu_fn, 3122 }; 3123 3124 const TCGOpcode *hold_list; 3125 TCGType type; 3126 uint32_t some; 3127 3128 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3129 check_overlap_3(dofs, aofs, bofs, maxsz); 3130 3131 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3132 do_dup(MO_8, dofs, oprsz, maxsz, 3133 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3134 return; 3135 } 3136 3137 /* 3138 * Implement inline with a vector type, if possible. 3139 * Prefer integer when 64-bit host and 64-bit comparison. 3140 */ 3141 hold_list = tcg_swap_vecop_list(cmp_list); 3142 type = choose_vector_type(cmp_list, vece, oprsz, 3143 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3144 switch (type) { 3145 case TCG_TYPE_V256: 3146 /* Recall that ARM SVE allows vector sizes that are not a 3147 * power of 2, but always a multiple of 16. The intent is 3148 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3149 */ 3150 some = QEMU_ALIGN_DOWN(oprsz, 32); 3151 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3152 if (some == oprsz) { 3153 break; 3154 } 3155 dofs += some; 3156 aofs += some; 3157 bofs += some; 3158 oprsz -= some; 3159 maxsz -= some; 3160 /* fallthru */ 3161 case TCG_TYPE_V128: 3162 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3163 break; 3164 case TCG_TYPE_V64: 3165 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3166 break; 3167 3168 case 0: 3169 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3170 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3171 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3172 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3173 } else { 3174 gen_helper_gvec_3 * const *fn = fns[cond]; 3175 3176 if (fn == NULL) { 3177 uint32_t tmp; 3178 tmp = aofs, aofs = bofs, bofs = tmp; 3179 cond = tcg_swap_cond(cond); 3180 fn = fns[cond]; 3181 assert(fn != NULL); 3182 } 3183 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3184 oprsz = maxsz; 3185 } 3186 break; 3187 3188 default: 3189 g_assert_not_reached(); 3190 } 3191 tcg_swap_vecop_list(hold_list); 3192 3193 if (oprsz < maxsz) { 3194 expand_clr(dofs + oprsz, maxsz - oprsz); 3195 } 3196 } 3197 3198 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 3199 { 3200 TCGv_i64 t = tcg_temp_new_i64(); 3201 3202 tcg_gen_and_i64(t, b, a); 3203 tcg_gen_andc_i64(d, c, a); 3204 tcg_gen_or_i64(d, d, t); 3205 tcg_temp_free_i64(t); 3206 } 3207 3208 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 3209 uint32_t bofs, uint32_t cofs, 3210 uint32_t oprsz, uint32_t maxsz) 3211 { 3212 static const GVecGen4 g = { 3213 .fni8 = tcg_gen_bitsel_i64, 3214 .fniv = tcg_gen_bitsel_vec, 3215 .fno = gen_helper_gvec_bitsel, 3216 }; 3217 3218 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 3219 } 3220