1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu-common.h" 22 #include "tcg.h" 23 #include "tcg-op.h" 24 #include "tcg-op-gvec.h" 25 #include "tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 #ifdef CONFIG_DEBUG_TCG 30 static const TCGOpcode vecop_list_empty[1] = { 0 }; 31 #else 32 #define vecop_list_empty NULL 33 #endif 34 35 36 /* Verify vector size and alignment rules. OFS should be the OR of all 37 of the operand offsets so that we can check them all at once. */ 38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 39 { 40 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 41 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 42 tcg_debug_assert(oprsz > 0); 43 tcg_debug_assert(oprsz <= maxsz); 44 tcg_debug_assert((oprsz & opr_align) == 0); 45 tcg_debug_assert((maxsz & max_align) == 0); 46 tcg_debug_assert((ofs & max_align) == 0); 47 } 48 49 /* Verify vector overlap rules for two operands. */ 50 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 51 { 52 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 53 } 54 55 /* Verify vector overlap rules for three operands. */ 56 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 57 { 58 check_overlap_2(d, a, s); 59 check_overlap_2(d, b, s); 60 check_overlap_2(a, b, s); 61 } 62 63 /* Verify vector overlap rules for four operands. */ 64 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 65 uint32_t c, uint32_t s) 66 { 67 check_overlap_2(d, a, s); 68 check_overlap_2(d, b, s); 69 check_overlap_2(d, c, s); 70 check_overlap_2(a, b, s); 71 check_overlap_2(a, c, s); 72 check_overlap_2(b, c, s); 73 } 74 75 /* Create a descriptor from components. */ 76 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 77 { 78 uint32_t desc = 0; 79 80 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 81 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 82 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 83 84 oprsz = (oprsz / 8) - 1; 85 maxsz = (maxsz / 8) - 1; 86 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 87 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 88 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 89 90 return desc; 91 } 92 93 /* Generate a call to a gvec-style helper with two vector operands. */ 94 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 95 uint32_t oprsz, uint32_t maxsz, int32_t data, 96 gen_helper_gvec_2 *fn) 97 { 98 TCGv_ptr a0, a1; 99 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 100 101 a0 = tcg_temp_new_ptr(); 102 a1 = tcg_temp_new_ptr(); 103 104 tcg_gen_addi_ptr(a0, cpu_env, dofs); 105 tcg_gen_addi_ptr(a1, cpu_env, aofs); 106 107 fn(a0, a1, desc); 108 109 tcg_temp_free_ptr(a0); 110 tcg_temp_free_ptr(a1); 111 tcg_temp_free_i32(desc); 112 } 113 114 /* Generate a call to a gvec-style helper with two vector operands 115 and one scalar operand. */ 116 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 117 uint32_t oprsz, uint32_t maxsz, int32_t data, 118 gen_helper_gvec_2i *fn) 119 { 120 TCGv_ptr a0, a1; 121 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 122 123 a0 = tcg_temp_new_ptr(); 124 a1 = tcg_temp_new_ptr(); 125 126 tcg_gen_addi_ptr(a0, cpu_env, dofs); 127 tcg_gen_addi_ptr(a1, cpu_env, aofs); 128 129 fn(a0, a1, c, desc); 130 131 tcg_temp_free_ptr(a0); 132 tcg_temp_free_ptr(a1); 133 tcg_temp_free_i32(desc); 134 } 135 136 /* Generate a call to a gvec-style helper with three vector operands. */ 137 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 138 uint32_t oprsz, uint32_t maxsz, int32_t data, 139 gen_helper_gvec_3 *fn) 140 { 141 TCGv_ptr a0, a1, a2; 142 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 143 144 a0 = tcg_temp_new_ptr(); 145 a1 = tcg_temp_new_ptr(); 146 a2 = tcg_temp_new_ptr(); 147 148 tcg_gen_addi_ptr(a0, cpu_env, dofs); 149 tcg_gen_addi_ptr(a1, cpu_env, aofs); 150 tcg_gen_addi_ptr(a2, cpu_env, bofs); 151 152 fn(a0, a1, a2, desc); 153 154 tcg_temp_free_ptr(a0); 155 tcg_temp_free_ptr(a1); 156 tcg_temp_free_ptr(a2); 157 tcg_temp_free_i32(desc); 158 } 159 160 /* Generate a call to a gvec-style helper with four vector operands. */ 161 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 162 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 163 int32_t data, gen_helper_gvec_4 *fn) 164 { 165 TCGv_ptr a0, a1, a2, a3; 166 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 167 168 a0 = tcg_temp_new_ptr(); 169 a1 = tcg_temp_new_ptr(); 170 a2 = tcg_temp_new_ptr(); 171 a3 = tcg_temp_new_ptr(); 172 173 tcg_gen_addi_ptr(a0, cpu_env, dofs); 174 tcg_gen_addi_ptr(a1, cpu_env, aofs); 175 tcg_gen_addi_ptr(a2, cpu_env, bofs); 176 tcg_gen_addi_ptr(a3, cpu_env, cofs); 177 178 fn(a0, a1, a2, a3, desc); 179 180 tcg_temp_free_ptr(a0); 181 tcg_temp_free_ptr(a1); 182 tcg_temp_free_ptr(a2); 183 tcg_temp_free_ptr(a3); 184 tcg_temp_free_i32(desc); 185 } 186 187 /* Generate a call to a gvec-style helper with five vector operands. */ 188 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 189 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 190 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 191 { 192 TCGv_ptr a0, a1, a2, a3, a4; 193 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 194 195 a0 = tcg_temp_new_ptr(); 196 a1 = tcg_temp_new_ptr(); 197 a2 = tcg_temp_new_ptr(); 198 a3 = tcg_temp_new_ptr(); 199 a4 = tcg_temp_new_ptr(); 200 201 tcg_gen_addi_ptr(a0, cpu_env, dofs); 202 tcg_gen_addi_ptr(a1, cpu_env, aofs); 203 tcg_gen_addi_ptr(a2, cpu_env, bofs); 204 tcg_gen_addi_ptr(a3, cpu_env, cofs); 205 tcg_gen_addi_ptr(a4, cpu_env, xofs); 206 207 fn(a0, a1, a2, a3, a4, desc); 208 209 tcg_temp_free_ptr(a0); 210 tcg_temp_free_ptr(a1); 211 tcg_temp_free_ptr(a2); 212 tcg_temp_free_ptr(a3); 213 tcg_temp_free_ptr(a4); 214 tcg_temp_free_i32(desc); 215 } 216 217 /* Generate a call to a gvec-style helper with three vector operands 218 and an extra pointer operand. */ 219 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 220 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 221 int32_t data, gen_helper_gvec_2_ptr *fn) 222 { 223 TCGv_ptr a0, a1; 224 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 225 226 a0 = tcg_temp_new_ptr(); 227 a1 = tcg_temp_new_ptr(); 228 229 tcg_gen_addi_ptr(a0, cpu_env, dofs); 230 tcg_gen_addi_ptr(a1, cpu_env, aofs); 231 232 fn(a0, a1, ptr, desc); 233 234 tcg_temp_free_ptr(a0); 235 tcg_temp_free_ptr(a1); 236 tcg_temp_free_i32(desc); 237 } 238 239 /* Generate a call to a gvec-style helper with three vector operands 240 and an extra pointer operand. */ 241 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 242 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 243 int32_t data, gen_helper_gvec_3_ptr *fn) 244 { 245 TCGv_ptr a0, a1, a2; 246 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 247 248 a0 = tcg_temp_new_ptr(); 249 a1 = tcg_temp_new_ptr(); 250 a2 = tcg_temp_new_ptr(); 251 252 tcg_gen_addi_ptr(a0, cpu_env, dofs); 253 tcg_gen_addi_ptr(a1, cpu_env, aofs); 254 tcg_gen_addi_ptr(a2, cpu_env, bofs); 255 256 fn(a0, a1, a2, ptr, desc); 257 258 tcg_temp_free_ptr(a0); 259 tcg_temp_free_ptr(a1); 260 tcg_temp_free_ptr(a2); 261 tcg_temp_free_i32(desc); 262 } 263 264 /* Generate a call to a gvec-style helper with four vector operands 265 and an extra pointer operand. */ 266 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 267 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 268 uint32_t maxsz, int32_t data, 269 gen_helper_gvec_4_ptr *fn) 270 { 271 TCGv_ptr a0, a1, a2, a3; 272 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 273 274 a0 = tcg_temp_new_ptr(); 275 a1 = tcg_temp_new_ptr(); 276 a2 = tcg_temp_new_ptr(); 277 a3 = tcg_temp_new_ptr(); 278 279 tcg_gen_addi_ptr(a0, cpu_env, dofs); 280 tcg_gen_addi_ptr(a1, cpu_env, aofs); 281 tcg_gen_addi_ptr(a2, cpu_env, bofs); 282 tcg_gen_addi_ptr(a3, cpu_env, cofs); 283 284 fn(a0, a1, a2, a3, ptr, desc); 285 286 tcg_temp_free_ptr(a0); 287 tcg_temp_free_ptr(a1); 288 tcg_temp_free_ptr(a2); 289 tcg_temp_free_ptr(a3); 290 tcg_temp_free_i32(desc); 291 } 292 293 /* Return true if we want to implement something of OPRSZ bytes 294 in units of LNSZ. This limits the expansion of inline code. */ 295 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 296 { 297 if (oprsz % lnsz == 0) { 298 uint32_t lnct = oprsz / lnsz; 299 return lnct >= 1 && lnct <= MAX_UNROLL; 300 } 301 return false; 302 } 303 304 static void expand_clr(uint32_t dofs, uint32_t maxsz); 305 306 /* Duplicate C as per VECE. */ 307 uint64_t (dup_const)(unsigned vece, uint64_t c) 308 { 309 switch (vece) { 310 case MO_8: 311 return 0x0101010101010101ull * (uint8_t)c; 312 case MO_16: 313 return 0x0001000100010001ull * (uint16_t)c; 314 case MO_32: 315 return 0x0000000100000001ull * (uint32_t)c; 316 case MO_64: 317 return c; 318 default: 319 g_assert_not_reached(); 320 } 321 } 322 323 /* Duplicate IN into OUT as per VECE. */ 324 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 325 { 326 switch (vece) { 327 case MO_8: 328 tcg_gen_ext8u_i32(out, in); 329 tcg_gen_muli_i32(out, out, 0x01010101); 330 break; 331 case MO_16: 332 tcg_gen_deposit_i32(out, in, in, 16, 16); 333 break; 334 case MO_32: 335 tcg_gen_mov_i32(out, in); 336 break; 337 default: 338 g_assert_not_reached(); 339 } 340 } 341 342 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 343 { 344 switch (vece) { 345 case MO_8: 346 tcg_gen_ext8u_i64(out, in); 347 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 348 break; 349 case MO_16: 350 tcg_gen_ext16u_i64(out, in); 351 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 352 break; 353 case MO_32: 354 tcg_gen_deposit_i64(out, in, in, 32, 32); 355 break; 356 case MO_64: 357 tcg_gen_mov_i64(out, in); 358 break; 359 default: 360 g_assert_not_reached(); 361 } 362 } 363 364 /* Select a supported vector type for implementing an operation on SIZE 365 * bytes. If OP is 0, assume that the real operation to be performed is 366 * required by all backends. Otherwise, make sure than OP can be performed 367 * on elements of size VECE in the selected type. Do not select V64 if 368 * PREFER_I64 is true. Return 0 if no vector type is selected. 369 */ 370 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 371 uint32_t size, bool prefer_i64) 372 { 373 if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { 374 /* 375 * Recall that ARM SVE allows vector sizes that are not a 376 * power of 2, but always a multiple of 16. The intent is 377 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 378 * It is hard to imagine a case in which v256 is supported 379 * but v128 is not, but check anyway. 380 */ 381 if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) 382 && (size % 32 == 0 383 || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) { 384 return TCG_TYPE_V256; 385 } 386 } 387 if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) 388 && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) { 389 return TCG_TYPE_V128; 390 } 391 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 392 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 393 return TCG_TYPE_V64; 394 } 395 return 0; 396 } 397 398 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 399 uint32_t maxsz, TCGv_vec t_vec) 400 { 401 uint32_t i = 0; 402 403 switch (type) { 404 case TCG_TYPE_V256: 405 /* 406 * Recall that ARM SVE allows vector sizes that are not a 407 * power of 2, but always a multiple of 16. The intent is 408 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 409 */ 410 for (; i + 32 <= oprsz; i += 32) { 411 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 412 } 413 /* fallthru */ 414 case TCG_TYPE_V128: 415 for (; i + 16 <= oprsz; i += 16) { 416 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 417 } 418 break; 419 case TCG_TYPE_V64: 420 for (; i < oprsz; i += 8) { 421 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 422 } 423 break; 424 default: 425 g_assert_not_reached(); 426 } 427 428 if (oprsz < maxsz) { 429 expand_clr(dofs + oprsz, maxsz - oprsz); 430 } 431 } 432 433 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 434 * Only one of IN_32 or IN_64 may be set; 435 * IN_C is used if IN_32 and IN_64 are unset. 436 */ 437 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 438 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 439 uint64_t in_c) 440 { 441 TCGType type; 442 TCGv_i64 t_64; 443 TCGv_i32 t_32, t_desc; 444 TCGv_ptr t_ptr; 445 uint32_t i; 446 447 assert(vece <= (in_32 ? MO_32 : MO_64)); 448 assert(in_32 == NULL || in_64 == NULL); 449 450 /* If we're storing 0, expand oprsz to maxsz. */ 451 if (in_32 == NULL && in_64 == NULL) { 452 in_c = dup_const(vece, in_c); 453 if (in_c == 0) { 454 oprsz = maxsz; 455 } 456 } 457 458 /* Implement inline with a vector type, if possible. 459 * Prefer integer when 64-bit host and no variable dup. 460 */ 461 type = choose_vector_type(NULL, vece, oprsz, 462 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 463 && (in_64 == NULL || vece == MO_64))); 464 if (type != 0) { 465 TCGv_vec t_vec = tcg_temp_new_vec(type); 466 467 if (in_32) { 468 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 469 } else if (in_64) { 470 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 471 } else { 472 tcg_gen_dupi_vec(vece, t_vec, in_c); 473 } 474 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 475 tcg_temp_free_vec(t_vec); 476 return; 477 } 478 479 /* Otherwise, inline with an integer type, unless "large". */ 480 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 481 t_64 = NULL; 482 t_32 = NULL; 483 484 if (in_32) { 485 /* We are given a 32-bit variable input. For a 64-bit host, 486 use a 64-bit operation unless the 32-bit operation would 487 be simple enough. */ 488 if (TCG_TARGET_REG_BITS == 64 489 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 490 t_64 = tcg_temp_new_i64(); 491 tcg_gen_extu_i32_i64(t_64, in_32); 492 gen_dup_i64(vece, t_64, t_64); 493 } else { 494 t_32 = tcg_temp_new_i32(); 495 gen_dup_i32(vece, t_32, in_32); 496 } 497 } else if (in_64) { 498 /* We are given a 64-bit variable input. */ 499 t_64 = tcg_temp_new_i64(); 500 gen_dup_i64(vece, t_64, in_64); 501 } else { 502 /* We are given a constant input. */ 503 /* For 64-bit hosts, use 64-bit constants for "simple" constants 504 or when we'd need too many 32-bit stores, or when a 64-bit 505 constant is really required. */ 506 if (vece == MO_64 507 || (TCG_TARGET_REG_BITS == 64 508 && (in_c == 0 || in_c == -1 509 || !check_size_impl(oprsz, 4)))) { 510 t_64 = tcg_const_i64(in_c); 511 } else { 512 t_32 = tcg_const_i32(in_c); 513 } 514 } 515 516 /* Implement inline if we picked an implementation size above. */ 517 if (t_32) { 518 for (i = 0; i < oprsz; i += 4) { 519 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 520 } 521 tcg_temp_free_i32(t_32); 522 goto done; 523 } 524 if (t_64) { 525 for (i = 0; i < oprsz; i += 8) { 526 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 527 } 528 tcg_temp_free_i64(t_64); 529 goto done; 530 } 531 } 532 533 /* Otherwise implement out of line. */ 534 t_ptr = tcg_temp_new_ptr(); 535 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 536 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 537 538 if (vece == MO_64) { 539 if (in_64) { 540 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 541 } else { 542 t_64 = tcg_const_i64(in_c); 543 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 544 tcg_temp_free_i64(t_64); 545 } 546 } else { 547 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 548 static dup_fn * const fns[3] = { 549 gen_helper_gvec_dup8, 550 gen_helper_gvec_dup16, 551 gen_helper_gvec_dup32 552 }; 553 554 if (in_32) { 555 fns[vece](t_ptr, t_desc, in_32); 556 } else { 557 t_32 = tcg_temp_new_i32(); 558 if (in_64) { 559 tcg_gen_extrl_i64_i32(t_32, in_64); 560 } else if (vece == MO_8) { 561 tcg_gen_movi_i32(t_32, in_c & 0xff); 562 } else if (vece == MO_16) { 563 tcg_gen_movi_i32(t_32, in_c & 0xffff); 564 } else { 565 tcg_gen_movi_i32(t_32, in_c); 566 } 567 fns[vece](t_ptr, t_desc, t_32); 568 tcg_temp_free_i32(t_32); 569 } 570 } 571 572 tcg_temp_free_ptr(t_ptr); 573 tcg_temp_free_i32(t_desc); 574 return; 575 576 done: 577 if (oprsz < maxsz) { 578 expand_clr(dofs + oprsz, maxsz - oprsz); 579 } 580 } 581 582 /* Likewise, but with zero. */ 583 static void expand_clr(uint32_t dofs, uint32_t maxsz) 584 { 585 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 586 } 587 588 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 589 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 590 void (*fni)(TCGv_i32, TCGv_i32)) 591 { 592 TCGv_i32 t0 = tcg_temp_new_i32(); 593 uint32_t i; 594 595 for (i = 0; i < oprsz; i += 4) { 596 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 597 fni(t0, t0); 598 tcg_gen_st_i32(t0, cpu_env, dofs + i); 599 } 600 tcg_temp_free_i32(t0); 601 } 602 603 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 604 int32_t c, bool load_dest, 605 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 606 { 607 TCGv_i32 t0 = tcg_temp_new_i32(); 608 TCGv_i32 t1 = tcg_temp_new_i32(); 609 uint32_t i; 610 611 for (i = 0; i < oprsz; i += 4) { 612 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 613 if (load_dest) { 614 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 615 } 616 fni(t1, t0, c); 617 tcg_gen_st_i32(t1, cpu_env, dofs + i); 618 } 619 tcg_temp_free_i32(t0); 620 tcg_temp_free_i32(t1); 621 } 622 623 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 624 TCGv_i32 c, bool scalar_first, 625 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 626 { 627 TCGv_i32 t0 = tcg_temp_new_i32(); 628 TCGv_i32 t1 = tcg_temp_new_i32(); 629 uint32_t i; 630 631 for (i = 0; i < oprsz; i += 4) { 632 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 633 if (scalar_first) { 634 fni(t1, c, t0); 635 } else { 636 fni(t1, t0, c); 637 } 638 tcg_gen_st_i32(t1, cpu_env, dofs + i); 639 } 640 tcg_temp_free_i32(t0); 641 tcg_temp_free_i32(t1); 642 } 643 644 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 645 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 646 uint32_t bofs, uint32_t oprsz, bool load_dest, 647 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 648 { 649 TCGv_i32 t0 = tcg_temp_new_i32(); 650 TCGv_i32 t1 = tcg_temp_new_i32(); 651 TCGv_i32 t2 = tcg_temp_new_i32(); 652 uint32_t i; 653 654 for (i = 0; i < oprsz; i += 4) { 655 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 656 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 657 if (load_dest) { 658 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 659 } 660 fni(t2, t0, t1); 661 tcg_gen_st_i32(t2, cpu_env, dofs + i); 662 } 663 tcg_temp_free_i32(t2); 664 tcg_temp_free_i32(t1); 665 tcg_temp_free_i32(t0); 666 } 667 668 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 669 uint32_t oprsz, int32_t c, bool load_dest, 670 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 671 { 672 TCGv_i32 t0 = tcg_temp_new_i32(); 673 TCGv_i32 t1 = tcg_temp_new_i32(); 674 TCGv_i32 t2 = tcg_temp_new_i32(); 675 uint32_t i; 676 677 for (i = 0; i < oprsz; i += 4) { 678 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 679 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 680 if (load_dest) { 681 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 682 } 683 fni(t2, t0, t1, c); 684 tcg_gen_st_i32(t2, cpu_env, dofs + i); 685 } 686 tcg_temp_free_i32(t0); 687 tcg_temp_free_i32(t1); 688 tcg_temp_free_i32(t2); 689 } 690 691 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 692 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 693 uint32_t cofs, uint32_t oprsz, bool write_aofs, 694 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 695 { 696 TCGv_i32 t0 = tcg_temp_new_i32(); 697 TCGv_i32 t1 = tcg_temp_new_i32(); 698 TCGv_i32 t2 = tcg_temp_new_i32(); 699 TCGv_i32 t3 = tcg_temp_new_i32(); 700 uint32_t i; 701 702 for (i = 0; i < oprsz; i += 4) { 703 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 704 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 705 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 706 fni(t0, t1, t2, t3); 707 tcg_gen_st_i32(t0, cpu_env, dofs + i); 708 if (write_aofs) { 709 tcg_gen_st_i32(t1, cpu_env, aofs + i); 710 } 711 } 712 tcg_temp_free_i32(t3); 713 tcg_temp_free_i32(t2); 714 tcg_temp_free_i32(t1); 715 tcg_temp_free_i32(t0); 716 } 717 718 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 719 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 720 void (*fni)(TCGv_i64, TCGv_i64)) 721 { 722 TCGv_i64 t0 = tcg_temp_new_i64(); 723 uint32_t i; 724 725 for (i = 0; i < oprsz; i += 8) { 726 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 727 fni(t0, t0); 728 tcg_gen_st_i64(t0, cpu_env, dofs + i); 729 } 730 tcg_temp_free_i64(t0); 731 } 732 733 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 734 int64_t c, bool load_dest, 735 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 736 { 737 TCGv_i64 t0 = tcg_temp_new_i64(); 738 TCGv_i64 t1 = tcg_temp_new_i64(); 739 uint32_t i; 740 741 for (i = 0; i < oprsz; i += 8) { 742 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 743 if (load_dest) { 744 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 745 } 746 fni(t1, t0, c); 747 tcg_gen_st_i64(t1, cpu_env, dofs + i); 748 } 749 tcg_temp_free_i64(t0); 750 tcg_temp_free_i64(t1); 751 } 752 753 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 754 TCGv_i64 c, bool scalar_first, 755 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 756 { 757 TCGv_i64 t0 = tcg_temp_new_i64(); 758 TCGv_i64 t1 = tcg_temp_new_i64(); 759 uint32_t i; 760 761 for (i = 0; i < oprsz; i += 8) { 762 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 763 if (scalar_first) { 764 fni(t1, c, t0); 765 } else { 766 fni(t1, t0, c); 767 } 768 tcg_gen_st_i64(t1, cpu_env, dofs + i); 769 } 770 tcg_temp_free_i64(t0); 771 tcg_temp_free_i64(t1); 772 } 773 774 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 775 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 776 uint32_t bofs, uint32_t oprsz, bool load_dest, 777 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 778 { 779 TCGv_i64 t0 = tcg_temp_new_i64(); 780 TCGv_i64 t1 = tcg_temp_new_i64(); 781 TCGv_i64 t2 = tcg_temp_new_i64(); 782 uint32_t i; 783 784 for (i = 0; i < oprsz; i += 8) { 785 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 786 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 787 if (load_dest) { 788 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 789 } 790 fni(t2, t0, t1); 791 tcg_gen_st_i64(t2, cpu_env, dofs + i); 792 } 793 tcg_temp_free_i64(t2); 794 tcg_temp_free_i64(t1); 795 tcg_temp_free_i64(t0); 796 } 797 798 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 799 uint32_t oprsz, int64_t c, bool load_dest, 800 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 801 { 802 TCGv_i64 t0 = tcg_temp_new_i64(); 803 TCGv_i64 t1 = tcg_temp_new_i64(); 804 TCGv_i64 t2 = tcg_temp_new_i64(); 805 uint32_t i; 806 807 for (i = 0; i < oprsz; i += 8) { 808 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 809 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 810 if (load_dest) { 811 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 812 } 813 fni(t2, t0, t1, c); 814 tcg_gen_st_i64(t2, cpu_env, dofs + i); 815 } 816 tcg_temp_free_i64(t0); 817 tcg_temp_free_i64(t1); 818 tcg_temp_free_i64(t2); 819 } 820 821 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 822 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 823 uint32_t cofs, uint32_t oprsz, bool write_aofs, 824 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 825 { 826 TCGv_i64 t0 = tcg_temp_new_i64(); 827 TCGv_i64 t1 = tcg_temp_new_i64(); 828 TCGv_i64 t2 = tcg_temp_new_i64(); 829 TCGv_i64 t3 = tcg_temp_new_i64(); 830 uint32_t i; 831 832 for (i = 0; i < oprsz; i += 8) { 833 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 834 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 835 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 836 fni(t0, t1, t2, t3); 837 tcg_gen_st_i64(t0, cpu_env, dofs + i); 838 if (write_aofs) { 839 tcg_gen_st_i64(t1, cpu_env, aofs + i); 840 } 841 } 842 tcg_temp_free_i64(t3); 843 tcg_temp_free_i64(t2); 844 tcg_temp_free_i64(t1); 845 tcg_temp_free_i64(t0); 846 } 847 848 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 849 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 850 uint32_t oprsz, uint32_t tysz, TCGType type, 851 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 852 { 853 TCGv_vec t0 = tcg_temp_new_vec(type); 854 uint32_t i; 855 856 for (i = 0; i < oprsz; i += tysz) { 857 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 858 fni(vece, t0, t0); 859 tcg_gen_st_vec(t0, cpu_env, dofs + i); 860 } 861 tcg_temp_free_vec(t0); 862 } 863 864 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 865 using host vectors. */ 866 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 867 uint32_t oprsz, uint32_t tysz, TCGType type, 868 int64_t c, bool load_dest, 869 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 870 { 871 TCGv_vec t0 = tcg_temp_new_vec(type); 872 TCGv_vec t1 = tcg_temp_new_vec(type); 873 uint32_t i; 874 875 for (i = 0; i < oprsz; i += tysz) { 876 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 877 if (load_dest) { 878 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 879 } 880 fni(vece, t1, t0, c); 881 tcg_gen_st_vec(t1, cpu_env, dofs + i); 882 } 883 tcg_temp_free_vec(t0); 884 tcg_temp_free_vec(t1); 885 } 886 887 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 888 uint32_t oprsz, uint32_t tysz, TCGType type, 889 TCGv_vec c, bool scalar_first, 890 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 891 { 892 TCGv_vec t0 = tcg_temp_new_vec(type); 893 TCGv_vec t1 = tcg_temp_new_vec(type); 894 uint32_t i; 895 896 for (i = 0; i < oprsz; i += tysz) { 897 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 898 if (scalar_first) { 899 fni(vece, t1, c, t0); 900 } else { 901 fni(vece, t1, t0, c); 902 } 903 tcg_gen_st_vec(t1, cpu_env, dofs + i); 904 } 905 tcg_temp_free_vec(t0); 906 tcg_temp_free_vec(t1); 907 } 908 909 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 910 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 911 uint32_t bofs, uint32_t oprsz, 912 uint32_t tysz, TCGType type, bool load_dest, 913 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 914 { 915 TCGv_vec t0 = tcg_temp_new_vec(type); 916 TCGv_vec t1 = tcg_temp_new_vec(type); 917 TCGv_vec t2 = tcg_temp_new_vec(type); 918 uint32_t i; 919 920 for (i = 0; i < oprsz; i += tysz) { 921 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 922 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 923 if (load_dest) { 924 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 925 } 926 fni(vece, t2, t0, t1); 927 tcg_gen_st_vec(t2, cpu_env, dofs + i); 928 } 929 tcg_temp_free_vec(t2); 930 tcg_temp_free_vec(t1); 931 tcg_temp_free_vec(t0); 932 } 933 934 /* 935 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 936 * using host vectors. 937 */ 938 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 939 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 940 TCGType type, int64_t c, bool load_dest, 941 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 942 int64_t)) 943 { 944 TCGv_vec t0 = tcg_temp_new_vec(type); 945 TCGv_vec t1 = tcg_temp_new_vec(type); 946 TCGv_vec t2 = tcg_temp_new_vec(type); 947 uint32_t i; 948 949 for (i = 0; i < oprsz; i += tysz) { 950 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 951 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 952 if (load_dest) { 953 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 954 } 955 fni(vece, t2, t0, t1, c); 956 tcg_gen_st_vec(t2, cpu_env, dofs + i); 957 } 958 tcg_temp_free_vec(t0); 959 tcg_temp_free_vec(t1); 960 tcg_temp_free_vec(t2); 961 } 962 963 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 964 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 965 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 966 uint32_t tysz, TCGType type, bool write_aofs, 967 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 968 TCGv_vec, TCGv_vec)) 969 { 970 TCGv_vec t0 = tcg_temp_new_vec(type); 971 TCGv_vec t1 = tcg_temp_new_vec(type); 972 TCGv_vec t2 = tcg_temp_new_vec(type); 973 TCGv_vec t3 = tcg_temp_new_vec(type); 974 uint32_t i; 975 976 for (i = 0; i < oprsz; i += tysz) { 977 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 978 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 979 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 980 fni(vece, t0, t1, t2, t3); 981 tcg_gen_st_vec(t0, cpu_env, dofs + i); 982 if (write_aofs) { 983 tcg_gen_st_vec(t1, cpu_env, aofs + i); 984 } 985 } 986 tcg_temp_free_vec(t3); 987 tcg_temp_free_vec(t2); 988 tcg_temp_free_vec(t1); 989 tcg_temp_free_vec(t0); 990 } 991 992 /* Expand a vector two-operand operation. */ 993 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 994 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 995 { 996 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 997 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 998 TCGType type; 999 uint32_t some; 1000 1001 check_size_align(oprsz, maxsz, dofs | aofs); 1002 check_overlap_2(dofs, aofs, maxsz); 1003 1004 type = 0; 1005 if (g->fniv) { 1006 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1007 } 1008 switch (type) { 1009 case TCG_TYPE_V256: 1010 /* Recall that ARM SVE allows vector sizes that are not a 1011 * power of 2, but always a multiple of 16. The intent is 1012 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1013 */ 1014 some = QEMU_ALIGN_DOWN(oprsz, 32); 1015 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); 1016 if (some == oprsz) { 1017 break; 1018 } 1019 dofs += some; 1020 aofs += some; 1021 oprsz -= some; 1022 maxsz -= some; 1023 /* fallthru */ 1024 case TCG_TYPE_V128: 1025 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); 1026 break; 1027 case TCG_TYPE_V64: 1028 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); 1029 break; 1030 1031 case 0: 1032 if (g->fni8 && check_size_impl(oprsz, 8)) { 1033 expand_2_i64(dofs, aofs, oprsz, g->fni8); 1034 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1035 expand_2_i32(dofs, aofs, oprsz, g->fni4); 1036 } else { 1037 assert(g->fno != NULL); 1038 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1039 oprsz = maxsz; 1040 } 1041 break; 1042 1043 default: 1044 g_assert_not_reached(); 1045 } 1046 tcg_swap_vecop_list(hold_list); 1047 1048 if (oprsz < maxsz) { 1049 expand_clr(dofs + oprsz, maxsz - oprsz); 1050 } 1051 } 1052 1053 /* Expand a vector operation with two vectors and an immediate. */ 1054 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1055 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1056 { 1057 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1058 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1059 TCGType type; 1060 uint32_t some; 1061 1062 check_size_align(oprsz, maxsz, dofs | aofs); 1063 check_overlap_2(dofs, aofs, maxsz); 1064 1065 type = 0; 1066 if (g->fniv) { 1067 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1068 } 1069 switch (type) { 1070 case TCG_TYPE_V256: 1071 /* Recall that ARM SVE allows vector sizes that are not a 1072 * power of 2, but always a multiple of 16. The intent is 1073 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1074 */ 1075 some = QEMU_ALIGN_DOWN(oprsz, 32); 1076 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1077 c, g->load_dest, g->fniv); 1078 if (some == oprsz) { 1079 break; 1080 } 1081 dofs += some; 1082 aofs += some; 1083 oprsz -= some; 1084 maxsz -= some; 1085 /* fallthru */ 1086 case TCG_TYPE_V128: 1087 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1088 c, g->load_dest, g->fniv); 1089 break; 1090 case TCG_TYPE_V64: 1091 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1092 c, g->load_dest, g->fniv); 1093 break; 1094 1095 case 0: 1096 if (g->fni8 && check_size_impl(oprsz, 8)) { 1097 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1098 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1099 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1100 } else { 1101 if (g->fno) { 1102 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1103 } else { 1104 TCGv_i64 tcg_c = tcg_const_i64(c); 1105 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1106 maxsz, c, g->fnoi); 1107 tcg_temp_free_i64(tcg_c); 1108 } 1109 oprsz = maxsz; 1110 } 1111 break; 1112 1113 default: 1114 g_assert_not_reached(); 1115 } 1116 tcg_swap_vecop_list(hold_list); 1117 1118 if (oprsz < maxsz) { 1119 expand_clr(dofs + oprsz, maxsz - oprsz); 1120 } 1121 } 1122 1123 /* Expand a vector operation with two vectors and a scalar. */ 1124 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1125 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1126 { 1127 TCGType type; 1128 1129 check_size_align(oprsz, maxsz, dofs | aofs); 1130 check_overlap_2(dofs, aofs, maxsz); 1131 1132 type = 0; 1133 if (g->fniv) { 1134 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1135 } 1136 if (type != 0) { 1137 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1138 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1139 TCGv_vec t_vec = tcg_temp_new_vec(type); 1140 uint32_t some; 1141 1142 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1143 1144 switch (type) { 1145 case TCG_TYPE_V256: 1146 /* Recall that ARM SVE allows vector sizes that are not a 1147 * power of 2, but always a multiple of 16. The intent is 1148 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1149 */ 1150 some = QEMU_ALIGN_DOWN(oprsz, 32); 1151 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1152 t_vec, g->scalar_first, g->fniv); 1153 if (some == oprsz) { 1154 break; 1155 } 1156 dofs += some; 1157 aofs += some; 1158 oprsz -= some; 1159 maxsz -= some; 1160 /* fallthru */ 1161 1162 case TCG_TYPE_V128: 1163 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1164 t_vec, g->scalar_first, g->fniv); 1165 break; 1166 1167 case TCG_TYPE_V64: 1168 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1169 t_vec, g->scalar_first, g->fniv); 1170 break; 1171 1172 default: 1173 g_assert_not_reached(); 1174 } 1175 tcg_temp_free_vec(t_vec); 1176 tcg_swap_vecop_list(hold_list); 1177 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1178 TCGv_i64 t64 = tcg_temp_new_i64(); 1179 1180 gen_dup_i64(g->vece, t64, c); 1181 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1182 tcg_temp_free_i64(t64); 1183 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1184 TCGv_i32 t32 = tcg_temp_new_i32(); 1185 1186 tcg_gen_extrl_i64_i32(t32, c); 1187 gen_dup_i32(g->vece, t32, t32); 1188 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1189 tcg_temp_free_i32(t32); 1190 } else { 1191 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1192 return; 1193 } 1194 1195 if (oprsz < maxsz) { 1196 expand_clr(dofs + oprsz, maxsz - oprsz); 1197 } 1198 } 1199 1200 /* Expand a vector three-operand operation. */ 1201 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1202 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1203 { 1204 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1205 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1206 TCGType type; 1207 uint32_t some; 1208 1209 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1210 check_overlap_3(dofs, aofs, bofs, maxsz); 1211 1212 type = 0; 1213 if (g->fniv) { 1214 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1215 } 1216 switch (type) { 1217 case TCG_TYPE_V256: 1218 /* Recall that ARM SVE allows vector sizes that are not a 1219 * power of 2, but always a multiple of 16. The intent is 1220 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1221 */ 1222 some = QEMU_ALIGN_DOWN(oprsz, 32); 1223 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1224 g->load_dest, g->fniv); 1225 if (some == oprsz) { 1226 break; 1227 } 1228 dofs += some; 1229 aofs += some; 1230 bofs += some; 1231 oprsz -= some; 1232 maxsz -= some; 1233 /* fallthru */ 1234 case TCG_TYPE_V128: 1235 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1236 g->load_dest, g->fniv); 1237 break; 1238 case TCG_TYPE_V64: 1239 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1240 g->load_dest, g->fniv); 1241 break; 1242 1243 case 0: 1244 if (g->fni8 && check_size_impl(oprsz, 8)) { 1245 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1246 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1247 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1248 } else { 1249 assert(g->fno != NULL); 1250 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1251 maxsz, g->data, g->fno); 1252 oprsz = maxsz; 1253 } 1254 break; 1255 1256 default: 1257 g_assert_not_reached(); 1258 } 1259 tcg_swap_vecop_list(hold_list); 1260 1261 if (oprsz < maxsz) { 1262 expand_clr(dofs + oprsz, maxsz - oprsz); 1263 } 1264 } 1265 1266 /* Expand a vector operation with three vectors and an immediate. */ 1267 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1268 uint32_t oprsz, uint32_t maxsz, int64_t c, 1269 const GVecGen3i *g) 1270 { 1271 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1272 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1273 TCGType type; 1274 uint32_t some; 1275 1276 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1277 check_overlap_3(dofs, aofs, bofs, maxsz); 1278 1279 type = 0; 1280 if (g->fniv) { 1281 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1282 } 1283 switch (type) { 1284 case TCG_TYPE_V256: 1285 /* 1286 * Recall that ARM SVE allows vector sizes that are not a 1287 * power of 2, but always a multiple of 16. The intent is 1288 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1289 */ 1290 some = QEMU_ALIGN_DOWN(oprsz, 32); 1291 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1292 c, g->load_dest, g->fniv); 1293 if (some == oprsz) { 1294 break; 1295 } 1296 dofs += some; 1297 aofs += some; 1298 bofs += some; 1299 oprsz -= some; 1300 maxsz -= some; 1301 /* fallthru */ 1302 case TCG_TYPE_V128: 1303 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1304 c, g->load_dest, g->fniv); 1305 break; 1306 case TCG_TYPE_V64: 1307 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1308 c, g->load_dest, g->fniv); 1309 break; 1310 1311 case 0: 1312 if (g->fni8 && check_size_impl(oprsz, 8)) { 1313 expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8); 1314 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1315 expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4); 1316 } else { 1317 assert(g->fno != NULL); 1318 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1319 oprsz = maxsz; 1320 } 1321 break; 1322 1323 default: 1324 g_assert_not_reached(); 1325 } 1326 tcg_swap_vecop_list(hold_list); 1327 1328 if (oprsz < maxsz) { 1329 expand_clr(dofs + oprsz, maxsz - oprsz); 1330 } 1331 } 1332 1333 /* Expand a vector four-operand operation. */ 1334 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1335 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1336 { 1337 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1338 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1339 TCGType type; 1340 uint32_t some; 1341 1342 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1343 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1344 1345 type = 0; 1346 if (g->fniv) { 1347 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1348 } 1349 switch (type) { 1350 case TCG_TYPE_V256: 1351 /* Recall that ARM SVE allows vector sizes that are not a 1352 * power of 2, but always a multiple of 16. The intent is 1353 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1354 */ 1355 some = QEMU_ALIGN_DOWN(oprsz, 32); 1356 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1357 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1358 if (some == oprsz) { 1359 break; 1360 } 1361 dofs += some; 1362 aofs += some; 1363 bofs += some; 1364 cofs += some; 1365 oprsz -= some; 1366 maxsz -= some; 1367 /* fallthru */ 1368 case TCG_TYPE_V128: 1369 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1370 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1371 break; 1372 case TCG_TYPE_V64: 1373 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1374 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1375 break; 1376 1377 case 0: 1378 if (g->fni8 && check_size_impl(oprsz, 8)) { 1379 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1380 g->write_aofs, g->fni8); 1381 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1382 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1383 g->write_aofs, g->fni4); 1384 } else { 1385 assert(g->fno != NULL); 1386 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1387 oprsz, maxsz, g->data, g->fno); 1388 oprsz = maxsz; 1389 } 1390 break; 1391 1392 default: 1393 g_assert_not_reached(); 1394 } 1395 tcg_swap_vecop_list(hold_list); 1396 1397 if (oprsz < maxsz) { 1398 expand_clr(dofs + oprsz, maxsz - oprsz); 1399 } 1400 } 1401 1402 /* 1403 * Expand specific vector operations. 1404 */ 1405 1406 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1407 { 1408 tcg_gen_mov_vec(a, b); 1409 } 1410 1411 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1412 uint32_t oprsz, uint32_t maxsz) 1413 { 1414 static const GVecGen2 g = { 1415 .fni8 = tcg_gen_mov_i64, 1416 .fniv = vec_mov2, 1417 .fno = gen_helper_gvec_mov, 1418 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1419 }; 1420 if (dofs != aofs) { 1421 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1422 } else { 1423 check_size_align(oprsz, maxsz, dofs); 1424 if (oprsz < maxsz) { 1425 expand_clr(dofs + oprsz, maxsz - oprsz); 1426 } 1427 } 1428 } 1429 1430 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1431 uint32_t maxsz, TCGv_i32 in) 1432 { 1433 check_size_align(oprsz, maxsz, dofs); 1434 tcg_debug_assert(vece <= MO_32); 1435 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1436 } 1437 1438 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1439 uint32_t maxsz, TCGv_i64 in) 1440 { 1441 check_size_align(oprsz, maxsz, dofs); 1442 tcg_debug_assert(vece <= MO_64); 1443 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1444 } 1445 1446 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1447 uint32_t oprsz, uint32_t maxsz) 1448 { 1449 check_size_align(oprsz, maxsz, dofs); 1450 if (vece <= MO_64) { 1451 TCGType type = choose_vector_type(NULL, vece, oprsz, 0); 1452 if (type != 0) { 1453 TCGv_vec t_vec = tcg_temp_new_vec(type); 1454 tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs); 1455 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1456 tcg_temp_free_vec(t_vec); 1457 } else if (vece <= MO_32) { 1458 TCGv_i32 in = tcg_temp_new_i32(); 1459 switch (vece) { 1460 case MO_8: 1461 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1462 break; 1463 case MO_16: 1464 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1465 break; 1466 default: 1467 tcg_gen_ld_i32(in, cpu_env, aofs); 1468 break; 1469 } 1470 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1471 tcg_temp_free_i32(in); 1472 } else { 1473 TCGv_i64 in = tcg_temp_new_i64(); 1474 tcg_gen_ld_i64(in, cpu_env, aofs); 1475 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1476 tcg_temp_free_i64(in); 1477 } 1478 } else { 1479 /* 128-bit duplicate. */ 1480 /* ??? Dup to 256-bit vector. */ 1481 int i; 1482 1483 tcg_debug_assert(vece == 4); 1484 tcg_debug_assert(oprsz >= 16); 1485 if (TCG_TARGET_HAS_v128) { 1486 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1487 1488 tcg_gen_ld_vec(in, cpu_env, aofs); 1489 for (i = 0; i < oprsz; i += 16) { 1490 tcg_gen_st_vec(in, cpu_env, dofs + i); 1491 } 1492 tcg_temp_free_vec(in); 1493 } else { 1494 TCGv_i64 in0 = tcg_temp_new_i64(); 1495 TCGv_i64 in1 = tcg_temp_new_i64(); 1496 1497 tcg_gen_ld_i64(in0, cpu_env, aofs); 1498 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1499 for (i = 0; i < oprsz; i += 16) { 1500 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1501 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1502 } 1503 tcg_temp_free_i64(in0); 1504 tcg_temp_free_i64(in1); 1505 } 1506 if (oprsz < maxsz) { 1507 expand_clr(dofs + oprsz, maxsz - oprsz); 1508 } 1509 } 1510 } 1511 1512 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, 1513 uint32_t maxsz, uint64_t x) 1514 { 1515 check_size_align(oprsz, maxsz, dofs); 1516 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); 1517 } 1518 1519 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, 1520 uint32_t maxsz, uint32_t x) 1521 { 1522 check_size_align(oprsz, maxsz, dofs); 1523 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); 1524 } 1525 1526 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, 1527 uint32_t maxsz, uint16_t x) 1528 { 1529 check_size_align(oprsz, maxsz, dofs); 1530 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); 1531 } 1532 1533 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, 1534 uint32_t maxsz, uint8_t x) 1535 { 1536 check_size_align(oprsz, maxsz, dofs); 1537 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); 1538 } 1539 1540 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1541 uint32_t oprsz, uint32_t maxsz) 1542 { 1543 static const GVecGen2 g = { 1544 .fni8 = tcg_gen_not_i64, 1545 .fniv = tcg_gen_not_vec, 1546 .fno = gen_helper_gvec_not, 1547 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1548 }; 1549 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1550 } 1551 1552 /* Perform a vector addition using normal addition and a mask. The mask 1553 should be the sign bit of each lane. This 6-operation form is more 1554 efficient than separate additions when there are 4 or more lanes in 1555 the 64-bit operation. */ 1556 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1557 { 1558 TCGv_i64 t1 = tcg_temp_new_i64(); 1559 TCGv_i64 t2 = tcg_temp_new_i64(); 1560 TCGv_i64 t3 = tcg_temp_new_i64(); 1561 1562 tcg_gen_andc_i64(t1, a, m); 1563 tcg_gen_andc_i64(t2, b, m); 1564 tcg_gen_xor_i64(t3, a, b); 1565 tcg_gen_add_i64(d, t1, t2); 1566 tcg_gen_and_i64(t3, t3, m); 1567 tcg_gen_xor_i64(d, d, t3); 1568 1569 tcg_temp_free_i64(t1); 1570 tcg_temp_free_i64(t2); 1571 tcg_temp_free_i64(t3); 1572 } 1573 1574 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1575 { 1576 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1577 gen_addv_mask(d, a, b, m); 1578 tcg_temp_free_i64(m); 1579 } 1580 1581 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1582 { 1583 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1584 gen_addv_mask(d, a, b, m); 1585 tcg_temp_free_i64(m); 1586 } 1587 1588 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1589 { 1590 TCGv_i64 t1 = tcg_temp_new_i64(); 1591 TCGv_i64 t2 = tcg_temp_new_i64(); 1592 1593 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1594 tcg_gen_add_i64(t2, a, b); 1595 tcg_gen_add_i64(t1, t1, b); 1596 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1597 1598 tcg_temp_free_i64(t1); 1599 tcg_temp_free_i64(t2); 1600 } 1601 1602 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1603 1604 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1605 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1606 { 1607 static const GVecGen3 g[4] = { 1608 { .fni8 = tcg_gen_vec_add8_i64, 1609 .fniv = tcg_gen_add_vec, 1610 .fno = gen_helper_gvec_add8, 1611 .opt_opc = vecop_list_add, 1612 .vece = MO_8 }, 1613 { .fni8 = tcg_gen_vec_add16_i64, 1614 .fniv = tcg_gen_add_vec, 1615 .fno = gen_helper_gvec_add16, 1616 .opt_opc = vecop_list_add, 1617 .vece = MO_16 }, 1618 { .fni4 = tcg_gen_add_i32, 1619 .fniv = tcg_gen_add_vec, 1620 .fno = gen_helper_gvec_add32, 1621 .opt_opc = vecop_list_add, 1622 .vece = MO_32 }, 1623 { .fni8 = tcg_gen_add_i64, 1624 .fniv = tcg_gen_add_vec, 1625 .fno = gen_helper_gvec_add64, 1626 .opt_opc = vecop_list_add, 1627 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1628 .vece = MO_64 }, 1629 }; 1630 1631 tcg_debug_assert(vece <= MO_64); 1632 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1633 } 1634 1635 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1636 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1637 { 1638 static const GVecGen2s g[4] = { 1639 { .fni8 = tcg_gen_vec_add8_i64, 1640 .fniv = tcg_gen_add_vec, 1641 .fno = gen_helper_gvec_adds8, 1642 .opt_opc = vecop_list_add, 1643 .vece = MO_8 }, 1644 { .fni8 = tcg_gen_vec_add16_i64, 1645 .fniv = tcg_gen_add_vec, 1646 .fno = gen_helper_gvec_adds16, 1647 .opt_opc = vecop_list_add, 1648 .vece = MO_16 }, 1649 { .fni4 = tcg_gen_add_i32, 1650 .fniv = tcg_gen_add_vec, 1651 .fno = gen_helper_gvec_adds32, 1652 .opt_opc = vecop_list_add, 1653 .vece = MO_32 }, 1654 { .fni8 = tcg_gen_add_i64, 1655 .fniv = tcg_gen_add_vec, 1656 .fno = gen_helper_gvec_adds64, 1657 .opt_opc = vecop_list_add, 1658 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1659 .vece = MO_64 }, 1660 }; 1661 1662 tcg_debug_assert(vece <= MO_64); 1663 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1664 } 1665 1666 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1667 int64_t c, uint32_t oprsz, uint32_t maxsz) 1668 { 1669 TCGv_i64 tmp = tcg_const_i64(c); 1670 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1671 tcg_temp_free_i64(tmp); 1672 } 1673 1674 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 1675 1676 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1677 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1678 { 1679 static const GVecGen2s g[4] = { 1680 { .fni8 = tcg_gen_vec_sub8_i64, 1681 .fniv = tcg_gen_sub_vec, 1682 .fno = gen_helper_gvec_subs8, 1683 .opt_opc = vecop_list_sub, 1684 .vece = MO_8 }, 1685 { .fni8 = tcg_gen_vec_sub16_i64, 1686 .fniv = tcg_gen_sub_vec, 1687 .fno = gen_helper_gvec_subs16, 1688 .opt_opc = vecop_list_sub, 1689 .vece = MO_16 }, 1690 { .fni4 = tcg_gen_sub_i32, 1691 .fniv = tcg_gen_sub_vec, 1692 .fno = gen_helper_gvec_subs32, 1693 .opt_opc = vecop_list_sub, 1694 .vece = MO_32 }, 1695 { .fni8 = tcg_gen_sub_i64, 1696 .fniv = tcg_gen_sub_vec, 1697 .fno = gen_helper_gvec_subs64, 1698 .opt_opc = vecop_list_sub, 1699 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1700 .vece = MO_64 }, 1701 }; 1702 1703 tcg_debug_assert(vece <= MO_64); 1704 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1705 } 1706 1707 /* Perform a vector subtraction using normal subtraction and a mask. 1708 Compare gen_addv_mask above. */ 1709 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1710 { 1711 TCGv_i64 t1 = tcg_temp_new_i64(); 1712 TCGv_i64 t2 = tcg_temp_new_i64(); 1713 TCGv_i64 t3 = tcg_temp_new_i64(); 1714 1715 tcg_gen_or_i64(t1, a, m); 1716 tcg_gen_andc_i64(t2, b, m); 1717 tcg_gen_eqv_i64(t3, a, b); 1718 tcg_gen_sub_i64(d, t1, t2); 1719 tcg_gen_and_i64(t3, t3, m); 1720 tcg_gen_xor_i64(d, d, t3); 1721 1722 tcg_temp_free_i64(t1); 1723 tcg_temp_free_i64(t2); 1724 tcg_temp_free_i64(t3); 1725 } 1726 1727 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1728 { 1729 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1730 gen_subv_mask(d, a, b, m); 1731 tcg_temp_free_i64(m); 1732 } 1733 1734 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1735 { 1736 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1737 gen_subv_mask(d, a, b, m); 1738 tcg_temp_free_i64(m); 1739 } 1740 1741 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1742 { 1743 TCGv_i64 t1 = tcg_temp_new_i64(); 1744 TCGv_i64 t2 = tcg_temp_new_i64(); 1745 1746 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1747 tcg_gen_sub_i64(t2, a, b); 1748 tcg_gen_sub_i64(t1, a, t1); 1749 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1750 1751 tcg_temp_free_i64(t1); 1752 tcg_temp_free_i64(t2); 1753 } 1754 1755 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1756 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1757 { 1758 static const GVecGen3 g[4] = { 1759 { .fni8 = tcg_gen_vec_sub8_i64, 1760 .fniv = tcg_gen_sub_vec, 1761 .fno = gen_helper_gvec_sub8, 1762 .opt_opc = vecop_list_sub, 1763 .vece = MO_8 }, 1764 { .fni8 = tcg_gen_vec_sub16_i64, 1765 .fniv = tcg_gen_sub_vec, 1766 .fno = gen_helper_gvec_sub16, 1767 .opt_opc = vecop_list_sub, 1768 .vece = MO_16 }, 1769 { .fni4 = tcg_gen_sub_i32, 1770 .fniv = tcg_gen_sub_vec, 1771 .fno = gen_helper_gvec_sub32, 1772 .opt_opc = vecop_list_sub, 1773 .vece = MO_32 }, 1774 { .fni8 = tcg_gen_sub_i64, 1775 .fniv = tcg_gen_sub_vec, 1776 .fno = gen_helper_gvec_sub64, 1777 .opt_opc = vecop_list_sub, 1778 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1779 .vece = MO_64 }, 1780 }; 1781 1782 tcg_debug_assert(vece <= MO_64); 1783 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1784 } 1785 1786 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 1787 1788 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1789 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1790 { 1791 static const GVecGen3 g[4] = { 1792 { .fniv = tcg_gen_mul_vec, 1793 .fno = gen_helper_gvec_mul8, 1794 .opt_opc = vecop_list_mul, 1795 .vece = MO_8 }, 1796 { .fniv = tcg_gen_mul_vec, 1797 .fno = gen_helper_gvec_mul16, 1798 .opt_opc = vecop_list_mul, 1799 .vece = MO_16 }, 1800 { .fni4 = tcg_gen_mul_i32, 1801 .fniv = tcg_gen_mul_vec, 1802 .fno = gen_helper_gvec_mul32, 1803 .opt_opc = vecop_list_mul, 1804 .vece = MO_32 }, 1805 { .fni8 = tcg_gen_mul_i64, 1806 .fniv = tcg_gen_mul_vec, 1807 .fno = gen_helper_gvec_mul64, 1808 .opt_opc = vecop_list_mul, 1809 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1810 .vece = MO_64 }, 1811 }; 1812 1813 tcg_debug_assert(vece <= MO_64); 1814 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1815 } 1816 1817 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1818 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1819 { 1820 static const GVecGen2s g[4] = { 1821 { .fniv = tcg_gen_mul_vec, 1822 .fno = gen_helper_gvec_muls8, 1823 .opt_opc = vecop_list_mul, 1824 .vece = MO_8 }, 1825 { .fniv = tcg_gen_mul_vec, 1826 .fno = gen_helper_gvec_muls16, 1827 .opt_opc = vecop_list_mul, 1828 .vece = MO_16 }, 1829 { .fni4 = tcg_gen_mul_i32, 1830 .fniv = tcg_gen_mul_vec, 1831 .fno = gen_helper_gvec_muls32, 1832 .opt_opc = vecop_list_mul, 1833 .vece = MO_32 }, 1834 { .fni8 = tcg_gen_mul_i64, 1835 .fniv = tcg_gen_mul_vec, 1836 .fno = gen_helper_gvec_muls64, 1837 .opt_opc = vecop_list_mul, 1838 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1839 .vece = MO_64 }, 1840 }; 1841 1842 tcg_debug_assert(vece <= MO_64); 1843 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1844 } 1845 1846 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1847 int64_t c, uint32_t oprsz, uint32_t maxsz) 1848 { 1849 TCGv_i64 tmp = tcg_const_i64(c); 1850 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1851 tcg_temp_free_i64(tmp); 1852 } 1853 1854 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1855 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1856 { 1857 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 1858 static const GVecGen3 g[4] = { 1859 { .fniv = tcg_gen_ssadd_vec, 1860 .fno = gen_helper_gvec_ssadd8, 1861 .opt_opc = vecop_list, 1862 .vece = MO_8 }, 1863 { .fniv = tcg_gen_ssadd_vec, 1864 .fno = gen_helper_gvec_ssadd16, 1865 .opt_opc = vecop_list, 1866 .vece = MO_16 }, 1867 { .fniv = tcg_gen_ssadd_vec, 1868 .fno = gen_helper_gvec_ssadd32, 1869 .opt_opc = vecop_list, 1870 .vece = MO_32 }, 1871 { .fniv = tcg_gen_ssadd_vec, 1872 .fno = gen_helper_gvec_ssadd64, 1873 .opt_opc = vecop_list, 1874 .vece = MO_64 }, 1875 }; 1876 tcg_debug_assert(vece <= MO_64); 1877 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1878 } 1879 1880 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 1881 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1882 { 1883 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 1884 static const GVecGen3 g[4] = { 1885 { .fniv = tcg_gen_sssub_vec, 1886 .fno = gen_helper_gvec_sssub8, 1887 .opt_opc = vecop_list, 1888 .vece = MO_8 }, 1889 { .fniv = tcg_gen_sssub_vec, 1890 .fno = gen_helper_gvec_sssub16, 1891 .opt_opc = vecop_list, 1892 .vece = MO_16 }, 1893 { .fniv = tcg_gen_sssub_vec, 1894 .fno = gen_helper_gvec_sssub32, 1895 .opt_opc = vecop_list, 1896 .vece = MO_32 }, 1897 { .fniv = tcg_gen_sssub_vec, 1898 .fno = gen_helper_gvec_sssub64, 1899 .opt_opc = vecop_list, 1900 .vece = MO_64 }, 1901 }; 1902 tcg_debug_assert(vece <= MO_64); 1903 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1904 } 1905 1906 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1907 { 1908 TCGv_i32 max = tcg_const_i32(-1); 1909 tcg_gen_add_i32(d, a, b); 1910 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 1911 tcg_temp_free_i32(max); 1912 } 1913 1914 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1915 { 1916 TCGv_i64 max = tcg_const_i64(-1); 1917 tcg_gen_add_i64(d, a, b); 1918 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 1919 tcg_temp_free_i64(max); 1920 } 1921 1922 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1923 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1924 { 1925 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 1926 static const GVecGen3 g[4] = { 1927 { .fniv = tcg_gen_usadd_vec, 1928 .fno = gen_helper_gvec_usadd8, 1929 .opt_opc = vecop_list, 1930 .vece = MO_8 }, 1931 { .fniv = tcg_gen_usadd_vec, 1932 .fno = gen_helper_gvec_usadd16, 1933 .opt_opc = vecop_list, 1934 .vece = MO_16 }, 1935 { .fni4 = tcg_gen_usadd_i32, 1936 .fniv = tcg_gen_usadd_vec, 1937 .fno = gen_helper_gvec_usadd32, 1938 .opt_opc = vecop_list, 1939 .vece = MO_32 }, 1940 { .fni8 = tcg_gen_usadd_i64, 1941 .fniv = tcg_gen_usadd_vec, 1942 .fno = gen_helper_gvec_usadd64, 1943 .opt_opc = vecop_list, 1944 .vece = MO_64 } 1945 }; 1946 tcg_debug_assert(vece <= MO_64); 1947 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1948 } 1949 1950 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1951 { 1952 TCGv_i32 min = tcg_const_i32(0); 1953 tcg_gen_sub_i32(d, a, b); 1954 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 1955 tcg_temp_free_i32(min); 1956 } 1957 1958 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1959 { 1960 TCGv_i64 min = tcg_const_i64(0); 1961 tcg_gen_sub_i64(d, a, b); 1962 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 1963 tcg_temp_free_i64(min); 1964 } 1965 1966 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 1967 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1968 { 1969 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 1970 static const GVecGen3 g[4] = { 1971 { .fniv = tcg_gen_ussub_vec, 1972 .fno = gen_helper_gvec_ussub8, 1973 .opt_opc = vecop_list, 1974 .vece = MO_8 }, 1975 { .fniv = tcg_gen_ussub_vec, 1976 .fno = gen_helper_gvec_ussub16, 1977 .opt_opc = vecop_list, 1978 .vece = MO_16 }, 1979 { .fni4 = tcg_gen_ussub_i32, 1980 .fniv = tcg_gen_ussub_vec, 1981 .fno = gen_helper_gvec_ussub32, 1982 .opt_opc = vecop_list, 1983 .vece = MO_32 }, 1984 { .fni8 = tcg_gen_ussub_i64, 1985 .fniv = tcg_gen_ussub_vec, 1986 .fno = gen_helper_gvec_ussub64, 1987 .opt_opc = vecop_list, 1988 .vece = MO_64 } 1989 }; 1990 tcg_debug_assert(vece <= MO_64); 1991 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1992 } 1993 1994 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 1995 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1996 { 1997 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 1998 static const GVecGen3 g[4] = { 1999 { .fniv = tcg_gen_smin_vec, 2000 .fno = gen_helper_gvec_smin8, 2001 .opt_opc = vecop_list, 2002 .vece = MO_8 }, 2003 { .fniv = tcg_gen_smin_vec, 2004 .fno = gen_helper_gvec_smin16, 2005 .opt_opc = vecop_list, 2006 .vece = MO_16 }, 2007 { .fni4 = tcg_gen_smin_i32, 2008 .fniv = tcg_gen_smin_vec, 2009 .fno = gen_helper_gvec_smin32, 2010 .opt_opc = vecop_list, 2011 .vece = MO_32 }, 2012 { .fni8 = tcg_gen_smin_i64, 2013 .fniv = tcg_gen_smin_vec, 2014 .fno = gen_helper_gvec_smin64, 2015 .opt_opc = vecop_list, 2016 .vece = MO_64 } 2017 }; 2018 tcg_debug_assert(vece <= MO_64); 2019 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2020 } 2021 2022 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2023 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2024 { 2025 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2026 static const GVecGen3 g[4] = { 2027 { .fniv = tcg_gen_umin_vec, 2028 .fno = gen_helper_gvec_umin8, 2029 .opt_opc = vecop_list, 2030 .vece = MO_8 }, 2031 { .fniv = tcg_gen_umin_vec, 2032 .fno = gen_helper_gvec_umin16, 2033 .opt_opc = vecop_list, 2034 .vece = MO_16 }, 2035 { .fni4 = tcg_gen_umin_i32, 2036 .fniv = tcg_gen_umin_vec, 2037 .fno = gen_helper_gvec_umin32, 2038 .opt_opc = vecop_list, 2039 .vece = MO_32 }, 2040 { .fni8 = tcg_gen_umin_i64, 2041 .fniv = tcg_gen_umin_vec, 2042 .fno = gen_helper_gvec_umin64, 2043 .opt_opc = vecop_list, 2044 .vece = MO_64 } 2045 }; 2046 tcg_debug_assert(vece <= MO_64); 2047 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2048 } 2049 2050 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2051 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2052 { 2053 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2054 static const GVecGen3 g[4] = { 2055 { .fniv = tcg_gen_smax_vec, 2056 .fno = gen_helper_gvec_smax8, 2057 .opt_opc = vecop_list, 2058 .vece = MO_8 }, 2059 { .fniv = tcg_gen_smax_vec, 2060 .fno = gen_helper_gvec_smax16, 2061 .opt_opc = vecop_list, 2062 .vece = MO_16 }, 2063 { .fni4 = tcg_gen_smax_i32, 2064 .fniv = tcg_gen_smax_vec, 2065 .fno = gen_helper_gvec_smax32, 2066 .opt_opc = vecop_list, 2067 .vece = MO_32 }, 2068 { .fni8 = tcg_gen_smax_i64, 2069 .fniv = tcg_gen_smax_vec, 2070 .fno = gen_helper_gvec_smax64, 2071 .opt_opc = vecop_list, 2072 .vece = MO_64 } 2073 }; 2074 tcg_debug_assert(vece <= MO_64); 2075 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2076 } 2077 2078 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2079 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2080 { 2081 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2082 static const GVecGen3 g[4] = { 2083 { .fniv = tcg_gen_umax_vec, 2084 .fno = gen_helper_gvec_umax8, 2085 .opt_opc = vecop_list, 2086 .vece = MO_8 }, 2087 { .fniv = tcg_gen_umax_vec, 2088 .fno = gen_helper_gvec_umax16, 2089 .opt_opc = vecop_list, 2090 .vece = MO_16 }, 2091 { .fni4 = tcg_gen_umax_i32, 2092 .fniv = tcg_gen_umax_vec, 2093 .fno = gen_helper_gvec_umax32, 2094 .opt_opc = vecop_list, 2095 .vece = MO_32 }, 2096 { .fni8 = tcg_gen_umax_i64, 2097 .fniv = tcg_gen_umax_vec, 2098 .fno = gen_helper_gvec_umax64, 2099 .opt_opc = vecop_list, 2100 .vece = MO_64 } 2101 }; 2102 tcg_debug_assert(vece <= MO_64); 2103 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2104 } 2105 2106 /* Perform a vector negation using normal negation and a mask. 2107 Compare gen_subv_mask above. */ 2108 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2109 { 2110 TCGv_i64 t2 = tcg_temp_new_i64(); 2111 TCGv_i64 t3 = tcg_temp_new_i64(); 2112 2113 tcg_gen_andc_i64(t3, m, b); 2114 tcg_gen_andc_i64(t2, b, m); 2115 tcg_gen_sub_i64(d, m, t2); 2116 tcg_gen_xor_i64(d, d, t3); 2117 2118 tcg_temp_free_i64(t2); 2119 tcg_temp_free_i64(t3); 2120 } 2121 2122 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2123 { 2124 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 2125 gen_negv_mask(d, b, m); 2126 tcg_temp_free_i64(m); 2127 } 2128 2129 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2130 { 2131 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 2132 gen_negv_mask(d, b, m); 2133 tcg_temp_free_i64(m); 2134 } 2135 2136 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2137 { 2138 TCGv_i64 t1 = tcg_temp_new_i64(); 2139 TCGv_i64 t2 = tcg_temp_new_i64(); 2140 2141 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2142 tcg_gen_neg_i64(t2, b); 2143 tcg_gen_neg_i64(t1, t1); 2144 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2145 2146 tcg_temp_free_i64(t1); 2147 tcg_temp_free_i64(t2); 2148 } 2149 2150 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2151 uint32_t oprsz, uint32_t maxsz) 2152 { 2153 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2154 static const GVecGen2 g[4] = { 2155 { .fni8 = tcg_gen_vec_neg8_i64, 2156 .fniv = tcg_gen_neg_vec, 2157 .fno = gen_helper_gvec_neg8, 2158 .opt_opc = vecop_list, 2159 .vece = MO_8 }, 2160 { .fni8 = tcg_gen_vec_neg16_i64, 2161 .fniv = tcg_gen_neg_vec, 2162 .fno = gen_helper_gvec_neg16, 2163 .opt_opc = vecop_list, 2164 .vece = MO_16 }, 2165 { .fni4 = tcg_gen_neg_i32, 2166 .fniv = tcg_gen_neg_vec, 2167 .fno = gen_helper_gvec_neg32, 2168 .opt_opc = vecop_list, 2169 .vece = MO_32 }, 2170 { .fni8 = tcg_gen_neg_i64, 2171 .fniv = tcg_gen_neg_vec, 2172 .fno = gen_helper_gvec_neg64, 2173 .opt_opc = vecop_list, 2174 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2175 .vece = MO_64 }, 2176 }; 2177 2178 tcg_debug_assert(vece <= MO_64); 2179 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2180 } 2181 2182 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2183 { 2184 TCGv_i64 t = tcg_temp_new_i64(); 2185 int nbit = 8 << vece; 2186 2187 /* Create -1 for each negative element. */ 2188 tcg_gen_shri_i64(t, b, nbit - 1); 2189 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2190 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2191 2192 /* 2193 * Invert (via xor -1) and add one (via sub -1). 2194 * Because of the ordering the msb is cleared, 2195 * so we never have carry into the next element. 2196 */ 2197 tcg_gen_xor_i64(d, b, t); 2198 tcg_gen_sub_i64(d, d, t); 2199 2200 tcg_temp_free_i64(t); 2201 } 2202 2203 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2204 { 2205 gen_absv_mask(d, b, MO_8); 2206 } 2207 2208 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2209 { 2210 gen_absv_mask(d, b, MO_16); 2211 } 2212 2213 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2214 uint32_t oprsz, uint32_t maxsz) 2215 { 2216 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2217 static const GVecGen2 g[4] = { 2218 { .fni8 = tcg_gen_vec_abs8_i64, 2219 .fniv = tcg_gen_abs_vec, 2220 .fno = gen_helper_gvec_abs8, 2221 .opt_opc = vecop_list, 2222 .vece = MO_8 }, 2223 { .fni8 = tcg_gen_vec_abs16_i64, 2224 .fniv = tcg_gen_abs_vec, 2225 .fno = gen_helper_gvec_abs16, 2226 .opt_opc = vecop_list, 2227 .vece = MO_16 }, 2228 { .fni4 = tcg_gen_abs_i32, 2229 .fniv = tcg_gen_abs_vec, 2230 .fno = gen_helper_gvec_abs32, 2231 .opt_opc = vecop_list, 2232 .vece = MO_32 }, 2233 { .fni8 = tcg_gen_abs_i64, 2234 .fniv = tcg_gen_abs_vec, 2235 .fno = gen_helper_gvec_abs64, 2236 .opt_opc = vecop_list, 2237 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2238 .vece = MO_64 }, 2239 }; 2240 2241 tcg_debug_assert(vece <= MO_64); 2242 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2243 } 2244 2245 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2246 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2247 { 2248 static const GVecGen3 g = { 2249 .fni8 = tcg_gen_and_i64, 2250 .fniv = tcg_gen_and_vec, 2251 .fno = gen_helper_gvec_and, 2252 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2253 }; 2254 2255 if (aofs == bofs) { 2256 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2257 } else { 2258 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2259 } 2260 } 2261 2262 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2263 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2264 { 2265 static const GVecGen3 g = { 2266 .fni8 = tcg_gen_or_i64, 2267 .fniv = tcg_gen_or_vec, 2268 .fno = gen_helper_gvec_or, 2269 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2270 }; 2271 2272 if (aofs == bofs) { 2273 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2274 } else { 2275 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2276 } 2277 } 2278 2279 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2280 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2281 { 2282 static const GVecGen3 g = { 2283 .fni8 = tcg_gen_xor_i64, 2284 .fniv = tcg_gen_xor_vec, 2285 .fno = gen_helper_gvec_xor, 2286 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2287 }; 2288 2289 if (aofs == bofs) { 2290 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); 2291 } else { 2292 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2293 } 2294 } 2295 2296 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2297 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2298 { 2299 static const GVecGen3 g = { 2300 .fni8 = tcg_gen_andc_i64, 2301 .fniv = tcg_gen_andc_vec, 2302 .fno = gen_helper_gvec_andc, 2303 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2304 }; 2305 2306 if (aofs == bofs) { 2307 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); 2308 } else { 2309 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2310 } 2311 } 2312 2313 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2314 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2315 { 2316 static const GVecGen3 g = { 2317 .fni8 = tcg_gen_orc_i64, 2318 .fniv = tcg_gen_orc_vec, 2319 .fno = gen_helper_gvec_orc, 2320 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2321 }; 2322 2323 if (aofs == bofs) { 2324 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); 2325 } else { 2326 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2327 } 2328 } 2329 2330 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2331 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2332 { 2333 static const GVecGen3 g = { 2334 .fni8 = tcg_gen_nand_i64, 2335 .fniv = tcg_gen_nand_vec, 2336 .fno = gen_helper_gvec_nand, 2337 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2338 }; 2339 2340 if (aofs == bofs) { 2341 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2342 } else { 2343 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2344 } 2345 } 2346 2347 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2348 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2349 { 2350 static const GVecGen3 g = { 2351 .fni8 = tcg_gen_nor_i64, 2352 .fniv = tcg_gen_nor_vec, 2353 .fno = gen_helper_gvec_nor, 2354 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2355 }; 2356 2357 if (aofs == bofs) { 2358 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2359 } else { 2360 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2361 } 2362 } 2363 2364 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2365 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2366 { 2367 static const GVecGen3 g = { 2368 .fni8 = tcg_gen_eqv_i64, 2369 .fniv = tcg_gen_eqv_vec, 2370 .fno = gen_helper_gvec_eqv, 2371 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2372 }; 2373 2374 if (aofs == bofs) { 2375 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); 2376 } else { 2377 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2378 } 2379 } 2380 2381 static const GVecGen2s gop_ands = { 2382 .fni8 = tcg_gen_and_i64, 2383 .fniv = tcg_gen_and_vec, 2384 .fno = gen_helper_gvec_ands, 2385 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2386 .vece = MO_64 2387 }; 2388 2389 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2390 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2391 { 2392 TCGv_i64 tmp = tcg_temp_new_i64(); 2393 gen_dup_i64(vece, tmp, c); 2394 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2395 tcg_temp_free_i64(tmp); 2396 } 2397 2398 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2399 int64_t c, uint32_t oprsz, uint32_t maxsz) 2400 { 2401 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2402 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2403 tcg_temp_free_i64(tmp); 2404 } 2405 2406 static const GVecGen2s gop_xors = { 2407 .fni8 = tcg_gen_xor_i64, 2408 .fniv = tcg_gen_xor_vec, 2409 .fno = gen_helper_gvec_xors, 2410 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2411 .vece = MO_64 2412 }; 2413 2414 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2415 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2416 { 2417 TCGv_i64 tmp = tcg_temp_new_i64(); 2418 gen_dup_i64(vece, tmp, c); 2419 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2420 tcg_temp_free_i64(tmp); 2421 } 2422 2423 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2424 int64_t c, uint32_t oprsz, uint32_t maxsz) 2425 { 2426 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2427 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2428 tcg_temp_free_i64(tmp); 2429 } 2430 2431 static const GVecGen2s gop_ors = { 2432 .fni8 = tcg_gen_or_i64, 2433 .fniv = tcg_gen_or_vec, 2434 .fno = gen_helper_gvec_ors, 2435 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2436 .vece = MO_64 2437 }; 2438 2439 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2440 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2441 { 2442 TCGv_i64 tmp = tcg_temp_new_i64(); 2443 gen_dup_i64(vece, tmp, c); 2444 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2445 tcg_temp_free_i64(tmp); 2446 } 2447 2448 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2449 int64_t c, uint32_t oprsz, uint32_t maxsz) 2450 { 2451 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2452 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2453 tcg_temp_free_i64(tmp); 2454 } 2455 2456 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2457 { 2458 uint64_t mask = dup_const(MO_8, 0xff << c); 2459 tcg_gen_shli_i64(d, a, c); 2460 tcg_gen_andi_i64(d, d, mask); 2461 } 2462 2463 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2464 { 2465 uint64_t mask = dup_const(MO_16, 0xffff << c); 2466 tcg_gen_shli_i64(d, a, c); 2467 tcg_gen_andi_i64(d, d, mask); 2468 } 2469 2470 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2471 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2472 { 2473 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2474 static const GVecGen2i g[4] = { 2475 { .fni8 = tcg_gen_vec_shl8i_i64, 2476 .fniv = tcg_gen_shli_vec, 2477 .fno = gen_helper_gvec_shl8i, 2478 .opt_opc = vecop_list, 2479 .vece = MO_8 }, 2480 { .fni8 = tcg_gen_vec_shl16i_i64, 2481 .fniv = tcg_gen_shli_vec, 2482 .fno = gen_helper_gvec_shl16i, 2483 .opt_opc = vecop_list, 2484 .vece = MO_16 }, 2485 { .fni4 = tcg_gen_shli_i32, 2486 .fniv = tcg_gen_shli_vec, 2487 .fno = gen_helper_gvec_shl32i, 2488 .opt_opc = vecop_list, 2489 .vece = MO_32 }, 2490 { .fni8 = tcg_gen_shli_i64, 2491 .fniv = tcg_gen_shli_vec, 2492 .fno = gen_helper_gvec_shl64i, 2493 .opt_opc = vecop_list, 2494 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2495 .vece = MO_64 }, 2496 }; 2497 2498 tcg_debug_assert(vece <= MO_64); 2499 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2500 if (shift == 0) { 2501 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2502 } else { 2503 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2504 } 2505 } 2506 2507 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2508 { 2509 uint64_t mask = dup_const(MO_8, 0xff >> c); 2510 tcg_gen_shri_i64(d, a, c); 2511 tcg_gen_andi_i64(d, d, mask); 2512 } 2513 2514 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2515 { 2516 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2517 tcg_gen_shri_i64(d, a, c); 2518 tcg_gen_andi_i64(d, d, mask); 2519 } 2520 2521 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2522 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2523 { 2524 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2525 static const GVecGen2i g[4] = { 2526 { .fni8 = tcg_gen_vec_shr8i_i64, 2527 .fniv = tcg_gen_shri_vec, 2528 .fno = gen_helper_gvec_shr8i, 2529 .opt_opc = vecop_list, 2530 .vece = MO_8 }, 2531 { .fni8 = tcg_gen_vec_shr16i_i64, 2532 .fniv = tcg_gen_shri_vec, 2533 .fno = gen_helper_gvec_shr16i, 2534 .opt_opc = vecop_list, 2535 .vece = MO_16 }, 2536 { .fni4 = tcg_gen_shri_i32, 2537 .fniv = tcg_gen_shri_vec, 2538 .fno = gen_helper_gvec_shr32i, 2539 .opt_opc = vecop_list, 2540 .vece = MO_32 }, 2541 { .fni8 = tcg_gen_shri_i64, 2542 .fniv = tcg_gen_shri_vec, 2543 .fno = gen_helper_gvec_shr64i, 2544 .opt_opc = vecop_list, 2545 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2546 .vece = MO_64 }, 2547 }; 2548 2549 tcg_debug_assert(vece <= MO_64); 2550 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2551 if (shift == 0) { 2552 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2553 } else { 2554 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2555 } 2556 } 2557 2558 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2559 { 2560 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2561 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2562 TCGv_i64 s = tcg_temp_new_i64(); 2563 2564 tcg_gen_shri_i64(d, a, c); 2565 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2566 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2567 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2568 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2569 tcg_temp_free_i64(s); 2570 } 2571 2572 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2573 { 2574 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2575 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2576 TCGv_i64 s = tcg_temp_new_i64(); 2577 2578 tcg_gen_shri_i64(d, a, c); 2579 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2580 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2581 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2582 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2583 tcg_temp_free_i64(s); 2584 } 2585 2586 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2587 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2588 { 2589 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 2590 static const GVecGen2i g[4] = { 2591 { .fni8 = tcg_gen_vec_sar8i_i64, 2592 .fniv = tcg_gen_sari_vec, 2593 .fno = gen_helper_gvec_sar8i, 2594 .opt_opc = vecop_list, 2595 .vece = MO_8 }, 2596 { .fni8 = tcg_gen_vec_sar16i_i64, 2597 .fniv = tcg_gen_sari_vec, 2598 .fno = gen_helper_gvec_sar16i, 2599 .opt_opc = vecop_list, 2600 .vece = MO_16 }, 2601 { .fni4 = tcg_gen_sari_i32, 2602 .fniv = tcg_gen_sari_vec, 2603 .fno = gen_helper_gvec_sar32i, 2604 .opt_opc = vecop_list, 2605 .vece = MO_32 }, 2606 { .fni8 = tcg_gen_sari_i64, 2607 .fniv = tcg_gen_sari_vec, 2608 .fno = gen_helper_gvec_sar64i, 2609 .opt_opc = vecop_list, 2610 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2611 .vece = MO_64 }, 2612 }; 2613 2614 tcg_debug_assert(vece <= MO_64); 2615 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2616 if (shift == 0) { 2617 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2618 } else { 2619 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2620 } 2621 } 2622 2623 /* 2624 * Specialized generation vector shifts by a non-constant scalar. 2625 */ 2626 2627 typedef struct { 2628 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 2629 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 2630 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 2631 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 2632 gen_helper_gvec_2 *fno[4]; 2633 TCGOpcode s_list[2]; 2634 TCGOpcode v_list[2]; 2635 } GVecGen2sh; 2636 2637 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2638 uint32_t oprsz, uint32_t tysz, TCGType type, 2639 TCGv_i32 shift, 2640 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 2641 { 2642 TCGv_vec t0 = tcg_temp_new_vec(type); 2643 uint32_t i; 2644 2645 for (i = 0; i < oprsz; i += tysz) { 2646 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2647 fni(vece, t0, t0, shift); 2648 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2649 } 2650 tcg_temp_free_vec(t0); 2651 } 2652 2653 static void 2654 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 2655 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 2656 { 2657 TCGType type; 2658 uint32_t some; 2659 2660 check_size_align(oprsz, maxsz, dofs | aofs); 2661 check_overlap_2(dofs, aofs, maxsz); 2662 2663 /* If the backend has a scalar expansion, great. */ 2664 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 2665 if (type) { 2666 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2667 switch (type) { 2668 case TCG_TYPE_V256: 2669 some = QEMU_ALIGN_DOWN(oprsz, 32); 2670 expand_2sh_vec(vece, dofs, aofs, some, 32, 2671 TCG_TYPE_V256, shift, g->fniv_s); 2672 if (some == oprsz) { 2673 break; 2674 } 2675 dofs += some; 2676 aofs += some; 2677 oprsz -= some; 2678 maxsz -= some; 2679 /* fallthru */ 2680 case TCG_TYPE_V128: 2681 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 2682 TCG_TYPE_V128, shift, g->fniv_s); 2683 break; 2684 case TCG_TYPE_V64: 2685 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 2686 TCG_TYPE_V64, shift, g->fniv_s); 2687 break; 2688 default: 2689 g_assert_not_reached(); 2690 } 2691 tcg_swap_vecop_list(hold_list); 2692 goto clear_tail; 2693 } 2694 2695 /* If the backend supports variable vector shifts, also cool. */ 2696 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 2697 if (type) { 2698 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2699 TCGv_vec v_shift = tcg_temp_new_vec(type); 2700 2701 if (vece == MO_64) { 2702 TCGv_i64 sh64 = tcg_temp_new_i64(); 2703 tcg_gen_extu_i32_i64(sh64, shift); 2704 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 2705 tcg_temp_free_i64(sh64); 2706 } else { 2707 tcg_gen_dup_i32_vec(vece, v_shift, shift); 2708 } 2709 2710 switch (type) { 2711 case TCG_TYPE_V256: 2712 some = QEMU_ALIGN_DOWN(oprsz, 32); 2713 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 2714 v_shift, false, g->fniv_v); 2715 if (some == oprsz) { 2716 break; 2717 } 2718 dofs += some; 2719 aofs += some; 2720 oprsz -= some; 2721 maxsz -= some; 2722 /* fallthru */ 2723 case TCG_TYPE_V128: 2724 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 2725 v_shift, false, g->fniv_v); 2726 break; 2727 case TCG_TYPE_V64: 2728 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 2729 v_shift, false, g->fniv_v); 2730 break; 2731 default: 2732 g_assert_not_reached(); 2733 } 2734 tcg_temp_free_vec(v_shift); 2735 tcg_swap_vecop_list(hold_list); 2736 goto clear_tail; 2737 } 2738 2739 /* Otherwise fall back to integral... */ 2740 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2741 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 2742 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2743 TCGv_i64 sh64 = tcg_temp_new_i64(); 2744 tcg_gen_extu_i32_i64(sh64, shift); 2745 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 2746 tcg_temp_free_i64(sh64); 2747 } else { 2748 TCGv_ptr a0 = tcg_temp_new_ptr(); 2749 TCGv_ptr a1 = tcg_temp_new_ptr(); 2750 TCGv_i32 desc = tcg_temp_new_i32(); 2751 2752 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 2753 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 2754 tcg_gen_addi_ptr(a0, cpu_env, dofs); 2755 tcg_gen_addi_ptr(a1, cpu_env, aofs); 2756 2757 g->fno[vece](a0, a1, desc); 2758 2759 tcg_temp_free_ptr(a0); 2760 tcg_temp_free_ptr(a1); 2761 tcg_temp_free_i32(desc); 2762 return; 2763 } 2764 2765 clear_tail: 2766 if (oprsz < maxsz) { 2767 expand_clr(dofs + oprsz, maxsz - oprsz); 2768 } 2769 } 2770 2771 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 2772 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2773 { 2774 static const GVecGen2sh g = { 2775 .fni4 = tcg_gen_shl_i32, 2776 .fni8 = tcg_gen_shl_i64, 2777 .fniv_s = tcg_gen_shls_vec, 2778 .fniv_v = tcg_gen_shlv_vec, 2779 .fno = { 2780 gen_helper_gvec_shl8i, 2781 gen_helper_gvec_shl16i, 2782 gen_helper_gvec_shl32i, 2783 gen_helper_gvec_shl64i, 2784 }, 2785 .s_list = { INDEX_op_shls_vec, 0 }, 2786 .v_list = { INDEX_op_shlv_vec, 0 }, 2787 }; 2788 2789 tcg_debug_assert(vece <= MO_64); 2790 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2791 } 2792 2793 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 2794 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2795 { 2796 static const GVecGen2sh g = { 2797 .fni4 = tcg_gen_shr_i32, 2798 .fni8 = tcg_gen_shr_i64, 2799 .fniv_s = tcg_gen_shrs_vec, 2800 .fniv_v = tcg_gen_shrv_vec, 2801 .fno = { 2802 gen_helper_gvec_shr8i, 2803 gen_helper_gvec_shr16i, 2804 gen_helper_gvec_shr32i, 2805 gen_helper_gvec_shr64i, 2806 }, 2807 .s_list = { INDEX_op_shrs_vec, 0 }, 2808 .v_list = { INDEX_op_shrv_vec, 0 }, 2809 }; 2810 2811 tcg_debug_assert(vece <= MO_64); 2812 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2813 } 2814 2815 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 2816 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2817 { 2818 static const GVecGen2sh g = { 2819 .fni4 = tcg_gen_sar_i32, 2820 .fni8 = tcg_gen_sar_i64, 2821 .fniv_s = tcg_gen_sars_vec, 2822 .fniv_v = tcg_gen_sarv_vec, 2823 .fno = { 2824 gen_helper_gvec_sar8i, 2825 gen_helper_gvec_sar16i, 2826 gen_helper_gvec_sar32i, 2827 gen_helper_gvec_sar64i, 2828 }, 2829 .s_list = { INDEX_op_sars_vec, 0 }, 2830 .v_list = { INDEX_op_sarv_vec, 0 }, 2831 }; 2832 2833 tcg_debug_assert(vece <= MO_64); 2834 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2835 } 2836 2837 /* 2838 * Expand D = A << (B % element bits) 2839 * 2840 * Unlike scalar shifts, where it is easy for the target front end 2841 * to include the modulo as part of the expansion. If the target 2842 * naturally includes the modulo as part of the operation, great! 2843 * If the target has some other behaviour from out-of-range shifts, 2844 * then it could not use this function anyway, and would need to 2845 * do it's own expansion with custom functions. 2846 */ 2847 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 2848 TCGv_vec a, TCGv_vec b) 2849 { 2850 TCGv_vec t = tcg_temp_new_vec_matching(d); 2851 2852 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2853 tcg_gen_and_vec(vece, t, t, b); 2854 tcg_gen_shlv_vec(vece, d, a, t); 2855 tcg_temp_free_vec(t); 2856 } 2857 2858 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2859 { 2860 TCGv_i32 t = tcg_temp_new_i32(); 2861 2862 tcg_gen_andi_i32(t, b, 31); 2863 tcg_gen_shl_i32(d, a, t); 2864 tcg_temp_free_i32(t); 2865 } 2866 2867 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2868 { 2869 TCGv_i64 t = tcg_temp_new_i64(); 2870 2871 tcg_gen_andi_i64(t, b, 63); 2872 tcg_gen_shl_i64(d, a, t); 2873 tcg_temp_free_i64(t); 2874 } 2875 2876 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 2877 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2878 { 2879 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 2880 static const GVecGen3 g[4] = { 2881 { .fniv = tcg_gen_shlv_mod_vec, 2882 .fno = gen_helper_gvec_shl8v, 2883 .opt_opc = vecop_list, 2884 .vece = MO_8 }, 2885 { .fniv = tcg_gen_shlv_mod_vec, 2886 .fno = gen_helper_gvec_shl16v, 2887 .opt_opc = vecop_list, 2888 .vece = MO_16 }, 2889 { .fni4 = tcg_gen_shl_mod_i32, 2890 .fniv = tcg_gen_shlv_mod_vec, 2891 .fno = gen_helper_gvec_shl32v, 2892 .opt_opc = vecop_list, 2893 .vece = MO_32 }, 2894 { .fni8 = tcg_gen_shl_mod_i64, 2895 .fniv = tcg_gen_shlv_mod_vec, 2896 .fno = gen_helper_gvec_shl64v, 2897 .opt_opc = vecop_list, 2898 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2899 .vece = MO_64 }, 2900 }; 2901 2902 tcg_debug_assert(vece <= MO_64); 2903 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2904 } 2905 2906 /* 2907 * Similarly for logical right shifts. 2908 */ 2909 2910 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 2911 TCGv_vec a, TCGv_vec b) 2912 { 2913 TCGv_vec t = tcg_temp_new_vec_matching(d); 2914 2915 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2916 tcg_gen_and_vec(vece, t, t, b); 2917 tcg_gen_shrv_vec(vece, d, a, t); 2918 tcg_temp_free_vec(t); 2919 } 2920 2921 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2922 { 2923 TCGv_i32 t = tcg_temp_new_i32(); 2924 2925 tcg_gen_andi_i32(t, b, 31); 2926 tcg_gen_shr_i32(d, a, t); 2927 tcg_temp_free_i32(t); 2928 } 2929 2930 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2931 { 2932 TCGv_i64 t = tcg_temp_new_i64(); 2933 2934 tcg_gen_andi_i64(t, b, 63); 2935 tcg_gen_shr_i64(d, a, t); 2936 tcg_temp_free_i64(t); 2937 } 2938 2939 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 2940 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2941 { 2942 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 2943 static const GVecGen3 g[4] = { 2944 { .fniv = tcg_gen_shrv_mod_vec, 2945 .fno = gen_helper_gvec_shr8v, 2946 .opt_opc = vecop_list, 2947 .vece = MO_8 }, 2948 { .fniv = tcg_gen_shrv_mod_vec, 2949 .fno = gen_helper_gvec_shr16v, 2950 .opt_opc = vecop_list, 2951 .vece = MO_16 }, 2952 { .fni4 = tcg_gen_shr_mod_i32, 2953 .fniv = tcg_gen_shrv_mod_vec, 2954 .fno = gen_helper_gvec_shr32v, 2955 .opt_opc = vecop_list, 2956 .vece = MO_32 }, 2957 { .fni8 = tcg_gen_shr_mod_i64, 2958 .fniv = tcg_gen_shrv_mod_vec, 2959 .fno = gen_helper_gvec_shr64v, 2960 .opt_opc = vecop_list, 2961 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2962 .vece = MO_64 }, 2963 }; 2964 2965 tcg_debug_assert(vece <= MO_64); 2966 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2967 } 2968 2969 /* 2970 * Similarly for arithmetic right shifts. 2971 */ 2972 2973 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 2974 TCGv_vec a, TCGv_vec b) 2975 { 2976 TCGv_vec t = tcg_temp_new_vec_matching(d); 2977 2978 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2979 tcg_gen_and_vec(vece, t, t, b); 2980 tcg_gen_sarv_vec(vece, d, a, t); 2981 tcg_temp_free_vec(t); 2982 } 2983 2984 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2985 { 2986 TCGv_i32 t = tcg_temp_new_i32(); 2987 2988 tcg_gen_andi_i32(t, b, 31); 2989 tcg_gen_sar_i32(d, a, t); 2990 tcg_temp_free_i32(t); 2991 } 2992 2993 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2994 { 2995 TCGv_i64 t = tcg_temp_new_i64(); 2996 2997 tcg_gen_andi_i64(t, b, 63); 2998 tcg_gen_sar_i64(d, a, t); 2999 tcg_temp_free_i64(t); 3000 } 3001 3002 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3003 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3004 { 3005 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3006 static const GVecGen3 g[4] = { 3007 { .fniv = tcg_gen_sarv_mod_vec, 3008 .fno = gen_helper_gvec_sar8v, 3009 .opt_opc = vecop_list, 3010 .vece = MO_8 }, 3011 { .fniv = tcg_gen_sarv_mod_vec, 3012 .fno = gen_helper_gvec_sar16v, 3013 .opt_opc = vecop_list, 3014 .vece = MO_16 }, 3015 { .fni4 = tcg_gen_sar_mod_i32, 3016 .fniv = tcg_gen_sarv_mod_vec, 3017 .fno = gen_helper_gvec_sar32v, 3018 .opt_opc = vecop_list, 3019 .vece = MO_32 }, 3020 { .fni8 = tcg_gen_sar_mod_i64, 3021 .fniv = tcg_gen_sarv_mod_vec, 3022 .fno = gen_helper_gvec_sar64v, 3023 .opt_opc = vecop_list, 3024 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3025 .vece = MO_64 }, 3026 }; 3027 3028 tcg_debug_assert(vece <= MO_64); 3029 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3030 } 3031 3032 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3033 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3034 uint32_t oprsz, TCGCond cond) 3035 { 3036 TCGv_i32 t0 = tcg_temp_new_i32(); 3037 TCGv_i32 t1 = tcg_temp_new_i32(); 3038 uint32_t i; 3039 3040 for (i = 0; i < oprsz; i += 4) { 3041 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 3042 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 3043 tcg_gen_setcond_i32(cond, t0, t0, t1); 3044 tcg_gen_neg_i32(t0, t0); 3045 tcg_gen_st_i32(t0, cpu_env, dofs + i); 3046 } 3047 tcg_temp_free_i32(t1); 3048 tcg_temp_free_i32(t0); 3049 } 3050 3051 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3052 uint32_t oprsz, TCGCond cond) 3053 { 3054 TCGv_i64 t0 = tcg_temp_new_i64(); 3055 TCGv_i64 t1 = tcg_temp_new_i64(); 3056 uint32_t i; 3057 3058 for (i = 0; i < oprsz; i += 8) { 3059 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 3060 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 3061 tcg_gen_setcond_i64(cond, t0, t0, t1); 3062 tcg_gen_neg_i64(t0, t0); 3063 tcg_gen_st_i64(t0, cpu_env, dofs + i); 3064 } 3065 tcg_temp_free_i64(t1); 3066 tcg_temp_free_i64(t0); 3067 } 3068 3069 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3070 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3071 TCGType type, TCGCond cond) 3072 { 3073 TCGv_vec t0 = tcg_temp_new_vec(type); 3074 TCGv_vec t1 = tcg_temp_new_vec(type); 3075 uint32_t i; 3076 3077 for (i = 0; i < oprsz; i += tysz) { 3078 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 3079 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 3080 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 3081 tcg_gen_st_vec(t0, cpu_env, dofs + i); 3082 } 3083 tcg_temp_free_vec(t1); 3084 tcg_temp_free_vec(t0); 3085 } 3086 3087 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3088 uint32_t aofs, uint32_t bofs, 3089 uint32_t oprsz, uint32_t maxsz) 3090 { 3091 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3092 static gen_helper_gvec_3 * const eq_fn[4] = { 3093 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3094 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3095 }; 3096 static gen_helper_gvec_3 * const ne_fn[4] = { 3097 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3098 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3099 }; 3100 static gen_helper_gvec_3 * const lt_fn[4] = { 3101 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3102 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3103 }; 3104 static gen_helper_gvec_3 * const le_fn[4] = { 3105 gen_helper_gvec_le8, gen_helper_gvec_le16, 3106 gen_helper_gvec_le32, gen_helper_gvec_le64 3107 }; 3108 static gen_helper_gvec_3 * const ltu_fn[4] = { 3109 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3110 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3111 }; 3112 static gen_helper_gvec_3 * const leu_fn[4] = { 3113 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3114 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3115 }; 3116 static gen_helper_gvec_3 * const * const fns[16] = { 3117 [TCG_COND_EQ] = eq_fn, 3118 [TCG_COND_NE] = ne_fn, 3119 [TCG_COND_LT] = lt_fn, 3120 [TCG_COND_LE] = le_fn, 3121 [TCG_COND_LTU] = ltu_fn, 3122 [TCG_COND_LEU] = leu_fn, 3123 }; 3124 3125 const TCGOpcode *hold_list; 3126 TCGType type; 3127 uint32_t some; 3128 3129 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3130 check_overlap_3(dofs, aofs, bofs, maxsz); 3131 3132 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3133 do_dup(MO_8, dofs, oprsz, maxsz, 3134 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3135 return; 3136 } 3137 3138 /* 3139 * Implement inline with a vector type, if possible. 3140 * Prefer integer when 64-bit host and 64-bit comparison. 3141 */ 3142 hold_list = tcg_swap_vecop_list(cmp_list); 3143 type = choose_vector_type(cmp_list, vece, oprsz, 3144 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3145 switch (type) { 3146 case TCG_TYPE_V256: 3147 /* Recall that ARM SVE allows vector sizes that are not a 3148 * power of 2, but always a multiple of 16. The intent is 3149 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3150 */ 3151 some = QEMU_ALIGN_DOWN(oprsz, 32); 3152 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3153 if (some == oprsz) { 3154 break; 3155 } 3156 dofs += some; 3157 aofs += some; 3158 bofs += some; 3159 oprsz -= some; 3160 maxsz -= some; 3161 /* fallthru */ 3162 case TCG_TYPE_V128: 3163 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3164 break; 3165 case TCG_TYPE_V64: 3166 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3167 break; 3168 3169 case 0: 3170 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3171 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3172 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3173 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3174 } else { 3175 gen_helper_gvec_3 * const *fn = fns[cond]; 3176 3177 if (fn == NULL) { 3178 uint32_t tmp; 3179 tmp = aofs, aofs = bofs, bofs = tmp; 3180 cond = tcg_swap_cond(cond); 3181 fn = fns[cond]; 3182 assert(fn != NULL); 3183 } 3184 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3185 oprsz = maxsz; 3186 } 3187 break; 3188 3189 default: 3190 g_assert_not_reached(); 3191 } 3192 tcg_swap_vecop_list(hold_list); 3193 3194 if (oprsz < maxsz) { 3195 expand_clr(dofs + oprsz, maxsz - oprsz); 3196 } 3197 } 3198 3199 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c) 3200 { 3201 TCGv_i64 t = tcg_temp_new_i64(); 3202 3203 tcg_gen_and_i64(t, b, a); 3204 tcg_gen_andc_i64(d, c, a); 3205 tcg_gen_or_i64(d, d, t); 3206 tcg_temp_free_i64(t); 3207 } 3208 3209 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs, 3210 uint32_t bofs, uint32_t cofs, 3211 uint32_t oprsz, uint32_t maxsz) 3212 { 3213 static const GVecGen4 g = { 3214 .fni8 = tcg_gen_bitsel_i64, 3215 .fniv = tcg_gen_bitsel_vec, 3216 .fno = gen_helper_gvec_bitsel, 3217 }; 3218 3219 tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g); 3220 } 3221