1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu-common.h" 22 #include "tcg.h" 23 #include "tcg-op.h" 24 #include "tcg-op-gvec.h" 25 #include "tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 #ifdef CONFIG_DEBUG_TCG 30 static const TCGOpcode vecop_list_empty[1] = { 0 }; 31 #else 32 #define vecop_list_empty NULL 33 #endif 34 35 36 /* Verify vector size and alignment rules. OFS should be the OR of all 37 of the operand offsets so that we can check them all at once. */ 38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 39 { 40 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 41 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 42 tcg_debug_assert(oprsz > 0); 43 tcg_debug_assert(oprsz <= maxsz); 44 tcg_debug_assert((oprsz & opr_align) == 0); 45 tcg_debug_assert((maxsz & max_align) == 0); 46 tcg_debug_assert((ofs & max_align) == 0); 47 } 48 49 /* Verify vector overlap rules for two operands. */ 50 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 51 { 52 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 53 } 54 55 /* Verify vector overlap rules for three operands. */ 56 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 57 { 58 check_overlap_2(d, a, s); 59 check_overlap_2(d, b, s); 60 check_overlap_2(a, b, s); 61 } 62 63 /* Verify vector overlap rules for four operands. */ 64 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 65 uint32_t c, uint32_t s) 66 { 67 check_overlap_2(d, a, s); 68 check_overlap_2(d, b, s); 69 check_overlap_2(d, c, s); 70 check_overlap_2(a, b, s); 71 check_overlap_2(a, c, s); 72 check_overlap_2(b, c, s); 73 } 74 75 /* Create a descriptor from components. */ 76 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 77 { 78 uint32_t desc = 0; 79 80 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 81 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 82 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 83 84 oprsz = (oprsz / 8) - 1; 85 maxsz = (maxsz / 8) - 1; 86 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 87 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 88 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 89 90 return desc; 91 } 92 93 /* Generate a call to a gvec-style helper with two vector operands. */ 94 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 95 uint32_t oprsz, uint32_t maxsz, int32_t data, 96 gen_helper_gvec_2 *fn) 97 { 98 TCGv_ptr a0, a1; 99 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 100 101 a0 = tcg_temp_new_ptr(); 102 a1 = tcg_temp_new_ptr(); 103 104 tcg_gen_addi_ptr(a0, cpu_env, dofs); 105 tcg_gen_addi_ptr(a1, cpu_env, aofs); 106 107 fn(a0, a1, desc); 108 109 tcg_temp_free_ptr(a0); 110 tcg_temp_free_ptr(a1); 111 tcg_temp_free_i32(desc); 112 } 113 114 /* Generate a call to a gvec-style helper with two vector operands 115 and one scalar operand. */ 116 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 117 uint32_t oprsz, uint32_t maxsz, int32_t data, 118 gen_helper_gvec_2i *fn) 119 { 120 TCGv_ptr a0, a1; 121 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 122 123 a0 = tcg_temp_new_ptr(); 124 a1 = tcg_temp_new_ptr(); 125 126 tcg_gen_addi_ptr(a0, cpu_env, dofs); 127 tcg_gen_addi_ptr(a1, cpu_env, aofs); 128 129 fn(a0, a1, c, desc); 130 131 tcg_temp_free_ptr(a0); 132 tcg_temp_free_ptr(a1); 133 tcg_temp_free_i32(desc); 134 } 135 136 /* Generate a call to a gvec-style helper with three vector operands. */ 137 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 138 uint32_t oprsz, uint32_t maxsz, int32_t data, 139 gen_helper_gvec_3 *fn) 140 { 141 TCGv_ptr a0, a1, a2; 142 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 143 144 a0 = tcg_temp_new_ptr(); 145 a1 = tcg_temp_new_ptr(); 146 a2 = tcg_temp_new_ptr(); 147 148 tcg_gen_addi_ptr(a0, cpu_env, dofs); 149 tcg_gen_addi_ptr(a1, cpu_env, aofs); 150 tcg_gen_addi_ptr(a2, cpu_env, bofs); 151 152 fn(a0, a1, a2, desc); 153 154 tcg_temp_free_ptr(a0); 155 tcg_temp_free_ptr(a1); 156 tcg_temp_free_ptr(a2); 157 tcg_temp_free_i32(desc); 158 } 159 160 /* Generate a call to a gvec-style helper with four vector operands. */ 161 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 162 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 163 int32_t data, gen_helper_gvec_4 *fn) 164 { 165 TCGv_ptr a0, a1, a2, a3; 166 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 167 168 a0 = tcg_temp_new_ptr(); 169 a1 = tcg_temp_new_ptr(); 170 a2 = tcg_temp_new_ptr(); 171 a3 = tcg_temp_new_ptr(); 172 173 tcg_gen_addi_ptr(a0, cpu_env, dofs); 174 tcg_gen_addi_ptr(a1, cpu_env, aofs); 175 tcg_gen_addi_ptr(a2, cpu_env, bofs); 176 tcg_gen_addi_ptr(a3, cpu_env, cofs); 177 178 fn(a0, a1, a2, a3, desc); 179 180 tcg_temp_free_ptr(a0); 181 tcg_temp_free_ptr(a1); 182 tcg_temp_free_ptr(a2); 183 tcg_temp_free_ptr(a3); 184 tcg_temp_free_i32(desc); 185 } 186 187 /* Generate a call to a gvec-style helper with five vector operands. */ 188 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 189 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 190 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 191 { 192 TCGv_ptr a0, a1, a2, a3, a4; 193 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 194 195 a0 = tcg_temp_new_ptr(); 196 a1 = tcg_temp_new_ptr(); 197 a2 = tcg_temp_new_ptr(); 198 a3 = tcg_temp_new_ptr(); 199 a4 = tcg_temp_new_ptr(); 200 201 tcg_gen_addi_ptr(a0, cpu_env, dofs); 202 tcg_gen_addi_ptr(a1, cpu_env, aofs); 203 tcg_gen_addi_ptr(a2, cpu_env, bofs); 204 tcg_gen_addi_ptr(a3, cpu_env, cofs); 205 tcg_gen_addi_ptr(a4, cpu_env, xofs); 206 207 fn(a0, a1, a2, a3, a4, desc); 208 209 tcg_temp_free_ptr(a0); 210 tcg_temp_free_ptr(a1); 211 tcg_temp_free_ptr(a2); 212 tcg_temp_free_ptr(a3); 213 tcg_temp_free_ptr(a4); 214 tcg_temp_free_i32(desc); 215 } 216 217 /* Generate a call to a gvec-style helper with three vector operands 218 and an extra pointer operand. */ 219 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 220 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 221 int32_t data, gen_helper_gvec_2_ptr *fn) 222 { 223 TCGv_ptr a0, a1; 224 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 225 226 a0 = tcg_temp_new_ptr(); 227 a1 = tcg_temp_new_ptr(); 228 229 tcg_gen_addi_ptr(a0, cpu_env, dofs); 230 tcg_gen_addi_ptr(a1, cpu_env, aofs); 231 232 fn(a0, a1, ptr, desc); 233 234 tcg_temp_free_ptr(a0); 235 tcg_temp_free_ptr(a1); 236 tcg_temp_free_i32(desc); 237 } 238 239 /* Generate a call to a gvec-style helper with three vector operands 240 and an extra pointer operand. */ 241 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 242 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 243 int32_t data, gen_helper_gvec_3_ptr *fn) 244 { 245 TCGv_ptr a0, a1, a2; 246 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 247 248 a0 = tcg_temp_new_ptr(); 249 a1 = tcg_temp_new_ptr(); 250 a2 = tcg_temp_new_ptr(); 251 252 tcg_gen_addi_ptr(a0, cpu_env, dofs); 253 tcg_gen_addi_ptr(a1, cpu_env, aofs); 254 tcg_gen_addi_ptr(a2, cpu_env, bofs); 255 256 fn(a0, a1, a2, ptr, desc); 257 258 tcg_temp_free_ptr(a0); 259 tcg_temp_free_ptr(a1); 260 tcg_temp_free_ptr(a2); 261 tcg_temp_free_i32(desc); 262 } 263 264 /* Generate a call to a gvec-style helper with four vector operands 265 and an extra pointer operand. */ 266 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 267 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 268 uint32_t maxsz, int32_t data, 269 gen_helper_gvec_4_ptr *fn) 270 { 271 TCGv_ptr a0, a1, a2, a3; 272 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 273 274 a0 = tcg_temp_new_ptr(); 275 a1 = tcg_temp_new_ptr(); 276 a2 = tcg_temp_new_ptr(); 277 a3 = tcg_temp_new_ptr(); 278 279 tcg_gen_addi_ptr(a0, cpu_env, dofs); 280 tcg_gen_addi_ptr(a1, cpu_env, aofs); 281 tcg_gen_addi_ptr(a2, cpu_env, bofs); 282 tcg_gen_addi_ptr(a3, cpu_env, cofs); 283 284 fn(a0, a1, a2, a3, ptr, desc); 285 286 tcg_temp_free_ptr(a0); 287 tcg_temp_free_ptr(a1); 288 tcg_temp_free_ptr(a2); 289 tcg_temp_free_ptr(a3); 290 tcg_temp_free_i32(desc); 291 } 292 293 /* Return true if we want to implement something of OPRSZ bytes 294 in units of LNSZ. This limits the expansion of inline code. */ 295 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 296 { 297 if (oprsz % lnsz == 0) { 298 uint32_t lnct = oprsz / lnsz; 299 return lnct >= 1 && lnct <= MAX_UNROLL; 300 } 301 return false; 302 } 303 304 static void expand_clr(uint32_t dofs, uint32_t maxsz); 305 306 /* Duplicate C as per VECE. */ 307 uint64_t (dup_const)(unsigned vece, uint64_t c) 308 { 309 switch (vece) { 310 case MO_8: 311 return 0x0101010101010101ull * (uint8_t)c; 312 case MO_16: 313 return 0x0001000100010001ull * (uint16_t)c; 314 case MO_32: 315 return 0x0000000100000001ull * (uint32_t)c; 316 case MO_64: 317 return c; 318 default: 319 g_assert_not_reached(); 320 } 321 } 322 323 /* Duplicate IN into OUT as per VECE. */ 324 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 325 { 326 switch (vece) { 327 case MO_8: 328 tcg_gen_ext8u_i32(out, in); 329 tcg_gen_muli_i32(out, out, 0x01010101); 330 break; 331 case MO_16: 332 tcg_gen_deposit_i32(out, in, in, 16, 16); 333 break; 334 case MO_32: 335 tcg_gen_mov_i32(out, in); 336 break; 337 default: 338 g_assert_not_reached(); 339 } 340 } 341 342 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 343 { 344 switch (vece) { 345 case MO_8: 346 tcg_gen_ext8u_i64(out, in); 347 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 348 break; 349 case MO_16: 350 tcg_gen_ext16u_i64(out, in); 351 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 352 break; 353 case MO_32: 354 tcg_gen_deposit_i64(out, in, in, 32, 32); 355 break; 356 case MO_64: 357 tcg_gen_mov_i64(out, in); 358 break; 359 default: 360 g_assert_not_reached(); 361 } 362 } 363 364 /* Select a supported vector type for implementing an operation on SIZE 365 * bytes. If OP is 0, assume that the real operation to be performed is 366 * required by all backends. Otherwise, make sure than OP can be performed 367 * on elements of size VECE in the selected type. Do not select V64 if 368 * PREFER_I64 is true. Return 0 if no vector type is selected. 369 */ 370 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, 371 uint32_t size, bool prefer_i64) 372 { 373 if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { 374 /* 375 * Recall that ARM SVE allows vector sizes that are not a 376 * power of 2, but always a multiple of 16. The intent is 377 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 378 * It is hard to imagine a case in which v256 is supported 379 * but v128 is not, but check anyway. 380 */ 381 if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) 382 && (size % 32 == 0 383 || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) { 384 return TCG_TYPE_V256; 385 } 386 } 387 if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) 388 && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) { 389 return TCG_TYPE_V128; 390 } 391 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 392 && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) { 393 return TCG_TYPE_V64; 394 } 395 return 0; 396 } 397 398 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, 399 uint32_t maxsz, TCGv_vec t_vec) 400 { 401 uint32_t i = 0; 402 403 switch (type) { 404 case TCG_TYPE_V256: 405 /* 406 * Recall that ARM SVE allows vector sizes that are not a 407 * power of 2, but always a multiple of 16. The intent is 408 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 409 */ 410 for (; i + 32 <= oprsz; i += 32) { 411 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 412 } 413 /* fallthru */ 414 case TCG_TYPE_V128: 415 for (; i + 16 <= oprsz; i += 16) { 416 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 417 } 418 break; 419 case TCG_TYPE_V64: 420 for (; i < oprsz; i += 8) { 421 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 422 } 423 break; 424 default: 425 g_assert_not_reached(); 426 } 427 428 if (oprsz < maxsz) { 429 expand_clr(dofs + oprsz, maxsz - oprsz); 430 } 431 } 432 433 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 434 * Only one of IN_32 or IN_64 may be set; 435 * IN_C is used if IN_32 and IN_64 are unset. 436 */ 437 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 438 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 439 uint64_t in_c) 440 { 441 TCGType type; 442 TCGv_i64 t_64; 443 TCGv_i32 t_32, t_desc; 444 TCGv_ptr t_ptr; 445 uint32_t i; 446 447 assert(vece <= (in_32 ? MO_32 : MO_64)); 448 assert(in_32 == NULL || in_64 == NULL); 449 450 /* If we're storing 0, expand oprsz to maxsz. */ 451 if (in_32 == NULL && in_64 == NULL) { 452 in_c = dup_const(vece, in_c); 453 if (in_c == 0) { 454 oprsz = maxsz; 455 } 456 } 457 458 /* Implement inline with a vector type, if possible. 459 * Prefer integer when 64-bit host and no variable dup. 460 */ 461 type = choose_vector_type(NULL, vece, oprsz, 462 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 463 && (in_64 == NULL || vece == MO_64))); 464 if (type != 0) { 465 TCGv_vec t_vec = tcg_temp_new_vec(type); 466 467 if (in_32) { 468 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 469 } else if (in_64) { 470 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 471 } else { 472 tcg_gen_dupi_vec(vece, t_vec, in_c); 473 } 474 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 475 tcg_temp_free_vec(t_vec); 476 return; 477 } 478 479 /* Otherwise, inline with an integer type, unless "large". */ 480 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 481 t_64 = NULL; 482 t_32 = NULL; 483 484 if (in_32) { 485 /* We are given a 32-bit variable input. For a 64-bit host, 486 use a 64-bit operation unless the 32-bit operation would 487 be simple enough. */ 488 if (TCG_TARGET_REG_BITS == 64 489 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 490 t_64 = tcg_temp_new_i64(); 491 tcg_gen_extu_i32_i64(t_64, in_32); 492 gen_dup_i64(vece, t_64, t_64); 493 } else { 494 t_32 = tcg_temp_new_i32(); 495 gen_dup_i32(vece, t_32, in_32); 496 } 497 } else if (in_64) { 498 /* We are given a 64-bit variable input. */ 499 t_64 = tcg_temp_new_i64(); 500 gen_dup_i64(vece, t_64, in_64); 501 } else { 502 /* We are given a constant input. */ 503 /* For 64-bit hosts, use 64-bit constants for "simple" constants 504 or when we'd need too many 32-bit stores, or when a 64-bit 505 constant is really required. */ 506 if (vece == MO_64 507 || (TCG_TARGET_REG_BITS == 64 508 && (in_c == 0 || in_c == -1 509 || !check_size_impl(oprsz, 4)))) { 510 t_64 = tcg_const_i64(in_c); 511 } else { 512 t_32 = tcg_const_i32(in_c); 513 } 514 } 515 516 /* Implement inline if we picked an implementation size above. */ 517 if (t_32) { 518 for (i = 0; i < oprsz; i += 4) { 519 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 520 } 521 tcg_temp_free_i32(t_32); 522 goto done; 523 } 524 if (t_64) { 525 for (i = 0; i < oprsz; i += 8) { 526 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 527 } 528 tcg_temp_free_i64(t_64); 529 goto done; 530 } 531 } 532 533 /* Otherwise implement out of line. */ 534 t_ptr = tcg_temp_new_ptr(); 535 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 536 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 537 538 if (vece == MO_64) { 539 if (in_64) { 540 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 541 } else { 542 t_64 = tcg_const_i64(in_c); 543 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 544 tcg_temp_free_i64(t_64); 545 } 546 } else { 547 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 548 static dup_fn * const fns[3] = { 549 gen_helper_gvec_dup8, 550 gen_helper_gvec_dup16, 551 gen_helper_gvec_dup32 552 }; 553 554 if (in_32) { 555 fns[vece](t_ptr, t_desc, in_32); 556 } else { 557 t_32 = tcg_temp_new_i32(); 558 if (in_64) { 559 tcg_gen_extrl_i64_i32(t_32, in_64); 560 } else if (vece == MO_8) { 561 tcg_gen_movi_i32(t_32, in_c & 0xff); 562 } else if (vece == MO_16) { 563 tcg_gen_movi_i32(t_32, in_c & 0xffff); 564 } else { 565 tcg_gen_movi_i32(t_32, in_c); 566 } 567 fns[vece](t_ptr, t_desc, t_32); 568 tcg_temp_free_i32(t_32); 569 } 570 } 571 572 tcg_temp_free_ptr(t_ptr); 573 tcg_temp_free_i32(t_desc); 574 return; 575 576 done: 577 if (oprsz < maxsz) { 578 expand_clr(dofs + oprsz, maxsz - oprsz); 579 } 580 } 581 582 /* Likewise, but with zero. */ 583 static void expand_clr(uint32_t dofs, uint32_t maxsz) 584 { 585 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 586 } 587 588 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 589 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 590 void (*fni)(TCGv_i32, TCGv_i32)) 591 { 592 TCGv_i32 t0 = tcg_temp_new_i32(); 593 uint32_t i; 594 595 for (i = 0; i < oprsz; i += 4) { 596 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 597 fni(t0, t0); 598 tcg_gen_st_i32(t0, cpu_env, dofs + i); 599 } 600 tcg_temp_free_i32(t0); 601 } 602 603 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 604 int32_t c, bool load_dest, 605 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 606 { 607 TCGv_i32 t0 = tcg_temp_new_i32(); 608 TCGv_i32 t1 = tcg_temp_new_i32(); 609 uint32_t i; 610 611 for (i = 0; i < oprsz; i += 4) { 612 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 613 if (load_dest) { 614 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 615 } 616 fni(t1, t0, c); 617 tcg_gen_st_i32(t1, cpu_env, dofs + i); 618 } 619 tcg_temp_free_i32(t0); 620 tcg_temp_free_i32(t1); 621 } 622 623 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 624 TCGv_i32 c, bool scalar_first, 625 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 626 { 627 TCGv_i32 t0 = tcg_temp_new_i32(); 628 TCGv_i32 t1 = tcg_temp_new_i32(); 629 uint32_t i; 630 631 for (i = 0; i < oprsz; i += 4) { 632 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 633 if (scalar_first) { 634 fni(t1, c, t0); 635 } else { 636 fni(t1, t0, c); 637 } 638 tcg_gen_st_i32(t1, cpu_env, dofs + i); 639 } 640 tcg_temp_free_i32(t0); 641 tcg_temp_free_i32(t1); 642 } 643 644 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 645 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 646 uint32_t bofs, uint32_t oprsz, bool load_dest, 647 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 648 { 649 TCGv_i32 t0 = tcg_temp_new_i32(); 650 TCGv_i32 t1 = tcg_temp_new_i32(); 651 TCGv_i32 t2 = tcg_temp_new_i32(); 652 uint32_t i; 653 654 for (i = 0; i < oprsz; i += 4) { 655 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 656 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 657 if (load_dest) { 658 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 659 } 660 fni(t2, t0, t1); 661 tcg_gen_st_i32(t2, cpu_env, dofs + i); 662 } 663 tcg_temp_free_i32(t2); 664 tcg_temp_free_i32(t1); 665 tcg_temp_free_i32(t0); 666 } 667 668 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 669 uint32_t oprsz, int32_t c, bool load_dest, 670 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t)) 671 { 672 TCGv_i32 t0 = tcg_temp_new_i32(); 673 TCGv_i32 t1 = tcg_temp_new_i32(); 674 TCGv_i32 t2 = tcg_temp_new_i32(); 675 uint32_t i; 676 677 for (i = 0; i < oprsz; i += 4) { 678 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 679 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 680 if (load_dest) { 681 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 682 } 683 fni(t2, t0, t1, c); 684 tcg_gen_st_i32(t2, cpu_env, dofs + i); 685 } 686 tcg_temp_free_i32(t0); 687 tcg_temp_free_i32(t1); 688 tcg_temp_free_i32(t2); 689 } 690 691 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 692 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 693 uint32_t cofs, uint32_t oprsz, bool write_aofs, 694 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 695 { 696 TCGv_i32 t0 = tcg_temp_new_i32(); 697 TCGv_i32 t1 = tcg_temp_new_i32(); 698 TCGv_i32 t2 = tcg_temp_new_i32(); 699 TCGv_i32 t3 = tcg_temp_new_i32(); 700 uint32_t i; 701 702 for (i = 0; i < oprsz; i += 4) { 703 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 704 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 705 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 706 fni(t0, t1, t2, t3); 707 tcg_gen_st_i32(t0, cpu_env, dofs + i); 708 if (write_aofs) { 709 tcg_gen_st_i32(t1, cpu_env, aofs + i); 710 } 711 } 712 tcg_temp_free_i32(t3); 713 tcg_temp_free_i32(t2); 714 tcg_temp_free_i32(t1); 715 tcg_temp_free_i32(t0); 716 } 717 718 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 719 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 720 void (*fni)(TCGv_i64, TCGv_i64)) 721 { 722 TCGv_i64 t0 = tcg_temp_new_i64(); 723 uint32_t i; 724 725 for (i = 0; i < oprsz; i += 8) { 726 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 727 fni(t0, t0); 728 tcg_gen_st_i64(t0, cpu_env, dofs + i); 729 } 730 tcg_temp_free_i64(t0); 731 } 732 733 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 734 int64_t c, bool load_dest, 735 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 736 { 737 TCGv_i64 t0 = tcg_temp_new_i64(); 738 TCGv_i64 t1 = tcg_temp_new_i64(); 739 uint32_t i; 740 741 for (i = 0; i < oprsz; i += 8) { 742 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 743 if (load_dest) { 744 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 745 } 746 fni(t1, t0, c); 747 tcg_gen_st_i64(t1, cpu_env, dofs + i); 748 } 749 tcg_temp_free_i64(t0); 750 tcg_temp_free_i64(t1); 751 } 752 753 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 754 TCGv_i64 c, bool scalar_first, 755 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 756 { 757 TCGv_i64 t0 = tcg_temp_new_i64(); 758 TCGv_i64 t1 = tcg_temp_new_i64(); 759 uint32_t i; 760 761 for (i = 0; i < oprsz; i += 8) { 762 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 763 if (scalar_first) { 764 fni(t1, c, t0); 765 } else { 766 fni(t1, t0, c); 767 } 768 tcg_gen_st_i64(t1, cpu_env, dofs + i); 769 } 770 tcg_temp_free_i64(t0); 771 tcg_temp_free_i64(t1); 772 } 773 774 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 775 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 776 uint32_t bofs, uint32_t oprsz, bool load_dest, 777 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 778 { 779 TCGv_i64 t0 = tcg_temp_new_i64(); 780 TCGv_i64 t1 = tcg_temp_new_i64(); 781 TCGv_i64 t2 = tcg_temp_new_i64(); 782 uint32_t i; 783 784 for (i = 0; i < oprsz; i += 8) { 785 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 786 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 787 if (load_dest) { 788 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 789 } 790 fni(t2, t0, t1); 791 tcg_gen_st_i64(t2, cpu_env, dofs + i); 792 } 793 tcg_temp_free_i64(t2); 794 tcg_temp_free_i64(t1); 795 tcg_temp_free_i64(t0); 796 } 797 798 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 799 uint32_t oprsz, int64_t c, bool load_dest, 800 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t)) 801 { 802 TCGv_i64 t0 = tcg_temp_new_i64(); 803 TCGv_i64 t1 = tcg_temp_new_i64(); 804 TCGv_i64 t2 = tcg_temp_new_i64(); 805 uint32_t i; 806 807 for (i = 0; i < oprsz; i += 8) { 808 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 809 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 810 if (load_dest) { 811 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 812 } 813 fni(t2, t0, t1, c); 814 tcg_gen_st_i64(t2, cpu_env, dofs + i); 815 } 816 tcg_temp_free_i64(t0); 817 tcg_temp_free_i64(t1); 818 tcg_temp_free_i64(t2); 819 } 820 821 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 822 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 823 uint32_t cofs, uint32_t oprsz, bool write_aofs, 824 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 825 { 826 TCGv_i64 t0 = tcg_temp_new_i64(); 827 TCGv_i64 t1 = tcg_temp_new_i64(); 828 TCGv_i64 t2 = tcg_temp_new_i64(); 829 TCGv_i64 t3 = tcg_temp_new_i64(); 830 uint32_t i; 831 832 for (i = 0; i < oprsz; i += 8) { 833 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 834 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 835 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 836 fni(t0, t1, t2, t3); 837 tcg_gen_st_i64(t0, cpu_env, dofs + i); 838 if (write_aofs) { 839 tcg_gen_st_i64(t1, cpu_env, aofs + i); 840 } 841 } 842 tcg_temp_free_i64(t3); 843 tcg_temp_free_i64(t2); 844 tcg_temp_free_i64(t1); 845 tcg_temp_free_i64(t0); 846 } 847 848 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 849 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 850 uint32_t oprsz, uint32_t tysz, TCGType type, 851 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 852 { 853 TCGv_vec t0 = tcg_temp_new_vec(type); 854 uint32_t i; 855 856 for (i = 0; i < oprsz; i += tysz) { 857 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 858 fni(vece, t0, t0); 859 tcg_gen_st_vec(t0, cpu_env, dofs + i); 860 } 861 tcg_temp_free_vec(t0); 862 } 863 864 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 865 using host vectors. */ 866 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 867 uint32_t oprsz, uint32_t tysz, TCGType type, 868 int64_t c, bool load_dest, 869 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 870 { 871 TCGv_vec t0 = tcg_temp_new_vec(type); 872 TCGv_vec t1 = tcg_temp_new_vec(type); 873 uint32_t i; 874 875 for (i = 0; i < oprsz; i += tysz) { 876 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 877 if (load_dest) { 878 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 879 } 880 fni(vece, t1, t0, c); 881 tcg_gen_st_vec(t1, cpu_env, dofs + i); 882 } 883 tcg_temp_free_vec(t0); 884 tcg_temp_free_vec(t1); 885 } 886 887 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 888 uint32_t oprsz, uint32_t tysz, TCGType type, 889 TCGv_vec c, bool scalar_first, 890 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 891 { 892 TCGv_vec t0 = tcg_temp_new_vec(type); 893 TCGv_vec t1 = tcg_temp_new_vec(type); 894 uint32_t i; 895 896 for (i = 0; i < oprsz; i += tysz) { 897 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 898 if (scalar_first) { 899 fni(vece, t1, c, t0); 900 } else { 901 fni(vece, t1, t0, c); 902 } 903 tcg_gen_st_vec(t1, cpu_env, dofs + i); 904 } 905 tcg_temp_free_vec(t0); 906 tcg_temp_free_vec(t1); 907 } 908 909 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 910 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 911 uint32_t bofs, uint32_t oprsz, 912 uint32_t tysz, TCGType type, bool load_dest, 913 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 914 { 915 TCGv_vec t0 = tcg_temp_new_vec(type); 916 TCGv_vec t1 = tcg_temp_new_vec(type); 917 TCGv_vec t2 = tcg_temp_new_vec(type); 918 uint32_t i; 919 920 for (i = 0; i < oprsz; i += tysz) { 921 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 922 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 923 if (load_dest) { 924 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 925 } 926 fni(vece, t2, t0, t1); 927 tcg_gen_st_vec(t2, cpu_env, dofs + i); 928 } 929 tcg_temp_free_vec(t2); 930 tcg_temp_free_vec(t1); 931 tcg_temp_free_vec(t0); 932 } 933 934 /* 935 * Expand OPSZ bytes worth of three-vector operands and an immediate operand 936 * using host vectors. 937 */ 938 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 939 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 940 TCGType type, int64_t c, bool load_dest, 941 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, 942 int64_t)) 943 { 944 TCGv_vec t0 = tcg_temp_new_vec(type); 945 TCGv_vec t1 = tcg_temp_new_vec(type); 946 TCGv_vec t2 = tcg_temp_new_vec(type); 947 uint32_t i; 948 949 for (i = 0; i < oprsz; i += tysz) { 950 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 951 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 952 if (load_dest) { 953 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 954 } 955 fni(vece, t2, t0, t1, c); 956 tcg_gen_st_vec(t2, cpu_env, dofs + i); 957 } 958 tcg_temp_free_vec(t0); 959 tcg_temp_free_vec(t1); 960 tcg_temp_free_vec(t2); 961 } 962 963 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 964 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 965 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 966 uint32_t tysz, TCGType type, bool write_aofs, 967 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 968 TCGv_vec, TCGv_vec)) 969 { 970 TCGv_vec t0 = tcg_temp_new_vec(type); 971 TCGv_vec t1 = tcg_temp_new_vec(type); 972 TCGv_vec t2 = tcg_temp_new_vec(type); 973 TCGv_vec t3 = tcg_temp_new_vec(type); 974 uint32_t i; 975 976 for (i = 0; i < oprsz; i += tysz) { 977 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 978 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 979 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 980 fni(vece, t0, t1, t2, t3); 981 tcg_gen_st_vec(t0, cpu_env, dofs + i); 982 if (write_aofs) { 983 tcg_gen_st_vec(t1, cpu_env, aofs + i); 984 } 985 } 986 tcg_temp_free_vec(t3); 987 tcg_temp_free_vec(t2); 988 tcg_temp_free_vec(t1); 989 tcg_temp_free_vec(t0); 990 } 991 992 /* Expand a vector two-operand operation. */ 993 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 994 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 995 { 996 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 997 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 998 TCGType type; 999 uint32_t some; 1000 1001 check_size_align(oprsz, maxsz, dofs | aofs); 1002 check_overlap_2(dofs, aofs, maxsz); 1003 1004 type = 0; 1005 if (g->fniv) { 1006 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1007 } 1008 switch (type) { 1009 case TCG_TYPE_V256: 1010 /* Recall that ARM SVE allows vector sizes that are not a 1011 * power of 2, but always a multiple of 16. The intent is 1012 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1013 */ 1014 some = QEMU_ALIGN_DOWN(oprsz, 32); 1015 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); 1016 if (some == oprsz) { 1017 break; 1018 } 1019 dofs += some; 1020 aofs += some; 1021 oprsz -= some; 1022 maxsz -= some; 1023 /* fallthru */ 1024 case TCG_TYPE_V128: 1025 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); 1026 break; 1027 case TCG_TYPE_V64: 1028 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); 1029 break; 1030 1031 case 0: 1032 if (g->fni8 && check_size_impl(oprsz, 8)) { 1033 expand_2_i64(dofs, aofs, oprsz, g->fni8); 1034 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1035 expand_2_i32(dofs, aofs, oprsz, g->fni4); 1036 } else { 1037 assert(g->fno != NULL); 1038 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 1039 oprsz = maxsz; 1040 } 1041 break; 1042 1043 default: 1044 g_assert_not_reached(); 1045 } 1046 tcg_swap_vecop_list(hold_list); 1047 1048 if (oprsz < maxsz) { 1049 expand_clr(dofs + oprsz, maxsz - oprsz); 1050 } 1051 } 1052 1053 /* Expand a vector operation with two vectors and an immediate. */ 1054 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1055 uint32_t maxsz, int64_t c, const GVecGen2i *g) 1056 { 1057 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1058 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1059 TCGType type; 1060 uint32_t some; 1061 1062 check_size_align(oprsz, maxsz, dofs | aofs); 1063 check_overlap_2(dofs, aofs, maxsz); 1064 1065 type = 0; 1066 if (g->fniv) { 1067 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1068 } 1069 switch (type) { 1070 case TCG_TYPE_V256: 1071 /* Recall that ARM SVE allows vector sizes that are not a 1072 * power of 2, but always a multiple of 16. The intent is 1073 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1074 */ 1075 some = QEMU_ALIGN_DOWN(oprsz, 32); 1076 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1077 c, g->load_dest, g->fniv); 1078 if (some == oprsz) { 1079 break; 1080 } 1081 dofs += some; 1082 aofs += some; 1083 oprsz -= some; 1084 maxsz -= some; 1085 /* fallthru */ 1086 case TCG_TYPE_V128: 1087 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1088 c, g->load_dest, g->fniv); 1089 break; 1090 case TCG_TYPE_V64: 1091 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1092 c, g->load_dest, g->fniv); 1093 break; 1094 1095 case 0: 1096 if (g->fni8 && check_size_impl(oprsz, 8)) { 1097 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1098 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1099 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1100 } else { 1101 if (g->fno) { 1102 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1103 } else { 1104 TCGv_i64 tcg_c = tcg_const_i64(c); 1105 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1106 maxsz, c, g->fnoi); 1107 tcg_temp_free_i64(tcg_c); 1108 } 1109 oprsz = maxsz; 1110 } 1111 break; 1112 1113 default: 1114 g_assert_not_reached(); 1115 } 1116 tcg_swap_vecop_list(hold_list); 1117 1118 if (oprsz < maxsz) { 1119 expand_clr(dofs + oprsz, maxsz - oprsz); 1120 } 1121 } 1122 1123 /* Expand a vector operation with two vectors and a scalar. */ 1124 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1125 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1126 { 1127 TCGType type; 1128 1129 check_size_align(oprsz, maxsz, dofs | aofs); 1130 check_overlap_2(dofs, aofs, maxsz); 1131 1132 type = 0; 1133 if (g->fniv) { 1134 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1135 } 1136 if (type != 0) { 1137 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1138 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1139 TCGv_vec t_vec = tcg_temp_new_vec(type); 1140 uint32_t some; 1141 1142 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1143 1144 switch (type) { 1145 case TCG_TYPE_V256: 1146 /* Recall that ARM SVE allows vector sizes that are not a 1147 * power of 2, but always a multiple of 16. The intent is 1148 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1149 */ 1150 some = QEMU_ALIGN_DOWN(oprsz, 32); 1151 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1152 t_vec, g->scalar_first, g->fniv); 1153 if (some == oprsz) { 1154 break; 1155 } 1156 dofs += some; 1157 aofs += some; 1158 oprsz -= some; 1159 maxsz -= some; 1160 /* fallthru */ 1161 1162 case TCG_TYPE_V128: 1163 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1164 t_vec, g->scalar_first, g->fniv); 1165 break; 1166 1167 case TCG_TYPE_V64: 1168 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1169 t_vec, g->scalar_first, g->fniv); 1170 break; 1171 1172 default: 1173 g_assert_not_reached(); 1174 } 1175 tcg_temp_free_vec(t_vec); 1176 tcg_swap_vecop_list(hold_list); 1177 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1178 TCGv_i64 t64 = tcg_temp_new_i64(); 1179 1180 gen_dup_i64(g->vece, t64, c); 1181 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1182 tcg_temp_free_i64(t64); 1183 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1184 TCGv_i32 t32 = tcg_temp_new_i32(); 1185 1186 tcg_gen_extrl_i64_i32(t32, c); 1187 gen_dup_i32(g->vece, t32, t32); 1188 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1189 tcg_temp_free_i32(t32); 1190 } else { 1191 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1192 return; 1193 } 1194 1195 if (oprsz < maxsz) { 1196 expand_clr(dofs + oprsz, maxsz - oprsz); 1197 } 1198 } 1199 1200 /* Expand a vector three-operand operation. */ 1201 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1202 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1203 { 1204 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1205 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1206 TCGType type; 1207 uint32_t some; 1208 1209 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1210 check_overlap_3(dofs, aofs, bofs, maxsz); 1211 1212 type = 0; 1213 if (g->fniv) { 1214 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1215 } 1216 switch (type) { 1217 case TCG_TYPE_V256: 1218 /* Recall that ARM SVE allows vector sizes that are not a 1219 * power of 2, but always a multiple of 16. The intent is 1220 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1221 */ 1222 some = QEMU_ALIGN_DOWN(oprsz, 32); 1223 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1224 g->load_dest, g->fniv); 1225 if (some == oprsz) { 1226 break; 1227 } 1228 dofs += some; 1229 aofs += some; 1230 bofs += some; 1231 oprsz -= some; 1232 maxsz -= some; 1233 /* fallthru */ 1234 case TCG_TYPE_V128: 1235 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1236 g->load_dest, g->fniv); 1237 break; 1238 case TCG_TYPE_V64: 1239 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1240 g->load_dest, g->fniv); 1241 break; 1242 1243 case 0: 1244 if (g->fni8 && check_size_impl(oprsz, 8)) { 1245 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1246 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1247 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1248 } else { 1249 assert(g->fno != NULL); 1250 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1251 maxsz, g->data, g->fno); 1252 oprsz = maxsz; 1253 } 1254 break; 1255 1256 default: 1257 g_assert_not_reached(); 1258 } 1259 tcg_swap_vecop_list(hold_list); 1260 1261 if (oprsz < maxsz) { 1262 expand_clr(dofs + oprsz, maxsz - oprsz); 1263 } 1264 } 1265 1266 /* Expand a vector operation with three vectors and an immediate. */ 1267 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1268 uint32_t oprsz, uint32_t maxsz, int64_t c, 1269 const GVecGen3i *g) 1270 { 1271 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1272 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1273 TCGType type; 1274 uint32_t some; 1275 1276 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1277 check_overlap_3(dofs, aofs, bofs, maxsz); 1278 1279 type = 0; 1280 if (g->fniv) { 1281 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1282 } 1283 switch (type) { 1284 case TCG_TYPE_V256: 1285 /* 1286 * Recall that ARM SVE allows vector sizes that are not a 1287 * power of 2, but always a multiple of 16. The intent is 1288 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1289 */ 1290 some = QEMU_ALIGN_DOWN(oprsz, 32); 1291 expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1292 c, g->load_dest, g->fniv); 1293 if (some == oprsz) { 1294 break; 1295 } 1296 dofs += some; 1297 aofs += some; 1298 bofs += some; 1299 oprsz -= some; 1300 maxsz -= some; 1301 /* fallthru */ 1302 case TCG_TYPE_V128: 1303 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1304 c, g->load_dest, g->fniv); 1305 break; 1306 case TCG_TYPE_V64: 1307 expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1308 c, g->load_dest, g->fniv); 1309 break; 1310 1311 case 0: 1312 if (g->fni8 && check_size_impl(oprsz, 8)) { 1313 expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8); 1314 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1315 expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4); 1316 } else { 1317 assert(g->fno != NULL); 1318 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno); 1319 oprsz = maxsz; 1320 } 1321 break; 1322 1323 default: 1324 g_assert_not_reached(); 1325 } 1326 tcg_swap_vecop_list(hold_list); 1327 1328 if (oprsz < maxsz) { 1329 expand_clr(dofs + oprsz, maxsz - oprsz); 1330 } 1331 } 1332 1333 /* Expand a vector four-operand operation. */ 1334 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1335 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1336 { 1337 const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty; 1338 const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list); 1339 TCGType type; 1340 uint32_t some; 1341 1342 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1343 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1344 1345 type = 0; 1346 if (g->fniv) { 1347 type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64); 1348 } 1349 switch (type) { 1350 case TCG_TYPE_V256: 1351 /* Recall that ARM SVE allows vector sizes that are not a 1352 * power of 2, but always a multiple of 16. The intent is 1353 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1354 */ 1355 some = QEMU_ALIGN_DOWN(oprsz, 32); 1356 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1357 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1358 if (some == oprsz) { 1359 break; 1360 } 1361 dofs += some; 1362 aofs += some; 1363 bofs += some; 1364 cofs += some; 1365 oprsz -= some; 1366 maxsz -= some; 1367 /* fallthru */ 1368 case TCG_TYPE_V128: 1369 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1370 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1371 break; 1372 case TCG_TYPE_V64: 1373 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1374 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1375 break; 1376 1377 case 0: 1378 if (g->fni8 && check_size_impl(oprsz, 8)) { 1379 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1380 g->write_aofs, g->fni8); 1381 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1382 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1383 g->write_aofs, g->fni4); 1384 } else { 1385 assert(g->fno != NULL); 1386 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1387 oprsz, maxsz, g->data, g->fno); 1388 oprsz = maxsz; 1389 } 1390 break; 1391 1392 default: 1393 g_assert_not_reached(); 1394 } 1395 tcg_swap_vecop_list(hold_list); 1396 1397 if (oprsz < maxsz) { 1398 expand_clr(dofs + oprsz, maxsz - oprsz); 1399 } 1400 } 1401 1402 /* 1403 * Expand specific vector operations. 1404 */ 1405 1406 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1407 { 1408 tcg_gen_mov_vec(a, b); 1409 } 1410 1411 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1412 uint32_t oprsz, uint32_t maxsz) 1413 { 1414 static const GVecGen2 g = { 1415 .fni8 = tcg_gen_mov_i64, 1416 .fniv = vec_mov2, 1417 .fno = gen_helper_gvec_mov, 1418 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1419 }; 1420 if (dofs != aofs) { 1421 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1422 } else { 1423 check_size_align(oprsz, maxsz, dofs); 1424 if (oprsz < maxsz) { 1425 expand_clr(dofs + oprsz, maxsz - oprsz); 1426 } 1427 } 1428 } 1429 1430 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1431 uint32_t maxsz, TCGv_i32 in) 1432 { 1433 check_size_align(oprsz, maxsz, dofs); 1434 tcg_debug_assert(vece <= MO_32); 1435 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1436 } 1437 1438 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1439 uint32_t maxsz, TCGv_i64 in) 1440 { 1441 check_size_align(oprsz, maxsz, dofs); 1442 tcg_debug_assert(vece <= MO_64); 1443 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1444 } 1445 1446 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1447 uint32_t oprsz, uint32_t maxsz) 1448 { 1449 if (vece <= MO_64) { 1450 TCGType type = choose_vector_type(0, vece, oprsz, 0); 1451 if (type != 0) { 1452 TCGv_vec t_vec = tcg_temp_new_vec(type); 1453 tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs); 1454 do_dup_store(type, dofs, oprsz, maxsz, t_vec); 1455 tcg_temp_free_vec(t_vec); 1456 return; 1457 } 1458 } 1459 if (vece <= MO_32) { 1460 TCGv_i32 in = tcg_temp_new_i32(); 1461 switch (vece) { 1462 case MO_8: 1463 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1464 break; 1465 case MO_16: 1466 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1467 break; 1468 case MO_32: 1469 tcg_gen_ld_i32(in, cpu_env, aofs); 1470 break; 1471 } 1472 tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in); 1473 tcg_temp_free_i32(in); 1474 } else if (vece == MO_64) { 1475 TCGv_i64 in = tcg_temp_new_i64(); 1476 tcg_gen_ld_i64(in, cpu_env, aofs); 1477 tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in); 1478 tcg_temp_free_i64(in); 1479 } else { 1480 /* 128-bit duplicate. */ 1481 /* ??? Dup to 256-bit vector. */ 1482 int i; 1483 1484 tcg_debug_assert(vece == 4); 1485 tcg_debug_assert(oprsz >= 16); 1486 if (TCG_TARGET_HAS_v128) { 1487 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1488 1489 tcg_gen_ld_vec(in, cpu_env, aofs); 1490 for (i = 0; i < oprsz; i += 16) { 1491 tcg_gen_st_vec(in, cpu_env, dofs + i); 1492 } 1493 tcg_temp_free_vec(in); 1494 } else { 1495 TCGv_i64 in0 = tcg_temp_new_i64(); 1496 TCGv_i64 in1 = tcg_temp_new_i64(); 1497 1498 tcg_gen_ld_i64(in0, cpu_env, aofs); 1499 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1500 for (i = 0; i < oprsz; i += 16) { 1501 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1502 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1503 } 1504 tcg_temp_free_i64(in0); 1505 tcg_temp_free_i64(in1); 1506 } 1507 } 1508 } 1509 1510 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, 1511 uint32_t maxsz, uint64_t x) 1512 { 1513 check_size_align(oprsz, maxsz, dofs); 1514 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); 1515 } 1516 1517 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, 1518 uint32_t maxsz, uint32_t x) 1519 { 1520 check_size_align(oprsz, maxsz, dofs); 1521 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); 1522 } 1523 1524 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, 1525 uint32_t maxsz, uint16_t x) 1526 { 1527 check_size_align(oprsz, maxsz, dofs); 1528 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); 1529 } 1530 1531 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, 1532 uint32_t maxsz, uint8_t x) 1533 { 1534 check_size_align(oprsz, maxsz, dofs); 1535 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); 1536 } 1537 1538 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1539 uint32_t oprsz, uint32_t maxsz) 1540 { 1541 static const GVecGen2 g = { 1542 .fni8 = tcg_gen_not_i64, 1543 .fniv = tcg_gen_not_vec, 1544 .fno = gen_helper_gvec_not, 1545 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1546 }; 1547 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1548 } 1549 1550 /* Perform a vector addition using normal addition and a mask. The mask 1551 should be the sign bit of each lane. This 6-operation form is more 1552 efficient than separate additions when there are 4 or more lanes in 1553 the 64-bit operation. */ 1554 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1555 { 1556 TCGv_i64 t1 = tcg_temp_new_i64(); 1557 TCGv_i64 t2 = tcg_temp_new_i64(); 1558 TCGv_i64 t3 = tcg_temp_new_i64(); 1559 1560 tcg_gen_andc_i64(t1, a, m); 1561 tcg_gen_andc_i64(t2, b, m); 1562 tcg_gen_xor_i64(t3, a, b); 1563 tcg_gen_add_i64(d, t1, t2); 1564 tcg_gen_and_i64(t3, t3, m); 1565 tcg_gen_xor_i64(d, d, t3); 1566 1567 tcg_temp_free_i64(t1); 1568 tcg_temp_free_i64(t2); 1569 tcg_temp_free_i64(t3); 1570 } 1571 1572 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1573 { 1574 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1575 gen_addv_mask(d, a, b, m); 1576 tcg_temp_free_i64(m); 1577 } 1578 1579 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1580 { 1581 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1582 gen_addv_mask(d, a, b, m); 1583 tcg_temp_free_i64(m); 1584 } 1585 1586 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1587 { 1588 TCGv_i64 t1 = tcg_temp_new_i64(); 1589 TCGv_i64 t2 = tcg_temp_new_i64(); 1590 1591 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1592 tcg_gen_add_i64(t2, a, b); 1593 tcg_gen_add_i64(t1, t1, b); 1594 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1595 1596 tcg_temp_free_i64(t1); 1597 tcg_temp_free_i64(t2); 1598 } 1599 1600 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 }; 1601 1602 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1603 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1604 { 1605 static const GVecGen3 g[4] = { 1606 { .fni8 = tcg_gen_vec_add8_i64, 1607 .fniv = tcg_gen_add_vec, 1608 .fno = gen_helper_gvec_add8, 1609 .opt_opc = vecop_list_add, 1610 .vece = MO_8 }, 1611 { .fni8 = tcg_gen_vec_add16_i64, 1612 .fniv = tcg_gen_add_vec, 1613 .fno = gen_helper_gvec_add16, 1614 .opt_opc = vecop_list_add, 1615 .vece = MO_16 }, 1616 { .fni4 = tcg_gen_add_i32, 1617 .fniv = tcg_gen_add_vec, 1618 .fno = gen_helper_gvec_add32, 1619 .opt_opc = vecop_list_add, 1620 .vece = MO_32 }, 1621 { .fni8 = tcg_gen_add_i64, 1622 .fniv = tcg_gen_add_vec, 1623 .fno = gen_helper_gvec_add64, 1624 .opt_opc = vecop_list_add, 1625 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1626 .vece = MO_64 }, 1627 }; 1628 1629 tcg_debug_assert(vece <= MO_64); 1630 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1631 } 1632 1633 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1634 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1635 { 1636 static const GVecGen2s g[4] = { 1637 { .fni8 = tcg_gen_vec_add8_i64, 1638 .fniv = tcg_gen_add_vec, 1639 .fno = gen_helper_gvec_adds8, 1640 .opt_opc = vecop_list_add, 1641 .vece = MO_8 }, 1642 { .fni8 = tcg_gen_vec_add16_i64, 1643 .fniv = tcg_gen_add_vec, 1644 .fno = gen_helper_gvec_adds16, 1645 .opt_opc = vecop_list_add, 1646 .vece = MO_16 }, 1647 { .fni4 = tcg_gen_add_i32, 1648 .fniv = tcg_gen_add_vec, 1649 .fno = gen_helper_gvec_adds32, 1650 .opt_opc = vecop_list_add, 1651 .vece = MO_32 }, 1652 { .fni8 = tcg_gen_add_i64, 1653 .fniv = tcg_gen_add_vec, 1654 .fno = gen_helper_gvec_adds64, 1655 .opt_opc = vecop_list_add, 1656 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1657 .vece = MO_64 }, 1658 }; 1659 1660 tcg_debug_assert(vece <= MO_64); 1661 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1662 } 1663 1664 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1665 int64_t c, uint32_t oprsz, uint32_t maxsz) 1666 { 1667 TCGv_i64 tmp = tcg_const_i64(c); 1668 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1669 tcg_temp_free_i64(tmp); 1670 } 1671 1672 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 }; 1673 1674 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1675 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1676 { 1677 static const GVecGen2s g[4] = { 1678 { .fni8 = tcg_gen_vec_sub8_i64, 1679 .fniv = tcg_gen_sub_vec, 1680 .fno = gen_helper_gvec_subs8, 1681 .opt_opc = vecop_list_sub, 1682 .vece = MO_8 }, 1683 { .fni8 = tcg_gen_vec_sub16_i64, 1684 .fniv = tcg_gen_sub_vec, 1685 .fno = gen_helper_gvec_subs16, 1686 .opt_opc = vecop_list_sub, 1687 .vece = MO_16 }, 1688 { .fni4 = tcg_gen_sub_i32, 1689 .fniv = tcg_gen_sub_vec, 1690 .fno = gen_helper_gvec_subs32, 1691 .opt_opc = vecop_list_sub, 1692 .vece = MO_32 }, 1693 { .fni8 = tcg_gen_sub_i64, 1694 .fniv = tcg_gen_sub_vec, 1695 .fno = gen_helper_gvec_subs64, 1696 .opt_opc = vecop_list_sub, 1697 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1698 .vece = MO_64 }, 1699 }; 1700 1701 tcg_debug_assert(vece <= MO_64); 1702 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1703 } 1704 1705 /* Perform a vector subtraction using normal subtraction and a mask. 1706 Compare gen_addv_mask above. */ 1707 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1708 { 1709 TCGv_i64 t1 = tcg_temp_new_i64(); 1710 TCGv_i64 t2 = tcg_temp_new_i64(); 1711 TCGv_i64 t3 = tcg_temp_new_i64(); 1712 1713 tcg_gen_or_i64(t1, a, m); 1714 tcg_gen_andc_i64(t2, b, m); 1715 tcg_gen_eqv_i64(t3, a, b); 1716 tcg_gen_sub_i64(d, t1, t2); 1717 tcg_gen_and_i64(t3, t3, m); 1718 tcg_gen_xor_i64(d, d, t3); 1719 1720 tcg_temp_free_i64(t1); 1721 tcg_temp_free_i64(t2); 1722 tcg_temp_free_i64(t3); 1723 } 1724 1725 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1726 { 1727 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1728 gen_subv_mask(d, a, b, m); 1729 tcg_temp_free_i64(m); 1730 } 1731 1732 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1733 { 1734 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1735 gen_subv_mask(d, a, b, m); 1736 tcg_temp_free_i64(m); 1737 } 1738 1739 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1740 { 1741 TCGv_i64 t1 = tcg_temp_new_i64(); 1742 TCGv_i64 t2 = tcg_temp_new_i64(); 1743 1744 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1745 tcg_gen_sub_i64(t2, a, b); 1746 tcg_gen_sub_i64(t1, a, t1); 1747 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1748 1749 tcg_temp_free_i64(t1); 1750 tcg_temp_free_i64(t2); 1751 } 1752 1753 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1754 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1755 { 1756 static const GVecGen3 g[4] = { 1757 { .fni8 = tcg_gen_vec_sub8_i64, 1758 .fniv = tcg_gen_sub_vec, 1759 .fno = gen_helper_gvec_sub8, 1760 .opt_opc = vecop_list_sub, 1761 .vece = MO_8 }, 1762 { .fni8 = tcg_gen_vec_sub16_i64, 1763 .fniv = tcg_gen_sub_vec, 1764 .fno = gen_helper_gvec_sub16, 1765 .opt_opc = vecop_list_sub, 1766 .vece = MO_16 }, 1767 { .fni4 = tcg_gen_sub_i32, 1768 .fniv = tcg_gen_sub_vec, 1769 .fno = gen_helper_gvec_sub32, 1770 .opt_opc = vecop_list_sub, 1771 .vece = MO_32 }, 1772 { .fni8 = tcg_gen_sub_i64, 1773 .fniv = tcg_gen_sub_vec, 1774 .fno = gen_helper_gvec_sub64, 1775 .opt_opc = vecop_list_sub, 1776 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1777 .vece = MO_64 }, 1778 }; 1779 1780 tcg_debug_assert(vece <= MO_64); 1781 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1782 } 1783 1784 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 }; 1785 1786 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1787 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1788 { 1789 static const GVecGen3 g[4] = { 1790 { .fniv = tcg_gen_mul_vec, 1791 .fno = gen_helper_gvec_mul8, 1792 .opt_opc = vecop_list_mul, 1793 .vece = MO_8 }, 1794 { .fniv = tcg_gen_mul_vec, 1795 .fno = gen_helper_gvec_mul16, 1796 .opt_opc = vecop_list_mul, 1797 .vece = MO_16 }, 1798 { .fni4 = tcg_gen_mul_i32, 1799 .fniv = tcg_gen_mul_vec, 1800 .fno = gen_helper_gvec_mul32, 1801 .opt_opc = vecop_list_mul, 1802 .vece = MO_32 }, 1803 { .fni8 = tcg_gen_mul_i64, 1804 .fniv = tcg_gen_mul_vec, 1805 .fno = gen_helper_gvec_mul64, 1806 .opt_opc = vecop_list_mul, 1807 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1808 .vece = MO_64 }, 1809 }; 1810 1811 tcg_debug_assert(vece <= MO_64); 1812 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1813 } 1814 1815 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1816 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1817 { 1818 static const GVecGen2s g[4] = { 1819 { .fniv = tcg_gen_mul_vec, 1820 .fno = gen_helper_gvec_muls8, 1821 .opt_opc = vecop_list_mul, 1822 .vece = MO_8 }, 1823 { .fniv = tcg_gen_mul_vec, 1824 .fno = gen_helper_gvec_muls16, 1825 .opt_opc = vecop_list_mul, 1826 .vece = MO_16 }, 1827 { .fni4 = tcg_gen_mul_i32, 1828 .fniv = tcg_gen_mul_vec, 1829 .fno = gen_helper_gvec_muls32, 1830 .opt_opc = vecop_list_mul, 1831 .vece = MO_32 }, 1832 { .fni8 = tcg_gen_mul_i64, 1833 .fniv = tcg_gen_mul_vec, 1834 .fno = gen_helper_gvec_muls64, 1835 .opt_opc = vecop_list_mul, 1836 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1837 .vece = MO_64 }, 1838 }; 1839 1840 tcg_debug_assert(vece <= MO_64); 1841 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1842 } 1843 1844 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1845 int64_t c, uint32_t oprsz, uint32_t maxsz) 1846 { 1847 TCGv_i64 tmp = tcg_const_i64(c); 1848 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1849 tcg_temp_free_i64(tmp); 1850 } 1851 1852 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1853 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1854 { 1855 static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 }; 1856 static const GVecGen3 g[4] = { 1857 { .fniv = tcg_gen_ssadd_vec, 1858 .fno = gen_helper_gvec_ssadd8, 1859 .opt_opc = vecop_list, 1860 .vece = MO_8 }, 1861 { .fniv = tcg_gen_ssadd_vec, 1862 .fno = gen_helper_gvec_ssadd16, 1863 .opt_opc = vecop_list, 1864 .vece = MO_16 }, 1865 { .fniv = tcg_gen_ssadd_vec, 1866 .fno = gen_helper_gvec_ssadd32, 1867 .opt_opc = vecop_list, 1868 .vece = MO_32 }, 1869 { .fniv = tcg_gen_ssadd_vec, 1870 .fno = gen_helper_gvec_ssadd64, 1871 .opt_opc = vecop_list, 1872 .vece = MO_64 }, 1873 }; 1874 tcg_debug_assert(vece <= MO_64); 1875 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1876 } 1877 1878 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 1879 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1880 { 1881 static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 }; 1882 static const GVecGen3 g[4] = { 1883 { .fniv = tcg_gen_sssub_vec, 1884 .fno = gen_helper_gvec_sssub8, 1885 .opt_opc = vecop_list, 1886 .vece = MO_8 }, 1887 { .fniv = tcg_gen_sssub_vec, 1888 .fno = gen_helper_gvec_sssub16, 1889 .opt_opc = vecop_list, 1890 .vece = MO_16 }, 1891 { .fniv = tcg_gen_sssub_vec, 1892 .fno = gen_helper_gvec_sssub32, 1893 .opt_opc = vecop_list, 1894 .vece = MO_32 }, 1895 { .fniv = tcg_gen_sssub_vec, 1896 .fno = gen_helper_gvec_sssub64, 1897 .opt_opc = vecop_list, 1898 .vece = MO_64 }, 1899 }; 1900 tcg_debug_assert(vece <= MO_64); 1901 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1902 } 1903 1904 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1905 { 1906 TCGv_i32 max = tcg_const_i32(-1); 1907 tcg_gen_add_i32(d, a, b); 1908 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 1909 tcg_temp_free_i32(max); 1910 } 1911 1912 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1913 { 1914 TCGv_i64 max = tcg_const_i64(-1); 1915 tcg_gen_add_i64(d, a, b); 1916 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 1917 tcg_temp_free_i64(max); 1918 } 1919 1920 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1921 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1922 { 1923 static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 }; 1924 static const GVecGen3 g[4] = { 1925 { .fniv = tcg_gen_usadd_vec, 1926 .fno = gen_helper_gvec_usadd8, 1927 .opt_opc = vecop_list, 1928 .vece = MO_8 }, 1929 { .fniv = tcg_gen_usadd_vec, 1930 .fno = gen_helper_gvec_usadd16, 1931 .opt_opc = vecop_list, 1932 .vece = MO_16 }, 1933 { .fni4 = tcg_gen_usadd_i32, 1934 .fniv = tcg_gen_usadd_vec, 1935 .fno = gen_helper_gvec_usadd32, 1936 .opt_opc = vecop_list, 1937 .vece = MO_32 }, 1938 { .fni8 = tcg_gen_usadd_i64, 1939 .fniv = tcg_gen_usadd_vec, 1940 .fno = gen_helper_gvec_usadd64, 1941 .opt_opc = vecop_list, 1942 .vece = MO_64 } 1943 }; 1944 tcg_debug_assert(vece <= MO_64); 1945 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1946 } 1947 1948 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1949 { 1950 TCGv_i32 min = tcg_const_i32(0); 1951 tcg_gen_sub_i32(d, a, b); 1952 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 1953 tcg_temp_free_i32(min); 1954 } 1955 1956 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1957 { 1958 TCGv_i64 min = tcg_const_i64(0); 1959 tcg_gen_sub_i64(d, a, b); 1960 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 1961 tcg_temp_free_i64(min); 1962 } 1963 1964 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 1965 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1966 { 1967 static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 }; 1968 static const GVecGen3 g[4] = { 1969 { .fniv = tcg_gen_ussub_vec, 1970 .fno = gen_helper_gvec_ussub8, 1971 .opt_opc = vecop_list, 1972 .vece = MO_8 }, 1973 { .fniv = tcg_gen_ussub_vec, 1974 .fno = gen_helper_gvec_ussub16, 1975 .opt_opc = vecop_list, 1976 .vece = MO_16 }, 1977 { .fni4 = tcg_gen_ussub_i32, 1978 .fniv = tcg_gen_ussub_vec, 1979 .fno = gen_helper_gvec_ussub32, 1980 .opt_opc = vecop_list, 1981 .vece = MO_32 }, 1982 { .fni8 = tcg_gen_ussub_i64, 1983 .fniv = tcg_gen_ussub_vec, 1984 .fno = gen_helper_gvec_ussub64, 1985 .opt_opc = vecop_list, 1986 .vece = MO_64 } 1987 }; 1988 tcg_debug_assert(vece <= MO_64); 1989 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1990 } 1991 1992 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 1993 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1994 { 1995 static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 }; 1996 static const GVecGen3 g[4] = { 1997 { .fniv = tcg_gen_smin_vec, 1998 .fno = gen_helper_gvec_smin8, 1999 .opt_opc = vecop_list, 2000 .vece = MO_8 }, 2001 { .fniv = tcg_gen_smin_vec, 2002 .fno = gen_helper_gvec_smin16, 2003 .opt_opc = vecop_list, 2004 .vece = MO_16 }, 2005 { .fni4 = tcg_gen_smin_i32, 2006 .fniv = tcg_gen_smin_vec, 2007 .fno = gen_helper_gvec_smin32, 2008 .opt_opc = vecop_list, 2009 .vece = MO_32 }, 2010 { .fni8 = tcg_gen_smin_i64, 2011 .fniv = tcg_gen_smin_vec, 2012 .fno = gen_helper_gvec_smin64, 2013 .opt_opc = vecop_list, 2014 .vece = MO_64 } 2015 }; 2016 tcg_debug_assert(vece <= MO_64); 2017 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2018 } 2019 2020 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 2021 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2022 { 2023 static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 }; 2024 static const GVecGen3 g[4] = { 2025 { .fniv = tcg_gen_umin_vec, 2026 .fno = gen_helper_gvec_umin8, 2027 .opt_opc = vecop_list, 2028 .vece = MO_8 }, 2029 { .fniv = tcg_gen_umin_vec, 2030 .fno = gen_helper_gvec_umin16, 2031 .opt_opc = vecop_list, 2032 .vece = MO_16 }, 2033 { .fni4 = tcg_gen_umin_i32, 2034 .fniv = tcg_gen_umin_vec, 2035 .fno = gen_helper_gvec_umin32, 2036 .opt_opc = vecop_list, 2037 .vece = MO_32 }, 2038 { .fni8 = tcg_gen_umin_i64, 2039 .fniv = tcg_gen_umin_vec, 2040 .fno = gen_helper_gvec_umin64, 2041 .opt_opc = vecop_list, 2042 .vece = MO_64 } 2043 }; 2044 tcg_debug_assert(vece <= MO_64); 2045 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2046 } 2047 2048 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 2049 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2050 { 2051 static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 }; 2052 static const GVecGen3 g[4] = { 2053 { .fniv = tcg_gen_smax_vec, 2054 .fno = gen_helper_gvec_smax8, 2055 .opt_opc = vecop_list, 2056 .vece = MO_8 }, 2057 { .fniv = tcg_gen_smax_vec, 2058 .fno = gen_helper_gvec_smax16, 2059 .opt_opc = vecop_list, 2060 .vece = MO_16 }, 2061 { .fni4 = tcg_gen_smax_i32, 2062 .fniv = tcg_gen_smax_vec, 2063 .fno = gen_helper_gvec_smax32, 2064 .opt_opc = vecop_list, 2065 .vece = MO_32 }, 2066 { .fni8 = tcg_gen_smax_i64, 2067 .fniv = tcg_gen_smax_vec, 2068 .fno = gen_helper_gvec_smax64, 2069 .opt_opc = vecop_list, 2070 .vece = MO_64 } 2071 }; 2072 tcg_debug_assert(vece <= MO_64); 2073 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2074 } 2075 2076 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 2077 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2078 { 2079 static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 }; 2080 static const GVecGen3 g[4] = { 2081 { .fniv = tcg_gen_umax_vec, 2082 .fno = gen_helper_gvec_umax8, 2083 .opt_opc = vecop_list, 2084 .vece = MO_8 }, 2085 { .fniv = tcg_gen_umax_vec, 2086 .fno = gen_helper_gvec_umax16, 2087 .opt_opc = vecop_list, 2088 .vece = MO_16 }, 2089 { .fni4 = tcg_gen_umax_i32, 2090 .fniv = tcg_gen_umax_vec, 2091 .fno = gen_helper_gvec_umax32, 2092 .opt_opc = vecop_list, 2093 .vece = MO_32 }, 2094 { .fni8 = tcg_gen_umax_i64, 2095 .fniv = tcg_gen_umax_vec, 2096 .fno = gen_helper_gvec_umax64, 2097 .opt_opc = vecop_list, 2098 .vece = MO_64 } 2099 }; 2100 tcg_debug_assert(vece <= MO_64); 2101 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2102 } 2103 2104 /* Perform a vector negation using normal negation and a mask. 2105 Compare gen_subv_mask above. */ 2106 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 2107 { 2108 TCGv_i64 t2 = tcg_temp_new_i64(); 2109 TCGv_i64 t3 = tcg_temp_new_i64(); 2110 2111 tcg_gen_andc_i64(t3, m, b); 2112 tcg_gen_andc_i64(t2, b, m); 2113 tcg_gen_sub_i64(d, m, t2); 2114 tcg_gen_xor_i64(d, d, t3); 2115 2116 tcg_temp_free_i64(t2); 2117 tcg_temp_free_i64(t3); 2118 } 2119 2120 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 2121 { 2122 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 2123 gen_negv_mask(d, b, m); 2124 tcg_temp_free_i64(m); 2125 } 2126 2127 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 2128 { 2129 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 2130 gen_negv_mask(d, b, m); 2131 tcg_temp_free_i64(m); 2132 } 2133 2134 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 2135 { 2136 TCGv_i64 t1 = tcg_temp_new_i64(); 2137 TCGv_i64 t2 = tcg_temp_new_i64(); 2138 2139 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 2140 tcg_gen_neg_i64(t2, b); 2141 tcg_gen_neg_i64(t1, t1); 2142 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 2143 2144 tcg_temp_free_i64(t1); 2145 tcg_temp_free_i64(t2); 2146 } 2147 2148 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 2149 uint32_t oprsz, uint32_t maxsz) 2150 { 2151 static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 }; 2152 static const GVecGen2 g[4] = { 2153 { .fni8 = tcg_gen_vec_neg8_i64, 2154 .fniv = tcg_gen_neg_vec, 2155 .fno = gen_helper_gvec_neg8, 2156 .opt_opc = vecop_list, 2157 .vece = MO_8 }, 2158 { .fni8 = tcg_gen_vec_neg16_i64, 2159 .fniv = tcg_gen_neg_vec, 2160 .fno = gen_helper_gvec_neg16, 2161 .opt_opc = vecop_list, 2162 .vece = MO_16 }, 2163 { .fni4 = tcg_gen_neg_i32, 2164 .fniv = tcg_gen_neg_vec, 2165 .fno = gen_helper_gvec_neg32, 2166 .opt_opc = vecop_list, 2167 .vece = MO_32 }, 2168 { .fni8 = tcg_gen_neg_i64, 2169 .fniv = tcg_gen_neg_vec, 2170 .fno = gen_helper_gvec_neg64, 2171 .opt_opc = vecop_list, 2172 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2173 .vece = MO_64 }, 2174 }; 2175 2176 tcg_debug_assert(vece <= MO_64); 2177 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2178 } 2179 2180 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece) 2181 { 2182 TCGv_i64 t = tcg_temp_new_i64(); 2183 int nbit = 8 << vece; 2184 2185 /* Create -1 for each negative element. */ 2186 tcg_gen_shri_i64(t, b, nbit - 1); 2187 tcg_gen_andi_i64(t, t, dup_const(vece, 1)); 2188 tcg_gen_muli_i64(t, t, (1 << nbit) - 1); 2189 2190 /* 2191 * Invert (via xor -1) and add one (via sub -1). 2192 * Because of the ordering the msb is cleared, 2193 * so we never have carry into the next element. 2194 */ 2195 tcg_gen_xor_i64(d, b, t); 2196 tcg_gen_sub_i64(d, d, t); 2197 2198 tcg_temp_free_i64(t); 2199 } 2200 2201 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b) 2202 { 2203 gen_absv_mask(d, b, MO_8); 2204 } 2205 2206 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b) 2207 { 2208 gen_absv_mask(d, b, MO_16); 2209 } 2210 2211 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs, 2212 uint32_t oprsz, uint32_t maxsz) 2213 { 2214 static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 }; 2215 static const GVecGen2 g[4] = { 2216 { .fni8 = tcg_gen_vec_abs8_i64, 2217 .fniv = tcg_gen_abs_vec, 2218 .fno = gen_helper_gvec_abs8, 2219 .opt_opc = vecop_list, 2220 .vece = MO_8 }, 2221 { .fni8 = tcg_gen_vec_abs16_i64, 2222 .fniv = tcg_gen_abs_vec, 2223 .fno = gen_helper_gvec_abs16, 2224 .opt_opc = vecop_list, 2225 .vece = MO_16 }, 2226 { .fni4 = tcg_gen_abs_i32, 2227 .fniv = tcg_gen_abs_vec, 2228 .fno = gen_helper_gvec_abs32, 2229 .opt_opc = vecop_list, 2230 .vece = MO_32 }, 2231 { .fni8 = tcg_gen_abs_i64, 2232 .fniv = tcg_gen_abs_vec, 2233 .fno = gen_helper_gvec_abs64, 2234 .opt_opc = vecop_list, 2235 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2236 .vece = MO_64 }, 2237 }; 2238 2239 tcg_debug_assert(vece <= MO_64); 2240 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 2241 } 2242 2243 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 2244 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2245 { 2246 static const GVecGen3 g = { 2247 .fni8 = tcg_gen_and_i64, 2248 .fniv = tcg_gen_and_vec, 2249 .fno = gen_helper_gvec_and, 2250 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2251 }; 2252 2253 if (aofs == bofs) { 2254 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2255 } else { 2256 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2257 } 2258 } 2259 2260 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2261 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2262 { 2263 static const GVecGen3 g = { 2264 .fni8 = tcg_gen_or_i64, 2265 .fniv = tcg_gen_or_vec, 2266 .fno = gen_helper_gvec_or, 2267 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2268 }; 2269 2270 if (aofs == bofs) { 2271 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2272 } else { 2273 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2274 } 2275 } 2276 2277 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2278 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2279 { 2280 static const GVecGen3 g = { 2281 .fni8 = tcg_gen_xor_i64, 2282 .fniv = tcg_gen_xor_vec, 2283 .fno = gen_helper_gvec_xor, 2284 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2285 }; 2286 2287 if (aofs == bofs) { 2288 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); 2289 } else { 2290 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2291 } 2292 } 2293 2294 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2295 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2296 { 2297 static const GVecGen3 g = { 2298 .fni8 = tcg_gen_andc_i64, 2299 .fniv = tcg_gen_andc_vec, 2300 .fno = gen_helper_gvec_andc, 2301 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2302 }; 2303 2304 if (aofs == bofs) { 2305 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); 2306 } else { 2307 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2308 } 2309 } 2310 2311 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2312 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2313 { 2314 static const GVecGen3 g = { 2315 .fni8 = tcg_gen_orc_i64, 2316 .fniv = tcg_gen_orc_vec, 2317 .fno = gen_helper_gvec_orc, 2318 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2319 }; 2320 2321 if (aofs == bofs) { 2322 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); 2323 } else { 2324 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2325 } 2326 } 2327 2328 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2329 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2330 { 2331 static const GVecGen3 g = { 2332 .fni8 = tcg_gen_nand_i64, 2333 .fniv = tcg_gen_nand_vec, 2334 .fno = gen_helper_gvec_nand, 2335 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2336 }; 2337 2338 if (aofs == bofs) { 2339 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2340 } else { 2341 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2342 } 2343 } 2344 2345 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2346 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2347 { 2348 static const GVecGen3 g = { 2349 .fni8 = tcg_gen_nor_i64, 2350 .fniv = tcg_gen_nor_vec, 2351 .fno = gen_helper_gvec_nor, 2352 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2353 }; 2354 2355 if (aofs == bofs) { 2356 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2357 } else { 2358 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2359 } 2360 } 2361 2362 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2363 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2364 { 2365 static const GVecGen3 g = { 2366 .fni8 = tcg_gen_eqv_i64, 2367 .fniv = tcg_gen_eqv_vec, 2368 .fno = gen_helper_gvec_eqv, 2369 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2370 }; 2371 2372 if (aofs == bofs) { 2373 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); 2374 } else { 2375 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2376 } 2377 } 2378 2379 static const GVecGen2s gop_ands = { 2380 .fni8 = tcg_gen_and_i64, 2381 .fniv = tcg_gen_and_vec, 2382 .fno = gen_helper_gvec_ands, 2383 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2384 .vece = MO_64 2385 }; 2386 2387 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2388 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2389 { 2390 TCGv_i64 tmp = tcg_temp_new_i64(); 2391 gen_dup_i64(vece, tmp, c); 2392 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2393 tcg_temp_free_i64(tmp); 2394 } 2395 2396 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2397 int64_t c, uint32_t oprsz, uint32_t maxsz) 2398 { 2399 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2400 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2401 tcg_temp_free_i64(tmp); 2402 } 2403 2404 static const GVecGen2s gop_xors = { 2405 .fni8 = tcg_gen_xor_i64, 2406 .fniv = tcg_gen_xor_vec, 2407 .fno = gen_helper_gvec_xors, 2408 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2409 .vece = MO_64 2410 }; 2411 2412 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2413 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2414 { 2415 TCGv_i64 tmp = tcg_temp_new_i64(); 2416 gen_dup_i64(vece, tmp, c); 2417 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2418 tcg_temp_free_i64(tmp); 2419 } 2420 2421 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2422 int64_t c, uint32_t oprsz, uint32_t maxsz) 2423 { 2424 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2425 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2426 tcg_temp_free_i64(tmp); 2427 } 2428 2429 static const GVecGen2s gop_ors = { 2430 .fni8 = tcg_gen_or_i64, 2431 .fniv = tcg_gen_or_vec, 2432 .fno = gen_helper_gvec_ors, 2433 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2434 .vece = MO_64 2435 }; 2436 2437 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2438 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2439 { 2440 TCGv_i64 tmp = tcg_temp_new_i64(); 2441 gen_dup_i64(vece, tmp, c); 2442 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2443 tcg_temp_free_i64(tmp); 2444 } 2445 2446 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2447 int64_t c, uint32_t oprsz, uint32_t maxsz) 2448 { 2449 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2450 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2451 tcg_temp_free_i64(tmp); 2452 } 2453 2454 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2455 { 2456 uint64_t mask = dup_const(MO_8, 0xff << c); 2457 tcg_gen_shli_i64(d, a, c); 2458 tcg_gen_andi_i64(d, d, mask); 2459 } 2460 2461 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2462 { 2463 uint64_t mask = dup_const(MO_16, 0xffff << c); 2464 tcg_gen_shli_i64(d, a, c); 2465 tcg_gen_andi_i64(d, d, mask); 2466 } 2467 2468 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2469 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2470 { 2471 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 2472 static const GVecGen2i g[4] = { 2473 { .fni8 = tcg_gen_vec_shl8i_i64, 2474 .fniv = tcg_gen_shli_vec, 2475 .fno = gen_helper_gvec_shl8i, 2476 .opt_opc = vecop_list, 2477 .vece = MO_8 }, 2478 { .fni8 = tcg_gen_vec_shl16i_i64, 2479 .fniv = tcg_gen_shli_vec, 2480 .fno = gen_helper_gvec_shl16i, 2481 .opt_opc = vecop_list, 2482 .vece = MO_16 }, 2483 { .fni4 = tcg_gen_shli_i32, 2484 .fniv = tcg_gen_shli_vec, 2485 .fno = gen_helper_gvec_shl32i, 2486 .opt_opc = vecop_list, 2487 .vece = MO_32 }, 2488 { .fni8 = tcg_gen_shli_i64, 2489 .fniv = tcg_gen_shli_vec, 2490 .fno = gen_helper_gvec_shl64i, 2491 .opt_opc = vecop_list, 2492 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2493 .vece = MO_64 }, 2494 }; 2495 2496 tcg_debug_assert(vece <= MO_64); 2497 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2498 if (shift == 0) { 2499 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2500 } else { 2501 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2502 } 2503 } 2504 2505 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2506 { 2507 uint64_t mask = dup_const(MO_8, 0xff >> c); 2508 tcg_gen_shri_i64(d, a, c); 2509 tcg_gen_andi_i64(d, d, mask); 2510 } 2511 2512 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2513 { 2514 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2515 tcg_gen_shri_i64(d, a, c); 2516 tcg_gen_andi_i64(d, d, mask); 2517 } 2518 2519 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2520 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2521 { 2522 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 2523 static const GVecGen2i g[4] = { 2524 { .fni8 = tcg_gen_vec_shr8i_i64, 2525 .fniv = tcg_gen_shri_vec, 2526 .fno = gen_helper_gvec_shr8i, 2527 .opt_opc = vecop_list, 2528 .vece = MO_8 }, 2529 { .fni8 = tcg_gen_vec_shr16i_i64, 2530 .fniv = tcg_gen_shri_vec, 2531 .fno = gen_helper_gvec_shr16i, 2532 .opt_opc = vecop_list, 2533 .vece = MO_16 }, 2534 { .fni4 = tcg_gen_shri_i32, 2535 .fniv = tcg_gen_shri_vec, 2536 .fno = gen_helper_gvec_shr32i, 2537 .opt_opc = vecop_list, 2538 .vece = MO_32 }, 2539 { .fni8 = tcg_gen_shri_i64, 2540 .fniv = tcg_gen_shri_vec, 2541 .fno = gen_helper_gvec_shr64i, 2542 .opt_opc = vecop_list, 2543 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2544 .vece = MO_64 }, 2545 }; 2546 2547 tcg_debug_assert(vece <= MO_64); 2548 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2549 if (shift == 0) { 2550 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2551 } else { 2552 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2553 } 2554 } 2555 2556 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2557 { 2558 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2559 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2560 TCGv_i64 s = tcg_temp_new_i64(); 2561 2562 tcg_gen_shri_i64(d, a, c); 2563 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2564 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2565 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2566 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2567 tcg_temp_free_i64(s); 2568 } 2569 2570 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2571 { 2572 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2573 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2574 TCGv_i64 s = tcg_temp_new_i64(); 2575 2576 tcg_gen_shri_i64(d, a, c); 2577 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2578 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2579 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2580 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2581 tcg_temp_free_i64(s); 2582 } 2583 2584 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2585 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2586 { 2587 static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 }; 2588 static const GVecGen2i g[4] = { 2589 { .fni8 = tcg_gen_vec_sar8i_i64, 2590 .fniv = tcg_gen_sari_vec, 2591 .fno = gen_helper_gvec_sar8i, 2592 .opt_opc = vecop_list, 2593 .vece = MO_8 }, 2594 { .fni8 = tcg_gen_vec_sar16i_i64, 2595 .fniv = tcg_gen_sari_vec, 2596 .fno = gen_helper_gvec_sar16i, 2597 .opt_opc = vecop_list, 2598 .vece = MO_16 }, 2599 { .fni4 = tcg_gen_sari_i32, 2600 .fniv = tcg_gen_sari_vec, 2601 .fno = gen_helper_gvec_sar32i, 2602 .opt_opc = vecop_list, 2603 .vece = MO_32 }, 2604 { .fni8 = tcg_gen_sari_i64, 2605 .fniv = tcg_gen_sari_vec, 2606 .fno = gen_helper_gvec_sar64i, 2607 .opt_opc = vecop_list, 2608 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2609 .vece = MO_64 }, 2610 }; 2611 2612 tcg_debug_assert(vece <= MO_64); 2613 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2614 if (shift == 0) { 2615 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2616 } else { 2617 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2618 } 2619 } 2620 2621 /* 2622 * Specialized generation vector shifts by a non-constant scalar. 2623 */ 2624 2625 typedef struct { 2626 void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); 2627 void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); 2628 void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32); 2629 void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); 2630 gen_helper_gvec_2 *fno[4]; 2631 TCGOpcode s_list[2]; 2632 TCGOpcode v_list[2]; 2633 } GVecGen2sh; 2634 2635 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2636 uint32_t oprsz, uint32_t tysz, TCGType type, 2637 TCGv_i32 shift, 2638 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32)) 2639 { 2640 TCGv_vec t0 = tcg_temp_new_vec(type); 2641 uint32_t i; 2642 2643 for (i = 0; i < oprsz; i += tysz) { 2644 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2645 fni(vece, t0, t0, shift); 2646 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2647 } 2648 tcg_temp_free_vec(t0); 2649 } 2650 2651 static void 2652 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift, 2653 uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g) 2654 { 2655 TCGType type; 2656 uint32_t some; 2657 2658 check_size_align(oprsz, maxsz, dofs | aofs); 2659 check_overlap_2(dofs, aofs, maxsz); 2660 2661 /* If the backend has a scalar expansion, great. */ 2662 type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64); 2663 if (type) { 2664 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2665 switch (type) { 2666 case TCG_TYPE_V256: 2667 some = QEMU_ALIGN_DOWN(oprsz, 32); 2668 expand_2sh_vec(vece, dofs, aofs, some, 32, 2669 TCG_TYPE_V256, shift, g->fniv_s); 2670 if (some == oprsz) { 2671 break; 2672 } 2673 dofs += some; 2674 aofs += some; 2675 oprsz -= some; 2676 maxsz -= some; 2677 /* fallthru */ 2678 case TCG_TYPE_V128: 2679 expand_2sh_vec(vece, dofs, aofs, oprsz, 16, 2680 TCG_TYPE_V128, shift, g->fniv_s); 2681 break; 2682 case TCG_TYPE_V64: 2683 expand_2sh_vec(vece, dofs, aofs, oprsz, 8, 2684 TCG_TYPE_V64, shift, g->fniv_s); 2685 break; 2686 default: 2687 g_assert_not_reached(); 2688 } 2689 tcg_swap_vecop_list(hold_list); 2690 goto clear_tail; 2691 } 2692 2693 /* If the backend supports variable vector shifts, also cool. */ 2694 type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64); 2695 if (type) { 2696 const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL); 2697 TCGv_vec v_shift = tcg_temp_new_vec(type); 2698 2699 if (vece == MO_64) { 2700 TCGv_i64 sh64 = tcg_temp_new_i64(); 2701 tcg_gen_extu_i32_i64(sh64, shift); 2702 tcg_gen_dup_i64_vec(MO_64, v_shift, sh64); 2703 tcg_temp_free_i64(sh64); 2704 } else { 2705 tcg_gen_dup_i32_vec(vece, v_shift, shift); 2706 } 2707 2708 switch (type) { 2709 case TCG_TYPE_V256: 2710 some = QEMU_ALIGN_DOWN(oprsz, 32); 2711 expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256, 2712 v_shift, false, g->fniv_v); 2713 if (some == oprsz) { 2714 break; 2715 } 2716 dofs += some; 2717 aofs += some; 2718 oprsz -= some; 2719 maxsz -= some; 2720 /* fallthru */ 2721 case TCG_TYPE_V128: 2722 expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 2723 v_shift, false, g->fniv_v); 2724 break; 2725 case TCG_TYPE_V64: 2726 expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 2727 v_shift, false, g->fniv_v); 2728 break; 2729 default: 2730 g_assert_not_reached(); 2731 } 2732 tcg_temp_free_vec(v_shift); 2733 tcg_swap_vecop_list(hold_list); 2734 goto clear_tail; 2735 } 2736 2737 /* Otherwise fall back to integral... */ 2738 if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2739 expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4); 2740 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2741 TCGv_i64 sh64 = tcg_temp_new_i64(); 2742 tcg_gen_extu_i32_i64(sh64, shift); 2743 expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8); 2744 tcg_temp_free_i64(sh64); 2745 } else { 2746 TCGv_ptr a0 = tcg_temp_new_ptr(); 2747 TCGv_ptr a1 = tcg_temp_new_ptr(); 2748 TCGv_i32 desc = tcg_temp_new_i32(); 2749 2750 tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT); 2751 tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0)); 2752 tcg_gen_addi_ptr(a0, cpu_env, dofs); 2753 tcg_gen_addi_ptr(a1, cpu_env, aofs); 2754 2755 g->fno[vece](a0, a1, desc); 2756 2757 tcg_temp_free_ptr(a0); 2758 tcg_temp_free_ptr(a1); 2759 tcg_temp_free_i32(desc); 2760 return; 2761 } 2762 2763 clear_tail: 2764 if (oprsz < maxsz) { 2765 expand_clr(dofs + oprsz, maxsz - oprsz); 2766 } 2767 } 2768 2769 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs, 2770 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2771 { 2772 static const GVecGen2sh g = { 2773 .fni4 = tcg_gen_shl_i32, 2774 .fni8 = tcg_gen_shl_i64, 2775 .fniv_s = tcg_gen_shls_vec, 2776 .fniv_v = tcg_gen_shlv_vec, 2777 .fno = { 2778 gen_helper_gvec_shl8i, 2779 gen_helper_gvec_shl16i, 2780 gen_helper_gvec_shl32i, 2781 gen_helper_gvec_shl64i, 2782 }, 2783 .s_list = { INDEX_op_shls_vec, 0 }, 2784 .v_list = { INDEX_op_shlv_vec, 0 }, 2785 }; 2786 2787 tcg_debug_assert(vece <= MO_64); 2788 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2789 } 2790 2791 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs, 2792 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2793 { 2794 static const GVecGen2sh g = { 2795 .fni4 = tcg_gen_shr_i32, 2796 .fni8 = tcg_gen_shr_i64, 2797 .fniv_s = tcg_gen_shrs_vec, 2798 .fniv_v = tcg_gen_shrv_vec, 2799 .fno = { 2800 gen_helper_gvec_shr8i, 2801 gen_helper_gvec_shr16i, 2802 gen_helper_gvec_shr32i, 2803 gen_helper_gvec_shr64i, 2804 }, 2805 .s_list = { INDEX_op_shrs_vec, 0 }, 2806 .v_list = { INDEX_op_shrv_vec, 0 }, 2807 }; 2808 2809 tcg_debug_assert(vece <= MO_64); 2810 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2811 } 2812 2813 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs, 2814 TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz) 2815 { 2816 static const GVecGen2sh g = { 2817 .fni4 = tcg_gen_sar_i32, 2818 .fni8 = tcg_gen_sar_i64, 2819 .fniv_s = tcg_gen_sars_vec, 2820 .fniv_v = tcg_gen_sarv_vec, 2821 .fno = { 2822 gen_helper_gvec_sar8i, 2823 gen_helper_gvec_sar16i, 2824 gen_helper_gvec_sar32i, 2825 gen_helper_gvec_sar64i, 2826 }, 2827 .s_list = { INDEX_op_sars_vec, 0 }, 2828 .v_list = { INDEX_op_sarv_vec, 0 }, 2829 }; 2830 2831 tcg_debug_assert(vece <= MO_64); 2832 do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g); 2833 } 2834 2835 /* 2836 * Expand D = A << (B % element bits) 2837 * 2838 * Unlike scalar shifts, where it is easy for the target front end 2839 * to include the modulo as part of the expansion. If the target 2840 * naturally includes the modulo as part of the operation, great! 2841 * If the target has some other behaviour from out-of-range shifts, 2842 * then it could not use this function anyway, and would need to 2843 * do it's own expansion with custom functions. 2844 */ 2845 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d, 2846 TCGv_vec a, TCGv_vec b) 2847 { 2848 TCGv_vec t = tcg_temp_new_vec_matching(d); 2849 2850 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2851 tcg_gen_and_vec(vece, t, t, b); 2852 tcg_gen_shlv_vec(vece, d, a, t); 2853 tcg_temp_free_vec(t); 2854 } 2855 2856 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2857 { 2858 TCGv_i32 t = tcg_temp_new_i32(); 2859 2860 tcg_gen_andi_i32(t, b, 31); 2861 tcg_gen_shl_i32(d, a, t); 2862 tcg_temp_free_i32(t); 2863 } 2864 2865 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2866 { 2867 TCGv_i64 t = tcg_temp_new_i64(); 2868 2869 tcg_gen_andi_i64(t, b, 63); 2870 tcg_gen_shl_i64(d, a, t); 2871 tcg_temp_free_i64(t); 2872 } 2873 2874 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs, 2875 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2876 { 2877 static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 }; 2878 static const GVecGen3 g[4] = { 2879 { .fniv = tcg_gen_shlv_mod_vec, 2880 .fno = gen_helper_gvec_shl8v, 2881 .opt_opc = vecop_list, 2882 .vece = MO_8 }, 2883 { .fniv = tcg_gen_shlv_mod_vec, 2884 .fno = gen_helper_gvec_shl16v, 2885 .opt_opc = vecop_list, 2886 .vece = MO_16 }, 2887 { .fni4 = tcg_gen_shl_mod_i32, 2888 .fniv = tcg_gen_shlv_mod_vec, 2889 .fno = gen_helper_gvec_shl32v, 2890 .opt_opc = vecop_list, 2891 .vece = MO_32 }, 2892 { .fni8 = tcg_gen_shl_mod_i64, 2893 .fniv = tcg_gen_shlv_mod_vec, 2894 .fno = gen_helper_gvec_shl64v, 2895 .opt_opc = vecop_list, 2896 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2897 .vece = MO_64 }, 2898 }; 2899 2900 tcg_debug_assert(vece <= MO_64); 2901 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2902 } 2903 2904 /* 2905 * Similarly for logical right shifts. 2906 */ 2907 2908 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d, 2909 TCGv_vec a, TCGv_vec b) 2910 { 2911 TCGv_vec t = tcg_temp_new_vec_matching(d); 2912 2913 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2914 tcg_gen_and_vec(vece, t, t, b); 2915 tcg_gen_shrv_vec(vece, d, a, t); 2916 tcg_temp_free_vec(t); 2917 } 2918 2919 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2920 { 2921 TCGv_i32 t = tcg_temp_new_i32(); 2922 2923 tcg_gen_andi_i32(t, b, 31); 2924 tcg_gen_shr_i32(d, a, t); 2925 tcg_temp_free_i32(t); 2926 } 2927 2928 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2929 { 2930 TCGv_i64 t = tcg_temp_new_i64(); 2931 2932 tcg_gen_andi_i64(t, b, 63); 2933 tcg_gen_shr_i64(d, a, t); 2934 tcg_temp_free_i64(t); 2935 } 2936 2937 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs, 2938 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2939 { 2940 static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 }; 2941 static const GVecGen3 g[4] = { 2942 { .fniv = tcg_gen_shrv_mod_vec, 2943 .fno = gen_helper_gvec_shr8v, 2944 .opt_opc = vecop_list, 2945 .vece = MO_8 }, 2946 { .fniv = tcg_gen_shrv_mod_vec, 2947 .fno = gen_helper_gvec_shr16v, 2948 .opt_opc = vecop_list, 2949 .vece = MO_16 }, 2950 { .fni4 = tcg_gen_shr_mod_i32, 2951 .fniv = tcg_gen_shrv_mod_vec, 2952 .fno = gen_helper_gvec_shr32v, 2953 .opt_opc = vecop_list, 2954 .vece = MO_32 }, 2955 { .fni8 = tcg_gen_shr_mod_i64, 2956 .fniv = tcg_gen_shrv_mod_vec, 2957 .fno = gen_helper_gvec_shr64v, 2958 .opt_opc = vecop_list, 2959 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2960 .vece = MO_64 }, 2961 }; 2962 2963 tcg_debug_assert(vece <= MO_64); 2964 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 2965 } 2966 2967 /* 2968 * Similarly for arithmetic right shifts. 2969 */ 2970 2971 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d, 2972 TCGv_vec a, TCGv_vec b) 2973 { 2974 TCGv_vec t = tcg_temp_new_vec_matching(d); 2975 2976 tcg_gen_dupi_vec(vece, t, (8 << vece) - 1); 2977 tcg_gen_and_vec(vece, t, t, b); 2978 tcg_gen_sarv_vec(vece, d, a, t); 2979 tcg_temp_free_vec(t); 2980 } 2981 2982 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2983 { 2984 TCGv_i32 t = tcg_temp_new_i32(); 2985 2986 tcg_gen_andi_i32(t, b, 31); 2987 tcg_gen_sar_i32(d, a, t); 2988 tcg_temp_free_i32(t); 2989 } 2990 2991 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2992 { 2993 TCGv_i64 t = tcg_temp_new_i64(); 2994 2995 tcg_gen_andi_i64(t, b, 63); 2996 tcg_gen_sar_i64(d, a, t); 2997 tcg_temp_free_i64(t); 2998 } 2999 3000 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs, 3001 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 3002 { 3003 static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 }; 3004 static const GVecGen3 g[4] = { 3005 { .fniv = tcg_gen_sarv_mod_vec, 3006 .fno = gen_helper_gvec_sar8v, 3007 .opt_opc = vecop_list, 3008 .vece = MO_8 }, 3009 { .fniv = tcg_gen_sarv_mod_vec, 3010 .fno = gen_helper_gvec_sar16v, 3011 .opt_opc = vecop_list, 3012 .vece = MO_16 }, 3013 { .fni4 = tcg_gen_sar_mod_i32, 3014 .fniv = tcg_gen_sarv_mod_vec, 3015 .fno = gen_helper_gvec_sar32v, 3016 .opt_opc = vecop_list, 3017 .vece = MO_32 }, 3018 { .fni8 = tcg_gen_sar_mod_i64, 3019 .fniv = tcg_gen_sarv_mod_vec, 3020 .fno = gen_helper_gvec_sar64v, 3021 .opt_opc = vecop_list, 3022 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 3023 .vece = MO_64 }, 3024 }; 3025 3026 tcg_debug_assert(vece <= MO_64); 3027 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 3028 } 3029 3030 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 3031 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3032 uint32_t oprsz, TCGCond cond) 3033 { 3034 TCGv_i32 t0 = tcg_temp_new_i32(); 3035 TCGv_i32 t1 = tcg_temp_new_i32(); 3036 uint32_t i; 3037 3038 for (i = 0; i < oprsz; i += 4) { 3039 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 3040 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 3041 tcg_gen_setcond_i32(cond, t0, t0, t1); 3042 tcg_gen_neg_i32(t0, t0); 3043 tcg_gen_st_i32(t0, cpu_env, dofs + i); 3044 } 3045 tcg_temp_free_i32(t1); 3046 tcg_temp_free_i32(t0); 3047 } 3048 3049 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 3050 uint32_t oprsz, TCGCond cond) 3051 { 3052 TCGv_i64 t0 = tcg_temp_new_i64(); 3053 TCGv_i64 t1 = tcg_temp_new_i64(); 3054 uint32_t i; 3055 3056 for (i = 0; i < oprsz; i += 8) { 3057 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 3058 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 3059 tcg_gen_setcond_i64(cond, t0, t0, t1); 3060 tcg_gen_neg_i64(t0, t0); 3061 tcg_gen_st_i64(t0, cpu_env, dofs + i); 3062 } 3063 tcg_temp_free_i64(t1); 3064 tcg_temp_free_i64(t0); 3065 } 3066 3067 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 3068 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 3069 TCGType type, TCGCond cond) 3070 { 3071 TCGv_vec t0 = tcg_temp_new_vec(type); 3072 TCGv_vec t1 = tcg_temp_new_vec(type); 3073 uint32_t i; 3074 3075 for (i = 0; i < oprsz; i += tysz) { 3076 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 3077 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 3078 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 3079 tcg_gen_st_vec(t0, cpu_env, dofs + i); 3080 } 3081 tcg_temp_free_vec(t1); 3082 tcg_temp_free_vec(t0); 3083 } 3084 3085 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 3086 uint32_t aofs, uint32_t bofs, 3087 uint32_t oprsz, uint32_t maxsz) 3088 { 3089 static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 }; 3090 static gen_helper_gvec_3 * const eq_fn[4] = { 3091 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 3092 gen_helper_gvec_eq32, gen_helper_gvec_eq64 3093 }; 3094 static gen_helper_gvec_3 * const ne_fn[4] = { 3095 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 3096 gen_helper_gvec_ne32, gen_helper_gvec_ne64 3097 }; 3098 static gen_helper_gvec_3 * const lt_fn[4] = { 3099 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 3100 gen_helper_gvec_lt32, gen_helper_gvec_lt64 3101 }; 3102 static gen_helper_gvec_3 * const le_fn[4] = { 3103 gen_helper_gvec_le8, gen_helper_gvec_le16, 3104 gen_helper_gvec_le32, gen_helper_gvec_le64 3105 }; 3106 static gen_helper_gvec_3 * const ltu_fn[4] = { 3107 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 3108 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 3109 }; 3110 static gen_helper_gvec_3 * const leu_fn[4] = { 3111 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 3112 gen_helper_gvec_leu32, gen_helper_gvec_leu64 3113 }; 3114 static gen_helper_gvec_3 * const * const fns[16] = { 3115 [TCG_COND_EQ] = eq_fn, 3116 [TCG_COND_NE] = ne_fn, 3117 [TCG_COND_LT] = lt_fn, 3118 [TCG_COND_LE] = le_fn, 3119 [TCG_COND_LTU] = ltu_fn, 3120 [TCG_COND_LEU] = leu_fn, 3121 }; 3122 3123 const TCGOpcode *hold_list; 3124 TCGType type; 3125 uint32_t some; 3126 3127 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 3128 check_overlap_3(dofs, aofs, bofs, maxsz); 3129 3130 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 3131 do_dup(MO_8, dofs, oprsz, maxsz, 3132 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 3133 return; 3134 } 3135 3136 /* 3137 * Implement inline with a vector type, if possible. 3138 * Prefer integer when 64-bit host and 64-bit comparison. 3139 */ 3140 hold_list = tcg_swap_vecop_list(cmp_list); 3141 type = choose_vector_type(cmp_list, vece, oprsz, 3142 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 3143 switch (type) { 3144 case TCG_TYPE_V256: 3145 /* Recall that ARM SVE allows vector sizes that are not a 3146 * power of 2, but always a multiple of 16. The intent is 3147 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 3148 */ 3149 some = QEMU_ALIGN_DOWN(oprsz, 32); 3150 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 3151 if (some == oprsz) { 3152 break; 3153 } 3154 dofs += some; 3155 aofs += some; 3156 bofs += some; 3157 oprsz -= some; 3158 maxsz -= some; 3159 /* fallthru */ 3160 case TCG_TYPE_V128: 3161 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 3162 break; 3163 case TCG_TYPE_V64: 3164 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 3165 break; 3166 3167 case 0: 3168 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 3169 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 3170 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 3171 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 3172 } else { 3173 gen_helper_gvec_3 * const *fn = fns[cond]; 3174 3175 if (fn == NULL) { 3176 uint32_t tmp; 3177 tmp = aofs, aofs = bofs, bofs = tmp; 3178 cond = tcg_swap_cond(cond); 3179 fn = fns[cond]; 3180 assert(fn != NULL); 3181 } 3182 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 3183 oprsz = maxsz; 3184 } 3185 break; 3186 3187 default: 3188 g_assert_not_reached(); 3189 } 3190 tcg_swap_vecop_list(hold_list); 3191 3192 if (oprsz < maxsz) { 3193 expand_clr(dofs + oprsz, maxsz - oprsz); 3194 } 3195 } 3196