1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu-common.h" 22 #include "tcg.h" 23 #include "tcg-op.h" 24 #include "tcg-op-gvec.h" 25 #include "tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 /* Verify vector size and alignment rules. OFS should be the OR of all 30 of the operand offsets so that we can check them all at once. */ 31 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 32 { 33 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 34 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 35 tcg_debug_assert(oprsz > 0); 36 tcg_debug_assert(oprsz <= maxsz); 37 tcg_debug_assert((oprsz & opr_align) == 0); 38 tcg_debug_assert((maxsz & max_align) == 0); 39 tcg_debug_assert((ofs & max_align) == 0); 40 } 41 42 /* Verify vector overlap rules for two operands. */ 43 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 44 { 45 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 46 } 47 48 /* Verify vector overlap rules for three operands. */ 49 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 50 { 51 check_overlap_2(d, a, s); 52 check_overlap_2(d, b, s); 53 check_overlap_2(a, b, s); 54 } 55 56 /* Verify vector overlap rules for four operands. */ 57 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 58 uint32_t c, uint32_t s) 59 { 60 check_overlap_2(d, a, s); 61 check_overlap_2(d, b, s); 62 check_overlap_2(d, c, s); 63 check_overlap_2(a, b, s); 64 check_overlap_2(a, c, s); 65 check_overlap_2(b, c, s); 66 } 67 68 /* Create a descriptor from components. */ 69 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 70 { 71 uint32_t desc = 0; 72 73 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 74 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 75 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 76 77 oprsz = (oprsz / 8) - 1; 78 maxsz = (maxsz / 8) - 1; 79 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 80 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 81 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 82 83 return desc; 84 } 85 86 /* Generate a call to a gvec-style helper with two vector operands. */ 87 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 88 uint32_t oprsz, uint32_t maxsz, int32_t data, 89 gen_helper_gvec_2 *fn) 90 { 91 TCGv_ptr a0, a1; 92 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 93 94 a0 = tcg_temp_new_ptr(); 95 a1 = tcg_temp_new_ptr(); 96 97 tcg_gen_addi_ptr(a0, cpu_env, dofs); 98 tcg_gen_addi_ptr(a1, cpu_env, aofs); 99 100 fn(a0, a1, desc); 101 102 tcg_temp_free_ptr(a0); 103 tcg_temp_free_ptr(a1); 104 tcg_temp_free_i32(desc); 105 } 106 107 /* Generate a call to a gvec-style helper with two vector operands 108 and one scalar operand. */ 109 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 110 uint32_t oprsz, uint32_t maxsz, int32_t data, 111 gen_helper_gvec_2i *fn) 112 { 113 TCGv_ptr a0, a1; 114 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 115 116 a0 = tcg_temp_new_ptr(); 117 a1 = tcg_temp_new_ptr(); 118 119 tcg_gen_addi_ptr(a0, cpu_env, dofs); 120 tcg_gen_addi_ptr(a1, cpu_env, aofs); 121 122 fn(a0, a1, c, desc); 123 124 tcg_temp_free_ptr(a0); 125 tcg_temp_free_ptr(a1); 126 tcg_temp_free_i32(desc); 127 } 128 129 /* Generate a call to a gvec-style helper with three vector operands. */ 130 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 131 uint32_t oprsz, uint32_t maxsz, int32_t data, 132 gen_helper_gvec_3 *fn) 133 { 134 TCGv_ptr a0, a1, a2; 135 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 136 137 a0 = tcg_temp_new_ptr(); 138 a1 = tcg_temp_new_ptr(); 139 a2 = tcg_temp_new_ptr(); 140 141 tcg_gen_addi_ptr(a0, cpu_env, dofs); 142 tcg_gen_addi_ptr(a1, cpu_env, aofs); 143 tcg_gen_addi_ptr(a2, cpu_env, bofs); 144 145 fn(a0, a1, a2, desc); 146 147 tcg_temp_free_ptr(a0); 148 tcg_temp_free_ptr(a1); 149 tcg_temp_free_ptr(a2); 150 tcg_temp_free_i32(desc); 151 } 152 153 /* Generate a call to a gvec-style helper with four vector operands. */ 154 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 155 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 156 int32_t data, gen_helper_gvec_4 *fn) 157 { 158 TCGv_ptr a0, a1, a2, a3; 159 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 160 161 a0 = tcg_temp_new_ptr(); 162 a1 = tcg_temp_new_ptr(); 163 a2 = tcg_temp_new_ptr(); 164 a3 = tcg_temp_new_ptr(); 165 166 tcg_gen_addi_ptr(a0, cpu_env, dofs); 167 tcg_gen_addi_ptr(a1, cpu_env, aofs); 168 tcg_gen_addi_ptr(a2, cpu_env, bofs); 169 tcg_gen_addi_ptr(a3, cpu_env, cofs); 170 171 fn(a0, a1, a2, a3, desc); 172 173 tcg_temp_free_ptr(a0); 174 tcg_temp_free_ptr(a1); 175 tcg_temp_free_ptr(a2); 176 tcg_temp_free_ptr(a3); 177 tcg_temp_free_i32(desc); 178 } 179 180 /* Generate a call to a gvec-style helper with five vector operands. */ 181 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 182 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 183 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 184 { 185 TCGv_ptr a0, a1, a2, a3, a4; 186 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 187 188 a0 = tcg_temp_new_ptr(); 189 a1 = tcg_temp_new_ptr(); 190 a2 = tcg_temp_new_ptr(); 191 a3 = tcg_temp_new_ptr(); 192 a4 = tcg_temp_new_ptr(); 193 194 tcg_gen_addi_ptr(a0, cpu_env, dofs); 195 tcg_gen_addi_ptr(a1, cpu_env, aofs); 196 tcg_gen_addi_ptr(a2, cpu_env, bofs); 197 tcg_gen_addi_ptr(a3, cpu_env, cofs); 198 tcg_gen_addi_ptr(a4, cpu_env, xofs); 199 200 fn(a0, a1, a2, a3, a4, desc); 201 202 tcg_temp_free_ptr(a0); 203 tcg_temp_free_ptr(a1); 204 tcg_temp_free_ptr(a2); 205 tcg_temp_free_ptr(a3); 206 tcg_temp_free_ptr(a4); 207 tcg_temp_free_i32(desc); 208 } 209 210 /* Generate a call to a gvec-style helper with three vector operands 211 and an extra pointer operand. */ 212 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 213 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 214 int32_t data, gen_helper_gvec_2_ptr *fn) 215 { 216 TCGv_ptr a0, a1; 217 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 218 219 a0 = tcg_temp_new_ptr(); 220 a1 = tcg_temp_new_ptr(); 221 222 tcg_gen_addi_ptr(a0, cpu_env, dofs); 223 tcg_gen_addi_ptr(a1, cpu_env, aofs); 224 225 fn(a0, a1, ptr, desc); 226 227 tcg_temp_free_ptr(a0); 228 tcg_temp_free_ptr(a1); 229 tcg_temp_free_i32(desc); 230 } 231 232 /* Generate a call to a gvec-style helper with three vector operands 233 and an extra pointer operand. */ 234 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 235 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 236 int32_t data, gen_helper_gvec_3_ptr *fn) 237 { 238 TCGv_ptr a0, a1, a2; 239 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 240 241 a0 = tcg_temp_new_ptr(); 242 a1 = tcg_temp_new_ptr(); 243 a2 = tcg_temp_new_ptr(); 244 245 tcg_gen_addi_ptr(a0, cpu_env, dofs); 246 tcg_gen_addi_ptr(a1, cpu_env, aofs); 247 tcg_gen_addi_ptr(a2, cpu_env, bofs); 248 249 fn(a0, a1, a2, ptr, desc); 250 251 tcg_temp_free_ptr(a0); 252 tcg_temp_free_ptr(a1); 253 tcg_temp_free_ptr(a2); 254 tcg_temp_free_i32(desc); 255 } 256 257 /* Generate a call to a gvec-style helper with four vector operands 258 and an extra pointer operand. */ 259 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 260 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 261 uint32_t maxsz, int32_t data, 262 gen_helper_gvec_4_ptr *fn) 263 { 264 TCGv_ptr a0, a1, a2, a3; 265 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 266 267 a0 = tcg_temp_new_ptr(); 268 a1 = tcg_temp_new_ptr(); 269 a2 = tcg_temp_new_ptr(); 270 a3 = tcg_temp_new_ptr(); 271 272 tcg_gen_addi_ptr(a0, cpu_env, dofs); 273 tcg_gen_addi_ptr(a1, cpu_env, aofs); 274 tcg_gen_addi_ptr(a2, cpu_env, bofs); 275 tcg_gen_addi_ptr(a3, cpu_env, cofs); 276 277 fn(a0, a1, a2, a3, ptr, desc); 278 279 tcg_temp_free_ptr(a0); 280 tcg_temp_free_ptr(a1); 281 tcg_temp_free_ptr(a2); 282 tcg_temp_free_ptr(a3); 283 tcg_temp_free_i32(desc); 284 } 285 286 /* Return true if we want to implement something of OPRSZ bytes 287 in units of LNSZ. This limits the expansion of inline code. */ 288 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 289 { 290 if (oprsz % lnsz == 0) { 291 uint32_t lnct = oprsz / lnsz; 292 return lnct >= 1 && lnct <= MAX_UNROLL; 293 } 294 return false; 295 } 296 297 static void expand_clr(uint32_t dofs, uint32_t maxsz); 298 299 /* Duplicate C as per VECE. */ 300 uint64_t (dup_const)(unsigned vece, uint64_t c) 301 { 302 switch (vece) { 303 case MO_8: 304 return 0x0101010101010101ull * (uint8_t)c; 305 case MO_16: 306 return 0x0001000100010001ull * (uint16_t)c; 307 case MO_32: 308 return 0x0000000100000001ull * (uint32_t)c; 309 case MO_64: 310 return c; 311 default: 312 g_assert_not_reached(); 313 } 314 } 315 316 /* Duplicate IN into OUT as per VECE. */ 317 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 318 { 319 switch (vece) { 320 case MO_8: 321 tcg_gen_ext8u_i32(out, in); 322 tcg_gen_muli_i32(out, out, 0x01010101); 323 break; 324 case MO_16: 325 tcg_gen_deposit_i32(out, in, in, 16, 16); 326 break; 327 case MO_32: 328 tcg_gen_mov_i32(out, in); 329 break; 330 default: 331 g_assert_not_reached(); 332 } 333 } 334 335 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 336 { 337 switch (vece) { 338 case MO_8: 339 tcg_gen_ext8u_i64(out, in); 340 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 341 break; 342 case MO_16: 343 tcg_gen_ext16u_i64(out, in); 344 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 345 break; 346 case MO_32: 347 tcg_gen_deposit_i64(out, in, in, 32, 32); 348 break; 349 case MO_64: 350 tcg_gen_mov_i64(out, in); 351 break; 352 default: 353 g_assert_not_reached(); 354 } 355 } 356 357 /* Select a supported vector type for implementing an operation on SIZE 358 * bytes. If OP is 0, assume that the real operation to be performed is 359 * required by all backends. Otherwise, make sure than OP can be performed 360 * on elements of size VECE in the selected type. Do not select V64 if 361 * PREFER_I64 is true. Return 0 if no vector type is selected. 362 */ 363 static TCGType choose_vector_type(TCGOpcode op, unsigned vece, uint32_t size, 364 bool prefer_i64) 365 { 366 if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { 367 if (op == 0) { 368 return TCG_TYPE_V256; 369 } 370 /* Recall that ARM SVE allows vector sizes that are not a 371 * power of 2, but always a multiple of 16. The intent is 372 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 373 * It is hard to imagine a case in which v256 is supported 374 * but v128 is not, but check anyway. 375 */ 376 if (tcg_can_emit_vec_op(op, TCG_TYPE_V256, vece) 377 && (size % 32 == 0 378 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) { 379 return TCG_TYPE_V256; 380 } 381 } 382 if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) 383 && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) { 384 return TCG_TYPE_V128; 385 } 386 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 387 && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V64, vece))) { 388 return TCG_TYPE_V64; 389 } 390 return 0; 391 } 392 393 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 394 * Only one of IN_32 or IN_64 may be set; 395 * IN_C is used if IN_32 and IN_64 are unset. 396 */ 397 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 398 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 399 uint64_t in_c) 400 { 401 TCGType type; 402 TCGv_i64 t_64; 403 TCGv_i32 t_32, t_desc; 404 TCGv_ptr t_ptr; 405 uint32_t i; 406 407 assert(vece <= (in_32 ? MO_32 : MO_64)); 408 assert(in_32 == NULL || in_64 == NULL); 409 410 /* If we're storing 0, expand oprsz to maxsz. */ 411 if (in_32 == NULL && in_64 == NULL) { 412 in_c = dup_const(vece, in_c); 413 if (in_c == 0) { 414 oprsz = maxsz; 415 } 416 } 417 418 /* Implement inline with a vector type, if possible. 419 * Prefer integer when 64-bit host and no variable dup. 420 */ 421 type = choose_vector_type(0, vece, oprsz, 422 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 423 && (in_64 == NULL || vece == MO_64))); 424 if (type != 0) { 425 TCGv_vec t_vec = tcg_temp_new_vec(type); 426 427 if (in_32) { 428 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 429 } else if (in_64) { 430 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 431 } else { 432 switch (vece) { 433 case MO_8: 434 tcg_gen_dup8i_vec(t_vec, in_c); 435 break; 436 case MO_16: 437 tcg_gen_dup16i_vec(t_vec, in_c); 438 break; 439 case MO_32: 440 tcg_gen_dup32i_vec(t_vec, in_c); 441 break; 442 default: 443 tcg_gen_dup64i_vec(t_vec, in_c); 444 break; 445 } 446 } 447 448 i = 0; 449 switch (type) { 450 case TCG_TYPE_V256: 451 /* Recall that ARM SVE allows vector sizes that are not a 452 * power of 2, but always a multiple of 16. The intent is 453 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 454 */ 455 for (; i + 32 <= oprsz; i += 32) { 456 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 457 } 458 /* fallthru */ 459 case TCG_TYPE_V128: 460 for (; i + 16 <= oprsz; i += 16) { 461 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 462 } 463 break; 464 case TCG_TYPE_V64: 465 for (; i < oprsz; i += 8) { 466 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 467 } 468 break; 469 default: 470 g_assert_not_reached(); 471 } 472 473 tcg_temp_free_vec(t_vec); 474 goto done; 475 } 476 477 /* Otherwise, inline with an integer type, unless "large". */ 478 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 479 t_64 = NULL; 480 t_32 = NULL; 481 482 if (in_32) { 483 /* We are given a 32-bit variable input. For a 64-bit host, 484 use a 64-bit operation unless the 32-bit operation would 485 be simple enough. */ 486 if (TCG_TARGET_REG_BITS == 64 487 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 488 t_64 = tcg_temp_new_i64(); 489 tcg_gen_extu_i32_i64(t_64, in_32); 490 gen_dup_i64(vece, t_64, t_64); 491 } else { 492 t_32 = tcg_temp_new_i32(); 493 gen_dup_i32(vece, t_32, in_32); 494 } 495 } else if (in_64) { 496 /* We are given a 64-bit variable input. */ 497 t_64 = tcg_temp_new_i64(); 498 gen_dup_i64(vece, t_64, in_64); 499 } else { 500 /* We are given a constant input. */ 501 /* For 64-bit hosts, use 64-bit constants for "simple" constants 502 or when we'd need too many 32-bit stores, or when a 64-bit 503 constant is really required. */ 504 if (vece == MO_64 505 || (TCG_TARGET_REG_BITS == 64 506 && (in_c == 0 || in_c == -1 507 || !check_size_impl(oprsz, 4)))) { 508 t_64 = tcg_const_i64(in_c); 509 } else { 510 t_32 = tcg_const_i32(in_c); 511 } 512 } 513 514 /* Implement inline if we picked an implementation size above. */ 515 if (t_32) { 516 for (i = 0; i < oprsz; i += 4) { 517 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 518 } 519 tcg_temp_free_i32(t_32); 520 goto done; 521 } 522 if (t_64) { 523 for (i = 0; i < oprsz; i += 8) { 524 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 525 } 526 tcg_temp_free_i64(t_64); 527 goto done; 528 } 529 } 530 531 /* Otherwise implement out of line. */ 532 t_ptr = tcg_temp_new_ptr(); 533 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 534 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 535 536 if (vece == MO_64) { 537 if (in_64) { 538 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 539 } else { 540 t_64 = tcg_const_i64(in_c); 541 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 542 tcg_temp_free_i64(t_64); 543 } 544 } else { 545 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 546 static dup_fn * const fns[3] = { 547 gen_helper_gvec_dup8, 548 gen_helper_gvec_dup16, 549 gen_helper_gvec_dup32 550 }; 551 552 if (in_32) { 553 fns[vece](t_ptr, t_desc, in_32); 554 } else { 555 t_32 = tcg_temp_new_i32(); 556 if (in_64) { 557 tcg_gen_extrl_i64_i32(t_32, in_64); 558 } else if (vece == MO_8) { 559 tcg_gen_movi_i32(t_32, in_c & 0xff); 560 } else if (vece == MO_16) { 561 tcg_gen_movi_i32(t_32, in_c & 0xffff); 562 } else { 563 tcg_gen_movi_i32(t_32, in_c); 564 } 565 fns[vece](t_ptr, t_desc, t_32); 566 tcg_temp_free_i32(t_32); 567 } 568 } 569 570 tcg_temp_free_ptr(t_ptr); 571 tcg_temp_free_i32(t_desc); 572 return; 573 574 done: 575 if (oprsz < maxsz) { 576 expand_clr(dofs + oprsz, maxsz - oprsz); 577 } 578 } 579 580 /* Likewise, but with zero. */ 581 static void expand_clr(uint32_t dofs, uint32_t maxsz) 582 { 583 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 584 } 585 586 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 587 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 588 void (*fni)(TCGv_i32, TCGv_i32)) 589 { 590 TCGv_i32 t0 = tcg_temp_new_i32(); 591 uint32_t i; 592 593 for (i = 0; i < oprsz; i += 4) { 594 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 595 fni(t0, t0); 596 tcg_gen_st_i32(t0, cpu_env, dofs + i); 597 } 598 tcg_temp_free_i32(t0); 599 } 600 601 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 602 int32_t c, bool load_dest, 603 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 604 { 605 TCGv_i32 t0 = tcg_temp_new_i32(); 606 TCGv_i32 t1 = tcg_temp_new_i32(); 607 uint32_t i; 608 609 for (i = 0; i < oprsz; i += 4) { 610 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 611 if (load_dest) { 612 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 613 } 614 fni(t1, t0, c); 615 tcg_gen_st_i32(t1, cpu_env, dofs + i); 616 } 617 tcg_temp_free_i32(t0); 618 tcg_temp_free_i32(t1); 619 } 620 621 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 622 TCGv_i32 c, bool scalar_first, 623 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 624 { 625 TCGv_i32 t0 = tcg_temp_new_i32(); 626 TCGv_i32 t1 = tcg_temp_new_i32(); 627 uint32_t i; 628 629 for (i = 0; i < oprsz; i += 4) { 630 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 631 if (scalar_first) { 632 fni(t1, c, t0); 633 } else { 634 fni(t1, t0, c); 635 } 636 tcg_gen_st_i32(t1, cpu_env, dofs + i); 637 } 638 tcg_temp_free_i32(t0); 639 tcg_temp_free_i32(t1); 640 } 641 642 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 643 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 644 uint32_t bofs, uint32_t oprsz, bool load_dest, 645 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 646 { 647 TCGv_i32 t0 = tcg_temp_new_i32(); 648 TCGv_i32 t1 = tcg_temp_new_i32(); 649 TCGv_i32 t2 = tcg_temp_new_i32(); 650 uint32_t i; 651 652 for (i = 0; i < oprsz; i += 4) { 653 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 654 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 655 if (load_dest) { 656 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 657 } 658 fni(t2, t0, t1); 659 tcg_gen_st_i32(t2, cpu_env, dofs + i); 660 } 661 tcg_temp_free_i32(t2); 662 tcg_temp_free_i32(t1); 663 tcg_temp_free_i32(t0); 664 } 665 666 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 667 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 668 uint32_t cofs, uint32_t oprsz, bool write_aofs, 669 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 670 { 671 TCGv_i32 t0 = tcg_temp_new_i32(); 672 TCGv_i32 t1 = tcg_temp_new_i32(); 673 TCGv_i32 t2 = tcg_temp_new_i32(); 674 TCGv_i32 t3 = tcg_temp_new_i32(); 675 uint32_t i; 676 677 for (i = 0; i < oprsz; i += 4) { 678 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 679 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 680 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 681 fni(t0, t1, t2, t3); 682 tcg_gen_st_i32(t0, cpu_env, dofs + i); 683 if (write_aofs) { 684 tcg_gen_st_i32(t1, cpu_env, aofs + i); 685 } 686 } 687 tcg_temp_free_i32(t3); 688 tcg_temp_free_i32(t2); 689 tcg_temp_free_i32(t1); 690 tcg_temp_free_i32(t0); 691 } 692 693 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 694 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 695 void (*fni)(TCGv_i64, TCGv_i64)) 696 { 697 TCGv_i64 t0 = tcg_temp_new_i64(); 698 uint32_t i; 699 700 for (i = 0; i < oprsz; i += 8) { 701 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 702 fni(t0, t0); 703 tcg_gen_st_i64(t0, cpu_env, dofs + i); 704 } 705 tcg_temp_free_i64(t0); 706 } 707 708 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 709 int64_t c, bool load_dest, 710 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 711 { 712 TCGv_i64 t0 = tcg_temp_new_i64(); 713 TCGv_i64 t1 = tcg_temp_new_i64(); 714 uint32_t i; 715 716 for (i = 0; i < oprsz; i += 8) { 717 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 718 if (load_dest) { 719 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 720 } 721 fni(t1, t0, c); 722 tcg_gen_st_i64(t1, cpu_env, dofs + i); 723 } 724 tcg_temp_free_i64(t0); 725 tcg_temp_free_i64(t1); 726 } 727 728 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 729 TCGv_i64 c, bool scalar_first, 730 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 731 { 732 TCGv_i64 t0 = tcg_temp_new_i64(); 733 TCGv_i64 t1 = tcg_temp_new_i64(); 734 uint32_t i; 735 736 for (i = 0; i < oprsz; i += 8) { 737 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 738 if (scalar_first) { 739 fni(t1, c, t0); 740 } else { 741 fni(t1, t0, c); 742 } 743 tcg_gen_st_i64(t1, cpu_env, dofs + i); 744 } 745 tcg_temp_free_i64(t0); 746 tcg_temp_free_i64(t1); 747 } 748 749 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 750 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 751 uint32_t bofs, uint32_t oprsz, bool load_dest, 752 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 753 { 754 TCGv_i64 t0 = tcg_temp_new_i64(); 755 TCGv_i64 t1 = tcg_temp_new_i64(); 756 TCGv_i64 t2 = tcg_temp_new_i64(); 757 uint32_t i; 758 759 for (i = 0; i < oprsz; i += 8) { 760 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 761 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 762 if (load_dest) { 763 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 764 } 765 fni(t2, t0, t1); 766 tcg_gen_st_i64(t2, cpu_env, dofs + i); 767 } 768 tcg_temp_free_i64(t2); 769 tcg_temp_free_i64(t1); 770 tcg_temp_free_i64(t0); 771 } 772 773 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 774 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 775 uint32_t cofs, uint32_t oprsz, bool write_aofs, 776 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 777 { 778 TCGv_i64 t0 = tcg_temp_new_i64(); 779 TCGv_i64 t1 = tcg_temp_new_i64(); 780 TCGv_i64 t2 = tcg_temp_new_i64(); 781 TCGv_i64 t3 = tcg_temp_new_i64(); 782 uint32_t i; 783 784 for (i = 0; i < oprsz; i += 8) { 785 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 786 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 787 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 788 fni(t0, t1, t2, t3); 789 tcg_gen_st_i64(t0, cpu_env, dofs + i); 790 if (write_aofs) { 791 tcg_gen_st_i64(t1, cpu_env, aofs + i); 792 } 793 } 794 tcg_temp_free_i64(t3); 795 tcg_temp_free_i64(t2); 796 tcg_temp_free_i64(t1); 797 tcg_temp_free_i64(t0); 798 } 799 800 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 801 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 802 uint32_t oprsz, uint32_t tysz, TCGType type, 803 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 804 { 805 TCGv_vec t0 = tcg_temp_new_vec(type); 806 uint32_t i; 807 808 for (i = 0; i < oprsz; i += tysz) { 809 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 810 fni(vece, t0, t0); 811 tcg_gen_st_vec(t0, cpu_env, dofs + i); 812 } 813 tcg_temp_free_vec(t0); 814 } 815 816 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 817 using host vectors. */ 818 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 819 uint32_t oprsz, uint32_t tysz, TCGType type, 820 int64_t c, bool load_dest, 821 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 822 { 823 TCGv_vec t0 = tcg_temp_new_vec(type); 824 TCGv_vec t1 = tcg_temp_new_vec(type); 825 uint32_t i; 826 827 for (i = 0; i < oprsz; i += tysz) { 828 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 829 if (load_dest) { 830 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 831 } 832 fni(vece, t1, t0, c); 833 tcg_gen_st_vec(t1, cpu_env, dofs + i); 834 } 835 tcg_temp_free_vec(t0); 836 tcg_temp_free_vec(t1); 837 } 838 839 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 840 uint32_t oprsz, uint32_t tysz, TCGType type, 841 TCGv_vec c, bool scalar_first, 842 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 843 { 844 TCGv_vec t0 = tcg_temp_new_vec(type); 845 TCGv_vec t1 = tcg_temp_new_vec(type); 846 uint32_t i; 847 848 for (i = 0; i < oprsz; i += tysz) { 849 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 850 if (scalar_first) { 851 fni(vece, t1, c, t0); 852 } else { 853 fni(vece, t1, t0, c); 854 } 855 tcg_gen_st_vec(t1, cpu_env, dofs + i); 856 } 857 tcg_temp_free_vec(t0); 858 tcg_temp_free_vec(t1); 859 } 860 861 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 862 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 863 uint32_t bofs, uint32_t oprsz, 864 uint32_t tysz, TCGType type, bool load_dest, 865 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 866 { 867 TCGv_vec t0 = tcg_temp_new_vec(type); 868 TCGv_vec t1 = tcg_temp_new_vec(type); 869 TCGv_vec t2 = tcg_temp_new_vec(type); 870 uint32_t i; 871 872 for (i = 0; i < oprsz; i += tysz) { 873 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 874 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 875 if (load_dest) { 876 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 877 } 878 fni(vece, t2, t0, t1); 879 tcg_gen_st_vec(t2, cpu_env, dofs + i); 880 } 881 tcg_temp_free_vec(t2); 882 tcg_temp_free_vec(t1); 883 tcg_temp_free_vec(t0); 884 } 885 886 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 887 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 888 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 889 uint32_t tysz, TCGType type, bool write_aofs, 890 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 891 TCGv_vec, TCGv_vec)) 892 { 893 TCGv_vec t0 = tcg_temp_new_vec(type); 894 TCGv_vec t1 = tcg_temp_new_vec(type); 895 TCGv_vec t2 = tcg_temp_new_vec(type); 896 TCGv_vec t3 = tcg_temp_new_vec(type); 897 uint32_t i; 898 899 for (i = 0; i < oprsz; i += tysz) { 900 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 901 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 902 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 903 fni(vece, t0, t1, t2, t3); 904 tcg_gen_st_vec(t0, cpu_env, dofs + i); 905 if (write_aofs) { 906 tcg_gen_st_vec(t1, cpu_env, aofs + i); 907 } 908 } 909 tcg_temp_free_vec(t3); 910 tcg_temp_free_vec(t2); 911 tcg_temp_free_vec(t1); 912 tcg_temp_free_vec(t0); 913 } 914 915 /* Expand a vector two-operand operation. */ 916 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 917 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 918 { 919 TCGType type; 920 uint32_t some; 921 922 check_size_align(oprsz, maxsz, dofs | aofs); 923 check_overlap_2(dofs, aofs, maxsz); 924 925 type = 0; 926 if (g->fniv) { 927 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 928 } 929 switch (type) { 930 case TCG_TYPE_V256: 931 /* Recall that ARM SVE allows vector sizes that are not a 932 * power of 2, but always a multiple of 16. The intent is 933 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 934 */ 935 some = QEMU_ALIGN_DOWN(oprsz, 32); 936 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); 937 if (some == oprsz) { 938 break; 939 } 940 dofs += some; 941 aofs += some; 942 oprsz -= some; 943 maxsz -= some; 944 /* fallthru */ 945 case TCG_TYPE_V128: 946 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); 947 break; 948 case TCG_TYPE_V64: 949 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); 950 break; 951 952 case 0: 953 if (g->fni8 && check_size_impl(oprsz, 8)) { 954 expand_2_i64(dofs, aofs, oprsz, g->fni8); 955 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 956 expand_2_i32(dofs, aofs, oprsz, g->fni4); 957 } else { 958 assert(g->fno != NULL); 959 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 960 return; 961 } 962 break; 963 964 default: 965 g_assert_not_reached(); 966 } 967 968 if (oprsz < maxsz) { 969 expand_clr(dofs + oprsz, maxsz - oprsz); 970 } 971 } 972 973 /* Expand a vector operation with two vectors and an immediate. */ 974 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 975 uint32_t maxsz, int64_t c, const GVecGen2i *g) 976 { 977 TCGType type; 978 uint32_t some; 979 980 check_size_align(oprsz, maxsz, dofs | aofs); 981 check_overlap_2(dofs, aofs, maxsz); 982 983 type = 0; 984 if (g->fniv) { 985 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 986 } 987 switch (type) { 988 case TCG_TYPE_V256: 989 /* Recall that ARM SVE allows vector sizes that are not a 990 * power of 2, but always a multiple of 16. The intent is 991 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 992 */ 993 some = QEMU_ALIGN_DOWN(oprsz, 32); 994 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 995 c, g->load_dest, g->fniv); 996 if (some == oprsz) { 997 break; 998 } 999 dofs += some; 1000 aofs += some; 1001 oprsz -= some; 1002 maxsz -= some; 1003 /* fallthru */ 1004 case TCG_TYPE_V128: 1005 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1006 c, g->load_dest, g->fniv); 1007 break; 1008 case TCG_TYPE_V64: 1009 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1010 c, g->load_dest, g->fniv); 1011 break; 1012 1013 case 0: 1014 if (g->fni8 && check_size_impl(oprsz, 8)) { 1015 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1016 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1017 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1018 } else { 1019 if (g->fno) { 1020 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1021 } else { 1022 TCGv_i64 tcg_c = tcg_const_i64(c); 1023 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1024 maxsz, c, g->fnoi); 1025 tcg_temp_free_i64(tcg_c); 1026 } 1027 return; 1028 } 1029 break; 1030 1031 default: 1032 g_assert_not_reached(); 1033 } 1034 1035 if (oprsz < maxsz) { 1036 expand_clr(dofs + oprsz, maxsz - oprsz); 1037 } 1038 } 1039 1040 /* Expand a vector operation with two vectors and a scalar. */ 1041 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1042 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1043 { 1044 TCGType type; 1045 1046 check_size_align(oprsz, maxsz, dofs | aofs); 1047 check_overlap_2(dofs, aofs, maxsz); 1048 1049 type = 0; 1050 if (g->fniv) { 1051 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 1052 } 1053 if (type != 0) { 1054 TCGv_vec t_vec = tcg_temp_new_vec(type); 1055 uint32_t some; 1056 1057 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1058 1059 switch (type) { 1060 case TCG_TYPE_V256: 1061 /* Recall that ARM SVE allows vector sizes that are not a 1062 * power of 2, but always a multiple of 16. The intent is 1063 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1064 */ 1065 some = QEMU_ALIGN_DOWN(oprsz, 32); 1066 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1067 t_vec, g->scalar_first, g->fniv); 1068 if (some == oprsz) { 1069 break; 1070 } 1071 dofs += some; 1072 aofs += some; 1073 oprsz -= some; 1074 maxsz -= some; 1075 /* fallthru */ 1076 1077 case TCG_TYPE_V128: 1078 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1079 t_vec, g->scalar_first, g->fniv); 1080 break; 1081 1082 case TCG_TYPE_V64: 1083 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1084 t_vec, g->scalar_first, g->fniv); 1085 break; 1086 1087 default: 1088 g_assert_not_reached(); 1089 } 1090 tcg_temp_free_vec(t_vec); 1091 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1092 TCGv_i64 t64 = tcg_temp_new_i64(); 1093 1094 gen_dup_i64(g->vece, t64, c); 1095 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1096 tcg_temp_free_i64(t64); 1097 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1098 TCGv_i32 t32 = tcg_temp_new_i32(); 1099 1100 tcg_gen_extrl_i64_i32(t32, c); 1101 gen_dup_i32(g->vece, t32, t32); 1102 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1103 tcg_temp_free_i32(t32); 1104 } else { 1105 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1106 return; 1107 } 1108 1109 if (oprsz < maxsz) { 1110 expand_clr(dofs + oprsz, maxsz - oprsz); 1111 } 1112 } 1113 1114 /* Expand a vector three-operand operation. */ 1115 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1116 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1117 { 1118 TCGType type; 1119 uint32_t some; 1120 1121 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1122 check_overlap_3(dofs, aofs, bofs, maxsz); 1123 1124 type = 0; 1125 if (g->fniv) { 1126 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 1127 } 1128 switch (type) { 1129 case TCG_TYPE_V256: 1130 /* Recall that ARM SVE allows vector sizes that are not a 1131 * power of 2, but always a multiple of 16. The intent is 1132 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1133 */ 1134 some = QEMU_ALIGN_DOWN(oprsz, 32); 1135 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1136 g->load_dest, g->fniv); 1137 if (some == oprsz) { 1138 break; 1139 } 1140 dofs += some; 1141 aofs += some; 1142 bofs += some; 1143 oprsz -= some; 1144 maxsz -= some; 1145 /* fallthru */ 1146 case TCG_TYPE_V128: 1147 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1148 g->load_dest, g->fniv); 1149 break; 1150 case TCG_TYPE_V64: 1151 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1152 g->load_dest, g->fniv); 1153 break; 1154 1155 case 0: 1156 if (g->fni8 && check_size_impl(oprsz, 8)) { 1157 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1158 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1159 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1160 } else { 1161 assert(g->fno != NULL); 1162 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1163 maxsz, g->data, g->fno); 1164 return; 1165 } 1166 break; 1167 1168 default: 1169 g_assert_not_reached(); 1170 } 1171 1172 if (oprsz < maxsz) { 1173 expand_clr(dofs + oprsz, maxsz - oprsz); 1174 } 1175 } 1176 1177 /* Expand a vector four-operand operation. */ 1178 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1179 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1180 { 1181 TCGType type; 1182 uint32_t some; 1183 1184 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1185 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1186 1187 type = 0; 1188 if (g->fniv) { 1189 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 1190 } 1191 switch (type) { 1192 case TCG_TYPE_V256: 1193 /* Recall that ARM SVE allows vector sizes that are not a 1194 * power of 2, but always a multiple of 16. The intent is 1195 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1196 */ 1197 some = QEMU_ALIGN_DOWN(oprsz, 32); 1198 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1199 32, TCG_TYPE_V256, g->write_aofs, g->fniv); 1200 if (some == oprsz) { 1201 break; 1202 } 1203 dofs += some; 1204 aofs += some; 1205 bofs += some; 1206 cofs += some; 1207 oprsz -= some; 1208 maxsz -= some; 1209 /* fallthru */ 1210 case TCG_TYPE_V128: 1211 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1212 16, TCG_TYPE_V128, g->write_aofs, g->fniv); 1213 break; 1214 case TCG_TYPE_V64: 1215 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1216 8, TCG_TYPE_V64, g->write_aofs, g->fniv); 1217 break; 1218 1219 case 0: 1220 if (g->fni8 && check_size_impl(oprsz, 8)) { 1221 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, 1222 g->write_aofs, g->fni8); 1223 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1224 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, 1225 g->write_aofs, g->fni4); 1226 } else { 1227 assert(g->fno != NULL); 1228 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1229 oprsz, maxsz, g->data, g->fno); 1230 return; 1231 } 1232 break; 1233 1234 default: 1235 g_assert_not_reached(); 1236 } 1237 1238 if (oprsz < maxsz) { 1239 expand_clr(dofs + oprsz, maxsz - oprsz); 1240 } 1241 } 1242 1243 /* 1244 * Expand specific vector operations. 1245 */ 1246 1247 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1248 { 1249 tcg_gen_mov_vec(a, b); 1250 } 1251 1252 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1253 uint32_t oprsz, uint32_t maxsz) 1254 { 1255 static const GVecGen2 g = { 1256 .fni8 = tcg_gen_mov_i64, 1257 .fniv = vec_mov2, 1258 .fno = gen_helper_gvec_mov, 1259 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1260 }; 1261 if (dofs != aofs) { 1262 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1263 } else { 1264 check_size_align(oprsz, maxsz, dofs); 1265 if (oprsz < maxsz) { 1266 expand_clr(dofs + oprsz, maxsz - oprsz); 1267 } 1268 } 1269 } 1270 1271 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1272 uint32_t maxsz, TCGv_i32 in) 1273 { 1274 check_size_align(oprsz, maxsz, dofs); 1275 tcg_debug_assert(vece <= MO_32); 1276 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1277 } 1278 1279 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1280 uint32_t maxsz, TCGv_i64 in) 1281 { 1282 check_size_align(oprsz, maxsz, dofs); 1283 tcg_debug_assert(vece <= MO_64); 1284 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1285 } 1286 1287 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1288 uint32_t oprsz, uint32_t maxsz) 1289 { 1290 if (vece <= MO_32) { 1291 TCGv_i32 in = tcg_temp_new_i32(); 1292 switch (vece) { 1293 case MO_8: 1294 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1295 break; 1296 case MO_16: 1297 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1298 break; 1299 case MO_32: 1300 tcg_gen_ld_i32(in, cpu_env, aofs); 1301 break; 1302 } 1303 tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in); 1304 tcg_temp_free_i32(in); 1305 } else if (vece == MO_64) { 1306 TCGv_i64 in = tcg_temp_new_i64(); 1307 tcg_gen_ld_i64(in, cpu_env, aofs); 1308 tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in); 1309 tcg_temp_free_i64(in); 1310 } else { 1311 /* 128-bit duplicate. */ 1312 /* ??? Dup to 256-bit vector. */ 1313 int i; 1314 1315 tcg_debug_assert(vece == 4); 1316 tcg_debug_assert(oprsz >= 16); 1317 if (TCG_TARGET_HAS_v128) { 1318 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1319 1320 tcg_gen_ld_vec(in, cpu_env, aofs); 1321 for (i = 0; i < oprsz; i += 16) { 1322 tcg_gen_st_vec(in, cpu_env, dofs + i); 1323 } 1324 tcg_temp_free_vec(in); 1325 } else { 1326 TCGv_i64 in0 = tcg_temp_new_i64(); 1327 TCGv_i64 in1 = tcg_temp_new_i64(); 1328 1329 tcg_gen_ld_i64(in0, cpu_env, aofs); 1330 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1331 for (i = 0; i < oprsz; i += 16) { 1332 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1333 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1334 } 1335 tcg_temp_free_i64(in0); 1336 tcg_temp_free_i64(in1); 1337 } 1338 } 1339 } 1340 1341 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, 1342 uint32_t maxsz, uint64_t x) 1343 { 1344 check_size_align(oprsz, maxsz, dofs); 1345 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); 1346 } 1347 1348 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, 1349 uint32_t maxsz, uint32_t x) 1350 { 1351 check_size_align(oprsz, maxsz, dofs); 1352 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); 1353 } 1354 1355 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, 1356 uint32_t maxsz, uint16_t x) 1357 { 1358 check_size_align(oprsz, maxsz, dofs); 1359 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); 1360 } 1361 1362 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, 1363 uint32_t maxsz, uint8_t x) 1364 { 1365 check_size_align(oprsz, maxsz, dofs); 1366 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); 1367 } 1368 1369 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1370 uint32_t oprsz, uint32_t maxsz) 1371 { 1372 static const GVecGen2 g = { 1373 .fni8 = tcg_gen_not_i64, 1374 .fniv = tcg_gen_not_vec, 1375 .fno = gen_helper_gvec_not, 1376 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1377 }; 1378 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1379 } 1380 1381 /* Perform a vector addition using normal addition and a mask. The mask 1382 should be the sign bit of each lane. This 6-operation form is more 1383 efficient than separate additions when there are 4 or more lanes in 1384 the 64-bit operation. */ 1385 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1386 { 1387 TCGv_i64 t1 = tcg_temp_new_i64(); 1388 TCGv_i64 t2 = tcg_temp_new_i64(); 1389 TCGv_i64 t3 = tcg_temp_new_i64(); 1390 1391 tcg_gen_andc_i64(t1, a, m); 1392 tcg_gen_andc_i64(t2, b, m); 1393 tcg_gen_xor_i64(t3, a, b); 1394 tcg_gen_add_i64(d, t1, t2); 1395 tcg_gen_and_i64(t3, t3, m); 1396 tcg_gen_xor_i64(d, d, t3); 1397 1398 tcg_temp_free_i64(t1); 1399 tcg_temp_free_i64(t2); 1400 tcg_temp_free_i64(t3); 1401 } 1402 1403 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1404 { 1405 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1406 gen_addv_mask(d, a, b, m); 1407 tcg_temp_free_i64(m); 1408 } 1409 1410 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1411 { 1412 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1413 gen_addv_mask(d, a, b, m); 1414 tcg_temp_free_i64(m); 1415 } 1416 1417 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1418 { 1419 TCGv_i64 t1 = tcg_temp_new_i64(); 1420 TCGv_i64 t2 = tcg_temp_new_i64(); 1421 1422 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1423 tcg_gen_add_i64(t2, a, b); 1424 tcg_gen_add_i64(t1, t1, b); 1425 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1426 1427 tcg_temp_free_i64(t1); 1428 tcg_temp_free_i64(t2); 1429 } 1430 1431 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1432 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1433 { 1434 static const GVecGen3 g[4] = { 1435 { .fni8 = tcg_gen_vec_add8_i64, 1436 .fniv = tcg_gen_add_vec, 1437 .fno = gen_helper_gvec_add8, 1438 .opc = INDEX_op_add_vec, 1439 .vece = MO_8 }, 1440 { .fni8 = tcg_gen_vec_add16_i64, 1441 .fniv = tcg_gen_add_vec, 1442 .fno = gen_helper_gvec_add16, 1443 .opc = INDEX_op_add_vec, 1444 .vece = MO_16 }, 1445 { .fni4 = tcg_gen_add_i32, 1446 .fniv = tcg_gen_add_vec, 1447 .fno = gen_helper_gvec_add32, 1448 .opc = INDEX_op_add_vec, 1449 .vece = MO_32 }, 1450 { .fni8 = tcg_gen_add_i64, 1451 .fniv = tcg_gen_add_vec, 1452 .fno = gen_helper_gvec_add64, 1453 .opc = INDEX_op_add_vec, 1454 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1455 .vece = MO_64 }, 1456 }; 1457 1458 tcg_debug_assert(vece <= MO_64); 1459 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1460 } 1461 1462 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1463 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1464 { 1465 static const GVecGen2s g[4] = { 1466 { .fni8 = tcg_gen_vec_add8_i64, 1467 .fniv = tcg_gen_add_vec, 1468 .fno = gen_helper_gvec_adds8, 1469 .opc = INDEX_op_add_vec, 1470 .vece = MO_8 }, 1471 { .fni8 = tcg_gen_vec_add16_i64, 1472 .fniv = tcg_gen_add_vec, 1473 .fno = gen_helper_gvec_adds16, 1474 .opc = INDEX_op_add_vec, 1475 .vece = MO_16 }, 1476 { .fni4 = tcg_gen_add_i32, 1477 .fniv = tcg_gen_add_vec, 1478 .fno = gen_helper_gvec_adds32, 1479 .opc = INDEX_op_add_vec, 1480 .vece = MO_32 }, 1481 { .fni8 = tcg_gen_add_i64, 1482 .fniv = tcg_gen_add_vec, 1483 .fno = gen_helper_gvec_adds64, 1484 .opc = INDEX_op_add_vec, 1485 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1486 .vece = MO_64 }, 1487 }; 1488 1489 tcg_debug_assert(vece <= MO_64); 1490 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1491 } 1492 1493 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1494 int64_t c, uint32_t oprsz, uint32_t maxsz) 1495 { 1496 TCGv_i64 tmp = tcg_const_i64(c); 1497 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1498 tcg_temp_free_i64(tmp); 1499 } 1500 1501 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1502 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1503 { 1504 static const GVecGen2s g[4] = { 1505 { .fni8 = tcg_gen_vec_sub8_i64, 1506 .fniv = tcg_gen_sub_vec, 1507 .fno = gen_helper_gvec_subs8, 1508 .opc = INDEX_op_sub_vec, 1509 .vece = MO_8 }, 1510 { .fni8 = tcg_gen_vec_sub16_i64, 1511 .fniv = tcg_gen_sub_vec, 1512 .fno = gen_helper_gvec_subs16, 1513 .opc = INDEX_op_sub_vec, 1514 .vece = MO_16 }, 1515 { .fni4 = tcg_gen_sub_i32, 1516 .fniv = tcg_gen_sub_vec, 1517 .fno = gen_helper_gvec_subs32, 1518 .opc = INDEX_op_sub_vec, 1519 .vece = MO_32 }, 1520 { .fni8 = tcg_gen_sub_i64, 1521 .fniv = tcg_gen_sub_vec, 1522 .fno = gen_helper_gvec_subs64, 1523 .opc = INDEX_op_sub_vec, 1524 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1525 .vece = MO_64 }, 1526 }; 1527 1528 tcg_debug_assert(vece <= MO_64); 1529 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1530 } 1531 1532 /* Perform a vector subtraction using normal subtraction and a mask. 1533 Compare gen_addv_mask above. */ 1534 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1535 { 1536 TCGv_i64 t1 = tcg_temp_new_i64(); 1537 TCGv_i64 t2 = tcg_temp_new_i64(); 1538 TCGv_i64 t3 = tcg_temp_new_i64(); 1539 1540 tcg_gen_or_i64(t1, a, m); 1541 tcg_gen_andc_i64(t2, b, m); 1542 tcg_gen_eqv_i64(t3, a, b); 1543 tcg_gen_sub_i64(d, t1, t2); 1544 tcg_gen_and_i64(t3, t3, m); 1545 tcg_gen_xor_i64(d, d, t3); 1546 1547 tcg_temp_free_i64(t1); 1548 tcg_temp_free_i64(t2); 1549 tcg_temp_free_i64(t3); 1550 } 1551 1552 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1553 { 1554 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1555 gen_subv_mask(d, a, b, m); 1556 tcg_temp_free_i64(m); 1557 } 1558 1559 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1560 { 1561 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1562 gen_subv_mask(d, a, b, m); 1563 tcg_temp_free_i64(m); 1564 } 1565 1566 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1567 { 1568 TCGv_i64 t1 = tcg_temp_new_i64(); 1569 TCGv_i64 t2 = tcg_temp_new_i64(); 1570 1571 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1572 tcg_gen_sub_i64(t2, a, b); 1573 tcg_gen_sub_i64(t1, a, t1); 1574 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1575 1576 tcg_temp_free_i64(t1); 1577 tcg_temp_free_i64(t2); 1578 } 1579 1580 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1581 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1582 { 1583 static const GVecGen3 g[4] = { 1584 { .fni8 = tcg_gen_vec_sub8_i64, 1585 .fniv = tcg_gen_sub_vec, 1586 .fno = gen_helper_gvec_sub8, 1587 .opc = INDEX_op_sub_vec, 1588 .vece = MO_8 }, 1589 { .fni8 = tcg_gen_vec_sub16_i64, 1590 .fniv = tcg_gen_sub_vec, 1591 .fno = gen_helper_gvec_sub16, 1592 .opc = INDEX_op_sub_vec, 1593 .vece = MO_16 }, 1594 { .fni4 = tcg_gen_sub_i32, 1595 .fniv = tcg_gen_sub_vec, 1596 .fno = gen_helper_gvec_sub32, 1597 .opc = INDEX_op_sub_vec, 1598 .vece = MO_32 }, 1599 { .fni8 = tcg_gen_sub_i64, 1600 .fniv = tcg_gen_sub_vec, 1601 .fno = gen_helper_gvec_sub64, 1602 .opc = INDEX_op_sub_vec, 1603 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1604 .vece = MO_64 }, 1605 }; 1606 1607 tcg_debug_assert(vece <= MO_64); 1608 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1609 } 1610 1611 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1612 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1613 { 1614 static const GVecGen3 g[4] = { 1615 { .fniv = tcg_gen_mul_vec, 1616 .fno = gen_helper_gvec_mul8, 1617 .opc = INDEX_op_mul_vec, 1618 .vece = MO_8 }, 1619 { .fniv = tcg_gen_mul_vec, 1620 .fno = gen_helper_gvec_mul16, 1621 .opc = INDEX_op_mul_vec, 1622 .vece = MO_16 }, 1623 { .fni4 = tcg_gen_mul_i32, 1624 .fniv = tcg_gen_mul_vec, 1625 .fno = gen_helper_gvec_mul32, 1626 .opc = INDEX_op_mul_vec, 1627 .vece = MO_32 }, 1628 { .fni8 = tcg_gen_mul_i64, 1629 .fniv = tcg_gen_mul_vec, 1630 .fno = gen_helper_gvec_mul64, 1631 .opc = INDEX_op_mul_vec, 1632 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1633 .vece = MO_64 }, 1634 }; 1635 1636 tcg_debug_assert(vece <= MO_64); 1637 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1638 } 1639 1640 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1641 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1642 { 1643 static const GVecGen2s g[4] = { 1644 { .fniv = tcg_gen_mul_vec, 1645 .fno = gen_helper_gvec_muls8, 1646 .opc = INDEX_op_mul_vec, 1647 .vece = MO_8 }, 1648 { .fniv = tcg_gen_mul_vec, 1649 .fno = gen_helper_gvec_muls16, 1650 .opc = INDEX_op_mul_vec, 1651 .vece = MO_16 }, 1652 { .fni4 = tcg_gen_mul_i32, 1653 .fniv = tcg_gen_mul_vec, 1654 .fno = gen_helper_gvec_muls32, 1655 .opc = INDEX_op_mul_vec, 1656 .vece = MO_32 }, 1657 { .fni8 = tcg_gen_mul_i64, 1658 .fniv = tcg_gen_mul_vec, 1659 .fno = gen_helper_gvec_muls64, 1660 .opc = INDEX_op_mul_vec, 1661 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1662 .vece = MO_64 }, 1663 }; 1664 1665 tcg_debug_assert(vece <= MO_64); 1666 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1667 } 1668 1669 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1670 int64_t c, uint32_t oprsz, uint32_t maxsz) 1671 { 1672 TCGv_i64 tmp = tcg_const_i64(c); 1673 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1674 tcg_temp_free_i64(tmp); 1675 } 1676 1677 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1678 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1679 { 1680 static const GVecGen3 g[4] = { 1681 { .fniv = tcg_gen_ssadd_vec, 1682 .fno = gen_helper_gvec_ssadd8, 1683 .opc = INDEX_op_ssadd_vec, 1684 .vece = MO_8 }, 1685 { .fniv = tcg_gen_ssadd_vec, 1686 .fno = gen_helper_gvec_ssadd16, 1687 .opc = INDEX_op_ssadd_vec, 1688 .vece = MO_16 }, 1689 { .fniv = tcg_gen_ssadd_vec, 1690 .fno = gen_helper_gvec_ssadd32, 1691 .opc = INDEX_op_ssadd_vec, 1692 .vece = MO_32 }, 1693 { .fniv = tcg_gen_ssadd_vec, 1694 .fno = gen_helper_gvec_ssadd64, 1695 .opc = INDEX_op_ssadd_vec, 1696 .vece = MO_64 }, 1697 }; 1698 tcg_debug_assert(vece <= MO_64); 1699 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1700 } 1701 1702 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 1703 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1704 { 1705 static const GVecGen3 g[4] = { 1706 { .fniv = tcg_gen_sssub_vec, 1707 .fno = gen_helper_gvec_sssub8, 1708 .opc = INDEX_op_sssub_vec, 1709 .vece = MO_8 }, 1710 { .fniv = tcg_gen_sssub_vec, 1711 .fno = gen_helper_gvec_sssub16, 1712 .opc = INDEX_op_sssub_vec, 1713 .vece = MO_16 }, 1714 { .fniv = tcg_gen_sssub_vec, 1715 .fno = gen_helper_gvec_sssub32, 1716 .opc = INDEX_op_sssub_vec, 1717 .vece = MO_32 }, 1718 { .fniv = tcg_gen_sssub_vec, 1719 .fno = gen_helper_gvec_sssub64, 1720 .opc = INDEX_op_sssub_vec, 1721 .vece = MO_64 }, 1722 }; 1723 tcg_debug_assert(vece <= MO_64); 1724 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1725 } 1726 1727 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1728 { 1729 TCGv_i32 max = tcg_const_i32(-1); 1730 tcg_gen_add_i32(d, a, b); 1731 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 1732 tcg_temp_free_i32(max); 1733 } 1734 1735 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1736 { 1737 TCGv_i64 max = tcg_const_i64(-1); 1738 tcg_gen_add_i64(d, a, b); 1739 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 1740 tcg_temp_free_i64(max); 1741 } 1742 1743 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1744 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1745 { 1746 static const GVecGen3 g[4] = { 1747 { .fniv = tcg_gen_usadd_vec, 1748 .fno = gen_helper_gvec_usadd8, 1749 .opc = INDEX_op_usadd_vec, 1750 .vece = MO_8 }, 1751 { .fniv = tcg_gen_usadd_vec, 1752 .fno = gen_helper_gvec_usadd16, 1753 .opc = INDEX_op_usadd_vec, 1754 .vece = MO_16 }, 1755 { .fni4 = tcg_gen_usadd_i32, 1756 .fniv = tcg_gen_usadd_vec, 1757 .fno = gen_helper_gvec_usadd32, 1758 .opc = INDEX_op_usadd_vec, 1759 .vece = MO_32 }, 1760 { .fni8 = tcg_gen_usadd_i64, 1761 .fniv = tcg_gen_usadd_vec, 1762 .fno = gen_helper_gvec_usadd64, 1763 .opc = INDEX_op_usadd_vec, 1764 .vece = MO_64 } 1765 }; 1766 tcg_debug_assert(vece <= MO_64); 1767 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1768 } 1769 1770 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1771 { 1772 TCGv_i32 min = tcg_const_i32(0); 1773 tcg_gen_sub_i32(d, a, b); 1774 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 1775 tcg_temp_free_i32(min); 1776 } 1777 1778 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1779 { 1780 TCGv_i64 min = tcg_const_i64(0); 1781 tcg_gen_sub_i64(d, a, b); 1782 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 1783 tcg_temp_free_i64(min); 1784 } 1785 1786 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 1787 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1788 { 1789 static const GVecGen3 g[4] = { 1790 { .fniv = tcg_gen_ussub_vec, 1791 .fno = gen_helper_gvec_ussub8, 1792 .opc = INDEX_op_ussub_vec, 1793 .vece = MO_8 }, 1794 { .fniv = tcg_gen_ussub_vec, 1795 .fno = gen_helper_gvec_ussub16, 1796 .opc = INDEX_op_ussub_vec, 1797 .vece = MO_16 }, 1798 { .fni4 = tcg_gen_ussub_i32, 1799 .fniv = tcg_gen_ussub_vec, 1800 .fno = gen_helper_gvec_ussub32, 1801 .opc = INDEX_op_ussub_vec, 1802 .vece = MO_32 }, 1803 { .fni8 = tcg_gen_ussub_i64, 1804 .fniv = tcg_gen_ussub_vec, 1805 .fno = gen_helper_gvec_ussub64, 1806 .opc = INDEX_op_ussub_vec, 1807 .vece = MO_64 } 1808 }; 1809 tcg_debug_assert(vece <= MO_64); 1810 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1811 } 1812 1813 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs, 1814 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1815 { 1816 static const GVecGen3 g[4] = { 1817 { .fniv = tcg_gen_smin_vec, 1818 .fno = gen_helper_gvec_smin8, 1819 .opc = INDEX_op_smin_vec, 1820 .vece = MO_8 }, 1821 { .fniv = tcg_gen_smin_vec, 1822 .fno = gen_helper_gvec_smin16, 1823 .opc = INDEX_op_smin_vec, 1824 .vece = MO_16 }, 1825 { .fni4 = tcg_gen_smin_i32, 1826 .fniv = tcg_gen_smin_vec, 1827 .fno = gen_helper_gvec_smin32, 1828 .opc = INDEX_op_smin_vec, 1829 .vece = MO_32 }, 1830 { .fni8 = tcg_gen_smin_i64, 1831 .fniv = tcg_gen_smin_vec, 1832 .fno = gen_helper_gvec_smin64, 1833 .opc = INDEX_op_smin_vec, 1834 .vece = MO_64 } 1835 }; 1836 tcg_debug_assert(vece <= MO_64); 1837 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1838 } 1839 1840 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs, 1841 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1842 { 1843 static const GVecGen3 g[4] = { 1844 { .fniv = tcg_gen_umin_vec, 1845 .fno = gen_helper_gvec_umin8, 1846 .opc = INDEX_op_umin_vec, 1847 .vece = MO_8 }, 1848 { .fniv = tcg_gen_umin_vec, 1849 .fno = gen_helper_gvec_umin16, 1850 .opc = INDEX_op_umin_vec, 1851 .vece = MO_16 }, 1852 { .fni4 = tcg_gen_umin_i32, 1853 .fniv = tcg_gen_umin_vec, 1854 .fno = gen_helper_gvec_umin32, 1855 .opc = INDEX_op_umin_vec, 1856 .vece = MO_32 }, 1857 { .fni8 = tcg_gen_umin_i64, 1858 .fniv = tcg_gen_umin_vec, 1859 .fno = gen_helper_gvec_umin64, 1860 .opc = INDEX_op_umin_vec, 1861 .vece = MO_64 } 1862 }; 1863 tcg_debug_assert(vece <= MO_64); 1864 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1865 } 1866 1867 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs, 1868 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1869 { 1870 static const GVecGen3 g[4] = { 1871 { .fniv = tcg_gen_smax_vec, 1872 .fno = gen_helper_gvec_smax8, 1873 .opc = INDEX_op_smax_vec, 1874 .vece = MO_8 }, 1875 { .fniv = tcg_gen_smax_vec, 1876 .fno = gen_helper_gvec_smax16, 1877 .opc = INDEX_op_smax_vec, 1878 .vece = MO_16 }, 1879 { .fni4 = tcg_gen_smax_i32, 1880 .fniv = tcg_gen_smax_vec, 1881 .fno = gen_helper_gvec_smax32, 1882 .opc = INDEX_op_smax_vec, 1883 .vece = MO_32 }, 1884 { .fni8 = tcg_gen_smax_i64, 1885 .fniv = tcg_gen_smax_vec, 1886 .fno = gen_helper_gvec_smax64, 1887 .opc = INDEX_op_smax_vec, 1888 .vece = MO_64 } 1889 }; 1890 tcg_debug_assert(vece <= MO_64); 1891 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1892 } 1893 1894 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs, 1895 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1896 { 1897 static const GVecGen3 g[4] = { 1898 { .fniv = tcg_gen_umax_vec, 1899 .fno = gen_helper_gvec_umax8, 1900 .opc = INDEX_op_umax_vec, 1901 .vece = MO_8 }, 1902 { .fniv = tcg_gen_umax_vec, 1903 .fno = gen_helper_gvec_umax16, 1904 .opc = INDEX_op_umax_vec, 1905 .vece = MO_16 }, 1906 { .fni4 = tcg_gen_umax_i32, 1907 .fniv = tcg_gen_umax_vec, 1908 .fno = gen_helper_gvec_umax32, 1909 .opc = INDEX_op_umax_vec, 1910 .vece = MO_32 }, 1911 { .fni8 = tcg_gen_umax_i64, 1912 .fniv = tcg_gen_umax_vec, 1913 .fno = gen_helper_gvec_umax64, 1914 .opc = INDEX_op_umax_vec, 1915 .vece = MO_64 } 1916 }; 1917 tcg_debug_assert(vece <= MO_64); 1918 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1919 } 1920 1921 /* Perform a vector negation using normal negation and a mask. 1922 Compare gen_subv_mask above. */ 1923 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 1924 { 1925 TCGv_i64 t2 = tcg_temp_new_i64(); 1926 TCGv_i64 t3 = tcg_temp_new_i64(); 1927 1928 tcg_gen_andc_i64(t3, m, b); 1929 tcg_gen_andc_i64(t2, b, m); 1930 tcg_gen_sub_i64(d, m, t2); 1931 tcg_gen_xor_i64(d, d, t3); 1932 1933 tcg_temp_free_i64(t2); 1934 tcg_temp_free_i64(t3); 1935 } 1936 1937 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 1938 { 1939 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1940 gen_negv_mask(d, b, m); 1941 tcg_temp_free_i64(m); 1942 } 1943 1944 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 1945 { 1946 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1947 gen_negv_mask(d, b, m); 1948 tcg_temp_free_i64(m); 1949 } 1950 1951 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 1952 { 1953 TCGv_i64 t1 = tcg_temp_new_i64(); 1954 TCGv_i64 t2 = tcg_temp_new_i64(); 1955 1956 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1957 tcg_gen_neg_i64(t2, b); 1958 tcg_gen_neg_i64(t1, t1); 1959 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1960 1961 tcg_temp_free_i64(t1); 1962 tcg_temp_free_i64(t2); 1963 } 1964 1965 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 1966 uint32_t oprsz, uint32_t maxsz) 1967 { 1968 static const GVecGen2 g[4] = { 1969 { .fni8 = tcg_gen_vec_neg8_i64, 1970 .fniv = tcg_gen_neg_vec, 1971 .fno = gen_helper_gvec_neg8, 1972 .opc = INDEX_op_neg_vec, 1973 .vece = MO_8 }, 1974 { .fni8 = tcg_gen_vec_neg16_i64, 1975 .fniv = tcg_gen_neg_vec, 1976 .fno = gen_helper_gvec_neg16, 1977 .opc = INDEX_op_neg_vec, 1978 .vece = MO_16 }, 1979 { .fni4 = tcg_gen_neg_i32, 1980 .fniv = tcg_gen_neg_vec, 1981 .fno = gen_helper_gvec_neg32, 1982 .opc = INDEX_op_neg_vec, 1983 .vece = MO_32 }, 1984 { .fni8 = tcg_gen_neg_i64, 1985 .fniv = tcg_gen_neg_vec, 1986 .fno = gen_helper_gvec_neg64, 1987 .opc = INDEX_op_neg_vec, 1988 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1989 .vece = MO_64 }, 1990 }; 1991 1992 tcg_debug_assert(vece <= MO_64); 1993 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 1994 } 1995 1996 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 1997 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1998 { 1999 static const GVecGen3 g = { 2000 .fni8 = tcg_gen_and_i64, 2001 .fniv = tcg_gen_and_vec, 2002 .fno = gen_helper_gvec_and, 2003 .opc = INDEX_op_and_vec, 2004 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2005 }; 2006 2007 if (aofs == bofs) { 2008 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2009 } else { 2010 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2011 } 2012 } 2013 2014 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 2015 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2016 { 2017 static const GVecGen3 g = { 2018 .fni8 = tcg_gen_or_i64, 2019 .fniv = tcg_gen_or_vec, 2020 .fno = gen_helper_gvec_or, 2021 .opc = INDEX_op_or_vec, 2022 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2023 }; 2024 2025 if (aofs == bofs) { 2026 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2027 } else { 2028 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2029 } 2030 } 2031 2032 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 2033 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2034 { 2035 static const GVecGen3 g = { 2036 .fni8 = tcg_gen_xor_i64, 2037 .fniv = tcg_gen_xor_vec, 2038 .fno = gen_helper_gvec_xor, 2039 .opc = INDEX_op_xor_vec, 2040 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2041 }; 2042 2043 if (aofs == bofs) { 2044 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); 2045 } else { 2046 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2047 } 2048 } 2049 2050 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 2051 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2052 { 2053 static const GVecGen3 g = { 2054 .fni8 = tcg_gen_andc_i64, 2055 .fniv = tcg_gen_andc_vec, 2056 .fno = gen_helper_gvec_andc, 2057 .opc = INDEX_op_andc_vec, 2058 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2059 }; 2060 2061 if (aofs == bofs) { 2062 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0); 2063 } else { 2064 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2065 } 2066 } 2067 2068 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 2069 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2070 { 2071 static const GVecGen3 g = { 2072 .fni8 = tcg_gen_orc_i64, 2073 .fniv = tcg_gen_orc_vec, 2074 .fno = gen_helper_gvec_orc, 2075 .opc = INDEX_op_orc_vec, 2076 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2077 }; 2078 2079 if (aofs == bofs) { 2080 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); 2081 } else { 2082 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2083 } 2084 } 2085 2086 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs, 2087 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2088 { 2089 static const GVecGen3 g = { 2090 .fni8 = tcg_gen_nand_i64, 2091 .fniv = tcg_gen_nand_vec, 2092 .fno = gen_helper_gvec_nand, 2093 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2094 }; 2095 2096 if (aofs == bofs) { 2097 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2098 } else { 2099 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2100 } 2101 } 2102 2103 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs, 2104 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2105 { 2106 static const GVecGen3 g = { 2107 .fni8 = tcg_gen_nor_i64, 2108 .fniv = tcg_gen_nor_vec, 2109 .fno = gen_helper_gvec_nor, 2110 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2111 }; 2112 2113 if (aofs == bofs) { 2114 tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz); 2115 } else { 2116 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2117 } 2118 } 2119 2120 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs, 2121 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 2122 { 2123 static const GVecGen3 g = { 2124 .fni8 = tcg_gen_eqv_i64, 2125 .fniv = tcg_gen_eqv_vec, 2126 .fno = gen_helper_gvec_eqv, 2127 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2128 }; 2129 2130 if (aofs == bofs) { 2131 tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1); 2132 } else { 2133 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 2134 } 2135 } 2136 2137 static const GVecGen2s gop_ands = { 2138 .fni8 = tcg_gen_and_i64, 2139 .fniv = tcg_gen_and_vec, 2140 .fno = gen_helper_gvec_ands, 2141 .opc = INDEX_op_and_vec, 2142 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2143 .vece = MO_64 2144 }; 2145 2146 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 2147 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2148 { 2149 TCGv_i64 tmp = tcg_temp_new_i64(); 2150 gen_dup_i64(vece, tmp, c); 2151 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2152 tcg_temp_free_i64(tmp); 2153 } 2154 2155 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 2156 int64_t c, uint32_t oprsz, uint32_t maxsz) 2157 { 2158 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2159 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 2160 tcg_temp_free_i64(tmp); 2161 } 2162 2163 static const GVecGen2s gop_xors = { 2164 .fni8 = tcg_gen_xor_i64, 2165 .fniv = tcg_gen_xor_vec, 2166 .fno = gen_helper_gvec_xors, 2167 .opc = INDEX_op_xor_vec, 2168 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2169 .vece = MO_64 2170 }; 2171 2172 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 2173 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2174 { 2175 TCGv_i64 tmp = tcg_temp_new_i64(); 2176 gen_dup_i64(vece, tmp, c); 2177 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2178 tcg_temp_free_i64(tmp); 2179 } 2180 2181 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 2182 int64_t c, uint32_t oprsz, uint32_t maxsz) 2183 { 2184 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2185 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 2186 tcg_temp_free_i64(tmp); 2187 } 2188 2189 static const GVecGen2s gop_ors = { 2190 .fni8 = tcg_gen_or_i64, 2191 .fniv = tcg_gen_or_vec, 2192 .fno = gen_helper_gvec_ors, 2193 .opc = INDEX_op_or_vec, 2194 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2195 .vece = MO_64 2196 }; 2197 2198 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 2199 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 2200 { 2201 TCGv_i64 tmp = tcg_temp_new_i64(); 2202 gen_dup_i64(vece, tmp, c); 2203 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2204 tcg_temp_free_i64(tmp); 2205 } 2206 2207 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 2208 int64_t c, uint32_t oprsz, uint32_t maxsz) 2209 { 2210 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 2211 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 2212 tcg_temp_free_i64(tmp); 2213 } 2214 2215 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2216 { 2217 uint64_t mask = dup_const(MO_8, 0xff << c); 2218 tcg_gen_shli_i64(d, a, c); 2219 tcg_gen_andi_i64(d, d, mask); 2220 } 2221 2222 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2223 { 2224 uint64_t mask = dup_const(MO_16, 0xffff << c); 2225 tcg_gen_shli_i64(d, a, c); 2226 tcg_gen_andi_i64(d, d, mask); 2227 } 2228 2229 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 2230 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2231 { 2232 static const GVecGen2i g[4] = { 2233 { .fni8 = tcg_gen_vec_shl8i_i64, 2234 .fniv = tcg_gen_shli_vec, 2235 .fno = gen_helper_gvec_shl8i, 2236 .opc = INDEX_op_shli_vec, 2237 .vece = MO_8 }, 2238 { .fni8 = tcg_gen_vec_shl16i_i64, 2239 .fniv = tcg_gen_shli_vec, 2240 .fno = gen_helper_gvec_shl16i, 2241 .opc = INDEX_op_shli_vec, 2242 .vece = MO_16 }, 2243 { .fni4 = tcg_gen_shli_i32, 2244 .fniv = tcg_gen_shli_vec, 2245 .fno = gen_helper_gvec_shl32i, 2246 .opc = INDEX_op_shli_vec, 2247 .vece = MO_32 }, 2248 { .fni8 = tcg_gen_shli_i64, 2249 .fniv = tcg_gen_shli_vec, 2250 .fno = gen_helper_gvec_shl64i, 2251 .opc = INDEX_op_shli_vec, 2252 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2253 .vece = MO_64 }, 2254 }; 2255 2256 tcg_debug_assert(vece <= MO_64); 2257 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2258 if (shift == 0) { 2259 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2260 } else { 2261 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2262 } 2263 } 2264 2265 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2266 { 2267 uint64_t mask = dup_const(MO_8, 0xff >> c); 2268 tcg_gen_shri_i64(d, a, c); 2269 tcg_gen_andi_i64(d, d, mask); 2270 } 2271 2272 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2273 { 2274 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2275 tcg_gen_shri_i64(d, a, c); 2276 tcg_gen_andi_i64(d, d, mask); 2277 } 2278 2279 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2280 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2281 { 2282 static const GVecGen2i g[4] = { 2283 { .fni8 = tcg_gen_vec_shr8i_i64, 2284 .fniv = tcg_gen_shri_vec, 2285 .fno = gen_helper_gvec_shr8i, 2286 .opc = INDEX_op_shri_vec, 2287 .vece = MO_8 }, 2288 { .fni8 = tcg_gen_vec_shr16i_i64, 2289 .fniv = tcg_gen_shri_vec, 2290 .fno = gen_helper_gvec_shr16i, 2291 .opc = INDEX_op_shri_vec, 2292 .vece = MO_16 }, 2293 { .fni4 = tcg_gen_shri_i32, 2294 .fniv = tcg_gen_shri_vec, 2295 .fno = gen_helper_gvec_shr32i, 2296 .opc = INDEX_op_shri_vec, 2297 .vece = MO_32 }, 2298 { .fni8 = tcg_gen_shri_i64, 2299 .fniv = tcg_gen_shri_vec, 2300 .fno = gen_helper_gvec_shr64i, 2301 .opc = INDEX_op_shri_vec, 2302 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2303 .vece = MO_64 }, 2304 }; 2305 2306 tcg_debug_assert(vece <= MO_64); 2307 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2308 if (shift == 0) { 2309 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2310 } else { 2311 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2312 } 2313 } 2314 2315 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2316 { 2317 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2318 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2319 TCGv_i64 s = tcg_temp_new_i64(); 2320 2321 tcg_gen_shri_i64(d, a, c); 2322 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2323 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2324 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2325 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2326 tcg_temp_free_i64(s); 2327 } 2328 2329 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2330 { 2331 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2332 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2333 TCGv_i64 s = tcg_temp_new_i64(); 2334 2335 tcg_gen_shri_i64(d, a, c); 2336 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2337 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2338 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2339 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2340 tcg_temp_free_i64(s); 2341 } 2342 2343 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2344 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2345 { 2346 static const GVecGen2i g[4] = { 2347 { .fni8 = tcg_gen_vec_sar8i_i64, 2348 .fniv = tcg_gen_sari_vec, 2349 .fno = gen_helper_gvec_sar8i, 2350 .opc = INDEX_op_sari_vec, 2351 .vece = MO_8 }, 2352 { .fni8 = tcg_gen_vec_sar16i_i64, 2353 .fniv = tcg_gen_sari_vec, 2354 .fno = gen_helper_gvec_sar16i, 2355 .opc = INDEX_op_sari_vec, 2356 .vece = MO_16 }, 2357 { .fni4 = tcg_gen_sari_i32, 2358 .fniv = tcg_gen_sari_vec, 2359 .fno = gen_helper_gvec_sar32i, 2360 .opc = INDEX_op_sari_vec, 2361 .vece = MO_32 }, 2362 { .fni8 = tcg_gen_sari_i64, 2363 .fniv = tcg_gen_sari_vec, 2364 .fno = gen_helper_gvec_sar64i, 2365 .opc = INDEX_op_sari_vec, 2366 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2367 .vece = MO_64 }, 2368 }; 2369 2370 tcg_debug_assert(vece <= MO_64); 2371 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2372 if (shift == 0) { 2373 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2374 } else { 2375 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2376 } 2377 } 2378 2379 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 2380 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 2381 uint32_t oprsz, TCGCond cond) 2382 { 2383 TCGv_i32 t0 = tcg_temp_new_i32(); 2384 TCGv_i32 t1 = tcg_temp_new_i32(); 2385 uint32_t i; 2386 2387 for (i = 0; i < oprsz; i += 4) { 2388 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 2389 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 2390 tcg_gen_setcond_i32(cond, t0, t0, t1); 2391 tcg_gen_neg_i32(t0, t0); 2392 tcg_gen_st_i32(t0, cpu_env, dofs + i); 2393 } 2394 tcg_temp_free_i32(t1); 2395 tcg_temp_free_i32(t0); 2396 } 2397 2398 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 2399 uint32_t oprsz, TCGCond cond) 2400 { 2401 TCGv_i64 t0 = tcg_temp_new_i64(); 2402 TCGv_i64 t1 = tcg_temp_new_i64(); 2403 uint32_t i; 2404 2405 for (i = 0; i < oprsz; i += 8) { 2406 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 2407 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 2408 tcg_gen_setcond_i64(cond, t0, t0, t1); 2409 tcg_gen_neg_i64(t0, t0); 2410 tcg_gen_st_i64(t0, cpu_env, dofs + i); 2411 } 2412 tcg_temp_free_i64(t1); 2413 tcg_temp_free_i64(t0); 2414 } 2415 2416 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2417 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 2418 TCGType type, TCGCond cond) 2419 { 2420 TCGv_vec t0 = tcg_temp_new_vec(type); 2421 TCGv_vec t1 = tcg_temp_new_vec(type); 2422 uint32_t i; 2423 2424 for (i = 0; i < oprsz; i += tysz) { 2425 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2426 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 2427 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 2428 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2429 } 2430 tcg_temp_free_vec(t1); 2431 tcg_temp_free_vec(t0); 2432 } 2433 2434 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 2435 uint32_t aofs, uint32_t bofs, 2436 uint32_t oprsz, uint32_t maxsz) 2437 { 2438 static gen_helper_gvec_3 * const eq_fn[4] = { 2439 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 2440 gen_helper_gvec_eq32, gen_helper_gvec_eq64 2441 }; 2442 static gen_helper_gvec_3 * const ne_fn[4] = { 2443 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 2444 gen_helper_gvec_ne32, gen_helper_gvec_ne64 2445 }; 2446 static gen_helper_gvec_3 * const lt_fn[4] = { 2447 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 2448 gen_helper_gvec_lt32, gen_helper_gvec_lt64 2449 }; 2450 static gen_helper_gvec_3 * const le_fn[4] = { 2451 gen_helper_gvec_le8, gen_helper_gvec_le16, 2452 gen_helper_gvec_le32, gen_helper_gvec_le64 2453 }; 2454 static gen_helper_gvec_3 * const ltu_fn[4] = { 2455 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 2456 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 2457 }; 2458 static gen_helper_gvec_3 * const leu_fn[4] = { 2459 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 2460 gen_helper_gvec_leu32, gen_helper_gvec_leu64 2461 }; 2462 static gen_helper_gvec_3 * const * const fns[16] = { 2463 [TCG_COND_EQ] = eq_fn, 2464 [TCG_COND_NE] = ne_fn, 2465 [TCG_COND_LT] = lt_fn, 2466 [TCG_COND_LE] = le_fn, 2467 [TCG_COND_LTU] = ltu_fn, 2468 [TCG_COND_LEU] = leu_fn, 2469 }; 2470 TCGType type; 2471 uint32_t some; 2472 2473 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 2474 check_overlap_3(dofs, aofs, bofs, maxsz); 2475 2476 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 2477 do_dup(MO_8, dofs, oprsz, maxsz, 2478 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 2479 return; 2480 } 2481 2482 /* Implement inline with a vector type, if possible. 2483 * Prefer integer when 64-bit host and 64-bit comparison. 2484 */ 2485 type = choose_vector_type(INDEX_op_cmp_vec, vece, oprsz, 2486 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 2487 switch (type) { 2488 case TCG_TYPE_V256: 2489 /* Recall that ARM SVE allows vector sizes that are not a 2490 * power of 2, but always a multiple of 16. The intent is 2491 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 2492 */ 2493 some = QEMU_ALIGN_DOWN(oprsz, 32); 2494 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 2495 if (some == oprsz) { 2496 break; 2497 } 2498 dofs += some; 2499 aofs += some; 2500 bofs += some; 2501 oprsz -= some; 2502 maxsz -= some; 2503 /* fallthru */ 2504 case TCG_TYPE_V128: 2505 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 2506 break; 2507 case TCG_TYPE_V64: 2508 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 2509 break; 2510 2511 case 0: 2512 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2513 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 2514 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2515 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 2516 } else { 2517 gen_helper_gvec_3 * const *fn = fns[cond]; 2518 2519 if (fn == NULL) { 2520 uint32_t tmp; 2521 tmp = aofs, aofs = bofs, bofs = tmp; 2522 cond = tcg_swap_cond(cond); 2523 fn = fns[cond]; 2524 assert(fn != NULL); 2525 } 2526 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 2527 return; 2528 } 2529 break; 2530 2531 default: 2532 g_assert_not_reached(); 2533 } 2534 2535 if (oprsz < maxsz) { 2536 expand_clr(dofs + oprsz, maxsz - oprsz); 2537 } 2538 } 2539