1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu-common.h" 22 #include "tcg.h" 23 #include "tcg-op.h" 24 #include "tcg-op-gvec.h" 25 #include "tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 /* Verify vector size and alignment rules. OFS should be the OR of all 30 of the operand offsets so that we can check them all at once. */ 31 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 32 { 33 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 34 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 35 tcg_debug_assert(oprsz > 0); 36 tcg_debug_assert(oprsz <= maxsz); 37 tcg_debug_assert((oprsz & opr_align) == 0); 38 tcg_debug_assert((maxsz & max_align) == 0); 39 tcg_debug_assert((ofs & max_align) == 0); 40 } 41 42 /* Verify vector overlap rules for two operands. */ 43 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 44 { 45 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 46 } 47 48 /* Verify vector overlap rules for three operands. */ 49 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 50 { 51 check_overlap_2(d, a, s); 52 check_overlap_2(d, b, s); 53 check_overlap_2(a, b, s); 54 } 55 56 /* Verify vector overlap rules for four operands. */ 57 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 58 uint32_t c, uint32_t s) 59 { 60 check_overlap_2(d, a, s); 61 check_overlap_2(d, b, s); 62 check_overlap_2(d, c, s); 63 check_overlap_2(a, b, s); 64 check_overlap_2(a, c, s); 65 check_overlap_2(b, c, s); 66 } 67 68 /* Create a descriptor from components. */ 69 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 70 { 71 uint32_t desc = 0; 72 73 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 74 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 75 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 76 77 oprsz = (oprsz / 8) - 1; 78 maxsz = (maxsz / 8) - 1; 79 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 80 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 81 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 82 83 return desc; 84 } 85 86 /* Generate a call to a gvec-style helper with two vector operands. */ 87 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 88 uint32_t oprsz, uint32_t maxsz, int32_t data, 89 gen_helper_gvec_2 *fn) 90 { 91 TCGv_ptr a0, a1; 92 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 93 94 a0 = tcg_temp_new_ptr(); 95 a1 = tcg_temp_new_ptr(); 96 97 tcg_gen_addi_ptr(a0, cpu_env, dofs); 98 tcg_gen_addi_ptr(a1, cpu_env, aofs); 99 100 fn(a0, a1, desc); 101 102 tcg_temp_free_ptr(a0); 103 tcg_temp_free_ptr(a1); 104 tcg_temp_free_i32(desc); 105 } 106 107 /* Generate a call to a gvec-style helper with two vector operands 108 and one scalar operand. */ 109 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 110 uint32_t oprsz, uint32_t maxsz, int32_t data, 111 gen_helper_gvec_2i *fn) 112 { 113 TCGv_ptr a0, a1; 114 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 115 116 a0 = tcg_temp_new_ptr(); 117 a1 = tcg_temp_new_ptr(); 118 119 tcg_gen_addi_ptr(a0, cpu_env, dofs); 120 tcg_gen_addi_ptr(a1, cpu_env, aofs); 121 122 fn(a0, a1, c, desc); 123 124 tcg_temp_free_ptr(a0); 125 tcg_temp_free_ptr(a1); 126 tcg_temp_free_i32(desc); 127 } 128 129 /* Generate a call to a gvec-style helper with three vector operands. */ 130 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 131 uint32_t oprsz, uint32_t maxsz, int32_t data, 132 gen_helper_gvec_3 *fn) 133 { 134 TCGv_ptr a0, a1, a2; 135 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 136 137 a0 = tcg_temp_new_ptr(); 138 a1 = tcg_temp_new_ptr(); 139 a2 = tcg_temp_new_ptr(); 140 141 tcg_gen_addi_ptr(a0, cpu_env, dofs); 142 tcg_gen_addi_ptr(a1, cpu_env, aofs); 143 tcg_gen_addi_ptr(a2, cpu_env, bofs); 144 145 fn(a0, a1, a2, desc); 146 147 tcg_temp_free_ptr(a0); 148 tcg_temp_free_ptr(a1); 149 tcg_temp_free_ptr(a2); 150 tcg_temp_free_i32(desc); 151 } 152 153 /* Generate a call to a gvec-style helper with four vector operands. */ 154 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 155 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 156 int32_t data, gen_helper_gvec_4 *fn) 157 { 158 TCGv_ptr a0, a1, a2, a3; 159 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 160 161 a0 = tcg_temp_new_ptr(); 162 a1 = tcg_temp_new_ptr(); 163 a2 = tcg_temp_new_ptr(); 164 a3 = tcg_temp_new_ptr(); 165 166 tcg_gen_addi_ptr(a0, cpu_env, dofs); 167 tcg_gen_addi_ptr(a1, cpu_env, aofs); 168 tcg_gen_addi_ptr(a2, cpu_env, bofs); 169 tcg_gen_addi_ptr(a3, cpu_env, cofs); 170 171 fn(a0, a1, a2, a3, desc); 172 173 tcg_temp_free_ptr(a0); 174 tcg_temp_free_ptr(a1); 175 tcg_temp_free_ptr(a2); 176 tcg_temp_free_ptr(a3); 177 tcg_temp_free_i32(desc); 178 } 179 180 /* Generate a call to a gvec-style helper with five vector operands. */ 181 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 182 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 183 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 184 { 185 TCGv_ptr a0, a1, a2, a3, a4; 186 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 187 188 a0 = tcg_temp_new_ptr(); 189 a1 = tcg_temp_new_ptr(); 190 a2 = tcg_temp_new_ptr(); 191 a3 = tcg_temp_new_ptr(); 192 a4 = tcg_temp_new_ptr(); 193 194 tcg_gen_addi_ptr(a0, cpu_env, dofs); 195 tcg_gen_addi_ptr(a1, cpu_env, aofs); 196 tcg_gen_addi_ptr(a2, cpu_env, bofs); 197 tcg_gen_addi_ptr(a3, cpu_env, cofs); 198 tcg_gen_addi_ptr(a4, cpu_env, xofs); 199 200 fn(a0, a1, a2, a3, a4, desc); 201 202 tcg_temp_free_ptr(a0); 203 tcg_temp_free_ptr(a1); 204 tcg_temp_free_ptr(a2); 205 tcg_temp_free_ptr(a3); 206 tcg_temp_free_ptr(a4); 207 tcg_temp_free_i32(desc); 208 } 209 210 /* Generate a call to a gvec-style helper with three vector operands 211 and an extra pointer operand. */ 212 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 213 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 214 int32_t data, gen_helper_gvec_2_ptr *fn) 215 { 216 TCGv_ptr a0, a1; 217 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 218 219 a0 = tcg_temp_new_ptr(); 220 a1 = tcg_temp_new_ptr(); 221 222 tcg_gen_addi_ptr(a0, cpu_env, dofs); 223 tcg_gen_addi_ptr(a1, cpu_env, aofs); 224 225 fn(a0, a1, ptr, desc); 226 227 tcg_temp_free_ptr(a0); 228 tcg_temp_free_ptr(a1); 229 tcg_temp_free_i32(desc); 230 } 231 232 /* Generate a call to a gvec-style helper with three vector operands 233 and an extra pointer operand. */ 234 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 235 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 236 int32_t data, gen_helper_gvec_3_ptr *fn) 237 { 238 TCGv_ptr a0, a1, a2; 239 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 240 241 a0 = tcg_temp_new_ptr(); 242 a1 = tcg_temp_new_ptr(); 243 a2 = tcg_temp_new_ptr(); 244 245 tcg_gen_addi_ptr(a0, cpu_env, dofs); 246 tcg_gen_addi_ptr(a1, cpu_env, aofs); 247 tcg_gen_addi_ptr(a2, cpu_env, bofs); 248 249 fn(a0, a1, a2, ptr, desc); 250 251 tcg_temp_free_ptr(a0); 252 tcg_temp_free_ptr(a1); 253 tcg_temp_free_ptr(a2); 254 tcg_temp_free_i32(desc); 255 } 256 257 /* Generate a call to a gvec-style helper with four vector operands 258 and an extra pointer operand. */ 259 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 260 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 261 uint32_t maxsz, int32_t data, 262 gen_helper_gvec_4_ptr *fn) 263 { 264 TCGv_ptr a0, a1, a2, a3; 265 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 266 267 a0 = tcg_temp_new_ptr(); 268 a1 = tcg_temp_new_ptr(); 269 a2 = tcg_temp_new_ptr(); 270 a3 = tcg_temp_new_ptr(); 271 272 tcg_gen_addi_ptr(a0, cpu_env, dofs); 273 tcg_gen_addi_ptr(a1, cpu_env, aofs); 274 tcg_gen_addi_ptr(a2, cpu_env, bofs); 275 tcg_gen_addi_ptr(a3, cpu_env, cofs); 276 277 fn(a0, a1, a2, a3, ptr, desc); 278 279 tcg_temp_free_ptr(a0); 280 tcg_temp_free_ptr(a1); 281 tcg_temp_free_ptr(a2); 282 tcg_temp_free_ptr(a3); 283 tcg_temp_free_i32(desc); 284 } 285 286 /* Return true if we want to implement something of OPRSZ bytes 287 in units of LNSZ. This limits the expansion of inline code. */ 288 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 289 { 290 uint32_t lnct = oprsz / lnsz; 291 return lnct >= 1 && lnct <= MAX_UNROLL; 292 } 293 294 static void expand_clr(uint32_t dofs, uint32_t maxsz); 295 296 /* Duplicate C as per VECE. */ 297 uint64_t (dup_const)(unsigned vece, uint64_t c) 298 { 299 switch (vece) { 300 case MO_8: 301 return 0x0101010101010101ull * (uint8_t)c; 302 case MO_16: 303 return 0x0001000100010001ull * (uint16_t)c; 304 case MO_32: 305 return 0x0000000100000001ull * (uint32_t)c; 306 case MO_64: 307 return c; 308 default: 309 g_assert_not_reached(); 310 } 311 } 312 313 /* Duplicate IN into OUT as per VECE. */ 314 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 315 { 316 switch (vece) { 317 case MO_8: 318 tcg_gen_ext8u_i32(out, in); 319 tcg_gen_muli_i32(out, out, 0x01010101); 320 break; 321 case MO_16: 322 tcg_gen_deposit_i32(out, in, in, 16, 16); 323 break; 324 case MO_32: 325 tcg_gen_mov_i32(out, in); 326 break; 327 default: 328 g_assert_not_reached(); 329 } 330 } 331 332 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 333 { 334 switch (vece) { 335 case MO_8: 336 tcg_gen_ext8u_i64(out, in); 337 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 338 break; 339 case MO_16: 340 tcg_gen_ext16u_i64(out, in); 341 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 342 break; 343 case MO_32: 344 tcg_gen_deposit_i64(out, in, in, 32, 32); 345 break; 346 case MO_64: 347 tcg_gen_mov_i64(out, in); 348 break; 349 default: 350 g_assert_not_reached(); 351 } 352 } 353 354 /* Select a supported vector type for implementing an operation on SIZE 355 * bytes. If OP is 0, assume that the real operation to be performed is 356 * required by all backends. Otherwise, make sure than OP can be performed 357 * on elements of size VECE in the selected type. Do not select V64 if 358 * PREFER_I64 is true. Return 0 if no vector type is selected. 359 */ 360 static TCGType choose_vector_type(TCGOpcode op, unsigned vece, uint32_t size, 361 bool prefer_i64) 362 { 363 if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { 364 if (op == 0) { 365 return TCG_TYPE_V256; 366 } 367 /* Recall that ARM SVE allows vector sizes that are not a 368 * power of 2, but always a multiple of 16. The intent is 369 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 370 * It is hard to imagine a case in which v256 is supported 371 * but v128 is not, but check anyway. 372 */ 373 if (tcg_can_emit_vec_op(op, TCG_TYPE_V256, vece) 374 && (size % 32 == 0 375 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) { 376 return TCG_TYPE_V256; 377 } 378 } 379 if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) 380 && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) { 381 return TCG_TYPE_V128; 382 } 383 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 384 && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V64, vece))) { 385 return TCG_TYPE_V64; 386 } 387 return 0; 388 } 389 390 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 391 * Only one of IN_32 or IN_64 may be set; 392 * IN_C is used if IN_32 and IN_64 are unset. 393 */ 394 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 395 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 396 uint64_t in_c) 397 { 398 TCGType type; 399 TCGv_i64 t_64; 400 TCGv_i32 t_32, t_desc; 401 TCGv_ptr t_ptr; 402 uint32_t i; 403 404 assert(vece <= (in_32 ? MO_32 : MO_64)); 405 assert(in_32 == NULL || in_64 == NULL); 406 407 /* If we're storing 0, expand oprsz to maxsz. */ 408 if (in_32 == NULL && in_64 == NULL) { 409 in_c = dup_const(vece, in_c); 410 if (in_c == 0) { 411 oprsz = maxsz; 412 } 413 } 414 415 /* Implement inline with a vector type, if possible. 416 * Prefer integer when 64-bit host and no variable dup. 417 */ 418 type = choose_vector_type(0, vece, oprsz, 419 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 420 && (in_64 == NULL || vece == MO_64))); 421 if (type != 0) { 422 TCGv_vec t_vec = tcg_temp_new_vec(type); 423 424 if (in_32) { 425 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 426 } else if (in_64) { 427 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 428 } else { 429 switch (vece) { 430 case MO_8: 431 tcg_gen_dup8i_vec(t_vec, in_c); 432 break; 433 case MO_16: 434 tcg_gen_dup16i_vec(t_vec, in_c); 435 break; 436 case MO_32: 437 tcg_gen_dup32i_vec(t_vec, in_c); 438 break; 439 default: 440 tcg_gen_dup64i_vec(t_vec, in_c); 441 break; 442 } 443 } 444 445 i = 0; 446 switch (type) { 447 case TCG_TYPE_V256: 448 /* Recall that ARM SVE allows vector sizes that are not a 449 * power of 2, but always a multiple of 16. The intent is 450 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 451 */ 452 for (; i + 32 <= oprsz; i += 32) { 453 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 454 } 455 /* fallthru */ 456 case TCG_TYPE_V128: 457 for (; i + 16 <= oprsz; i += 16) { 458 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 459 } 460 break; 461 case TCG_TYPE_V64: 462 for (; i < oprsz; i += 8) { 463 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 464 } 465 break; 466 default: 467 g_assert_not_reached(); 468 } 469 470 tcg_temp_free_vec(t_vec); 471 goto done; 472 } 473 474 /* Otherwise, inline with an integer type, unless "large". */ 475 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 476 t_64 = NULL; 477 t_32 = NULL; 478 479 if (in_32) { 480 /* We are given a 32-bit variable input. For a 64-bit host, 481 use a 64-bit operation unless the 32-bit operation would 482 be simple enough. */ 483 if (TCG_TARGET_REG_BITS == 64 484 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 485 t_64 = tcg_temp_new_i64(); 486 tcg_gen_extu_i32_i64(t_64, in_32); 487 gen_dup_i64(vece, t_64, t_64); 488 } else { 489 t_32 = tcg_temp_new_i32(); 490 gen_dup_i32(vece, t_32, in_32); 491 } 492 } else if (in_64) { 493 /* We are given a 64-bit variable input. */ 494 t_64 = tcg_temp_new_i64(); 495 gen_dup_i64(vece, t_64, in_64); 496 } else { 497 /* We are given a constant input. */ 498 /* For 64-bit hosts, use 64-bit constants for "simple" constants 499 or when we'd need too many 32-bit stores, or when a 64-bit 500 constant is really required. */ 501 if (vece == MO_64 502 || (TCG_TARGET_REG_BITS == 64 503 && (in_c == 0 || in_c == -1 504 || !check_size_impl(oprsz, 4)))) { 505 t_64 = tcg_const_i64(in_c); 506 } else { 507 t_32 = tcg_const_i32(in_c); 508 } 509 } 510 511 /* Implement inline if we picked an implementation size above. */ 512 if (t_32) { 513 for (i = 0; i < oprsz; i += 4) { 514 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 515 } 516 tcg_temp_free_i32(t_32); 517 goto done; 518 } 519 if (t_64) { 520 for (i = 0; i < oprsz; i += 8) { 521 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 522 } 523 tcg_temp_free_i64(t_64); 524 goto done; 525 } 526 } 527 528 /* Otherwise implement out of line. */ 529 t_ptr = tcg_temp_new_ptr(); 530 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 531 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 532 533 if (vece == MO_64) { 534 if (in_64) { 535 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 536 } else { 537 t_64 = tcg_const_i64(in_c); 538 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 539 tcg_temp_free_i64(t_64); 540 } 541 } else { 542 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 543 static dup_fn * const fns[3] = { 544 gen_helper_gvec_dup8, 545 gen_helper_gvec_dup16, 546 gen_helper_gvec_dup32 547 }; 548 549 if (in_32) { 550 fns[vece](t_ptr, t_desc, in_32); 551 } else { 552 t_32 = tcg_temp_new_i32(); 553 if (in_64) { 554 tcg_gen_extrl_i64_i32(t_32, in_64); 555 } else if (vece == MO_8) { 556 tcg_gen_movi_i32(t_32, in_c & 0xff); 557 } else if (vece == MO_16) { 558 tcg_gen_movi_i32(t_32, in_c & 0xffff); 559 } else { 560 tcg_gen_movi_i32(t_32, in_c); 561 } 562 fns[vece](t_ptr, t_desc, t_32); 563 tcg_temp_free_i32(t_32); 564 } 565 } 566 567 tcg_temp_free_ptr(t_ptr); 568 tcg_temp_free_i32(t_desc); 569 return; 570 571 done: 572 if (oprsz < maxsz) { 573 expand_clr(dofs + oprsz, maxsz - oprsz); 574 } 575 } 576 577 /* Likewise, but with zero. */ 578 static void expand_clr(uint32_t dofs, uint32_t maxsz) 579 { 580 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 581 } 582 583 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 584 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 585 void (*fni)(TCGv_i32, TCGv_i32)) 586 { 587 TCGv_i32 t0 = tcg_temp_new_i32(); 588 uint32_t i; 589 590 for (i = 0; i < oprsz; i += 4) { 591 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 592 fni(t0, t0); 593 tcg_gen_st_i32(t0, cpu_env, dofs + i); 594 } 595 tcg_temp_free_i32(t0); 596 } 597 598 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 599 int32_t c, bool load_dest, 600 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 601 { 602 TCGv_i32 t0 = tcg_temp_new_i32(); 603 TCGv_i32 t1 = tcg_temp_new_i32(); 604 uint32_t i; 605 606 for (i = 0; i < oprsz; i += 4) { 607 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 608 if (load_dest) { 609 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 610 } 611 fni(t1, t0, c); 612 tcg_gen_st_i32(t1, cpu_env, dofs + i); 613 } 614 tcg_temp_free_i32(t0); 615 tcg_temp_free_i32(t1); 616 } 617 618 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 619 TCGv_i32 c, bool scalar_first, 620 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 621 { 622 TCGv_i32 t0 = tcg_temp_new_i32(); 623 TCGv_i32 t1 = tcg_temp_new_i32(); 624 uint32_t i; 625 626 for (i = 0; i < oprsz; i += 4) { 627 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 628 if (scalar_first) { 629 fni(t1, c, t0); 630 } else { 631 fni(t1, t0, c); 632 } 633 tcg_gen_st_i32(t1, cpu_env, dofs + i); 634 } 635 tcg_temp_free_i32(t0); 636 tcg_temp_free_i32(t1); 637 } 638 639 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 640 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 641 uint32_t bofs, uint32_t oprsz, bool load_dest, 642 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 643 { 644 TCGv_i32 t0 = tcg_temp_new_i32(); 645 TCGv_i32 t1 = tcg_temp_new_i32(); 646 TCGv_i32 t2 = tcg_temp_new_i32(); 647 uint32_t i; 648 649 for (i = 0; i < oprsz; i += 4) { 650 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 651 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 652 if (load_dest) { 653 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 654 } 655 fni(t2, t0, t1); 656 tcg_gen_st_i32(t2, cpu_env, dofs + i); 657 } 658 tcg_temp_free_i32(t2); 659 tcg_temp_free_i32(t1); 660 tcg_temp_free_i32(t0); 661 } 662 663 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 664 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 665 uint32_t cofs, uint32_t oprsz, 666 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 667 { 668 TCGv_i32 t0 = tcg_temp_new_i32(); 669 TCGv_i32 t1 = tcg_temp_new_i32(); 670 TCGv_i32 t2 = tcg_temp_new_i32(); 671 TCGv_i32 t3 = tcg_temp_new_i32(); 672 uint32_t i; 673 674 for (i = 0; i < oprsz; i += 4) { 675 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 676 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 677 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 678 fni(t0, t1, t2, t3); 679 tcg_gen_st_i32(t0, cpu_env, dofs + i); 680 } 681 tcg_temp_free_i32(t3); 682 tcg_temp_free_i32(t2); 683 tcg_temp_free_i32(t1); 684 tcg_temp_free_i32(t0); 685 } 686 687 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 688 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 689 void (*fni)(TCGv_i64, TCGv_i64)) 690 { 691 TCGv_i64 t0 = tcg_temp_new_i64(); 692 uint32_t i; 693 694 for (i = 0; i < oprsz; i += 8) { 695 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 696 fni(t0, t0); 697 tcg_gen_st_i64(t0, cpu_env, dofs + i); 698 } 699 tcg_temp_free_i64(t0); 700 } 701 702 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 703 int64_t c, bool load_dest, 704 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 705 { 706 TCGv_i64 t0 = tcg_temp_new_i64(); 707 TCGv_i64 t1 = tcg_temp_new_i64(); 708 uint32_t i; 709 710 for (i = 0; i < oprsz; i += 8) { 711 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 712 if (load_dest) { 713 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 714 } 715 fni(t1, t0, c); 716 tcg_gen_st_i64(t1, cpu_env, dofs + i); 717 } 718 tcg_temp_free_i64(t0); 719 tcg_temp_free_i64(t1); 720 } 721 722 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 723 TCGv_i64 c, bool scalar_first, 724 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 725 { 726 TCGv_i64 t0 = tcg_temp_new_i64(); 727 TCGv_i64 t1 = tcg_temp_new_i64(); 728 uint32_t i; 729 730 for (i = 0; i < oprsz; i += 8) { 731 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 732 if (scalar_first) { 733 fni(t1, c, t0); 734 } else { 735 fni(t1, t0, c); 736 } 737 tcg_gen_st_i64(t1, cpu_env, dofs + i); 738 } 739 tcg_temp_free_i64(t0); 740 tcg_temp_free_i64(t1); 741 } 742 743 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 744 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 745 uint32_t bofs, uint32_t oprsz, bool load_dest, 746 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 747 { 748 TCGv_i64 t0 = tcg_temp_new_i64(); 749 TCGv_i64 t1 = tcg_temp_new_i64(); 750 TCGv_i64 t2 = tcg_temp_new_i64(); 751 uint32_t i; 752 753 for (i = 0; i < oprsz; i += 8) { 754 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 755 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 756 if (load_dest) { 757 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 758 } 759 fni(t2, t0, t1); 760 tcg_gen_st_i64(t2, cpu_env, dofs + i); 761 } 762 tcg_temp_free_i64(t2); 763 tcg_temp_free_i64(t1); 764 tcg_temp_free_i64(t0); 765 } 766 767 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 768 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 769 uint32_t cofs, uint32_t oprsz, 770 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 771 { 772 TCGv_i64 t0 = tcg_temp_new_i64(); 773 TCGv_i64 t1 = tcg_temp_new_i64(); 774 TCGv_i64 t2 = tcg_temp_new_i64(); 775 TCGv_i64 t3 = tcg_temp_new_i64(); 776 uint32_t i; 777 778 for (i = 0; i < oprsz; i += 8) { 779 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 780 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 781 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 782 fni(t0, t1, t2, t3); 783 tcg_gen_st_i64(t0, cpu_env, dofs + i); 784 } 785 tcg_temp_free_i64(t3); 786 tcg_temp_free_i64(t2); 787 tcg_temp_free_i64(t1); 788 tcg_temp_free_i64(t0); 789 } 790 791 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 792 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 793 uint32_t oprsz, uint32_t tysz, TCGType type, 794 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 795 { 796 TCGv_vec t0 = tcg_temp_new_vec(type); 797 uint32_t i; 798 799 for (i = 0; i < oprsz; i += tysz) { 800 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 801 fni(vece, t0, t0); 802 tcg_gen_st_vec(t0, cpu_env, dofs + i); 803 } 804 tcg_temp_free_vec(t0); 805 } 806 807 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 808 using host vectors. */ 809 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 810 uint32_t oprsz, uint32_t tysz, TCGType type, 811 int64_t c, bool load_dest, 812 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 813 { 814 TCGv_vec t0 = tcg_temp_new_vec(type); 815 TCGv_vec t1 = tcg_temp_new_vec(type); 816 uint32_t i; 817 818 for (i = 0; i < oprsz; i += tysz) { 819 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 820 if (load_dest) { 821 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 822 } 823 fni(vece, t1, t0, c); 824 tcg_gen_st_vec(t1, cpu_env, dofs + i); 825 } 826 tcg_temp_free_vec(t0); 827 tcg_temp_free_vec(t1); 828 } 829 830 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 831 uint32_t oprsz, uint32_t tysz, TCGType type, 832 TCGv_vec c, bool scalar_first, 833 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 834 { 835 TCGv_vec t0 = tcg_temp_new_vec(type); 836 TCGv_vec t1 = tcg_temp_new_vec(type); 837 uint32_t i; 838 839 for (i = 0; i < oprsz; i += tysz) { 840 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 841 if (scalar_first) { 842 fni(vece, t1, c, t0); 843 } else { 844 fni(vece, t1, t0, c); 845 } 846 tcg_gen_st_vec(t1, cpu_env, dofs + i); 847 } 848 tcg_temp_free_vec(t0); 849 tcg_temp_free_vec(t1); 850 } 851 852 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 853 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 854 uint32_t bofs, uint32_t oprsz, 855 uint32_t tysz, TCGType type, bool load_dest, 856 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 857 { 858 TCGv_vec t0 = tcg_temp_new_vec(type); 859 TCGv_vec t1 = tcg_temp_new_vec(type); 860 TCGv_vec t2 = tcg_temp_new_vec(type); 861 uint32_t i; 862 863 for (i = 0; i < oprsz; i += tysz) { 864 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 865 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 866 if (load_dest) { 867 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 868 } 869 fni(vece, t2, t0, t1); 870 tcg_gen_st_vec(t2, cpu_env, dofs + i); 871 } 872 tcg_temp_free_vec(t2); 873 tcg_temp_free_vec(t1); 874 tcg_temp_free_vec(t0); 875 } 876 877 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 878 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 879 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 880 uint32_t tysz, TCGType type, 881 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 882 TCGv_vec, TCGv_vec)) 883 { 884 TCGv_vec t0 = tcg_temp_new_vec(type); 885 TCGv_vec t1 = tcg_temp_new_vec(type); 886 TCGv_vec t2 = tcg_temp_new_vec(type); 887 TCGv_vec t3 = tcg_temp_new_vec(type); 888 uint32_t i; 889 890 for (i = 0; i < oprsz; i += tysz) { 891 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 892 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 893 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 894 fni(vece, t0, t1, t2, t3); 895 tcg_gen_st_vec(t0, cpu_env, dofs + i); 896 } 897 tcg_temp_free_vec(t3); 898 tcg_temp_free_vec(t2); 899 tcg_temp_free_vec(t1); 900 tcg_temp_free_vec(t0); 901 } 902 903 /* Expand a vector two-operand operation. */ 904 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 905 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 906 { 907 TCGType type; 908 uint32_t some; 909 910 check_size_align(oprsz, maxsz, dofs | aofs); 911 check_overlap_2(dofs, aofs, maxsz); 912 913 type = 0; 914 if (g->fniv) { 915 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 916 } 917 switch (type) { 918 case TCG_TYPE_V256: 919 /* Recall that ARM SVE allows vector sizes that are not a 920 * power of 2, but always a multiple of 16. The intent is 921 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 922 */ 923 some = QEMU_ALIGN_DOWN(oprsz, 32); 924 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); 925 if (some == oprsz) { 926 break; 927 } 928 dofs += some; 929 aofs += some; 930 oprsz -= some; 931 maxsz -= some; 932 /* fallthru */ 933 case TCG_TYPE_V128: 934 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); 935 break; 936 case TCG_TYPE_V64: 937 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); 938 break; 939 940 case 0: 941 if (g->fni8 && check_size_impl(oprsz, 8)) { 942 expand_2_i64(dofs, aofs, oprsz, g->fni8); 943 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 944 expand_2_i32(dofs, aofs, oprsz, g->fni4); 945 } else { 946 assert(g->fno != NULL); 947 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 948 return; 949 } 950 break; 951 952 default: 953 g_assert_not_reached(); 954 } 955 956 if (oprsz < maxsz) { 957 expand_clr(dofs + oprsz, maxsz - oprsz); 958 } 959 } 960 961 /* Expand a vector operation with two vectors and an immediate. */ 962 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 963 uint32_t maxsz, int64_t c, const GVecGen2i *g) 964 { 965 TCGType type; 966 uint32_t some; 967 968 check_size_align(oprsz, maxsz, dofs | aofs); 969 check_overlap_2(dofs, aofs, maxsz); 970 971 type = 0; 972 if (g->fniv) { 973 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 974 } 975 switch (type) { 976 case TCG_TYPE_V256: 977 /* Recall that ARM SVE allows vector sizes that are not a 978 * power of 2, but always a multiple of 16. The intent is 979 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 980 */ 981 some = QEMU_ALIGN_DOWN(oprsz, 32); 982 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 983 c, g->load_dest, g->fniv); 984 if (some == oprsz) { 985 break; 986 } 987 dofs += some; 988 aofs += some; 989 oprsz -= some; 990 maxsz -= some; 991 /* fallthru */ 992 case TCG_TYPE_V128: 993 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 994 c, g->load_dest, g->fniv); 995 break; 996 case TCG_TYPE_V64: 997 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 998 c, g->load_dest, g->fniv); 999 break; 1000 1001 case 0: 1002 if (g->fni8 && check_size_impl(oprsz, 8)) { 1003 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1004 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1005 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1006 } else { 1007 if (g->fno) { 1008 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1009 } else { 1010 TCGv_i64 tcg_c = tcg_const_i64(c); 1011 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1012 maxsz, c, g->fnoi); 1013 tcg_temp_free_i64(tcg_c); 1014 } 1015 return; 1016 } 1017 break; 1018 1019 default: 1020 g_assert_not_reached(); 1021 } 1022 1023 if (oprsz < maxsz) { 1024 expand_clr(dofs + oprsz, maxsz - oprsz); 1025 } 1026 } 1027 1028 /* Expand a vector operation with two vectors and a scalar. */ 1029 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1030 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1031 { 1032 TCGType type; 1033 1034 check_size_align(oprsz, maxsz, dofs | aofs); 1035 check_overlap_2(dofs, aofs, maxsz); 1036 1037 type = 0; 1038 if (g->fniv) { 1039 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 1040 } 1041 if (type != 0) { 1042 TCGv_vec t_vec = tcg_temp_new_vec(type); 1043 uint32_t some; 1044 1045 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1046 1047 switch (type) { 1048 case TCG_TYPE_V256: 1049 /* Recall that ARM SVE allows vector sizes that are not a 1050 * power of 2, but always a multiple of 16. The intent is 1051 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1052 */ 1053 some = QEMU_ALIGN_DOWN(oprsz, 32); 1054 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1055 t_vec, g->scalar_first, g->fniv); 1056 if (some == oprsz) { 1057 break; 1058 } 1059 dofs += some; 1060 aofs += some; 1061 oprsz -= some; 1062 maxsz -= some; 1063 /* fallthru */ 1064 1065 case TCG_TYPE_V128: 1066 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1067 t_vec, g->scalar_first, g->fniv); 1068 break; 1069 1070 case TCG_TYPE_V64: 1071 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1072 t_vec, g->scalar_first, g->fniv); 1073 break; 1074 1075 default: 1076 g_assert_not_reached(); 1077 } 1078 tcg_temp_free_vec(t_vec); 1079 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1080 TCGv_i64 t64 = tcg_temp_new_i64(); 1081 1082 gen_dup_i64(g->vece, t64, c); 1083 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1084 tcg_temp_free_i64(t64); 1085 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1086 TCGv_i32 t32 = tcg_temp_new_i32(); 1087 1088 tcg_gen_extrl_i64_i32(t32, c); 1089 gen_dup_i32(g->vece, t32, t32); 1090 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1091 tcg_temp_free_i32(t32); 1092 } else { 1093 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1094 return; 1095 } 1096 1097 if (oprsz < maxsz) { 1098 expand_clr(dofs + oprsz, maxsz - oprsz); 1099 } 1100 } 1101 1102 /* Expand a vector three-operand operation. */ 1103 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1104 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1105 { 1106 TCGType type; 1107 uint32_t some; 1108 1109 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1110 check_overlap_3(dofs, aofs, bofs, maxsz); 1111 1112 type = 0; 1113 if (g->fniv) { 1114 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 1115 } 1116 switch (type) { 1117 case TCG_TYPE_V256: 1118 /* Recall that ARM SVE allows vector sizes that are not a 1119 * power of 2, but always a multiple of 16. The intent is 1120 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1121 */ 1122 some = QEMU_ALIGN_DOWN(oprsz, 32); 1123 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1124 g->load_dest, g->fniv); 1125 if (some == oprsz) { 1126 break; 1127 } 1128 dofs += some; 1129 aofs += some; 1130 bofs += some; 1131 oprsz -= some; 1132 maxsz -= some; 1133 /* fallthru */ 1134 case TCG_TYPE_V128: 1135 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1136 g->load_dest, g->fniv); 1137 break; 1138 case TCG_TYPE_V64: 1139 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1140 g->load_dest, g->fniv); 1141 break; 1142 1143 case 0: 1144 if (g->fni8 && check_size_impl(oprsz, 8)) { 1145 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1146 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1147 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1148 } else { 1149 assert(g->fno != NULL); 1150 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1151 maxsz, g->data, g->fno); 1152 return; 1153 } 1154 break; 1155 1156 default: 1157 g_assert_not_reached(); 1158 } 1159 1160 if (oprsz < maxsz) { 1161 expand_clr(dofs + oprsz, maxsz - oprsz); 1162 } 1163 } 1164 1165 /* Expand a vector four-operand operation. */ 1166 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1167 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1168 { 1169 TCGType type; 1170 uint32_t some; 1171 1172 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1173 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1174 1175 type = 0; 1176 if (g->fniv) { 1177 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 1178 } 1179 switch (type) { 1180 case TCG_TYPE_V256: 1181 /* Recall that ARM SVE allows vector sizes that are not a 1182 * power of 2, but always a multiple of 16. The intent is 1183 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1184 */ 1185 some = QEMU_ALIGN_DOWN(oprsz, 32); 1186 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1187 32, TCG_TYPE_V256, g->fniv); 1188 if (some == oprsz) { 1189 break; 1190 } 1191 dofs += some; 1192 aofs += some; 1193 bofs += some; 1194 cofs += some; 1195 oprsz -= some; 1196 maxsz -= some; 1197 /* fallthru */ 1198 case TCG_TYPE_V128: 1199 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1200 16, TCG_TYPE_V128, g->fniv); 1201 break; 1202 case TCG_TYPE_V64: 1203 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1204 8, TCG_TYPE_V64, g->fniv); 1205 break; 1206 1207 case 0: 1208 if (g->fni8 && check_size_impl(oprsz, 8)) { 1209 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8); 1210 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1211 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4); 1212 } else { 1213 assert(g->fno != NULL); 1214 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1215 oprsz, maxsz, g->data, g->fno); 1216 return; 1217 } 1218 break; 1219 1220 default: 1221 g_assert_not_reached(); 1222 } 1223 1224 if (oprsz < maxsz) { 1225 expand_clr(dofs + oprsz, maxsz - oprsz); 1226 } 1227 } 1228 1229 /* 1230 * Expand specific vector operations. 1231 */ 1232 1233 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1234 { 1235 tcg_gen_mov_vec(a, b); 1236 } 1237 1238 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1239 uint32_t oprsz, uint32_t maxsz) 1240 { 1241 static const GVecGen2 g = { 1242 .fni8 = tcg_gen_mov_i64, 1243 .fniv = vec_mov2, 1244 .fno = gen_helper_gvec_mov, 1245 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1246 }; 1247 if (dofs != aofs) { 1248 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1249 } else { 1250 check_size_align(oprsz, maxsz, dofs); 1251 if (oprsz < maxsz) { 1252 expand_clr(dofs + oprsz, maxsz - oprsz); 1253 } 1254 } 1255 } 1256 1257 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1258 uint32_t maxsz, TCGv_i32 in) 1259 { 1260 check_size_align(oprsz, maxsz, dofs); 1261 tcg_debug_assert(vece <= MO_32); 1262 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1263 } 1264 1265 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1266 uint32_t maxsz, TCGv_i64 in) 1267 { 1268 check_size_align(oprsz, maxsz, dofs); 1269 tcg_debug_assert(vece <= MO_64); 1270 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1271 } 1272 1273 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1274 uint32_t oprsz, uint32_t maxsz) 1275 { 1276 if (vece <= MO_32) { 1277 TCGv_i32 in = tcg_temp_new_i32(); 1278 switch (vece) { 1279 case MO_8: 1280 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1281 break; 1282 case MO_16: 1283 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1284 break; 1285 case MO_32: 1286 tcg_gen_ld_i32(in, cpu_env, aofs); 1287 break; 1288 } 1289 tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in); 1290 tcg_temp_free_i32(in); 1291 } else if (vece == MO_64) { 1292 TCGv_i64 in = tcg_temp_new_i64(); 1293 tcg_gen_ld_i64(in, cpu_env, aofs); 1294 tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in); 1295 tcg_temp_free_i64(in); 1296 } else { 1297 /* 128-bit duplicate. */ 1298 /* ??? Dup to 256-bit vector. */ 1299 int i; 1300 1301 tcg_debug_assert(vece == 4); 1302 tcg_debug_assert(oprsz >= 16); 1303 if (TCG_TARGET_HAS_v128) { 1304 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1305 1306 tcg_gen_ld_vec(in, cpu_env, aofs); 1307 for (i = 0; i < oprsz; i += 16) { 1308 tcg_gen_st_vec(in, cpu_env, dofs + i); 1309 } 1310 tcg_temp_free_vec(in); 1311 } else { 1312 TCGv_i64 in0 = tcg_temp_new_i64(); 1313 TCGv_i64 in1 = tcg_temp_new_i64(); 1314 1315 tcg_gen_ld_i64(in0, cpu_env, aofs); 1316 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1317 for (i = 0; i < oprsz; i += 16) { 1318 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1319 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1320 } 1321 tcg_temp_free_i64(in0); 1322 tcg_temp_free_i64(in1); 1323 } 1324 } 1325 } 1326 1327 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, 1328 uint32_t maxsz, uint64_t x) 1329 { 1330 check_size_align(oprsz, maxsz, dofs); 1331 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); 1332 } 1333 1334 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, 1335 uint32_t maxsz, uint32_t x) 1336 { 1337 check_size_align(oprsz, maxsz, dofs); 1338 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); 1339 } 1340 1341 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, 1342 uint32_t maxsz, uint16_t x) 1343 { 1344 check_size_align(oprsz, maxsz, dofs); 1345 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); 1346 } 1347 1348 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, 1349 uint32_t maxsz, uint8_t x) 1350 { 1351 check_size_align(oprsz, maxsz, dofs); 1352 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); 1353 } 1354 1355 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1356 uint32_t oprsz, uint32_t maxsz) 1357 { 1358 static const GVecGen2 g = { 1359 .fni8 = tcg_gen_not_i64, 1360 .fniv = tcg_gen_not_vec, 1361 .fno = gen_helper_gvec_not, 1362 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1363 }; 1364 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1365 } 1366 1367 /* Perform a vector addition using normal addition and a mask. The mask 1368 should be the sign bit of each lane. This 6-operation form is more 1369 efficient than separate additions when there are 4 or more lanes in 1370 the 64-bit operation. */ 1371 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1372 { 1373 TCGv_i64 t1 = tcg_temp_new_i64(); 1374 TCGv_i64 t2 = tcg_temp_new_i64(); 1375 TCGv_i64 t3 = tcg_temp_new_i64(); 1376 1377 tcg_gen_andc_i64(t1, a, m); 1378 tcg_gen_andc_i64(t2, b, m); 1379 tcg_gen_xor_i64(t3, a, b); 1380 tcg_gen_add_i64(d, t1, t2); 1381 tcg_gen_and_i64(t3, t3, m); 1382 tcg_gen_xor_i64(d, d, t3); 1383 1384 tcg_temp_free_i64(t1); 1385 tcg_temp_free_i64(t2); 1386 tcg_temp_free_i64(t3); 1387 } 1388 1389 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1390 { 1391 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1392 gen_addv_mask(d, a, b, m); 1393 tcg_temp_free_i64(m); 1394 } 1395 1396 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1397 { 1398 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1399 gen_addv_mask(d, a, b, m); 1400 tcg_temp_free_i64(m); 1401 } 1402 1403 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1404 { 1405 TCGv_i64 t1 = tcg_temp_new_i64(); 1406 TCGv_i64 t2 = tcg_temp_new_i64(); 1407 1408 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1409 tcg_gen_add_i64(t2, a, b); 1410 tcg_gen_add_i64(t1, t1, b); 1411 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1412 1413 tcg_temp_free_i64(t1); 1414 tcg_temp_free_i64(t2); 1415 } 1416 1417 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1418 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1419 { 1420 static const GVecGen3 g[4] = { 1421 { .fni8 = tcg_gen_vec_add8_i64, 1422 .fniv = tcg_gen_add_vec, 1423 .fno = gen_helper_gvec_add8, 1424 .opc = INDEX_op_add_vec, 1425 .vece = MO_8 }, 1426 { .fni8 = tcg_gen_vec_add16_i64, 1427 .fniv = tcg_gen_add_vec, 1428 .fno = gen_helper_gvec_add16, 1429 .opc = INDEX_op_add_vec, 1430 .vece = MO_16 }, 1431 { .fni4 = tcg_gen_add_i32, 1432 .fniv = tcg_gen_add_vec, 1433 .fno = gen_helper_gvec_add32, 1434 .opc = INDEX_op_add_vec, 1435 .vece = MO_32 }, 1436 { .fni8 = tcg_gen_add_i64, 1437 .fniv = tcg_gen_add_vec, 1438 .fno = gen_helper_gvec_add64, 1439 .opc = INDEX_op_add_vec, 1440 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1441 .vece = MO_64 }, 1442 }; 1443 1444 tcg_debug_assert(vece <= MO_64); 1445 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1446 } 1447 1448 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1449 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1450 { 1451 static const GVecGen2s g[4] = { 1452 { .fni8 = tcg_gen_vec_add8_i64, 1453 .fniv = tcg_gen_add_vec, 1454 .fno = gen_helper_gvec_adds8, 1455 .opc = INDEX_op_add_vec, 1456 .vece = MO_8 }, 1457 { .fni8 = tcg_gen_vec_add16_i64, 1458 .fniv = tcg_gen_add_vec, 1459 .fno = gen_helper_gvec_adds16, 1460 .opc = INDEX_op_add_vec, 1461 .vece = MO_16 }, 1462 { .fni4 = tcg_gen_add_i32, 1463 .fniv = tcg_gen_add_vec, 1464 .fno = gen_helper_gvec_adds32, 1465 .opc = INDEX_op_add_vec, 1466 .vece = MO_32 }, 1467 { .fni8 = tcg_gen_add_i64, 1468 .fniv = tcg_gen_add_vec, 1469 .fno = gen_helper_gvec_adds64, 1470 .opc = INDEX_op_add_vec, 1471 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1472 .vece = MO_64 }, 1473 }; 1474 1475 tcg_debug_assert(vece <= MO_64); 1476 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1477 } 1478 1479 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1480 int64_t c, uint32_t oprsz, uint32_t maxsz) 1481 { 1482 TCGv_i64 tmp = tcg_const_i64(c); 1483 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1484 tcg_temp_free_i64(tmp); 1485 } 1486 1487 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1488 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1489 { 1490 static const GVecGen2s g[4] = { 1491 { .fni8 = tcg_gen_vec_sub8_i64, 1492 .fniv = tcg_gen_sub_vec, 1493 .fno = gen_helper_gvec_subs8, 1494 .opc = INDEX_op_sub_vec, 1495 .vece = MO_8 }, 1496 { .fni8 = tcg_gen_vec_sub16_i64, 1497 .fniv = tcg_gen_sub_vec, 1498 .fno = gen_helper_gvec_subs16, 1499 .opc = INDEX_op_sub_vec, 1500 .vece = MO_16 }, 1501 { .fni4 = tcg_gen_sub_i32, 1502 .fniv = tcg_gen_sub_vec, 1503 .fno = gen_helper_gvec_subs32, 1504 .opc = INDEX_op_sub_vec, 1505 .vece = MO_32 }, 1506 { .fni8 = tcg_gen_sub_i64, 1507 .fniv = tcg_gen_sub_vec, 1508 .fno = gen_helper_gvec_subs64, 1509 .opc = INDEX_op_sub_vec, 1510 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1511 .vece = MO_64 }, 1512 }; 1513 1514 tcg_debug_assert(vece <= MO_64); 1515 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1516 } 1517 1518 /* Perform a vector subtraction using normal subtraction and a mask. 1519 Compare gen_addv_mask above. */ 1520 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1521 { 1522 TCGv_i64 t1 = tcg_temp_new_i64(); 1523 TCGv_i64 t2 = tcg_temp_new_i64(); 1524 TCGv_i64 t3 = tcg_temp_new_i64(); 1525 1526 tcg_gen_or_i64(t1, a, m); 1527 tcg_gen_andc_i64(t2, b, m); 1528 tcg_gen_eqv_i64(t3, a, b); 1529 tcg_gen_sub_i64(d, t1, t2); 1530 tcg_gen_and_i64(t3, t3, m); 1531 tcg_gen_xor_i64(d, d, t3); 1532 1533 tcg_temp_free_i64(t1); 1534 tcg_temp_free_i64(t2); 1535 tcg_temp_free_i64(t3); 1536 } 1537 1538 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1539 { 1540 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1541 gen_subv_mask(d, a, b, m); 1542 tcg_temp_free_i64(m); 1543 } 1544 1545 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1546 { 1547 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1548 gen_subv_mask(d, a, b, m); 1549 tcg_temp_free_i64(m); 1550 } 1551 1552 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1553 { 1554 TCGv_i64 t1 = tcg_temp_new_i64(); 1555 TCGv_i64 t2 = tcg_temp_new_i64(); 1556 1557 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1558 tcg_gen_sub_i64(t2, a, b); 1559 tcg_gen_sub_i64(t1, a, t1); 1560 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1561 1562 tcg_temp_free_i64(t1); 1563 tcg_temp_free_i64(t2); 1564 } 1565 1566 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1567 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1568 { 1569 static const GVecGen3 g[4] = { 1570 { .fni8 = tcg_gen_vec_sub8_i64, 1571 .fniv = tcg_gen_sub_vec, 1572 .fno = gen_helper_gvec_sub8, 1573 .opc = INDEX_op_sub_vec, 1574 .vece = MO_8 }, 1575 { .fni8 = tcg_gen_vec_sub16_i64, 1576 .fniv = tcg_gen_sub_vec, 1577 .fno = gen_helper_gvec_sub16, 1578 .opc = INDEX_op_sub_vec, 1579 .vece = MO_16 }, 1580 { .fni4 = tcg_gen_sub_i32, 1581 .fniv = tcg_gen_sub_vec, 1582 .fno = gen_helper_gvec_sub32, 1583 .opc = INDEX_op_sub_vec, 1584 .vece = MO_32 }, 1585 { .fni8 = tcg_gen_sub_i64, 1586 .fniv = tcg_gen_sub_vec, 1587 .fno = gen_helper_gvec_sub64, 1588 .opc = INDEX_op_sub_vec, 1589 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1590 .vece = MO_64 }, 1591 }; 1592 1593 tcg_debug_assert(vece <= MO_64); 1594 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1595 } 1596 1597 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1598 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1599 { 1600 static const GVecGen3 g[4] = { 1601 { .fniv = tcg_gen_mul_vec, 1602 .fno = gen_helper_gvec_mul8, 1603 .opc = INDEX_op_mul_vec, 1604 .vece = MO_8 }, 1605 { .fniv = tcg_gen_mul_vec, 1606 .fno = gen_helper_gvec_mul16, 1607 .opc = INDEX_op_mul_vec, 1608 .vece = MO_16 }, 1609 { .fni4 = tcg_gen_mul_i32, 1610 .fniv = tcg_gen_mul_vec, 1611 .fno = gen_helper_gvec_mul32, 1612 .opc = INDEX_op_mul_vec, 1613 .vece = MO_32 }, 1614 { .fni8 = tcg_gen_mul_i64, 1615 .fniv = tcg_gen_mul_vec, 1616 .fno = gen_helper_gvec_mul64, 1617 .opc = INDEX_op_mul_vec, 1618 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1619 .vece = MO_64 }, 1620 }; 1621 1622 tcg_debug_assert(vece <= MO_64); 1623 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1624 } 1625 1626 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1627 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1628 { 1629 static const GVecGen2s g[4] = { 1630 { .fniv = tcg_gen_mul_vec, 1631 .fno = gen_helper_gvec_muls8, 1632 .opc = INDEX_op_mul_vec, 1633 .vece = MO_8 }, 1634 { .fniv = tcg_gen_mul_vec, 1635 .fno = gen_helper_gvec_muls16, 1636 .opc = INDEX_op_mul_vec, 1637 .vece = MO_16 }, 1638 { .fni4 = tcg_gen_mul_i32, 1639 .fniv = tcg_gen_mul_vec, 1640 .fno = gen_helper_gvec_muls32, 1641 .opc = INDEX_op_mul_vec, 1642 .vece = MO_32 }, 1643 { .fni8 = tcg_gen_mul_i64, 1644 .fniv = tcg_gen_mul_vec, 1645 .fno = gen_helper_gvec_muls64, 1646 .opc = INDEX_op_mul_vec, 1647 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1648 .vece = MO_64 }, 1649 }; 1650 1651 tcg_debug_assert(vece <= MO_64); 1652 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1653 } 1654 1655 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1656 int64_t c, uint32_t oprsz, uint32_t maxsz) 1657 { 1658 TCGv_i64 tmp = tcg_const_i64(c); 1659 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1660 tcg_temp_free_i64(tmp); 1661 } 1662 1663 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1664 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1665 { 1666 static const GVecGen3 g[4] = { 1667 { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 }, 1668 { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 }, 1669 { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 }, 1670 { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 } 1671 }; 1672 tcg_debug_assert(vece <= MO_64); 1673 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1674 } 1675 1676 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 1677 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1678 { 1679 static const GVecGen3 g[4] = { 1680 { .fno = gen_helper_gvec_sssub8, .vece = MO_8 }, 1681 { .fno = gen_helper_gvec_sssub16, .vece = MO_16 }, 1682 { .fno = gen_helper_gvec_sssub32, .vece = MO_32 }, 1683 { .fno = gen_helper_gvec_sssub64, .vece = MO_64 } 1684 }; 1685 tcg_debug_assert(vece <= MO_64); 1686 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1687 } 1688 1689 static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1690 { 1691 TCGv_i32 max = tcg_const_i32(-1); 1692 tcg_gen_add_i32(d, a, b); 1693 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 1694 tcg_temp_free_i32(max); 1695 } 1696 1697 static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1698 { 1699 TCGv_i64 max = tcg_const_i64(-1); 1700 tcg_gen_add_i64(d, a, b); 1701 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 1702 tcg_temp_free_i64(max); 1703 } 1704 1705 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1706 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1707 { 1708 static const GVecGen3 g[4] = { 1709 { .fno = gen_helper_gvec_usadd8, .vece = MO_8 }, 1710 { .fno = gen_helper_gvec_usadd16, .vece = MO_16 }, 1711 { .fni4 = tcg_gen_vec_usadd32_i32, 1712 .fno = gen_helper_gvec_usadd32, 1713 .vece = MO_32 }, 1714 { .fni8 = tcg_gen_vec_usadd32_i64, 1715 .fno = gen_helper_gvec_usadd64, 1716 .vece = MO_64 } 1717 }; 1718 tcg_debug_assert(vece <= MO_64); 1719 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1720 } 1721 1722 static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1723 { 1724 TCGv_i32 min = tcg_const_i32(0); 1725 tcg_gen_sub_i32(d, a, b); 1726 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 1727 tcg_temp_free_i32(min); 1728 } 1729 1730 static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1731 { 1732 TCGv_i64 min = tcg_const_i64(0); 1733 tcg_gen_sub_i64(d, a, b); 1734 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 1735 tcg_temp_free_i64(min); 1736 } 1737 1738 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 1739 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1740 { 1741 static const GVecGen3 g[4] = { 1742 { .fno = gen_helper_gvec_ussub8, .vece = MO_8 }, 1743 { .fno = gen_helper_gvec_ussub16, .vece = MO_16 }, 1744 { .fni4 = tcg_gen_vec_ussub32_i32, 1745 .fno = gen_helper_gvec_ussub32, 1746 .vece = MO_32 }, 1747 { .fni8 = tcg_gen_vec_ussub32_i64, 1748 .fno = gen_helper_gvec_ussub64, 1749 .vece = MO_64 } 1750 }; 1751 tcg_debug_assert(vece <= MO_64); 1752 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1753 } 1754 1755 /* Perform a vector negation using normal negation and a mask. 1756 Compare gen_subv_mask above. */ 1757 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 1758 { 1759 TCGv_i64 t2 = tcg_temp_new_i64(); 1760 TCGv_i64 t3 = tcg_temp_new_i64(); 1761 1762 tcg_gen_andc_i64(t3, m, b); 1763 tcg_gen_andc_i64(t2, b, m); 1764 tcg_gen_sub_i64(d, m, t2); 1765 tcg_gen_xor_i64(d, d, t3); 1766 1767 tcg_temp_free_i64(t2); 1768 tcg_temp_free_i64(t3); 1769 } 1770 1771 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 1772 { 1773 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1774 gen_negv_mask(d, b, m); 1775 tcg_temp_free_i64(m); 1776 } 1777 1778 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 1779 { 1780 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1781 gen_negv_mask(d, b, m); 1782 tcg_temp_free_i64(m); 1783 } 1784 1785 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 1786 { 1787 TCGv_i64 t1 = tcg_temp_new_i64(); 1788 TCGv_i64 t2 = tcg_temp_new_i64(); 1789 1790 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1791 tcg_gen_neg_i64(t2, b); 1792 tcg_gen_neg_i64(t1, t1); 1793 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1794 1795 tcg_temp_free_i64(t1); 1796 tcg_temp_free_i64(t2); 1797 } 1798 1799 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 1800 uint32_t oprsz, uint32_t maxsz) 1801 { 1802 static const GVecGen2 g[4] = { 1803 { .fni8 = tcg_gen_vec_neg8_i64, 1804 .fniv = tcg_gen_neg_vec, 1805 .fno = gen_helper_gvec_neg8, 1806 .opc = INDEX_op_neg_vec, 1807 .vece = MO_8 }, 1808 { .fni8 = tcg_gen_vec_neg16_i64, 1809 .fniv = tcg_gen_neg_vec, 1810 .fno = gen_helper_gvec_neg16, 1811 .opc = INDEX_op_neg_vec, 1812 .vece = MO_16 }, 1813 { .fni4 = tcg_gen_neg_i32, 1814 .fniv = tcg_gen_neg_vec, 1815 .fno = gen_helper_gvec_neg32, 1816 .opc = INDEX_op_neg_vec, 1817 .vece = MO_32 }, 1818 { .fni8 = tcg_gen_neg_i64, 1819 .fniv = tcg_gen_neg_vec, 1820 .fno = gen_helper_gvec_neg64, 1821 .opc = INDEX_op_neg_vec, 1822 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1823 .vece = MO_64 }, 1824 }; 1825 1826 tcg_debug_assert(vece <= MO_64); 1827 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 1828 } 1829 1830 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 1831 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1832 { 1833 static const GVecGen3 g = { 1834 .fni8 = tcg_gen_and_i64, 1835 .fniv = tcg_gen_and_vec, 1836 .fno = gen_helper_gvec_and, 1837 .opc = INDEX_op_and_vec, 1838 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1839 }; 1840 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1841 } 1842 1843 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 1844 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1845 { 1846 static const GVecGen3 g = { 1847 .fni8 = tcg_gen_or_i64, 1848 .fniv = tcg_gen_or_vec, 1849 .fno = gen_helper_gvec_or, 1850 .opc = INDEX_op_or_vec, 1851 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1852 }; 1853 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1854 } 1855 1856 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 1857 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1858 { 1859 static const GVecGen3 g = { 1860 .fni8 = tcg_gen_xor_i64, 1861 .fniv = tcg_gen_xor_vec, 1862 .fno = gen_helper_gvec_xor, 1863 .opc = INDEX_op_xor_vec, 1864 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1865 }; 1866 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1867 } 1868 1869 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 1870 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1871 { 1872 static const GVecGen3 g = { 1873 .fni8 = tcg_gen_andc_i64, 1874 .fniv = tcg_gen_andc_vec, 1875 .fno = gen_helper_gvec_andc, 1876 .opc = INDEX_op_andc_vec, 1877 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1878 }; 1879 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1880 } 1881 1882 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 1883 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1884 { 1885 static const GVecGen3 g = { 1886 .fni8 = tcg_gen_orc_i64, 1887 .fniv = tcg_gen_orc_vec, 1888 .fno = gen_helper_gvec_orc, 1889 .opc = INDEX_op_orc_vec, 1890 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1891 }; 1892 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1893 } 1894 1895 static const GVecGen2s gop_ands = { 1896 .fni8 = tcg_gen_and_i64, 1897 .fniv = tcg_gen_and_vec, 1898 .fno = gen_helper_gvec_ands, 1899 .opc = INDEX_op_and_vec, 1900 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1901 .vece = MO_64 1902 }; 1903 1904 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 1905 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1906 { 1907 TCGv_i64 tmp = tcg_temp_new_i64(); 1908 gen_dup_i64(vece, tmp, c); 1909 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 1910 tcg_temp_free_i64(tmp); 1911 } 1912 1913 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 1914 int64_t c, uint32_t oprsz, uint32_t maxsz) 1915 { 1916 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 1917 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 1918 tcg_temp_free_i64(tmp); 1919 } 1920 1921 static const GVecGen2s gop_xors = { 1922 .fni8 = tcg_gen_xor_i64, 1923 .fniv = tcg_gen_xor_vec, 1924 .fno = gen_helper_gvec_xors, 1925 .opc = INDEX_op_xor_vec, 1926 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1927 .vece = MO_64 1928 }; 1929 1930 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 1931 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1932 { 1933 TCGv_i64 tmp = tcg_temp_new_i64(); 1934 gen_dup_i64(vece, tmp, c); 1935 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 1936 tcg_temp_free_i64(tmp); 1937 } 1938 1939 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 1940 int64_t c, uint32_t oprsz, uint32_t maxsz) 1941 { 1942 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 1943 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 1944 tcg_temp_free_i64(tmp); 1945 } 1946 1947 static const GVecGen2s gop_ors = { 1948 .fni8 = tcg_gen_or_i64, 1949 .fniv = tcg_gen_or_vec, 1950 .fno = gen_helper_gvec_ors, 1951 .opc = INDEX_op_or_vec, 1952 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1953 .vece = MO_64 1954 }; 1955 1956 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 1957 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1958 { 1959 TCGv_i64 tmp = tcg_temp_new_i64(); 1960 gen_dup_i64(vece, tmp, c); 1961 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 1962 tcg_temp_free_i64(tmp); 1963 } 1964 1965 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 1966 int64_t c, uint32_t oprsz, uint32_t maxsz) 1967 { 1968 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 1969 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 1970 tcg_temp_free_i64(tmp); 1971 } 1972 1973 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1974 { 1975 uint64_t mask = dup_const(MO_8, 0xff << c); 1976 tcg_gen_shli_i64(d, a, c); 1977 tcg_gen_andi_i64(d, d, mask); 1978 } 1979 1980 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1981 { 1982 uint64_t mask = dup_const(MO_16, 0xffff << c); 1983 tcg_gen_shli_i64(d, a, c); 1984 tcg_gen_andi_i64(d, d, mask); 1985 } 1986 1987 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 1988 int64_t shift, uint32_t oprsz, uint32_t maxsz) 1989 { 1990 static const GVecGen2i g[4] = { 1991 { .fni8 = tcg_gen_vec_shl8i_i64, 1992 .fniv = tcg_gen_shli_vec, 1993 .fno = gen_helper_gvec_shl8i, 1994 .opc = INDEX_op_shli_vec, 1995 .vece = MO_8 }, 1996 { .fni8 = tcg_gen_vec_shl16i_i64, 1997 .fniv = tcg_gen_shli_vec, 1998 .fno = gen_helper_gvec_shl16i, 1999 .opc = INDEX_op_shli_vec, 2000 .vece = MO_16 }, 2001 { .fni4 = tcg_gen_shli_i32, 2002 .fniv = tcg_gen_shli_vec, 2003 .fno = gen_helper_gvec_shl32i, 2004 .opc = INDEX_op_shli_vec, 2005 .vece = MO_32 }, 2006 { .fni8 = tcg_gen_shli_i64, 2007 .fniv = tcg_gen_shli_vec, 2008 .fno = gen_helper_gvec_shl64i, 2009 .opc = INDEX_op_shli_vec, 2010 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2011 .vece = MO_64 }, 2012 }; 2013 2014 tcg_debug_assert(vece <= MO_64); 2015 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2016 if (shift == 0) { 2017 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2018 } else { 2019 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2020 } 2021 } 2022 2023 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2024 { 2025 uint64_t mask = dup_const(MO_8, 0xff >> c); 2026 tcg_gen_shri_i64(d, a, c); 2027 tcg_gen_andi_i64(d, d, mask); 2028 } 2029 2030 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2031 { 2032 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2033 tcg_gen_shri_i64(d, a, c); 2034 tcg_gen_andi_i64(d, d, mask); 2035 } 2036 2037 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2038 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2039 { 2040 static const GVecGen2i g[4] = { 2041 { .fni8 = tcg_gen_vec_shr8i_i64, 2042 .fniv = tcg_gen_shri_vec, 2043 .fno = gen_helper_gvec_shr8i, 2044 .opc = INDEX_op_shri_vec, 2045 .vece = MO_8 }, 2046 { .fni8 = tcg_gen_vec_shr16i_i64, 2047 .fniv = tcg_gen_shri_vec, 2048 .fno = gen_helper_gvec_shr16i, 2049 .opc = INDEX_op_shri_vec, 2050 .vece = MO_16 }, 2051 { .fni4 = tcg_gen_shri_i32, 2052 .fniv = tcg_gen_shri_vec, 2053 .fno = gen_helper_gvec_shr32i, 2054 .opc = INDEX_op_shri_vec, 2055 .vece = MO_32 }, 2056 { .fni8 = tcg_gen_shri_i64, 2057 .fniv = tcg_gen_shri_vec, 2058 .fno = gen_helper_gvec_shr64i, 2059 .opc = INDEX_op_shri_vec, 2060 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2061 .vece = MO_64 }, 2062 }; 2063 2064 tcg_debug_assert(vece <= MO_64); 2065 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2066 if (shift == 0) { 2067 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2068 } else { 2069 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2070 } 2071 } 2072 2073 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2074 { 2075 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2076 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2077 TCGv_i64 s = tcg_temp_new_i64(); 2078 2079 tcg_gen_shri_i64(d, a, c); 2080 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2081 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2082 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2083 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2084 tcg_temp_free_i64(s); 2085 } 2086 2087 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2088 { 2089 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2090 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2091 TCGv_i64 s = tcg_temp_new_i64(); 2092 2093 tcg_gen_shri_i64(d, a, c); 2094 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2095 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2096 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2097 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2098 tcg_temp_free_i64(s); 2099 } 2100 2101 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2102 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2103 { 2104 static const GVecGen2i g[4] = { 2105 { .fni8 = tcg_gen_vec_sar8i_i64, 2106 .fniv = tcg_gen_sari_vec, 2107 .fno = gen_helper_gvec_sar8i, 2108 .opc = INDEX_op_sari_vec, 2109 .vece = MO_8 }, 2110 { .fni8 = tcg_gen_vec_sar16i_i64, 2111 .fniv = tcg_gen_sari_vec, 2112 .fno = gen_helper_gvec_sar16i, 2113 .opc = INDEX_op_sari_vec, 2114 .vece = MO_16 }, 2115 { .fni4 = tcg_gen_sari_i32, 2116 .fniv = tcg_gen_sari_vec, 2117 .fno = gen_helper_gvec_sar32i, 2118 .opc = INDEX_op_sari_vec, 2119 .vece = MO_32 }, 2120 { .fni8 = tcg_gen_sari_i64, 2121 .fniv = tcg_gen_sari_vec, 2122 .fno = gen_helper_gvec_sar64i, 2123 .opc = INDEX_op_sari_vec, 2124 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2125 .vece = MO_64 }, 2126 }; 2127 2128 tcg_debug_assert(vece <= MO_64); 2129 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2130 if (shift == 0) { 2131 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2132 } else { 2133 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2134 } 2135 } 2136 2137 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 2138 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 2139 uint32_t oprsz, TCGCond cond) 2140 { 2141 TCGv_i32 t0 = tcg_temp_new_i32(); 2142 TCGv_i32 t1 = tcg_temp_new_i32(); 2143 uint32_t i; 2144 2145 for (i = 0; i < oprsz; i += 4) { 2146 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 2147 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 2148 tcg_gen_setcond_i32(cond, t0, t0, t1); 2149 tcg_gen_neg_i32(t0, t0); 2150 tcg_gen_st_i32(t0, cpu_env, dofs + i); 2151 } 2152 tcg_temp_free_i32(t1); 2153 tcg_temp_free_i32(t0); 2154 } 2155 2156 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 2157 uint32_t oprsz, TCGCond cond) 2158 { 2159 TCGv_i64 t0 = tcg_temp_new_i64(); 2160 TCGv_i64 t1 = tcg_temp_new_i64(); 2161 uint32_t i; 2162 2163 for (i = 0; i < oprsz; i += 8) { 2164 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 2165 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 2166 tcg_gen_setcond_i64(cond, t0, t0, t1); 2167 tcg_gen_neg_i64(t0, t0); 2168 tcg_gen_st_i64(t0, cpu_env, dofs + i); 2169 } 2170 tcg_temp_free_i64(t1); 2171 tcg_temp_free_i64(t0); 2172 } 2173 2174 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2175 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 2176 TCGType type, TCGCond cond) 2177 { 2178 TCGv_vec t0 = tcg_temp_new_vec(type); 2179 TCGv_vec t1 = tcg_temp_new_vec(type); 2180 uint32_t i; 2181 2182 for (i = 0; i < oprsz; i += tysz) { 2183 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2184 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 2185 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 2186 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2187 } 2188 tcg_temp_free_vec(t1); 2189 tcg_temp_free_vec(t0); 2190 } 2191 2192 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 2193 uint32_t aofs, uint32_t bofs, 2194 uint32_t oprsz, uint32_t maxsz) 2195 { 2196 static gen_helper_gvec_3 * const eq_fn[4] = { 2197 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 2198 gen_helper_gvec_eq32, gen_helper_gvec_eq64 2199 }; 2200 static gen_helper_gvec_3 * const ne_fn[4] = { 2201 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 2202 gen_helper_gvec_ne32, gen_helper_gvec_ne64 2203 }; 2204 static gen_helper_gvec_3 * const lt_fn[4] = { 2205 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 2206 gen_helper_gvec_lt32, gen_helper_gvec_lt64 2207 }; 2208 static gen_helper_gvec_3 * const le_fn[4] = { 2209 gen_helper_gvec_le8, gen_helper_gvec_le16, 2210 gen_helper_gvec_le32, gen_helper_gvec_le64 2211 }; 2212 static gen_helper_gvec_3 * const ltu_fn[4] = { 2213 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 2214 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 2215 }; 2216 static gen_helper_gvec_3 * const leu_fn[4] = { 2217 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 2218 gen_helper_gvec_leu32, gen_helper_gvec_leu64 2219 }; 2220 static gen_helper_gvec_3 * const * const fns[16] = { 2221 [TCG_COND_EQ] = eq_fn, 2222 [TCG_COND_NE] = ne_fn, 2223 [TCG_COND_LT] = lt_fn, 2224 [TCG_COND_LE] = le_fn, 2225 [TCG_COND_LTU] = ltu_fn, 2226 [TCG_COND_LEU] = leu_fn, 2227 }; 2228 TCGType type; 2229 uint32_t some; 2230 2231 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 2232 check_overlap_3(dofs, aofs, bofs, maxsz); 2233 2234 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 2235 do_dup(MO_8, dofs, oprsz, maxsz, 2236 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 2237 return; 2238 } 2239 2240 /* Implement inline with a vector type, if possible. 2241 * Prefer integer when 64-bit host and 64-bit comparison. 2242 */ 2243 type = choose_vector_type(INDEX_op_cmp_vec, vece, oprsz, 2244 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 2245 switch (type) { 2246 case TCG_TYPE_V256: 2247 /* Recall that ARM SVE allows vector sizes that are not a 2248 * power of 2, but always a multiple of 16. The intent is 2249 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 2250 */ 2251 some = QEMU_ALIGN_DOWN(oprsz, 32); 2252 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 2253 if (some == oprsz) { 2254 break; 2255 } 2256 dofs += some; 2257 aofs += some; 2258 bofs += some; 2259 oprsz -= some; 2260 maxsz -= some; 2261 /* fallthru */ 2262 case TCG_TYPE_V128: 2263 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 2264 break; 2265 case TCG_TYPE_V64: 2266 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 2267 break; 2268 2269 case 0: 2270 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2271 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 2272 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2273 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 2274 } else { 2275 gen_helper_gvec_3 * const *fn = fns[cond]; 2276 2277 if (fn == NULL) { 2278 uint32_t tmp; 2279 tmp = aofs, aofs = bofs, bofs = tmp; 2280 cond = tcg_swap_cond(cond); 2281 fn = fns[cond]; 2282 assert(fn != NULL); 2283 } 2284 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 2285 return; 2286 } 2287 break; 2288 2289 default: 2290 g_assert_not_reached(); 2291 } 2292 2293 if (oprsz < maxsz) { 2294 expand_clr(dofs + oprsz, maxsz - oprsz); 2295 } 2296 } 2297