1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu-common.h" 22 #include "tcg.h" 23 #include "tcg-op.h" 24 #include "tcg-op-gvec.h" 25 #include "tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 /* Verify vector size and alignment rules. OFS should be the OR of all 30 of the operand offsets so that we can check them all at once. */ 31 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 32 { 33 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 34 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 35 tcg_debug_assert(oprsz > 0); 36 tcg_debug_assert(oprsz <= maxsz); 37 tcg_debug_assert((oprsz & opr_align) == 0); 38 tcg_debug_assert((maxsz & max_align) == 0); 39 tcg_debug_assert((ofs & max_align) == 0); 40 } 41 42 /* Verify vector overlap rules for two operands. */ 43 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 44 { 45 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 46 } 47 48 /* Verify vector overlap rules for three operands. */ 49 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 50 { 51 check_overlap_2(d, a, s); 52 check_overlap_2(d, b, s); 53 check_overlap_2(a, b, s); 54 } 55 56 /* Verify vector overlap rules for four operands. */ 57 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 58 uint32_t c, uint32_t s) 59 { 60 check_overlap_2(d, a, s); 61 check_overlap_2(d, b, s); 62 check_overlap_2(d, c, s); 63 check_overlap_2(a, b, s); 64 check_overlap_2(a, c, s); 65 check_overlap_2(b, c, s); 66 } 67 68 /* Create a descriptor from components. */ 69 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 70 { 71 uint32_t desc = 0; 72 73 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 74 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 75 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 76 77 oprsz = (oprsz / 8) - 1; 78 maxsz = (maxsz / 8) - 1; 79 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 80 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 81 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 82 83 return desc; 84 } 85 86 /* Generate a call to a gvec-style helper with two vector operands. */ 87 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 88 uint32_t oprsz, uint32_t maxsz, int32_t data, 89 gen_helper_gvec_2 *fn) 90 { 91 TCGv_ptr a0, a1; 92 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 93 94 a0 = tcg_temp_new_ptr(); 95 a1 = tcg_temp_new_ptr(); 96 97 tcg_gen_addi_ptr(a0, cpu_env, dofs); 98 tcg_gen_addi_ptr(a1, cpu_env, aofs); 99 100 fn(a0, a1, desc); 101 102 tcg_temp_free_ptr(a0); 103 tcg_temp_free_ptr(a1); 104 tcg_temp_free_i32(desc); 105 } 106 107 /* Generate a call to a gvec-style helper with two vector operands 108 and one scalar operand. */ 109 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 110 uint32_t oprsz, uint32_t maxsz, int32_t data, 111 gen_helper_gvec_2i *fn) 112 { 113 TCGv_ptr a0, a1; 114 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 115 116 a0 = tcg_temp_new_ptr(); 117 a1 = tcg_temp_new_ptr(); 118 119 tcg_gen_addi_ptr(a0, cpu_env, dofs); 120 tcg_gen_addi_ptr(a1, cpu_env, aofs); 121 122 fn(a0, a1, c, desc); 123 124 tcg_temp_free_ptr(a0); 125 tcg_temp_free_ptr(a1); 126 tcg_temp_free_i32(desc); 127 } 128 129 /* Generate a call to a gvec-style helper with three vector operands. */ 130 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 131 uint32_t oprsz, uint32_t maxsz, int32_t data, 132 gen_helper_gvec_3 *fn) 133 { 134 TCGv_ptr a0, a1, a2; 135 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 136 137 a0 = tcg_temp_new_ptr(); 138 a1 = tcg_temp_new_ptr(); 139 a2 = tcg_temp_new_ptr(); 140 141 tcg_gen_addi_ptr(a0, cpu_env, dofs); 142 tcg_gen_addi_ptr(a1, cpu_env, aofs); 143 tcg_gen_addi_ptr(a2, cpu_env, bofs); 144 145 fn(a0, a1, a2, desc); 146 147 tcg_temp_free_ptr(a0); 148 tcg_temp_free_ptr(a1); 149 tcg_temp_free_ptr(a2); 150 tcg_temp_free_i32(desc); 151 } 152 153 /* Generate a call to a gvec-style helper with four vector operands. */ 154 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 155 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 156 int32_t data, gen_helper_gvec_4 *fn) 157 { 158 TCGv_ptr a0, a1, a2, a3; 159 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 160 161 a0 = tcg_temp_new_ptr(); 162 a1 = tcg_temp_new_ptr(); 163 a2 = tcg_temp_new_ptr(); 164 a3 = tcg_temp_new_ptr(); 165 166 tcg_gen_addi_ptr(a0, cpu_env, dofs); 167 tcg_gen_addi_ptr(a1, cpu_env, aofs); 168 tcg_gen_addi_ptr(a2, cpu_env, bofs); 169 tcg_gen_addi_ptr(a3, cpu_env, cofs); 170 171 fn(a0, a1, a2, a3, desc); 172 173 tcg_temp_free_ptr(a0); 174 tcg_temp_free_ptr(a1); 175 tcg_temp_free_ptr(a2); 176 tcg_temp_free_ptr(a3); 177 tcg_temp_free_i32(desc); 178 } 179 180 /* Generate a call to a gvec-style helper with five vector operands. */ 181 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 182 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 183 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 184 { 185 TCGv_ptr a0, a1, a2, a3, a4; 186 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 187 188 a0 = tcg_temp_new_ptr(); 189 a1 = tcg_temp_new_ptr(); 190 a2 = tcg_temp_new_ptr(); 191 a3 = tcg_temp_new_ptr(); 192 a4 = tcg_temp_new_ptr(); 193 194 tcg_gen_addi_ptr(a0, cpu_env, dofs); 195 tcg_gen_addi_ptr(a1, cpu_env, aofs); 196 tcg_gen_addi_ptr(a2, cpu_env, bofs); 197 tcg_gen_addi_ptr(a3, cpu_env, cofs); 198 tcg_gen_addi_ptr(a4, cpu_env, xofs); 199 200 fn(a0, a1, a2, a3, a4, desc); 201 202 tcg_temp_free_ptr(a0); 203 tcg_temp_free_ptr(a1); 204 tcg_temp_free_ptr(a2); 205 tcg_temp_free_ptr(a3); 206 tcg_temp_free_ptr(a4); 207 tcg_temp_free_i32(desc); 208 } 209 210 /* Generate a call to a gvec-style helper with three vector operands 211 and an extra pointer operand. */ 212 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 213 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 214 int32_t data, gen_helper_gvec_2_ptr *fn) 215 { 216 TCGv_ptr a0, a1; 217 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 218 219 a0 = tcg_temp_new_ptr(); 220 a1 = tcg_temp_new_ptr(); 221 222 tcg_gen_addi_ptr(a0, cpu_env, dofs); 223 tcg_gen_addi_ptr(a1, cpu_env, aofs); 224 225 fn(a0, a1, ptr, desc); 226 227 tcg_temp_free_ptr(a0); 228 tcg_temp_free_ptr(a1); 229 tcg_temp_free_i32(desc); 230 } 231 232 /* Generate a call to a gvec-style helper with three vector operands 233 and an extra pointer operand. */ 234 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 235 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 236 int32_t data, gen_helper_gvec_3_ptr *fn) 237 { 238 TCGv_ptr a0, a1, a2; 239 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 240 241 a0 = tcg_temp_new_ptr(); 242 a1 = tcg_temp_new_ptr(); 243 a2 = tcg_temp_new_ptr(); 244 245 tcg_gen_addi_ptr(a0, cpu_env, dofs); 246 tcg_gen_addi_ptr(a1, cpu_env, aofs); 247 tcg_gen_addi_ptr(a2, cpu_env, bofs); 248 249 fn(a0, a1, a2, ptr, desc); 250 251 tcg_temp_free_ptr(a0); 252 tcg_temp_free_ptr(a1); 253 tcg_temp_free_ptr(a2); 254 tcg_temp_free_i32(desc); 255 } 256 257 /* Generate a call to a gvec-style helper with four vector operands 258 and an extra pointer operand. */ 259 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 260 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 261 uint32_t maxsz, int32_t data, 262 gen_helper_gvec_4_ptr *fn) 263 { 264 TCGv_ptr a0, a1, a2, a3; 265 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 266 267 a0 = tcg_temp_new_ptr(); 268 a1 = tcg_temp_new_ptr(); 269 a2 = tcg_temp_new_ptr(); 270 a3 = tcg_temp_new_ptr(); 271 272 tcg_gen_addi_ptr(a0, cpu_env, dofs); 273 tcg_gen_addi_ptr(a1, cpu_env, aofs); 274 tcg_gen_addi_ptr(a2, cpu_env, bofs); 275 tcg_gen_addi_ptr(a3, cpu_env, cofs); 276 277 fn(a0, a1, a2, a3, ptr, desc); 278 279 tcg_temp_free_ptr(a0); 280 tcg_temp_free_ptr(a1); 281 tcg_temp_free_ptr(a2); 282 tcg_temp_free_ptr(a3); 283 tcg_temp_free_i32(desc); 284 } 285 286 /* Return true if we want to implement something of OPRSZ bytes 287 in units of LNSZ. This limits the expansion of inline code. */ 288 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 289 { 290 if (oprsz % lnsz == 0) { 291 uint32_t lnct = oprsz / lnsz; 292 return lnct >= 1 && lnct <= MAX_UNROLL; 293 } 294 return false; 295 } 296 297 static void expand_clr(uint32_t dofs, uint32_t maxsz); 298 299 /* Duplicate C as per VECE. */ 300 uint64_t (dup_const)(unsigned vece, uint64_t c) 301 { 302 switch (vece) { 303 case MO_8: 304 return 0x0101010101010101ull * (uint8_t)c; 305 case MO_16: 306 return 0x0001000100010001ull * (uint16_t)c; 307 case MO_32: 308 return 0x0000000100000001ull * (uint32_t)c; 309 case MO_64: 310 return c; 311 default: 312 g_assert_not_reached(); 313 } 314 } 315 316 /* Duplicate IN into OUT as per VECE. */ 317 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 318 { 319 switch (vece) { 320 case MO_8: 321 tcg_gen_ext8u_i32(out, in); 322 tcg_gen_muli_i32(out, out, 0x01010101); 323 break; 324 case MO_16: 325 tcg_gen_deposit_i32(out, in, in, 16, 16); 326 break; 327 case MO_32: 328 tcg_gen_mov_i32(out, in); 329 break; 330 default: 331 g_assert_not_reached(); 332 } 333 } 334 335 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 336 { 337 switch (vece) { 338 case MO_8: 339 tcg_gen_ext8u_i64(out, in); 340 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 341 break; 342 case MO_16: 343 tcg_gen_ext16u_i64(out, in); 344 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 345 break; 346 case MO_32: 347 tcg_gen_deposit_i64(out, in, in, 32, 32); 348 break; 349 case MO_64: 350 tcg_gen_mov_i64(out, in); 351 break; 352 default: 353 g_assert_not_reached(); 354 } 355 } 356 357 /* Select a supported vector type for implementing an operation on SIZE 358 * bytes. If OP is 0, assume that the real operation to be performed is 359 * required by all backends. Otherwise, make sure than OP can be performed 360 * on elements of size VECE in the selected type. Do not select V64 if 361 * PREFER_I64 is true. Return 0 if no vector type is selected. 362 */ 363 static TCGType choose_vector_type(TCGOpcode op, unsigned vece, uint32_t size, 364 bool prefer_i64) 365 { 366 if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { 367 if (op == 0) { 368 return TCG_TYPE_V256; 369 } 370 /* Recall that ARM SVE allows vector sizes that are not a 371 * power of 2, but always a multiple of 16. The intent is 372 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 373 * It is hard to imagine a case in which v256 is supported 374 * but v128 is not, but check anyway. 375 */ 376 if (tcg_can_emit_vec_op(op, TCG_TYPE_V256, vece) 377 && (size % 32 == 0 378 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) { 379 return TCG_TYPE_V256; 380 } 381 } 382 if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) 383 && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) { 384 return TCG_TYPE_V128; 385 } 386 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) 387 && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V64, vece))) { 388 return TCG_TYPE_V64; 389 } 390 return 0; 391 } 392 393 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 394 * Only one of IN_32 or IN_64 may be set; 395 * IN_C is used if IN_32 and IN_64 are unset. 396 */ 397 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 398 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 399 uint64_t in_c) 400 { 401 TCGType type; 402 TCGv_i64 t_64; 403 TCGv_i32 t_32, t_desc; 404 TCGv_ptr t_ptr; 405 uint32_t i; 406 407 assert(vece <= (in_32 ? MO_32 : MO_64)); 408 assert(in_32 == NULL || in_64 == NULL); 409 410 /* If we're storing 0, expand oprsz to maxsz. */ 411 if (in_32 == NULL && in_64 == NULL) { 412 in_c = dup_const(vece, in_c); 413 if (in_c == 0) { 414 oprsz = maxsz; 415 } 416 } 417 418 /* Implement inline with a vector type, if possible. 419 * Prefer integer when 64-bit host and no variable dup. 420 */ 421 type = choose_vector_type(0, vece, oprsz, 422 (TCG_TARGET_REG_BITS == 64 && in_32 == NULL 423 && (in_64 == NULL || vece == MO_64))); 424 if (type != 0) { 425 TCGv_vec t_vec = tcg_temp_new_vec(type); 426 427 if (in_32) { 428 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 429 } else if (in_64) { 430 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 431 } else { 432 switch (vece) { 433 case MO_8: 434 tcg_gen_dup8i_vec(t_vec, in_c); 435 break; 436 case MO_16: 437 tcg_gen_dup16i_vec(t_vec, in_c); 438 break; 439 case MO_32: 440 tcg_gen_dup32i_vec(t_vec, in_c); 441 break; 442 default: 443 tcg_gen_dup64i_vec(t_vec, in_c); 444 break; 445 } 446 } 447 448 i = 0; 449 switch (type) { 450 case TCG_TYPE_V256: 451 /* Recall that ARM SVE allows vector sizes that are not a 452 * power of 2, but always a multiple of 16. The intent is 453 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 454 */ 455 for (; i + 32 <= oprsz; i += 32) { 456 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 457 } 458 /* fallthru */ 459 case TCG_TYPE_V128: 460 for (; i + 16 <= oprsz; i += 16) { 461 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 462 } 463 break; 464 case TCG_TYPE_V64: 465 for (; i < oprsz; i += 8) { 466 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 467 } 468 break; 469 default: 470 g_assert_not_reached(); 471 } 472 473 tcg_temp_free_vec(t_vec); 474 goto done; 475 } 476 477 /* Otherwise, inline with an integer type, unless "large". */ 478 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 479 t_64 = NULL; 480 t_32 = NULL; 481 482 if (in_32) { 483 /* We are given a 32-bit variable input. For a 64-bit host, 484 use a 64-bit operation unless the 32-bit operation would 485 be simple enough. */ 486 if (TCG_TARGET_REG_BITS == 64 487 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 488 t_64 = tcg_temp_new_i64(); 489 tcg_gen_extu_i32_i64(t_64, in_32); 490 gen_dup_i64(vece, t_64, t_64); 491 } else { 492 t_32 = tcg_temp_new_i32(); 493 gen_dup_i32(vece, t_32, in_32); 494 } 495 } else if (in_64) { 496 /* We are given a 64-bit variable input. */ 497 t_64 = tcg_temp_new_i64(); 498 gen_dup_i64(vece, t_64, in_64); 499 } else { 500 /* We are given a constant input. */ 501 /* For 64-bit hosts, use 64-bit constants for "simple" constants 502 or when we'd need too many 32-bit stores, or when a 64-bit 503 constant is really required. */ 504 if (vece == MO_64 505 || (TCG_TARGET_REG_BITS == 64 506 && (in_c == 0 || in_c == -1 507 || !check_size_impl(oprsz, 4)))) { 508 t_64 = tcg_const_i64(in_c); 509 } else { 510 t_32 = tcg_const_i32(in_c); 511 } 512 } 513 514 /* Implement inline if we picked an implementation size above. */ 515 if (t_32) { 516 for (i = 0; i < oprsz; i += 4) { 517 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 518 } 519 tcg_temp_free_i32(t_32); 520 goto done; 521 } 522 if (t_64) { 523 for (i = 0; i < oprsz; i += 8) { 524 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 525 } 526 tcg_temp_free_i64(t_64); 527 goto done; 528 } 529 } 530 531 /* Otherwise implement out of line. */ 532 t_ptr = tcg_temp_new_ptr(); 533 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 534 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 535 536 if (vece == MO_64) { 537 if (in_64) { 538 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 539 } else { 540 t_64 = tcg_const_i64(in_c); 541 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 542 tcg_temp_free_i64(t_64); 543 } 544 } else { 545 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 546 static dup_fn * const fns[3] = { 547 gen_helper_gvec_dup8, 548 gen_helper_gvec_dup16, 549 gen_helper_gvec_dup32 550 }; 551 552 if (in_32) { 553 fns[vece](t_ptr, t_desc, in_32); 554 } else { 555 t_32 = tcg_temp_new_i32(); 556 if (in_64) { 557 tcg_gen_extrl_i64_i32(t_32, in_64); 558 } else if (vece == MO_8) { 559 tcg_gen_movi_i32(t_32, in_c & 0xff); 560 } else if (vece == MO_16) { 561 tcg_gen_movi_i32(t_32, in_c & 0xffff); 562 } else { 563 tcg_gen_movi_i32(t_32, in_c); 564 } 565 fns[vece](t_ptr, t_desc, t_32); 566 tcg_temp_free_i32(t_32); 567 } 568 } 569 570 tcg_temp_free_ptr(t_ptr); 571 tcg_temp_free_i32(t_desc); 572 return; 573 574 done: 575 if (oprsz < maxsz) { 576 expand_clr(dofs + oprsz, maxsz - oprsz); 577 } 578 } 579 580 /* Likewise, but with zero. */ 581 static void expand_clr(uint32_t dofs, uint32_t maxsz) 582 { 583 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 584 } 585 586 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 587 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 588 void (*fni)(TCGv_i32, TCGv_i32)) 589 { 590 TCGv_i32 t0 = tcg_temp_new_i32(); 591 uint32_t i; 592 593 for (i = 0; i < oprsz; i += 4) { 594 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 595 fni(t0, t0); 596 tcg_gen_st_i32(t0, cpu_env, dofs + i); 597 } 598 tcg_temp_free_i32(t0); 599 } 600 601 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 602 int32_t c, bool load_dest, 603 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 604 { 605 TCGv_i32 t0 = tcg_temp_new_i32(); 606 TCGv_i32 t1 = tcg_temp_new_i32(); 607 uint32_t i; 608 609 for (i = 0; i < oprsz; i += 4) { 610 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 611 if (load_dest) { 612 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 613 } 614 fni(t1, t0, c); 615 tcg_gen_st_i32(t1, cpu_env, dofs + i); 616 } 617 tcg_temp_free_i32(t0); 618 tcg_temp_free_i32(t1); 619 } 620 621 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 622 TCGv_i32 c, bool scalar_first, 623 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 624 { 625 TCGv_i32 t0 = tcg_temp_new_i32(); 626 TCGv_i32 t1 = tcg_temp_new_i32(); 627 uint32_t i; 628 629 for (i = 0; i < oprsz; i += 4) { 630 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 631 if (scalar_first) { 632 fni(t1, c, t0); 633 } else { 634 fni(t1, t0, c); 635 } 636 tcg_gen_st_i32(t1, cpu_env, dofs + i); 637 } 638 tcg_temp_free_i32(t0); 639 tcg_temp_free_i32(t1); 640 } 641 642 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 643 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 644 uint32_t bofs, uint32_t oprsz, bool load_dest, 645 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 646 { 647 TCGv_i32 t0 = tcg_temp_new_i32(); 648 TCGv_i32 t1 = tcg_temp_new_i32(); 649 TCGv_i32 t2 = tcg_temp_new_i32(); 650 uint32_t i; 651 652 for (i = 0; i < oprsz; i += 4) { 653 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 654 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 655 if (load_dest) { 656 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 657 } 658 fni(t2, t0, t1); 659 tcg_gen_st_i32(t2, cpu_env, dofs + i); 660 } 661 tcg_temp_free_i32(t2); 662 tcg_temp_free_i32(t1); 663 tcg_temp_free_i32(t0); 664 } 665 666 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 667 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 668 uint32_t cofs, uint32_t oprsz, 669 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 670 { 671 TCGv_i32 t0 = tcg_temp_new_i32(); 672 TCGv_i32 t1 = tcg_temp_new_i32(); 673 TCGv_i32 t2 = tcg_temp_new_i32(); 674 TCGv_i32 t3 = tcg_temp_new_i32(); 675 uint32_t i; 676 677 for (i = 0; i < oprsz; i += 4) { 678 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 679 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 680 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 681 fni(t0, t1, t2, t3); 682 tcg_gen_st_i32(t0, cpu_env, dofs + i); 683 } 684 tcg_temp_free_i32(t3); 685 tcg_temp_free_i32(t2); 686 tcg_temp_free_i32(t1); 687 tcg_temp_free_i32(t0); 688 } 689 690 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 691 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 692 void (*fni)(TCGv_i64, TCGv_i64)) 693 { 694 TCGv_i64 t0 = tcg_temp_new_i64(); 695 uint32_t i; 696 697 for (i = 0; i < oprsz; i += 8) { 698 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 699 fni(t0, t0); 700 tcg_gen_st_i64(t0, cpu_env, dofs + i); 701 } 702 tcg_temp_free_i64(t0); 703 } 704 705 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 706 int64_t c, bool load_dest, 707 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 708 { 709 TCGv_i64 t0 = tcg_temp_new_i64(); 710 TCGv_i64 t1 = tcg_temp_new_i64(); 711 uint32_t i; 712 713 for (i = 0; i < oprsz; i += 8) { 714 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 715 if (load_dest) { 716 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 717 } 718 fni(t1, t0, c); 719 tcg_gen_st_i64(t1, cpu_env, dofs + i); 720 } 721 tcg_temp_free_i64(t0); 722 tcg_temp_free_i64(t1); 723 } 724 725 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 726 TCGv_i64 c, bool scalar_first, 727 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 728 { 729 TCGv_i64 t0 = tcg_temp_new_i64(); 730 TCGv_i64 t1 = tcg_temp_new_i64(); 731 uint32_t i; 732 733 for (i = 0; i < oprsz; i += 8) { 734 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 735 if (scalar_first) { 736 fni(t1, c, t0); 737 } else { 738 fni(t1, t0, c); 739 } 740 tcg_gen_st_i64(t1, cpu_env, dofs + i); 741 } 742 tcg_temp_free_i64(t0); 743 tcg_temp_free_i64(t1); 744 } 745 746 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 747 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 748 uint32_t bofs, uint32_t oprsz, bool load_dest, 749 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 750 { 751 TCGv_i64 t0 = tcg_temp_new_i64(); 752 TCGv_i64 t1 = tcg_temp_new_i64(); 753 TCGv_i64 t2 = tcg_temp_new_i64(); 754 uint32_t i; 755 756 for (i = 0; i < oprsz; i += 8) { 757 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 758 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 759 if (load_dest) { 760 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 761 } 762 fni(t2, t0, t1); 763 tcg_gen_st_i64(t2, cpu_env, dofs + i); 764 } 765 tcg_temp_free_i64(t2); 766 tcg_temp_free_i64(t1); 767 tcg_temp_free_i64(t0); 768 } 769 770 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 771 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 772 uint32_t cofs, uint32_t oprsz, 773 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 774 { 775 TCGv_i64 t0 = tcg_temp_new_i64(); 776 TCGv_i64 t1 = tcg_temp_new_i64(); 777 TCGv_i64 t2 = tcg_temp_new_i64(); 778 TCGv_i64 t3 = tcg_temp_new_i64(); 779 uint32_t i; 780 781 for (i = 0; i < oprsz; i += 8) { 782 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 783 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 784 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 785 fni(t0, t1, t2, t3); 786 tcg_gen_st_i64(t0, cpu_env, dofs + i); 787 } 788 tcg_temp_free_i64(t3); 789 tcg_temp_free_i64(t2); 790 tcg_temp_free_i64(t1); 791 tcg_temp_free_i64(t0); 792 } 793 794 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 795 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 796 uint32_t oprsz, uint32_t tysz, TCGType type, 797 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 798 { 799 TCGv_vec t0 = tcg_temp_new_vec(type); 800 uint32_t i; 801 802 for (i = 0; i < oprsz; i += tysz) { 803 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 804 fni(vece, t0, t0); 805 tcg_gen_st_vec(t0, cpu_env, dofs + i); 806 } 807 tcg_temp_free_vec(t0); 808 } 809 810 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 811 using host vectors. */ 812 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 813 uint32_t oprsz, uint32_t tysz, TCGType type, 814 int64_t c, bool load_dest, 815 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 816 { 817 TCGv_vec t0 = tcg_temp_new_vec(type); 818 TCGv_vec t1 = tcg_temp_new_vec(type); 819 uint32_t i; 820 821 for (i = 0; i < oprsz; i += tysz) { 822 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 823 if (load_dest) { 824 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 825 } 826 fni(vece, t1, t0, c); 827 tcg_gen_st_vec(t1, cpu_env, dofs + i); 828 } 829 tcg_temp_free_vec(t0); 830 tcg_temp_free_vec(t1); 831 } 832 833 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 834 uint32_t oprsz, uint32_t tysz, TCGType type, 835 TCGv_vec c, bool scalar_first, 836 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 837 { 838 TCGv_vec t0 = tcg_temp_new_vec(type); 839 TCGv_vec t1 = tcg_temp_new_vec(type); 840 uint32_t i; 841 842 for (i = 0; i < oprsz; i += tysz) { 843 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 844 if (scalar_first) { 845 fni(vece, t1, c, t0); 846 } else { 847 fni(vece, t1, t0, c); 848 } 849 tcg_gen_st_vec(t1, cpu_env, dofs + i); 850 } 851 tcg_temp_free_vec(t0); 852 tcg_temp_free_vec(t1); 853 } 854 855 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 856 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 857 uint32_t bofs, uint32_t oprsz, 858 uint32_t tysz, TCGType type, bool load_dest, 859 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 860 { 861 TCGv_vec t0 = tcg_temp_new_vec(type); 862 TCGv_vec t1 = tcg_temp_new_vec(type); 863 TCGv_vec t2 = tcg_temp_new_vec(type); 864 uint32_t i; 865 866 for (i = 0; i < oprsz; i += tysz) { 867 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 868 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 869 if (load_dest) { 870 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 871 } 872 fni(vece, t2, t0, t1); 873 tcg_gen_st_vec(t2, cpu_env, dofs + i); 874 } 875 tcg_temp_free_vec(t2); 876 tcg_temp_free_vec(t1); 877 tcg_temp_free_vec(t0); 878 } 879 880 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 881 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 882 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 883 uint32_t tysz, TCGType type, 884 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 885 TCGv_vec, TCGv_vec)) 886 { 887 TCGv_vec t0 = tcg_temp_new_vec(type); 888 TCGv_vec t1 = tcg_temp_new_vec(type); 889 TCGv_vec t2 = tcg_temp_new_vec(type); 890 TCGv_vec t3 = tcg_temp_new_vec(type); 891 uint32_t i; 892 893 for (i = 0; i < oprsz; i += tysz) { 894 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 895 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 896 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 897 fni(vece, t0, t1, t2, t3); 898 tcg_gen_st_vec(t0, cpu_env, dofs + i); 899 } 900 tcg_temp_free_vec(t3); 901 tcg_temp_free_vec(t2); 902 tcg_temp_free_vec(t1); 903 tcg_temp_free_vec(t0); 904 } 905 906 /* Expand a vector two-operand operation. */ 907 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 908 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 909 { 910 TCGType type; 911 uint32_t some; 912 913 check_size_align(oprsz, maxsz, dofs | aofs); 914 check_overlap_2(dofs, aofs, maxsz); 915 916 type = 0; 917 if (g->fniv) { 918 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 919 } 920 switch (type) { 921 case TCG_TYPE_V256: 922 /* Recall that ARM SVE allows vector sizes that are not a 923 * power of 2, but always a multiple of 16. The intent is 924 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 925 */ 926 some = QEMU_ALIGN_DOWN(oprsz, 32); 927 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); 928 if (some == oprsz) { 929 break; 930 } 931 dofs += some; 932 aofs += some; 933 oprsz -= some; 934 maxsz -= some; 935 /* fallthru */ 936 case TCG_TYPE_V128: 937 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); 938 break; 939 case TCG_TYPE_V64: 940 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); 941 break; 942 943 case 0: 944 if (g->fni8 && check_size_impl(oprsz, 8)) { 945 expand_2_i64(dofs, aofs, oprsz, g->fni8); 946 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 947 expand_2_i32(dofs, aofs, oprsz, g->fni4); 948 } else { 949 assert(g->fno != NULL); 950 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 951 return; 952 } 953 break; 954 955 default: 956 g_assert_not_reached(); 957 } 958 959 if (oprsz < maxsz) { 960 expand_clr(dofs + oprsz, maxsz - oprsz); 961 } 962 } 963 964 /* Expand a vector operation with two vectors and an immediate. */ 965 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 966 uint32_t maxsz, int64_t c, const GVecGen2i *g) 967 { 968 TCGType type; 969 uint32_t some; 970 971 check_size_align(oprsz, maxsz, dofs | aofs); 972 check_overlap_2(dofs, aofs, maxsz); 973 974 type = 0; 975 if (g->fniv) { 976 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 977 } 978 switch (type) { 979 case TCG_TYPE_V256: 980 /* Recall that ARM SVE allows vector sizes that are not a 981 * power of 2, but always a multiple of 16. The intent is 982 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 983 */ 984 some = QEMU_ALIGN_DOWN(oprsz, 32); 985 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 986 c, g->load_dest, g->fniv); 987 if (some == oprsz) { 988 break; 989 } 990 dofs += some; 991 aofs += some; 992 oprsz -= some; 993 maxsz -= some; 994 /* fallthru */ 995 case TCG_TYPE_V128: 996 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 997 c, g->load_dest, g->fniv); 998 break; 999 case TCG_TYPE_V64: 1000 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1001 c, g->load_dest, g->fniv); 1002 break; 1003 1004 case 0: 1005 if (g->fni8 && check_size_impl(oprsz, 8)) { 1006 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 1007 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1008 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 1009 } else { 1010 if (g->fno) { 1011 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 1012 } else { 1013 TCGv_i64 tcg_c = tcg_const_i64(c); 1014 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, 1015 maxsz, c, g->fnoi); 1016 tcg_temp_free_i64(tcg_c); 1017 } 1018 return; 1019 } 1020 break; 1021 1022 default: 1023 g_assert_not_reached(); 1024 } 1025 1026 if (oprsz < maxsz) { 1027 expand_clr(dofs + oprsz, maxsz - oprsz); 1028 } 1029 } 1030 1031 /* Expand a vector operation with two vectors and a scalar. */ 1032 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 1033 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 1034 { 1035 TCGType type; 1036 1037 check_size_align(oprsz, maxsz, dofs | aofs); 1038 check_overlap_2(dofs, aofs, maxsz); 1039 1040 type = 0; 1041 if (g->fniv) { 1042 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 1043 } 1044 if (type != 0) { 1045 TCGv_vec t_vec = tcg_temp_new_vec(type); 1046 uint32_t some; 1047 1048 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 1049 1050 switch (type) { 1051 case TCG_TYPE_V256: 1052 /* Recall that ARM SVE allows vector sizes that are not a 1053 * power of 2, but always a multiple of 16. The intent is 1054 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1055 */ 1056 some = QEMU_ALIGN_DOWN(oprsz, 32); 1057 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1058 t_vec, g->scalar_first, g->fniv); 1059 if (some == oprsz) { 1060 break; 1061 } 1062 dofs += some; 1063 aofs += some; 1064 oprsz -= some; 1065 maxsz -= some; 1066 /* fallthru */ 1067 1068 case TCG_TYPE_V128: 1069 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1070 t_vec, g->scalar_first, g->fniv); 1071 break; 1072 1073 case TCG_TYPE_V64: 1074 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1075 t_vec, g->scalar_first, g->fniv); 1076 break; 1077 1078 default: 1079 g_assert_not_reached(); 1080 } 1081 tcg_temp_free_vec(t_vec); 1082 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1083 TCGv_i64 t64 = tcg_temp_new_i64(); 1084 1085 gen_dup_i64(g->vece, t64, c); 1086 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1087 tcg_temp_free_i64(t64); 1088 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1089 TCGv_i32 t32 = tcg_temp_new_i32(); 1090 1091 tcg_gen_extrl_i64_i32(t32, c); 1092 gen_dup_i32(g->vece, t32, t32); 1093 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1094 tcg_temp_free_i32(t32); 1095 } else { 1096 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1097 return; 1098 } 1099 1100 if (oprsz < maxsz) { 1101 expand_clr(dofs + oprsz, maxsz - oprsz); 1102 } 1103 } 1104 1105 /* Expand a vector three-operand operation. */ 1106 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1107 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1108 { 1109 TCGType type; 1110 uint32_t some; 1111 1112 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1113 check_overlap_3(dofs, aofs, bofs, maxsz); 1114 1115 type = 0; 1116 if (g->fniv) { 1117 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 1118 } 1119 switch (type) { 1120 case TCG_TYPE_V256: 1121 /* Recall that ARM SVE allows vector sizes that are not a 1122 * power of 2, but always a multiple of 16. The intent is 1123 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1124 */ 1125 some = QEMU_ALIGN_DOWN(oprsz, 32); 1126 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1127 g->load_dest, g->fniv); 1128 if (some == oprsz) { 1129 break; 1130 } 1131 dofs += some; 1132 aofs += some; 1133 bofs += some; 1134 oprsz -= some; 1135 maxsz -= some; 1136 /* fallthru */ 1137 case TCG_TYPE_V128: 1138 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1139 g->load_dest, g->fniv); 1140 break; 1141 case TCG_TYPE_V64: 1142 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1143 g->load_dest, g->fniv); 1144 break; 1145 1146 case 0: 1147 if (g->fni8 && check_size_impl(oprsz, 8)) { 1148 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1149 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1150 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1151 } else { 1152 assert(g->fno != NULL); 1153 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, 1154 maxsz, g->data, g->fno); 1155 return; 1156 } 1157 break; 1158 1159 default: 1160 g_assert_not_reached(); 1161 } 1162 1163 if (oprsz < maxsz) { 1164 expand_clr(dofs + oprsz, maxsz - oprsz); 1165 } 1166 } 1167 1168 /* Expand a vector four-operand operation. */ 1169 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1170 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1171 { 1172 TCGType type; 1173 uint32_t some; 1174 1175 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1176 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1177 1178 type = 0; 1179 if (g->fniv) { 1180 type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64); 1181 } 1182 switch (type) { 1183 case TCG_TYPE_V256: 1184 /* Recall that ARM SVE allows vector sizes that are not a 1185 * power of 2, but always a multiple of 16. The intent is 1186 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 1187 */ 1188 some = QEMU_ALIGN_DOWN(oprsz, 32); 1189 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1190 32, TCG_TYPE_V256, g->fniv); 1191 if (some == oprsz) { 1192 break; 1193 } 1194 dofs += some; 1195 aofs += some; 1196 bofs += some; 1197 cofs += some; 1198 oprsz -= some; 1199 maxsz -= some; 1200 /* fallthru */ 1201 case TCG_TYPE_V128: 1202 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1203 16, TCG_TYPE_V128, g->fniv); 1204 break; 1205 case TCG_TYPE_V64: 1206 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1207 8, TCG_TYPE_V64, g->fniv); 1208 break; 1209 1210 case 0: 1211 if (g->fni8 && check_size_impl(oprsz, 8)) { 1212 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8); 1213 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1214 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4); 1215 } else { 1216 assert(g->fno != NULL); 1217 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1218 oprsz, maxsz, g->data, g->fno); 1219 return; 1220 } 1221 break; 1222 1223 default: 1224 g_assert_not_reached(); 1225 } 1226 1227 if (oprsz < maxsz) { 1228 expand_clr(dofs + oprsz, maxsz - oprsz); 1229 } 1230 } 1231 1232 /* 1233 * Expand specific vector operations. 1234 */ 1235 1236 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1237 { 1238 tcg_gen_mov_vec(a, b); 1239 } 1240 1241 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1242 uint32_t oprsz, uint32_t maxsz) 1243 { 1244 static const GVecGen2 g = { 1245 .fni8 = tcg_gen_mov_i64, 1246 .fniv = vec_mov2, 1247 .fno = gen_helper_gvec_mov, 1248 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1249 }; 1250 if (dofs != aofs) { 1251 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1252 } else { 1253 check_size_align(oprsz, maxsz, dofs); 1254 if (oprsz < maxsz) { 1255 expand_clr(dofs + oprsz, maxsz - oprsz); 1256 } 1257 } 1258 } 1259 1260 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1261 uint32_t maxsz, TCGv_i32 in) 1262 { 1263 check_size_align(oprsz, maxsz, dofs); 1264 tcg_debug_assert(vece <= MO_32); 1265 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1266 } 1267 1268 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1269 uint32_t maxsz, TCGv_i64 in) 1270 { 1271 check_size_align(oprsz, maxsz, dofs); 1272 tcg_debug_assert(vece <= MO_64); 1273 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1274 } 1275 1276 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1277 uint32_t oprsz, uint32_t maxsz) 1278 { 1279 if (vece <= MO_32) { 1280 TCGv_i32 in = tcg_temp_new_i32(); 1281 switch (vece) { 1282 case MO_8: 1283 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1284 break; 1285 case MO_16: 1286 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1287 break; 1288 case MO_32: 1289 tcg_gen_ld_i32(in, cpu_env, aofs); 1290 break; 1291 } 1292 tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in); 1293 tcg_temp_free_i32(in); 1294 } else if (vece == MO_64) { 1295 TCGv_i64 in = tcg_temp_new_i64(); 1296 tcg_gen_ld_i64(in, cpu_env, aofs); 1297 tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in); 1298 tcg_temp_free_i64(in); 1299 } else { 1300 /* 128-bit duplicate. */ 1301 /* ??? Dup to 256-bit vector. */ 1302 int i; 1303 1304 tcg_debug_assert(vece == 4); 1305 tcg_debug_assert(oprsz >= 16); 1306 if (TCG_TARGET_HAS_v128) { 1307 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1308 1309 tcg_gen_ld_vec(in, cpu_env, aofs); 1310 for (i = 0; i < oprsz; i += 16) { 1311 tcg_gen_st_vec(in, cpu_env, dofs + i); 1312 } 1313 tcg_temp_free_vec(in); 1314 } else { 1315 TCGv_i64 in0 = tcg_temp_new_i64(); 1316 TCGv_i64 in1 = tcg_temp_new_i64(); 1317 1318 tcg_gen_ld_i64(in0, cpu_env, aofs); 1319 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1320 for (i = 0; i < oprsz; i += 16) { 1321 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1322 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1323 } 1324 tcg_temp_free_i64(in0); 1325 tcg_temp_free_i64(in1); 1326 } 1327 } 1328 } 1329 1330 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, 1331 uint32_t maxsz, uint64_t x) 1332 { 1333 check_size_align(oprsz, maxsz, dofs); 1334 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); 1335 } 1336 1337 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, 1338 uint32_t maxsz, uint32_t x) 1339 { 1340 check_size_align(oprsz, maxsz, dofs); 1341 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); 1342 } 1343 1344 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, 1345 uint32_t maxsz, uint16_t x) 1346 { 1347 check_size_align(oprsz, maxsz, dofs); 1348 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); 1349 } 1350 1351 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, 1352 uint32_t maxsz, uint8_t x) 1353 { 1354 check_size_align(oprsz, maxsz, dofs); 1355 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); 1356 } 1357 1358 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1359 uint32_t oprsz, uint32_t maxsz) 1360 { 1361 static const GVecGen2 g = { 1362 .fni8 = tcg_gen_not_i64, 1363 .fniv = tcg_gen_not_vec, 1364 .fno = gen_helper_gvec_not, 1365 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1366 }; 1367 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1368 } 1369 1370 /* Perform a vector addition using normal addition and a mask. The mask 1371 should be the sign bit of each lane. This 6-operation form is more 1372 efficient than separate additions when there are 4 or more lanes in 1373 the 64-bit operation. */ 1374 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1375 { 1376 TCGv_i64 t1 = tcg_temp_new_i64(); 1377 TCGv_i64 t2 = tcg_temp_new_i64(); 1378 TCGv_i64 t3 = tcg_temp_new_i64(); 1379 1380 tcg_gen_andc_i64(t1, a, m); 1381 tcg_gen_andc_i64(t2, b, m); 1382 tcg_gen_xor_i64(t3, a, b); 1383 tcg_gen_add_i64(d, t1, t2); 1384 tcg_gen_and_i64(t3, t3, m); 1385 tcg_gen_xor_i64(d, d, t3); 1386 1387 tcg_temp_free_i64(t1); 1388 tcg_temp_free_i64(t2); 1389 tcg_temp_free_i64(t3); 1390 } 1391 1392 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1393 { 1394 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1395 gen_addv_mask(d, a, b, m); 1396 tcg_temp_free_i64(m); 1397 } 1398 1399 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1400 { 1401 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1402 gen_addv_mask(d, a, b, m); 1403 tcg_temp_free_i64(m); 1404 } 1405 1406 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1407 { 1408 TCGv_i64 t1 = tcg_temp_new_i64(); 1409 TCGv_i64 t2 = tcg_temp_new_i64(); 1410 1411 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1412 tcg_gen_add_i64(t2, a, b); 1413 tcg_gen_add_i64(t1, t1, b); 1414 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1415 1416 tcg_temp_free_i64(t1); 1417 tcg_temp_free_i64(t2); 1418 } 1419 1420 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1421 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1422 { 1423 static const GVecGen3 g[4] = { 1424 { .fni8 = tcg_gen_vec_add8_i64, 1425 .fniv = tcg_gen_add_vec, 1426 .fno = gen_helper_gvec_add8, 1427 .opc = INDEX_op_add_vec, 1428 .vece = MO_8 }, 1429 { .fni8 = tcg_gen_vec_add16_i64, 1430 .fniv = tcg_gen_add_vec, 1431 .fno = gen_helper_gvec_add16, 1432 .opc = INDEX_op_add_vec, 1433 .vece = MO_16 }, 1434 { .fni4 = tcg_gen_add_i32, 1435 .fniv = tcg_gen_add_vec, 1436 .fno = gen_helper_gvec_add32, 1437 .opc = INDEX_op_add_vec, 1438 .vece = MO_32 }, 1439 { .fni8 = tcg_gen_add_i64, 1440 .fniv = tcg_gen_add_vec, 1441 .fno = gen_helper_gvec_add64, 1442 .opc = INDEX_op_add_vec, 1443 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1444 .vece = MO_64 }, 1445 }; 1446 1447 tcg_debug_assert(vece <= MO_64); 1448 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1449 } 1450 1451 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1452 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1453 { 1454 static const GVecGen2s g[4] = { 1455 { .fni8 = tcg_gen_vec_add8_i64, 1456 .fniv = tcg_gen_add_vec, 1457 .fno = gen_helper_gvec_adds8, 1458 .opc = INDEX_op_add_vec, 1459 .vece = MO_8 }, 1460 { .fni8 = tcg_gen_vec_add16_i64, 1461 .fniv = tcg_gen_add_vec, 1462 .fno = gen_helper_gvec_adds16, 1463 .opc = INDEX_op_add_vec, 1464 .vece = MO_16 }, 1465 { .fni4 = tcg_gen_add_i32, 1466 .fniv = tcg_gen_add_vec, 1467 .fno = gen_helper_gvec_adds32, 1468 .opc = INDEX_op_add_vec, 1469 .vece = MO_32 }, 1470 { .fni8 = tcg_gen_add_i64, 1471 .fniv = tcg_gen_add_vec, 1472 .fno = gen_helper_gvec_adds64, 1473 .opc = INDEX_op_add_vec, 1474 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1475 .vece = MO_64 }, 1476 }; 1477 1478 tcg_debug_assert(vece <= MO_64); 1479 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1480 } 1481 1482 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1483 int64_t c, uint32_t oprsz, uint32_t maxsz) 1484 { 1485 TCGv_i64 tmp = tcg_const_i64(c); 1486 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1487 tcg_temp_free_i64(tmp); 1488 } 1489 1490 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1491 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1492 { 1493 static const GVecGen2s g[4] = { 1494 { .fni8 = tcg_gen_vec_sub8_i64, 1495 .fniv = tcg_gen_sub_vec, 1496 .fno = gen_helper_gvec_subs8, 1497 .opc = INDEX_op_sub_vec, 1498 .vece = MO_8 }, 1499 { .fni8 = tcg_gen_vec_sub16_i64, 1500 .fniv = tcg_gen_sub_vec, 1501 .fno = gen_helper_gvec_subs16, 1502 .opc = INDEX_op_sub_vec, 1503 .vece = MO_16 }, 1504 { .fni4 = tcg_gen_sub_i32, 1505 .fniv = tcg_gen_sub_vec, 1506 .fno = gen_helper_gvec_subs32, 1507 .opc = INDEX_op_sub_vec, 1508 .vece = MO_32 }, 1509 { .fni8 = tcg_gen_sub_i64, 1510 .fniv = tcg_gen_sub_vec, 1511 .fno = gen_helper_gvec_subs64, 1512 .opc = INDEX_op_sub_vec, 1513 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1514 .vece = MO_64 }, 1515 }; 1516 1517 tcg_debug_assert(vece <= MO_64); 1518 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1519 } 1520 1521 /* Perform a vector subtraction using normal subtraction and a mask. 1522 Compare gen_addv_mask above. */ 1523 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1524 { 1525 TCGv_i64 t1 = tcg_temp_new_i64(); 1526 TCGv_i64 t2 = tcg_temp_new_i64(); 1527 TCGv_i64 t3 = tcg_temp_new_i64(); 1528 1529 tcg_gen_or_i64(t1, a, m); 1530 tcg_gen_andc_i64(t2, b, m); 1531 tcg_gen_eqv_i64(t3, a, b); 1532 tcg_gen_sub_i64(d, t1, t2); 1533 tcg_gen_and_i64(t3, t3, m); 1534 tcg_gen_xor_i64(d, d, t3); 1535 1536 tcg_temp_free_i64(t1); 1537 tcg_temp_free_i64(t2); 1538 tcg_temp_free_i64(t3); 1539 } 1540 1541 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1542 { 1543 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1544 gen_subv_mask(d, a, b, m); 1545 tcg_temp_free_i64(m); 1546 } 1547 1548 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1549 { 1550 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1551 gen_subv_mask(d, a, b, m); 1552 tcg_temp_free_i64(m); 1553 } 1554 1555 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1556 { 1557 TCGv_i64 t1 = tcg_temp_new_i64(); 1558 TCGv_i64 t2 = tcg_temp_new_i64(); 1559 1560 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1561 tcg_gen_sub_i64(t2, a, b); 1562 tcg_gen_sub_i64(t1, a, t1); 1563 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1564 1565 tcg_temp_free_i64(t1); 1566 tcg_temp_free_i64(t2); 1567 } 1568 1569 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1570 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1571 { 1572 static const GVecGen3 g[4] = { 1573 { .fni8 = tcg_gen_vec_sub8_i64, 1574 .fniv = tcg_gen_sub_vec, 1575 .fno = gen_helper_gvec_sub8, 1576 .opc = INDEX_op_sub_vec, 1577 .vece = MO_8 }, 1578 { .fni8 = tcg_gen_vec_sub16_i64, 1579 .fniv = tcg_gen_sub_vec, 1580 .fno = gen_helper_gvec_sub16, 1581 .opc = INDEX_op_sub_vec, 1582 .vece = MO_16 }, 1583 { .fni4 = tcg_gen_sub_i32, 1584 .fniv = tcg_gen_sub_vec, 1585 .fno = gen_helper_gvec_sub32, 1586 .opc = INDEX_op_sub_vec, 1587 .vece = MO_32 }, 1588 { .fni8 = tcg_gen_sub_i64, 1589 .fniv = tcg_gen_sub_vec, 1590 .fno = gen_helper_gvec_sub64, 1591 .opc = INDEX_op_sub_vec, 1592 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1593 .vece = MO_64 }, 1594 }; 1595 1596 tcg_debug_assert(vece <= MO_64); 1597 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1598 } 1599 1600 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1601 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1602 { 1603 static const GVecGen3 g[4] = { 1604 { .fniv = tcg_gen_mul_vec, 1605 .fno = gen_helper_gvec_mul8, 1606 .opc = INDEX_op_mul_vec, 1607 .vece = MO_8 }, 1608 { .fniv = tcg_gen_mul_vec, 1609 .fno = gen_helper_gvec_mul16, 1610 .opc = INDEX_op_mul_vec, 1611 .vece = MO_16 }, 1612 { .fni4 = tcg_gen_mul_i32, 1613 .fniv = tcg_gen_mul_vec, 1614 .fno = gen_helper_gvec_mul32, 1615 .opc = INDEX_op_mul_vec, 1616 .vece = MO_32 }, 1617 { .fni8 = tcg_gen_mul_i64, 1618 .fniv = tcg_gen_mul_vec, 1619 .fno = gen_helper_gvec_mul64, 1620 .opc = INDEX_op_mul_vec, 1621 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1622 .vece = MO_64 }, 1623 }; 1624 1625 tcg_debug_assert(vece <= MO_64); 1626 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1627 } 1628 1629 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1630 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1631 { 1632 static const GVecGen2s g[4] = { 1633 { .fniv = tcg_gen_mul_vec, 1634 .fno = gen_helper_gvec_muls8, 1635 .opc = INDEX_op_mul_vec, 1636 .vece = MO_8 }, 1637 { .fniv = tcg_gen_mul_vec, 1638 .fno = gen_helper_gvec_muls16, 1639 .opc = INDEX_op_mul_vec, 1640 .vece = MO_16 }, 1641 { .fni4 = tcg_gen_mul_i32, 1642 .fniv = tcg_gen_mul_vec, 1643 .fno = gen_helper_gvec_muls32, 1644 .opc = INDEX_op_mul_vec, 1645 .vece = MO_32 }, 1646 { .fni8 = tcg_gen_mul_i64, 1647 .fniv = tcg_gen_mul_vec, 1648 .fno = gen_helper_gvec_muls64, 1649 .opc = INDEX_op_mul_vec, 1650 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1651 .vece = MO_64 }, 1652 }; 1653 1654 tcg_debug_assert(vece <= MO_64); 1655 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1656 } 1657 1658 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1659 int64_t c, uint32_t oprsz, uint32_t maxsz) 1660 { 1661 TCGv_i64 tmp = tcg_const_i64(c); 1662 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1663 tcg_temp_free_i64(tmp); 1664 } 1665 1666 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1667 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1668 { 1669 static const GVecGen3 g[4] = { 1670 { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 }, 1671 { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 }, 1672 { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 }, 1673 { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 } 1674 }; 1675 tcg_debug_assert(vece <= MO_64); 1676 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1677 } 1678 1679 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 1680 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1681 { 1682 static const GVecGen3 g[4] = { 1683 { .fno = gen_helper_gvec_sssub8, .vece = MO_8 }, 1684 { .fno = gen_helper_gvec_sssub16, .vece = MO_16 }, 1685 { .fno = gen_helper_gvec_sssub32, .vece = MO_32 }, 1686 { .fno = gen_helper_gvec_sssub64, .vece = MO_64 } 1687 }; 1688 tcg_debug_assert(vece <= MO_64); 1689 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1690 } 1691 1692 static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1693 { 1694 TCGv_i32 max = tcg_const_i32(-1); 1695 tcg_gen_add_i32(d, a, b); 1696 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 1697 tcg_temp_free_i32(max); 1698 } 1699 1700 static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1701 { 1702 TCGv_i64 max = tcg_const_i64(-1); 1703 tcg_gen_add_i64(d, a, b); 1704 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 1705 tcg_temp_free_i64(max); 1706 } 1707 1708 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1709 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1710 { 1711 static const GVecGen3 g[4] = { 1712 { .fno = gen_helper_gvec_usadd8, .vece = MO_8 }, 1713 { .fno = gen_helper_gvec_usadd16, .vece = MO_16 }, 1714 { .fni4 = tcg_gen_vec_usadd32_i32, 1715 .fno = gen_helper_gvec_usadd32, 1716 .vece = MO_32 }, 1717 { .fni8 = tcg_gen_vec_usadd32_i64, 1718 .fno = gen_helper_gvec_usadd64, 1719 .vece = MO_64 } 1720 }; 1721 tcg_debug_assert(vece <= MO_64); 1722 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1723 } 1724 1725 static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1726 { 1727 TCGv_i32 min = tcg_const_i32(0); 1728 tcg_gen_sub_i32(d, a, b); 1729 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 1730 tcg_temp_free_i32(min); 1731 } 1732 1733 static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1734 { 1735 TCGv_i64 min = tcg_const_i64(0); 1736 tcg_gen_sub_i64(d, a, b); 1737 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 1738 tcg_temp_free_i64(min); 1739 } 1740 1741 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 1742 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1743 { 1744 static const GVecGen3 g[4] = { 1745 { .fno = gen_helper_gvec_ussub8, .vece = MO_8 }, 1746 { .fno = gen_helper_gvec_ussub16, .vece = MO_16 }, 1747 { .fni4 = tcg_gen_vec_ussub32_i32, 1748 .fno = gen_helper_gvec_ussub32, 1749 .vece = MO_32 }, 1750 { .fni8 = tcg_gen_vec_ussub32_i64, 1751 .fno = gen_helper_gvec_ussub64, 1752 .vece = MO_64 } 1753 }; 1754 tcg_debug_assert(vece <= MO_64); 1755 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1756 } 1757 1758 /* Perform a vector negation using normal negation and a mask. 1759 Compare gen_subv_mask above. */ 1760 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 1761 { 1762 TCGv_i64 t2 = tcg_temp_new_i64(); 1763 TCGv_i64 t3 = tcg_temp_new_i64(); 1764 1765 tcg_gen_andc_i64(t3, m, b); 1766 tcg_gen_andc_i64(t2, b, m); 1767 tcg_gen_sub_i64(d, m, t2); 1768 tcg_gen_xor_i64(d, d, t3); 1769 1770 tcg_temp_free_i64(t2); 1771 tcg_temp_free_i64(t3); 1772 } 1773 1774 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 1775 { 1776 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1777 gen_negv_mask(d, b, m); 1778 tcg_temp_free_i64(m); 1779 } 1780 1781 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 1782 { 1783 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1784 gen_negv_mask(d, b, m); 1785 tcg_temp_free_i64(m); 1786 } 1787 1788 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 1789 { 1790 TCGv_i64 t1 = tcg_temp_new_i64(); 1791 TCGv_i64 t2 = tcg_temp_new_i64(); 1792 1793 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1794 tcg_gen_neg_i64(t2, b); 1795 tcg_gen_neg_i64(t1, t1); 1796 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1797 1798 tcg_temp_free_i64(t1); 1799 tcg_temp_free_i64(t2); 1800 } 1801 1802 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 1803 uint32_t oprsz, uint32_t maxsz) 1804 { 1805 static const GVecGen2 g[4] = { 1806 { .fni8 = tcg_gen_vec_neg8_i64, 1807 .fniv = tcg_gen_neg_vec, 1808 .fno = gen_helper_gvec_neg8, 1809 .opc = INDEX_op_neg_vec, 1810 .vece = MO_8 }, 1811 { .fni8 = tcg_gen_vec_neg16_i64, 1812 .fniv = tcg_gen_neg_vec, 1813 .fno = gen_helper_gvec_neg16, 1814 .opc = INDEX_op_neg_vec, 1815 .vece = MO_16 }, 1816 { .fni4 = tcg_gen_neg_i32, 1817 .fniv = tcg_gen_neg_vec, 1818 .fno = gen_helper_gvec_neg32, 1819 .opc = INDEX_op_neg_vec, 1820 .vece = MO_32 }, 1821 { .fni8 = tcg_gen_neg_i64, 1822 .fniv = tcg_gen_neg_vec, 1823 .fno = gen_helper_gvec_neg64, 1824 .opc = INDEX_op_neg_vec, 1825 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1826 .vece = MO_64 }, 1827 }; 1828 1829 tcg_debug_assert(vece <= MO_64); 1830 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 1831 } 1832 1833 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 1834 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1835 { 1836 static const GVecGen3 g = { 1837 .fni8 = tcg_gen_and_i64, 1838 .fniv = tcg_gen_and_vec, 1839 .fno = gen_helper_gvec_and, 1840 .opc = INDEX_op_and_vec, 1841 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1842 }; 1843 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1844 } 1845 1846 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 1847 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1848 { 1849 static const GVecGen3 g = { 1850 .fni8 = tcg_gen_or_i64, 1851 .fniv = tcg_gen_or_vec, 1852 .fno = gen_helper_gvec_or, 1853 .opc = INDEX_op_or_vec, 1854 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1855 }; 1856 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1857 } 1858 1859 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 1860 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1861 { 1862 static const GVecGen3 g = { 1863 .fni8 = tcg_gen_xor_i64, 1864 .fniv = tcg_gen_xor_vec, 1865 .fno = gen_helper_gvec_xor, 1866 .opc = INDEX_op_xor_vec, 1867 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1868 }; 1869 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1870 } 1871 1872 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 1873 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1874 { 1875 static const GVecGen3 g = { 1876 .fni8 = tcg_gen_andc_i64, 1877 .fniv = tcg_gen_andc_vec, 1878 .fno = gen_helper_gvec_andc, 1879 .opc = INDEX_op_andc_vec, 1880 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1881 }; 1882 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1883 } 1884 1885 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 1886 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1887 { 1888 static const GVecGen3 g = { 1889 .fni8 = tcg_gen_orc_i64, 1890 .fniv = tcg_gen_orc_vec, 1891 .fno = gen_helper_gvec_orc, 1892 .opc = INDEX_op_orc_vec, 1893 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1894 }; 1895 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1896 } 1897 1898 static const GVecGen2s gop_ands = { 1899 .fni8 = tcg_gen_and_i64, 1900 .fniv = tcg_gen_and_vec, 1901 .fno = gen_helper_gvec_ands, 1902 .opc = INDEX_op_and_vec, 1903 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1904 .vece = MO_64 1905 }; 1906 1907 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 1908 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1909 { 1910 TCGv_i64 tmp = tcg_temp_new_i64(); 1911 gen_dup_i64(vece, tmp, c); 1912 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 1913 tcg_temp_free_i64(tmp); 1914 } 1915 1916 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 1917 int64_t c, uint32_t oprsz, uint32_t maxsz) 1918 { 1919 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 1920 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 1921 tcg_temp_free_i64(tmp); 1922 } 1923 1924 static const GVecGen2s gop_xors = { 1925 .fni8 = tcg_gen_xor_i64, 1926 .fniv = tcg_gen_xor_vec, 1927 .fno = gen_helper_gvec_xors, 1928 .opc = INDEX_op_xor_vec, 1929 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1930 .vece = MO_64 1931 }; 1932 1933 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 1934 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1935 { 1936 TCGv_i64 tmp = tcg_temp_new_i64(); 1937 gen_dup_i64(vece, tmp, c); 1938 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 1939 tcg_temp_free_i64(tmp); 1940 } 1941 1942 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 1943 int64_t c, uint32_t oprsz, uint32_t maxsz) 1944 { 1945 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 1946 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 1947 tcg_temp_free_i64(tmp); 1948 } 1949 1950 static const GVecGen2s gop_ors = { 1951 .fni8 = tcg_gen_or_i64, 1952 .fniv = tcg_gen_or_vec, 1953 .fno = gen_helper_gvec_ors, 1954 .opc = INDEX_op_or_vec, 1955 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1956 .vece = MO_64 1957 }; 1958 1959 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 1960 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1961 { 1962 TCGv_i64 tmp = tcg_temp_new_i64(); 1963 gen_dup_i64(vece, tmp, c); 1964 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 1965 tcg_temp_free_i64(tmp); 1966 } 1967 1968 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 1969 int64_t c, uint32_t oprsz, uint32_t maxsz) 1970 { 1971 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 1972 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 1973 tcg_temp_free_i64(tmp); 1974 } 1975 1976 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1977 { 1978 uint64_t mask = dup_const(MO_8, 0xff << c); 1979 tcg_gen_shli_i64(d, a, c); 1980 tcg_gen_andi_i64(d, d, mask); 1981 } 1982 1983 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1984 { 1985 uint64_t mask = dup_const(MO_16, 0xffff << c); 1986 tcg_gen_shli_i64(d, a, c); 1987 tcg_gen_andi_i64(d, d, mask); 1988 } 1989 1990 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 1991 int64_t shift, uint32_t oprsz, uint32_t maxsz) 1992 { 1993 static const GVecGen2i g[4] = { 1994 { .fni8 = tcg_gen_vec_shl8i_i64, 1995 .fniv = tcg_gen_shli_vec, 1996 .fno = gen_helper_gvec_shl8i, 1997 .opc = INDEX_op_shli_vec, 1998 .vece = MO_8 }, 1999 { .fni8 = tcg_gen_vec_shl16i_i64, 2000 .fniv = tcg_gen_shli_vec, 2001 .fno = gen_helper_gvec_shl16i, 2002 .opc = INDEX_op_shli_vec, 2003 .vece = MO_16 }, 2004 { .fni4 = tcg_gen_shli_i32, 2005 .fniv = tcg_gen_shli_vec, 2006 .fno = gen_helper_gvec_shl32i, 2007 .opc = INDEX_op_shli_vec, 2008 .vece = MO_32 }, 2009 { .fni8 = tcg_gen_shli_i64, 2010 .fniv = tcg_gen_shli_vec, 2011 .fno = gen_helper_gvec_shl64i, 2012 .opc = INDEX_op_shli_vec, 2013 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2014 .vece = MO_64 }, 2015 }; 2016 2017 tcg_debug_assert(vece <= MO_64); 2018 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2019 if (shift == 0) { 2020 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2021 } else { 2022 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2023 } 2024 } 2025 2026 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2027 { 2028 uint64_t mask = dup_const(MO_8, 0xff >> c); 2029 tcg_gen_shri_i64(d, a, c); 2030 tcg_gen_andi_i64(d, d, mask); 2031 } 2032 2033 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2034 { 2035 uint64_t mask = dup_const(MO_16, 0xffff >> c); 2036 tcg_gen_shri_i64(d, a, c); 2037 tcg_gen_andi_i64(d, d, mask); 2038 } 2039 2040 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 2041 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2042 { 2043 static const GVecGen2i g[4] = { 2044 { .fni8 = tcg_gen_vec_shr8i_i64, 2045 .fniv = tcg_gen_shri_vec, 2046 .fno = gen_helper_gvec_shr8i, 2047 .opc = INDEX_op_shri_vec, 2048 .vece = MO_8 }, 2049 { .fni8 = tcg_gen_vec_shr16i_i64, 2050 .fniv = tcg_gen_shri_vec, 2051 .fno = gen_helper_gvec_shr16i, 2052 .opc = INDEX_op_shri_vec, 2053 .vece = MO_16 }, 2054 { .fni4 = tcg_gen_shri_i32, 2055 .fniv = tcg_gen_shri_vec, 2056 .fno = gen_helper_gvec_shr32i, 2057 .opc = INDEX_op_shri_vec, 2058 .vece = MO_32 }, 2059 { .fni8 = tcg_gen_shri_i64, 2060 .fniv = tcg_gen_shri_vec, 2061 .fno = gen_helper_gvec_shr64i, 2062 .opc = INDEX_op_shri_vec, 2063 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2064 .vece = MO_64 }, 2065 }; 2066 2067 tcg_debug_assert(vece <= MO_64); 2068 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2069 if (shift == 0) { 2070 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2071 } else { 2072 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2073 } 2074 } 2075 2076 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2077 { 2078 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2079 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2080 TCGv_i64 s = tcg_temp_new_i64(); 2081 2082 tcg_gen_shri_i64(d, a, c); 2083 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2084 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2085 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2086 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2087 tcg_temp_free_i64(s); 2088 } 2089 2090 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2091 { 2092 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2093 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2094 TCGv_i64 s = tcg_temp_new_i64(); 2095 2096 tcg_gen_shri_i64(d, a, c); 2097 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2098 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2099 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2100 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2101 tcg_temp_free_i64(s); 2102 } 2103 2104 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2105 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2106 { 2107 static const GVecGen2i g[4] = { 2108 { .fni8 = tcg_gen_vec_sar8i_i64, 2109 .fniv = tcg_gen_sari_vec, 2110 .fno = gen_helper_gvec_sar8i, 2111 .opc = INDEX_op_sari_vec, 2112 .vece = MO_8 }, 2113 { .fni8 = tcg_gen_vec_sar16i_i64, 2114 .fniv = tcg_gen_sari_vec, 2115 .fno = gen_helper_gvec_sar16i, 2116 .opc = INDEX_op_sari_vec, 2117 .vece = MO_16 }, 2118 { .fni4 = tcg_gen_sari_i32, 2119 .fniv = tcg_gen_sari_vec, 2120 .fno = gen_helper_gvec_sar32i, 2121 .opc = INDEX_op_sari_vec, 2122 .vece = MO_32 }, 2123 { .fni8 = tcg_gen_sari_i64, 2124 .fniv = tcg_gen_sari_vec, 2125 .fno = gen_helper_gvec_sar64i, 2126 .opc = INDEX_op_sari_vec, 2127 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2128 .vece = MO_64 }, 2129 }; 2130 2131 tcg_debug_assert(vece <= MO_64); 2132 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2133 if (shift == 0) { 2134 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2135 } else { 2136 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2137 } 2138 } 2139 2140 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 2141 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 2142 uint32_t oprsz, TCGCond cond) 2143 { 2144 TCGv_i32 t0 = tcg_temp_new_i32(); 2145 TCGv_i32 t1 = tcg_temp_new_i32(); 2146 uint32_t i; 2147 2148 for (i = 0; i < oprsz; i += 4) { 2149 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 2150 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 2151 tcg_gen_setcond_i32(cond, t0, t0, t1); 2152 tcg_gen_neg_i32(t0, t0); 2153 tcg_gen_st_i32(t0, cpu_env, dofs + i); 2154 } 2155 tcg_temp_free_i32(t1); 2156 tcg_temp_free_i32(t0); 2157 } 2158 2159 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 2160 uint32_t oprsz, TCGCond cond) 2161 { 2162 TCGv_i64 t0 = tcg_temp_new_i64(); 2163 TCGv_i64 t1 = tcg_temp_new_i64(); 2164 uint32_t i; 2165 2166 for (i = 0; i < oprsz; i += 8) { 2167 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 2168 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 2169 tcg_gen_setcond_i64(cond, t0, t0, t1); 2170 tcg_gen_neg_i64(t0, t0); 2171 tcg_gen_st_i64(t0, cpu_env, dofs + i); 2172 } 2173 tcg_temp_free_i64(t1); 2174 tcg_temp_free_i64(t0); 2175 } 2176 2177 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2178 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 2179 TCGType type, TCGCond cond) 2180 { 2181 TCGv_vec t0 = tcg_temp_new_vec(type); 2182 TCGv_vec t1 = tcg_temp_new_vec(type); 2183 uint32_t i; 2184 2185 for (i = 0; i < oprsz; i += tysz) { 2186 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2187 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 2188 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 2189 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2190 } 2191 tcg_temp_free_vec(t1); 2192 tcg_temp_free_vec(t0); 2193 } 2194 2195 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 2196 uint32_t aofs, uint32_t bofs, 2197 uint32_t oprsz, uint32_t maxsz) 2198 { 2199 static gen_helper_gvec_3 * const eq_fn[4] = { 2200 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 2201 gen_helper_gvec_eq32, gen_helper_gvec_eq64 2202 }; 2203 static gen_helper_gvec_3 * const ne_fn[4] = { 2204 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 2205 gen_helper_gvec_ne32, gen_helper_gvec_ne64 2206 }; 2207 static gen_helper_gvec_3 * const lt_fn[4] = { 2208 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 2209 gen_helper_gvec_lt32, gen_helper_gvec_lt64 2210 }; 2211 static gen_helper_gvec_3 * const le_fn[4] = { 2212 gen_helper_gvec_le8, gen_helper_gvec_le16, 2213 gen_helper_gvec_le32, gen_helper_gvec_le64 2214 }; 2215 static gen_helper_gvec_3 * const ltu_fn[4] = { 2216 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 2217 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 2218 }; 2219 static gen_helper_gvec_3 * const leu_fn[4] = { 2220 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 2221 gen_helper_gvec_leu32, gen_helper_gvec_leu64 2222 }; 2223 static gen_helper_gvec_3 * const * const fns[16] = { 2224 [TCG_COND_EQ] = eq_fn, 2225 [TCG_COND_NE] = ne_fn, 2226 [TCG_COND_LT] = lt_fn, 2227 [TCG_COND_LE] = le_fn, 2228 [TCG_COND_LTU] = ltu_fn, 2229 [TCG_COND_LEU] = leu_fn, 2230 }; 2231 TCGType type; 2232 uint32_t some; 2233 2234 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 2235 check_overlap_3(dofs, aofs, bofs, maxsz); 2236 2237 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 2238 do_dup(MO_8, dofs, oprsz, maxsz, 2239 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 2240 return; 2241 } 2242 2243 /* Implement inline with a vector type, if possible. 2244 * Prefer integer when 64-bit host and 64-bit comparison. 2245 */ 2246 type = choose_vector_type(INDEX_op_cmp_vec, vece, oprsz, 2247 TCG_TARGET_REG_BITS == 64 && vece == MO_64); 2248 switch (type) { 2249 case TCG_TYPE_V256: 2250 /* Recall that ARM SVE allows vector sizes that are not a 2251 * power of 2, but always a multiple of 16. The intent is 2252 * that e.g. size == 80 would be expanded with 2x32 + 1x16. 2253 */ 2254 some = QEMU_ALIGN_DOWN(oprsz, 32); 2255 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 2256 if (some == oprsz) { 2257 break; 2258 } 2259 dofs += some; 2260 aofs += some; 2261 bofs += some; 2262 oprsz -= some; 2263 maxsz -= some; 2264 /* fallthru */ 2265 case TCG_TYPE_V128: 2266 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 2267 break; 2268 case TCG_TYPE_V64: 2269 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 2270 break; 2271 2272 case 0: 2273 if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2274 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 2275 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2276 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 2277 } else { 2278 gen_helper_gvec_3 * const *fn = fns[cond]; 2279 2280 if (fn == NULL) { 2281 uint32_t tmp; 2282 tmp = aofs, aofs = bofs, bofs = tmp; 2283 cond = tcg_swap_cond(cond); 2284 fn = fns[cond]; 2285 assert(fn != NULL); 2286 } 2287 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 2288 return; 2289 } 2290 break; 2291 2292 default: 2293 g_assert_not_reached(); 2294 } 2295 2296 if (oprsz < maxsz) { 2297 expand_clr(dofs + oprsz, maxsz - oprsz); 2298 } 2299 } 2300