1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu-common.h" 22 #include "tcg.h" 23 #include "tcg-op.h" 24 #include "tcg-op-gvec.h" 25 #include "tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 /* Verify vector size and alignment rules. OFS should be the OR of all 30 of the operand offsets so that we can check them all at once. */ 31 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 32 { 33 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 34 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 35 tcg_debug_assert(oprsz > 0); 36 tcg_debug_assert(oprsz <= maxsz); 37 tcg_debug_assert((oprsz & opr_align) == 0); 38 tcg_debug_assert((maxsz & max_align) == 0); 39 tcg_debug_assert((ofs & max_align) == 0); 40 } 41 42 /* Verify vector overlap rules for two operands. */ 43 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 44 { 45 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 46 } 47 48 /* Verify vector overlap rules for three operands. */ 49 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 50 { 51 check_overlap_2(d, a, s); 52 check_overlap_2(d, b, s); 53 check_overlap_2(a, b, s); 54 } 55 56 /* Verify vector overlap rules for four operands. */ 57 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 58 uint32_t c, uint32_t s) 59 { 60 check_overlap_2(d, a, s); 61 check_overlap_2(d, b, s); 62 check_overlap_2(d, c, s); 63 check_overlap_2(a, b, s); 64 check_overlap_2(a, c, s); 65 check_overlap_2(b, c, s); 66 } 67 68 /* Create a descriptor from components. */ 69 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 70 { 71 uint32_t desc = 0; 72 73 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 74 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 75 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 76 77 oprsz = (oprsz / 8) - 1; 78 maxsz = (maxsz / 8) - 1; 79 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 80 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 81 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 82 83 return desc; 84 } 85 86 /* Generate a call to a gvec-style helper with two vector operands. */ 87 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 88 uint32_t oprsz, uint32_t maxsz, int32_t data, 89 gen_helper_gvec_2 *fn) 90 { 91 TCGv_ptr a0, a1; 92 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 93 94 a0 = tcg_temp_new_ptr(); 95 a1 = tcg_temp_new_ptr(); 96 97 tcg_gen_addi_ptr(a0, cpu_env, dofs); 98 tcg_gen_addi_ptr(a1, cpu_env, aofs); 99 100 fn(a0, a1, desc); 101 102 tcg_temp_free_ptr(a0); 103 tcg_temp_free_ptr(a1); 104 tcg_temp_free_i32(desc); 105 } 106 107 /* Generate a call to a gvec-style helper with two vector operands 108 and one scalar operand. */ 109 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, 110 uint32_t oprsz, uint32_t maxsz, int32_t data, 111 gen_helper_gvec_2i *fn) 112 { 113 TCGv_ptr a0, a1; 114 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 115 116 a0 = tcg_temp_new_ptr(); 117 a1 = tcg_temp_new_ptr(); 118 119 tcg_gen_addi_ptr(a0, cpu_env, dofs); 120 tcg_gen_addi_ptr(a1, cpu_env, aofs); 121 122 fn(a0, a1, c, desc); 123 124 tcg_temp_free_ptr(a0); 125 tcg_temp_free_ptr(a1); 126 tcg_temp_free_i32(desc); 127 } 128 129 /* Generate a call to a gvec-style helper with three vector operands. */ 130 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 131 uint32_t oprsz, uint32_t maxsz, int32_t data, 132 gen_helper_gvec_3 *fn) 133 { 134 TCGv_ptr a0, a1, a2; 135 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 136 137 a0 = tcg_temp_new_ptr(); 138 a1 = tcg_temp_new_ptr(); 139 a2 = tcg_temp_new_ptr(); 140 141 tcg_gen_addi_ptr(a0, cpu_env, dofs); 142 tcg_gen_addi_ptr(a1, cpu_env, aofs); 143 tcg_gen_addi_ptr(a2, cpu_env, bofs); 144 145 fn(a0, a1, a2, desc); 146 147 tcg_temp_free_ptr(a0); 148 tcg_temp_free_ptr(a1); 149 tcg_temp_free_ptr(a2); 150 tcg_temp_free_i32(desc); 151 } 152 153 /* Generate a call to a gvec-style helper with four vector operands. */ 154 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 155 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 156 int32_t data, gen_helper_gvec_4 *fn) 157 { 158 TCGv_ptr a0, a1, a2, a3; 159 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 160 161 a0 = tcg_temp_new_ptr(); 162 a1 = tcg_temp_new_ptr(); 163 a2 = tcg_temp_new_ptr(); 164 a3 = tcg_temp_new_ptr(); 165 166 tcg_gen_addi_ptr(a0, cpu_env, dofs); 167 tcg_gen_addi_ptr(a1, cpu_env, aofs); 168 tcg_gen_addi_ptr(a2, cpu_env, bofs); 169 tcg_gen_addi_ptr(a3, cpu_env, cofs); 170 171 fn(a0, a1, a2, a3, desc); 172 173 tcg_temp_free_ptr(a0); 174 tcg_temp_free_ptr(a1); 175 tcg_temp_free_ptr(a2); 176 tcg_temp_free_ptr(a3); 177 tcg_temp_free_i32(desc); 178 } 179 180 /* Generate a call to a gvec-style helper with five vector operands. */ 181 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 182 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 183 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 184 { 185 TCGv_ptr a0, a1, a2, a3, a4; 186 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 187 188 a0 = tcg_temp_new_ptr(); 189 a1 = tcg_temp_new_ptr(); 190 a2 = tcg_temp_new_ptr(); 191 a3 = tcg_temp_new_ptr(); 192 a4 = tcg_temp_new_ptr(); 193 194 tcg_gen_addi_ptr(a0, cpu_env, dofs); 195 tcg_gen_addi_ptr(a1, cpu_env, aofs); 196 tcg_gen_addi_ptr(a2, cpu_env, bofs); 197 tcg_gen_addi_ptr(a3, cpu_env, cofs); 198 tcg_gen_addi_ptr(a4, cpu_env, xofs); 199 200 fn(a0, a1, a2, a3, a4, desc); 201 202 tcg_temp_free_ptr(a0); 203 tcg_temp_free_ptr(a1); 204 tcg_temp_free_ptr(a2); 205 tcg_temp_free_ptr(a3); 206 tcg_temp_free_ptr(a4); 207 tcg_temp_free_i32(desc); 208 } 209 210 /* Generate a call to a gvec-style helper with three vector operands 211 and an extra pointer operand. */ 212 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 213 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 214 int32_t data, gen_helper_gvec_2_ptr *fn) 215 { 216 TCGv_ptr a0, a1; 217 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 218 219 a0 = tcg_temp_new_ptr(); 220 a1 = tcg_temp_new_ptr(); 221 222 tcg_gen_addi_ptr(a0, cpu_env, dofs); 223 tcg_gen_addi_ptr(a1, cpu_env, aofs); 224 225 fn(a0, a1, ptr, desc); 226 227 tcg_temp_free_ptr(a0); 228 tcg_temp_free_ptr(a1); 229 tcg_temp_free_i32(desc); 230 } 231 232 /* Generate a call to a gvec-style helper with three vector operands 233 and an extra pointer operand. */ 234 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 235 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 236 int32_t data, gen_helper_gvec_3_ptr *fn) 237 { 238 TCGv_ptr a0, a1, a2; 239 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 240 241 a0 = tcg_temp_new_ptr(); 242 a1 = tcg_temp_new_ptr(); 243 a2 = tcg_temp_new_ptr(); 244 245 tcg_gen_addi_ptr(a0, cpu_env, dofs); 246 tcg_gen_addi_ptr(a1, cpu_env, aofs); 247 tcg_gen_addi_ptr(a2, cpu_env, bofs); 248 249 fn(a0, a1, a2, ptr, desc); 250 251 tcg_temp_free_ptr(a0); 252 tcg_temp_free_ptr(a1); 253 tcg_temp_free_ptr(a2); 254 tcg_temp_free_i32(desc); 255 } 256 257 /* Generate a call to a gvec-style helper with four vector operands 258 and an extra pointer operand. */ 259 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 260 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 261 uint32_t maxsz, int32_t data, 262 gen_helper_gvec_4_ptr *fn) 263 { 264 TCGv_ptr a0, a1, a2, a3; 265 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 266 267 a0 = tcg_temp_new_ptr(); 268 a1 = tcg_temp_new_ptr(); 269 a2 = tcg_temp_new_ptr(); 270 a3 = tcg_temp_new_ptr(); 271 272 tcg_gen_addi_ptr(a0, cpu_env, dofs); 273 tcg_gen_addi_ptr(a1, cpu_env, aofs); 274 tcg_gen_addi_ptr(a2, cpu_env, bofs); 275 tcg_gen_addi_ptr(a3, cpu_env, cofs); 276 277 fn(a0, a1, a2, a3, ptr, desc); 278 279 tcg_temp_free_ptr(a0); 280 tcg_temp_free_ptr(a1); 281 tcg_temp_free_ptr(a2); 282 tcg_temp_free_ptr(a3); 283 tcg_temp_free_i32(desc); 284 } 285 286 /* Return true if we want to implement something of OPRSZ bytes 287 in units of LNSZ. This limits the expansion of inline code. */ 288 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 289 { 290 uint32_t lnct = oprsz / lnsz; 291 return lnct >= 1 && lnct <= MAX_UNROLL; 292 } 293 294 static void expand_clr(uint32_t dofs, uint32_t maxsz); 295 296 /* Duplicate C as per VECE. */ 297 uint64_t (dup_const)(unsigned vece, uint64_t c) 298 { 299 switch (vece) { 300 case MO_8: 301 return 0x0101010101010101ull * (uint8_t)c; 302 case MO_16: 303 return 0x0001000100010001ull * (uint16_t)c; 304 case MO_32: 305 return 0x0000000100000001ull * (uint32_t)c; 306 case MO_64: 307 return c; 308 default: 309 g_assert_not_reached(); 310 } 311 } 312 313 /* Duplicate IN into OUT as per VECE. */ 314 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 315 { 316 switch (vece) { 317 case MO_8: 318 tcg_gen_ext8u_i32(out, in); 319 tcg_gen_muli_i32(out, out, 0x01010101); 320 break; 321 case MO_16: 322 tcg_gen_deposit_i32(out, in, in, 16, 16); 323 break; 324 case MO_32: 325 tcg_gen_mov_i32(out, in); 326 break; 327 default: 328 g_assert_not_reached(); 329 } 330 } 331 332 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 333 { 334 switch (vece) { 335 case MO_8: 336 tcg_gen_ext8u_i64(out, in); 337 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 338 break; 339 case MO_16: 340 tcg_gen_ext16u_i64(out, in); 341 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 342 break; 343 case MO_32: 344 tcg_gen_deposit_i64(out, in, in, 32, 32); 345 break; 346 case MO_64: 347 tcg_gen_mov_i64(out, in); 348 break; 349 default: 350 g_assert_not_reached(); 351 } 352 } 353 354 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 355 * Only one of IN_32 or IN_64 may be set; 356 * IN_C is used if IN_32 and IN_64 are unset. 357 */ 358 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 359 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 360 uint64_t in_c) 361 { 362 TCGType type; 363 TCGv_i64 t_64; 364 TCGv_i32 t_32, t_desc; 365 TCGv_ptr t_ptr; 366 uint32_t i; 367 368 assert(vece <= (in_32 ? MO_32 : MO_64)); 369 assert(in_32 == NULL || in_64 == NULL); 370 371 /* If we're storing 0, expand oprsz to maxsz. */ 372 if (in_32 == NULL && in_64 == NULL) { 373 in_c = dup_const(vece, in_c); 374 if (in_c == 0) { 375 oprsz = maxsz; 376 } 377 } 378 379 type = 0; 380 if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) { 381 type = TCG_TYPE_V256; 382 } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) { 383 type = TCG_TYPE_V128; 384 } else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8) 385 /* Prefer integer when 64-bit host and no variable dup. */ 386 && !(TCG_TARGET_REG_BITS == 64 && in_32 == NULL 387 && (in_64 == NULL || vece == MO_64))) { 388 type = TCG_TYPE_V64; 389 } 390 391 /* Implement inline with a vector type, if possible. */ 392 if (type != 0) { 393 TCGv_vec t_vec = tcg_temp_new_vec(type); 394 395 if (in_32) { 396 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 397 } else if (in_64) { 398 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 399 } else { 400 switch (vece) { 401 case MO_8: 402 tcg_gen_dup8i_vec(t_vec, in_c); 403 break; 404 case MO_16: 405 tcg_gen_dup16i_vec(t_vec, in_c); 406 break; 407 case MO_32: 408 tcg_gen_dup32i_vec(t_vec, in_c); 409 break; 410 default: 411 tcg_gen_dup64i_vec(t_vec, in_c); 412 break; 413 } 414 } 415 416 i = 0; 417 if (TCG_TARGET_HAS_v256) { 418 for (; i + 32 <= oprsz; i += 32) { 419 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 420 } 421 } 422 if (TCG_TARGET_HAS_v128) { 423 for (; i + 16 <= oprsz; i += 16) { 424 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 425 } 426 } 427 if (TCG_TARGET_HAS_v64) { 428 for (; i < oprsz; i += 8) { 429 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 430 } 431 } 432 tcg_temp_free_vec(t_vec); 433 goto done; 434 } 435 436 /* Otherwise, inline with an integer type, unless "large". */ 437 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 438 t_64 = NULL; 439 t_32 = NULL; 440 441 if (in_32) { 442 /* We are given a 32-bit variable input. For a 64-bit host, 443 use a 64-bit operation unless the 32-bit operation would 444 be simple enough. */ 445 if (TCG_TARGET_REG_BITS == 64 446 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 447 t_64 = tcg_temp_new_i64(); 448 tcg_gen_extu_i32_i64(t_64, in_32); 449 gen_dup_i64(vece, t_64, t_64); 450 } else { 451 t_32 = tcg_temp_new_i32(); 452 gen_dup_i32(vece, t_32, in_32); 453 } 454 } else if (in_64) { 455 /* We are given a 64-bit variable input. */ 456 t_64 = tcg_temp_new_i64(); 457 gen_dup_i64(vece, t_64, in_64); 458 } else { 459 /* We are given a constant input. */ 460 /* For 64-bit hosts, use 64-bit constants for "simple" constants 461 or when we'd need too many 32-bit stores, or when a 64-bit 462 constant is really required. */ 463 if (vece == MO_64 464 || (TCG_TARGET_REG_BITS == 64 465 && (in_c == 0 || in_c == -1 466 || !check_size_impl(oprsz, 4)))) { 467 t_64 = tcg_const_i64(in_c); 468 } else { 469 t_32 = tcg_const_i32(in_c); 470 } 471 } 472 473 /* Implement inline if we picked an implementation size above. */ 474 if (t_32) { 475 for (i = 0; i < oprsz; i += 4) { 476 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 477 } 478 tcg_temp_free_i32(t_32); 479 goto done; 480 } 481 if (t_64) { 482 for (i = 0; i < oprsz; i += 8) { 483 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 484 } 485 tcg_temp_free_i64(t_64); 486 goto done; 487 } 488 } 489 490 /* Otherwise implement out of line. */ 491 t_ptr = tcg_temp_new_ptr(); 492 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 493 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 494 495 if (vece == MO_64) { 496 if (in_64) { 497 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 498 } else { 499 t_64 = tcg_const_i64(in_c); 500 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 501 tcg_temp_free_i64(t_64); 502 } 503 } else { 504 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 505 static dup_fn * const fns[3] = { 506 gen_helper_gvec_dup8, 507 gen_helper_gvec_dup16, 508 gen_helper_gvec_dup32 509 }; 510 511 if (in_32) { 512 fns[vece](t_ptr, t_desc, in_32); 513 } else { 514 t_32 = tcg_temp_new_i32(); 515 if (in_64) { 516 tcg_gen_extrl_i64_i32(t_32, in_64); 517 } else if (vece == MO_8) { 518 tcg_gen_movi_i32(t_32, in_c & 0xff); 519 } else if (vece == MO_16) { 520 tcg_gen_movi_i32(t_32, in_c & 0xffff); 521 } else { 522 tcg_gen_movi_i32(t_32, in_c); 523 } 524 fns[vece](t_ptr, t_desc, t_32); 525 tcg_temp_free_i32(t_32); 526 } 527 } 528 529 tcg_temp_free_ptr(t_ptr); 530 tcg_temp_free_i32(t_desc); 531 return; 532 533 done: 534 if (oprsz < maxsz) { 535 expand_clr(dofs + oprsz, maxsz - oprsz); 536 } 537 } 538 539 /* Likewise, but with zero. */ 540 static void expand_clr(uint32_t dofs, uint32_t maxsz) 541 { 542 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 543 } 544 545 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 546 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 547 void (*fni)(TCGv_i32, TCGv_i32)) 548 { 549 TCGv_i32 t0 = tcg_temp_new_i32(); 550 uint32_t i; 551 552 for (i = 0; i < oprsz; i += 4) { 553 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 554 fni(t0, t0); 555 tcg_gen_st_i32(t0, cpu_env, dofs + i); 556 } 557 tcg_temp_free_i32(t0); 558 } 559 560 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 561 int32_t c, bool load_dest, 562 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 563 { 564 TCGv_i32 t0 = tcg_temp_new_i32(); 565 TCGv_i32 t1 = tcg_temp_new_i32(); 566 uint32_t i; 567 568 for (i = 0; i < oprsz; i += 4) { 569 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 570 if (load_dest) { 571 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 572 } 573 fni(t1, t0, c); 574 tcg_gen_st_i32(t1, cpu_env, dofs + i); 575 } 576 tcg_temp_free_i32(t0); 577 tcg_temp_free_i32(t1); 578 } 579 580 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 581 TCGv_i32 c, bool scalar_first, 582 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 583 { 584 TCGv_i32 t0 = tcg_temp_new_i32(); 585 TCGv_i32 t1 = tcg_temp_new_i32(); 586 uint32_t i; 587 588 for (i = 0; i < oprsz; i += 4) { 589 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 590 if (scalar_first) { 591 fni(t1, c, t0); 592 } else { 593 fni(t1, t0, c); 594 } 595 tcg_gen_st_i32(t1, cpu_env, dofs + i); 596 } 597 tcg_temp_free_i32(t0); 598 tcg_temp_free_i32(t1); 599 } 600 601 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 602 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 603 uint32_t bofs, uint32_t oprsz, bool load_dest, 604 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 605 { 606 TCGv_i32 t0 = tcg_temp_new_i32(); 607 TCGv_i32 t1 = tcg_temp_new_i32(); 608 TCGv_i32 t2 = tcg_temp_new_i32(); 609 uint32_t i; 610 611 for (i = 0; i < oprsz; i += 4) { 612 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 613 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 614 if (load_dest) { 615 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 616 } 617 fni(t2, t0, t1); 618 tcg_gen_st_i32(t2, cpu_env, dofs + i); 619 } 620 tcg_temp_free_i32(t2); 621 tcg_temp_free_i32(t1); 622 tcg_temp_free_i32(t0); 623 } 624 625 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 626 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 627 uint32_t cofs, uint32_t oprsz, 628 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 629 { 630 TCGv_i32 t0 = tcg_temp_new_i32(); 631 TCGv_i32 t1 = tcg_temp_new_i32(); 632 TCGv_i32 t2 = tcg_temp_new_i32(); 633 TCGv_i32 t3 = tcg_temp_new_i32(); 634 uint32_t i; 635 636 for (i = 0; i < oprsz; i += 4) { 637 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 638 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 639 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 640 fni(t0, t1, t2, t3); 641 tcg_gen_st_i32(t0, cpu_env, dofs + i); 642 } 643 tcg_temp_free_i32(t3); 644 tcg_temp_free_i32(t2); 645 tcg_temp_free_i32(t1); 646 tcg_temp_free_i32(t0); 647 } 648 649 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 650 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 651 void (*fni)(TCGv_i64, TCGv_i64)) 652 { 653 TCGv_i64 t0 = tcg_temp_new_i64(); 654 uint32_t i; 655 656 for (i = 0; i < oprsz; i += 8) { 657 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 658 fni(t0, t0); 659 tcg_gen_st_i64(t0, cpu_env, dofs + i); 660 } 661 tcg_temp_free_i64(t0); 662 } 663 664 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 665 int64_t c, bool load_dest, 666 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 667 { 668 TCGv_i64 t0 = tcg_temp_new_i64(); 669 TCGv_i64 t1 = tcg_temp_new_i64(); 670 uint32_t i; 671 672 for (i = 0; i < oprsz; i += 8) { 673 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 674 if (load_dest) { 675 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 676 } 677 fni(t1, t0, c); 678 tcg_gen_st_i64(t1, cpu_env, dofs + i); 679 } 680 tcg_temp_free_i64(t0); 681 tcg_temp_free_i64(t1); 682 } 683 684 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 685 TCGv_i64 c, bool scalar_first, 686 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 687 { 688 TCGv_i64 t0 = tcg_temp_new_i64(); 689 TCGv_i64 t1 = tcg_temp_new_i64(); 690 uint32_t i; 691 692 for (i = 0; i < oprsz; i += 8) { 693 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 694 if (scalar_first) { 695 fni(t1, c, t0); 696 } else { 697 fni(t1, t0, c); 698 } 699 tcg_gen_st_i64(t1, cpu_env, dofs + i); 700 } 701 tcg_temp_free_i64(t0); 702 tcg_temp_free_i64(t1); 703 } 704 705 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 706 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 707 uint32_t bofs, uint32_t oprsz, bool load_dest, 708 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 709 { 710 TCGv_i64 t0 = tcg_temp_new_i64(); 711 TCGv_i64 t1 = tcg_temp_new_i64(); 712 TCGv_i64 t2 = tcg_temp_new_i64(); 713 uint32_t i; 714 715 for (i = 0; i < oprsz; i += 8) { 716 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 717 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 718 if (load_dest) { 719 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 720 } 721 fni(t2, t0, t1); 722 tcg_gen_st_i64(t2, cpu_env, dofs + i); 723 } 724 tcg_temp_free_i64(t2); 725 tcg_temp_free_i64(t1); 726 tcg_temp_free_i64(t0); 727 } 728 729 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 730 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 731 uint32_t cofs, uint32_t oprsz, 732 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 733 { 734 TCGv_i64 t0 = tcg_temp_new_i64(); 735 TCGv_i64 t1 = tcg_temp_new_i64(); 736 TCGv_i64 t2 = tcg_temp_new_i64(); 737 TCGv_i64 t3 = tcg_temp_new_i64(); 738 uint32_t i; 739 740 for (i = 0; i < oprsz; i += 8) { 741 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 742 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 743 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 744 fni(t0, t1, t2, t3); 745 tcg_gen_st_i64(t0, cpu_env, dofs + i); 746 } 747 tcg_temp_free_i64(t3); 748 tcg_temp_free_i64(t2); 749 tcg_temp_free_i64(t1); 750 tcg_temp_free_i64(t0); 751 } 752 753 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 754 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 755 uint32_t oprsz, uint32_t tysz, TCGType type, 756 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 757 { 758 TCGv_vec t0 = tcg_temp_new_vec(type); 759 uint32_t i; 760 761 for (i = 0; i < oprsz; i += tysz) { 762 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 763 fni(vece, t0, t0); 764 tcg_gen_st_vec(t0, cpu_env, dofs + i); 765 } 766 tcg_temp_free_vec(t0); 767 } 768 769 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 770 using host vectors. */ 771 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 772 uint32_t oprsz, uint32_t tysz, TCGType type, 773 int64_t c, bool load_dest, 774 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 775 { 776 TCGv_vec t0 = tcg_temp_new_vec(type); 777 TCGv_vec t1 = tcg_temp_new_vec(type); 778 uint32_t i; 779 780 for (i = 0; i < oprsz; i += tysz) { 781 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 782 if (load_dest) { 783 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 784 } 785 fni(vece, t1, t0, c); 786 tcg_gen_st_vec(t1, cpu_env, dofs + i); 787 } 788 tcg_temp_free_vec(t0); 789 tcg_temp_free_vec(t1); 790 } 791 792 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 793 uint32_t oprsz, uint32_t tysz, TCGType type, 794 TCGv_vec c, bool scalar_first, 795 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 796 { 797 TCGv_vec t0 = tcg_temp_new_vec(type); 798 TCGv_vec t1 = tcg_temp_new_vec(type); 799 uint32_t i; 800 801 for (i = 0; i < oprsz; i += tysz) { 802 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 803 if (scalar_first) { 804 fni(vece, t1, c, t0); 805 } else { 806 fni(vece, t1, t0, c); 807 } 808 tcg_gen_st_vec(t1, cpu_env, dofs + i); 809 } 810 tcg_temp_free_vec(t0); 811 tcg_temp_free_vec(t1); 812 } 813 814 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 815 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 816 uint32_t bofs, uint32_t oprsz, 817 uint32_t tysz, TCGType type, bool load_dest, 818 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 819 { 820 TCGv_vec t0 = tcg_temp_new_vec(type); 821 TCGv_vec t1 = tcg_temp_new_vec(type); 822 TCGv_vec t2 = tcg_temp_new_vec(type); 823 uint32_t i; 824 825 for (i = 0; i < oprsz; i += tysz) { 826 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 827 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 828 if (load_dest) { 829 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 830 } 831 fni(vece, t2, t0, t1); 832 tcg_gen_st_vec(t2, cpu_env, dofs + i); 833 } 834 tcg_temp_free_vec(t2); 835 tcg_temp_free_vec(t1); 836 tcg_temp_free_vec(t0); 837 } 838 839 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 840 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 841 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 842 uint32_t tysz, TCGType type, 843 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 844 TCGv_vec, TCGv_vec)) 845 { 846 TCGv_vec t0 = tcg_temp_new_vec(type); 847 TCGv_vec t1 = tcg_temp_new_vec(type); 848 TCGv_vec t2 = tcg_temp_new_vec(type); 849 TCGv_vec t3 = tcg_temp_new_vec(type); 850 uint32_t i; 851 852 for (i = 0; i < oprsz; i += tysz) { 853 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 854 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 855 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 856 fni(vece, t0, t1, t2, t3); 857 tcg_gen_st_vec(t0, cpu_env, dofs + i); 858 } 859 tcg_temp_free_vec(t3); 860 tcg_temp_free_vec(t2); 861 tcg_temp_free_vec(t1); 862 tcg_temp_free_vec(t0); 863 } 864 865 /* Expand a vector two-operand operation. */ 866 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 867 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 868 { 869 check_size_align(oprsz, maxsz, dofs | aofs); 870 check_overlap_2(dofs, aofs, maxsz); 871 872 /* Recall that ARM SVE allows vector sizes that are not a power of 2. 873 Expand with successively smaller host vector sizes. The intent is 874 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ 875 /* ??? For maxsz > oprsz, the host may be able to use an opr-sized 876 operation, zeroing the balance of the register. We can then 877 use a max-sized store to implement the clearing without an extra 878 store operation. This is true for aarch64 and x86_64 hosts. */ 879 880 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) 881 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { 882 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); 883 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); 884 if (some == oprsz) { 885 goto done; 886 } 887 dofs += some; 888 aofs += some; 889 oprsz -= some; 890 maxsz -= some; 891 } 892 893 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) 894 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { 895 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); 896 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 897 && g->fniv && check_size_impl(oprsz, 8) 898 && (!g->opc 899 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { 900 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); 901 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 902 expand_2_i64(dofs, aofs, oprsz, g->fni8); 903 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 904 expand_2_i32(dofs, aofs, oprsz, g->fni4); 905 } else { 906 assert(g->fno != NULL); 907 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 908 return; 909 } 910 911 done: 912 if (oprsz < maxsz) { 913 expand_clr(dofs + oprsz, maxsz - oprsz); 914 } 915 } 916 917 /* Expand a vector operation with two vectors and an immediate. */ 918 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 919 uint32_t maxsz, int64_t c, const GVecGen2i *g) 920 { 921 check_size_align(oprsz, maxsz, dofs | aofs); 922 check_overlap_2(dofs, aofs, maxsz); 923 924 /* Recall that ARM SVE allows vector sizes that are not a power of 2. 925 Expand with successively smaller host vector sizes. The intent is 926 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ 927 928 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) 929 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { 930 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); 931 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 932 c, g->load_dest, g->fniv); 933 if (some == oprsz) { 934 goto done; 935 } 936 dofs += some; 937 aofs += some; 938 oprsz -= some; 939 maxsz -= some; 940 } 941 942 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) 943 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { 944 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 945 c, g->load_dest, g->fniv); 946 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 947 && g->fniv && check_size_impl(oprsz, 8) 948 && (!g->opc 949 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { 950 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 951 c, g->load_dest, g->fniv); 952 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 953 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 954 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 955 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 956 } else { 957 if (g->fno) { 958 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 959 } else { 960 TCGv_i64 tcg_c = tcg_const_i64(c); 961 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, maxsz, c, g->fnoi); 962 tcg_temp_free_i64(tcg_c); 963 } 964 return; 965 } 966 967 done: 968 if (oprsz < maxsz) { 969 expand_clr(dofs + oprsz, maxsz - oprsz); 970 } 971 } 972 973 /* Expand a vector operation with two vectors and a scalar. */ 974 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 975 uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) 976 { 977 TCGType type; 978 979 check_size_align(oprsz, maxsz, dofs | aofs); 980 check_overlap_2(dofs, aofs, maxsz); 981 982 type = 0; 983 if (g->fniv) { 984 if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) { 985 type = TCG_TYPE_V256; 986 } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) { 987 type = TCG_TYPE_V128; 988 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 989 && check_size_impl(oprsz, 8)) { 990 type = TCG_TYPE_V64; 991 } 992 } 993 if (type != 0) { 994 TCGv_vec t_vec = tcg_temp_new_vec(type); 995 996 tcg_gen_dup_i64_vec(g->vece, t_vec, c); 997 998 /* Recall that ARM SVE allows vector sizes that are not a power of 2. 999 Expand with successively smaller host vector sizes. The intent is 1000 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ 1001 switch (type) { 1002 case TCG_TYPE_V256: 1003 { 1004 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); 1005 expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 1006 t_vec, g->scalar_first, g->fniv); 1007 if (some == oprsz) { 1008 break; 1009 } 1010 dofs += some; 1011 aofs += some; 1012 oprsz -= some; 1013 maxsz -= some; 1014 } 1015 /* fallthru */ 1016 1017 case TCG_TYPE_V128: 1018 expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 1019 t_vec, g->scalar_first, g->fniv); 1020 break; 1021 1022 case TCG_TYPE_V64: 1023 expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 1024 t_vec, g->scalar_first, g->fniv); 1025 break; 1026 1027 default: 1028 g_assert_not_reached(); 1029 } 1030 tcg_temp_free_vec(t_vec); 1031 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1032 TCGv_i64 t64 = tcg_temp_new_i64(); 1033 1034 gen_dup_i64(g->vece, t64, c); 1035 expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); 1036 tcg_temp_free_i64(t64); 1037 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1038 TCGv_i32 t32 = tcg_temp_new_i32(); 1039 1040 tcg_gen_extrl_i64_i32(t32, c); 1041 gen_dup_i32(g->vece, t32, t32); 1042 expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); 1043 tcg_temp_free_i32(t32); 1044 } else { 1045 tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); 1046 return; 1047 } 1048 1049 if (oprsz < maxsz) { 1050 expand_clr(dofs + oprsz, maxsz - oprsz); 1051 } 1052 } 1053 1054 /* Expand a vector three-operand operation. */ 1055 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 1056 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 1057 { 1058 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 1059 check_overlap_3(dofs, aofs, bofs, maxsz); 1060 1061 /* Recall that ARM SVE allows vector sizes that are not a power of 2. 1062 Expand with successively smaller host vector sizes. The intent is 1063 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ 1064 1065 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) 1066 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { 1067 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); 1068 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 1069 g->load_dest, g->fniv); 1070 if (some == oprsz) { 1071 goto done; 1072 } 1073 dofs += some; 1074 aofs += some; 1075 bofs += some; 1076 oprsz -= some; 1077 maxsz -= some; 1078 } 1079 1080 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) 1081 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { 1082 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 1083 g->load_dest, g->fniv); 1084 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 1085 && g->fniv && check_size_impl(oprsz, 8) 1086 && (!g->opc 1087 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { 1088 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 1089 g->load_dest, g->fniv); 1090 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1091 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 1092 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1093 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 1094 } else { 1095 assert(g->fno != NULL); 1096 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, g->data, g->fno); 1097 } 1098 1099 done: 1100 if (oprsz < maxsz) { 1101 expand_clr(dofs + oprsz, maxsz - oprsz); 1102 } 1103 } 1104 1105 /* Expand a vector four-operand operation. */ 1106 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 1107 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 1108 { 1109 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 1110 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 1111 1112 /* Recall that ARM SVE allows vector sizes that are not a power of 2. 1113 Expand with successively smaller host vector sizes. The intent is 1114 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ 1115 1116 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) 1117 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { 1118 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); 1119 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 1120 32, TCG_TYPE_V256, g->fniv); 1121 if (some == oprsz) { 1122 goto done; 1123 } 1124 dofs += some; 1125 aofs += some; 1126 bofs += some; 1127 cofs += some; 1128 oprsz -= some; 1129 maxsz -= some; 1130 } 1131 1132 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) 1133 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { 1134 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1135 16, TCG_TYPE_V128, g->fniv); 1136 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 1137 && g->fniv && check_size_impl(oprsz, 8) 1138 && (!g->opc 1139 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { 1140 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 1141 8, TCG_TYPE_V64, g->fniv); 1142 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 1143 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8); 1144 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 1145 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4); 1146 } else { 1147 assert(g->fno != NULL); 1148 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 1149 oprsz, maxsz, g->data, g->fno); 1150 return; 1151 } 1152 1153 done: 1154 if (oprsz < maxsz) { 1155 expand_clr(dofs + oprsz, maxsz - oprsz); 1156 } 1157 } 1158 1159 /* 1160 * Expand specific vector operations. 1161 */ 1162 1163 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 1164 { 1165 tcg_gen_mov_vec(a, b); 1166 } 1167 1168 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 1169 uint32_t oprsz, uint32_t maxsz) 1170 { 1171 static const GVecGen2 g = { 1172 .fni8 = tcg_gen_mov_i64, 1173 .fniv = vec_mov2, 1174 .fno = gen_helper_gvec_mov, 1175 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1176 }; 1177 if (dofs != aofs) { 1178 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1179 } else { 1180 check_size_align(oprsz, maxsz, dofs); 1181 if (oprsz < maxsz) { 1182 expand_clr(dofs + oprsz, maxsz - oprsz); 1183 } 1184 } 1185 } 1186 1187 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1188 uint32_t maxsz, TCGv_i32 in) 1189 { 1190 check_size_align(oprsz, maxsz, dofs); 1191 tcg_debug_assert(vece <= MO_32); 1192 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1193 } 1194 1195 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1196 uint32_t maxsz, TCGv_i64 in) 1197 { 1198 check_size_align(oprsz, maxsz, dofs); 1199 tcg_debug_assert(vece <= MO_64); 1200 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1201 } 1202 1203 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1204 uint32_t oprsz, uint32_t maxsz) 1205 { 1206 if (vece <= MO_32) { 1207 TCGv_i32 in = tcg_temp_new_i32(); 1208 switch (vece) { 1209 case MO_8: 1210 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1211 break; 1212 case MO_16: 1213 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1214 break; 1215 case MO_32: 1216 tcg_gen_ld_i32(in, cpu_env, aofs); 1217 break; 1218 } 1219 tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in); 1220 tcg_temp_free_i32(in); 1221 } else if (vece == MO_64) { 1222 TCGv_i64 in = tcg_temp_new_i64(); 1223 tcg_gen_ld_i64(in, cpu_env, aofs); 1224 tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in); 1225 tcg_temp_free_i64(in); 1226 } else { 1227 /* 128-bit duplicate. */ 1228 /* ??? Dup to 256-bit vector. */ 1229 int i; 1230 1231 tcg_debug_assert(vece == 4); 1232 tcg_debug_assert(oprsz >= 16); 1233 if (TCG_TARGET_HAS_v128) { 1234 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1235 1236 tcg_gen_ld_vec(in, cpu_env, aofs); 1237 for (i = 0; i < oprsz; i += 16) { 1238 tcg_gen_st_vec(in, cpu_env, dofs + i); 1239 } 1240 tcg_temp_free_vec(in); 1241 } else { 1242 TCGv_i64 in0 = tcg_temp_new_i64(); 1243 TCGv_i64 in1 = tcg_temp_new_i64(); 1244 1245 tcg_gen_ld_i64(in0, cpu_env, aofs); 1246 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1247 for (i = 0; i < oprsz; i += 16) { 1248 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1249 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1250 } 1251 tcg_temp_free_i64(in0); 1252 tcg_temp_free_i64(in1); 1253 } 1254 } 1255 } 1256 1257 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, 1258 uint32_t maxsz, uint64_t x) 1259 { 1260 check_size_align(oprsz, maxsz, dofs); 1261 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); 1262 } 1263 1264 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, 1265 uint32_t maxsz, uint32_t x) 1266 { 1267 check_size_align(oprsz, maxsz, dofs); 1268 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); 1269 } 1270 1271 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, 1272 uint32_t maxsz, uint16_t x) 1273 { 1274 check_size_align(oprsz, maxsz, dofs); 1275 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); 1276 } 1277 1278 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, 1279 uint32_t maxsz, uint8_t x) 1280 { 1281 check_size_align(oprsz, maxsz, dofs); 1282 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); 1283 } 1284 1285 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1286 uint32_t oprsz, uint32_t maxsz) 1287 { 1288 static const GVecGen2 g = { 1289 .fni8 = tcg_gen_not_i64, 1290 .fniv = tcg_gen_not_vec, 1291 .fno = gen_helper_gvec_not, 1292 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1293 }; 1294 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1295 } 1296 1297 /* Perform a vector addition using normal addition and a mask. The mask 1298 should be the sign bit of each lane. This 6-operation form is more 1299 efficient than separate additions when there are 4 or more lanes in 1300 the 64-bit operation. */ 1301 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1302 { 1303 TCGv_i64 t1 = tcg_temp_new_i64(); 1304 TCGv_i64 t2 = tcg_temp_new_i64(); 1305 TCGv_i64 t3 = tcg_temp_new_i64(); 1306 1307 tcg_gen_andc_i64(t1, a, m); 1308 tcg_gen_andc_i64(t2, b, m); 1309 tcg_gen_xor_i64(t3, a, b); 1310 tcg_gen_add_i64(d, t1, t2); 1311 tcg_gen_and_i64(t3, t3, m); 1312 tcg_gen_xor_i64(d, d, t3); 1313 1314 tcg_temp_free_i64(t1); 1315 tcg_temp_free_i64(t2); 1316 tcg_temp_free_i64(t3); 1317 } 1318 1319 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1320 { 1321 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1322 gen_addv_mask(d, a, b, m); 1323 tcg_temp_free_i64(m); 1324 } 1325 1326 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1327 { 1328 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1329 gen_addv_mask(d, a, b, m); 1330 tcg_temp_free_i64(m); 1331 } 1332 1333 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1334 { 1335 TCGv_i64 t1 = tcg_temp_new_i64(); 1336 TCGv_i64 t2 = tcg_temp_new_i64(); 1337 1338 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1339 tcg_gen_add_i64(t2, a, b); 1340 tcg_gen_add_i64(t1, t1, b); 1341 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1342 1343 tcg_temp_free_i64(t1); 1344 tcg_temp_free_i64(t2); 1345 } 1346 1347 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1348 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1349 { 1350 static const GVecGen3 g[4] = { 1351 { .fni8 = tcg_gen_vec_add8_i64, 1352 .fniv = tcg_gen_add_vec, 1353 .fno = gen_helper_gvec_add8, 1354 .opc = INDEX_op_add_vec, 1355 .vece = MO_8 }, 1356 { .fni8 = tcg_gen_vec_add16_i64, 1357 .fniv = tcg_gen_add_vec, 1358 .fno = gen_helper_gvec_add16, 1359 .opc = INDEX_op_add_vec, 1360 .vece = MO_16 }, 1361 { .fni4 = tcg_gen_add_i32, 1362 .fniv = tcg_gen_add_vec, 1363 .fno = gen_helper_gvec_add32, 1364 .opc = INDEX_op_add_vec, 1365 .vece = MO_32 }, 1366 { .fni8 = tcg_gen_add_i64, 1367 .fniv = tcg_gen_add_vec, 1368 .fno = gen_helper_gvec_add64, 1369 .opc = INDEX_op_add_vec, 1370 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1371 .vece = MO_64 }, 1372 }; 1373 1374 tcg_debug_assert(vece <= MO_64); 1375 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1376 } 1377 1378 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, 1379 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1380 { 1381 static const GVecGen2s g[4] = { 1382 { .fni8 = tcg_gen_vec_add8_i64, 1383 .fniv = tcg_gen_add_vec, 1384 .fno = gen_helper_gvec_adds8, 1385 .opc = INDEX_op_add_vec, 1386 .vece = MO_8 }, 1387 { .fni8 = tcg_gen_vec_add16_i64, 1388 .fniv = tcg_gen_add_vec, 1389 .fno = gen_helper_gvec_adds16, 1390 .opc = INDEX_op_add_vec, 1391 .vece = MO_16 }, 1392 { .fni4 = tcg_gen_add_i32, 1393 .fniv = tcg_gen_add_vec, 1394 .fno = gen_helper_gvec_adds32, 1395 .opc = INDEX_op_add_vec, 1396 .vece = MO_32 }, 1397 { .fni8 = tcg_gen_add_i64, 1398 .fniv = tcg_gen_add_vec, 1399 .fno = gen_helper_gvec_adds64, 1400 .opc = INDEX_op_add_vec, 1401 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1402 .vece = MO_64 }, 1403 }; 1404 1405 tcg_debug_assert(vece <= MO_64); 1406 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1407 } 1408 1409 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, 1410 int64_t c, uint32_t oprsz, uint32_t maxsz) 1411 { 1412 TCGv_i64 tmp = tcg_const_i64(c); 1413 tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); 1414 tcg_temp_free_i64(tmp); 1415 } 1416 1417 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, 1418 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1419 { 1420 static const GVecGen2s g[4] = { 1421 { .fni8 = tcg_gen_vec_sub8_i64, 1422 .fniv = tcg_gen_sub_vec, 1423 .fno = gen_helper_gvec_subs8, 1424 .opc = INDEX_op_sub_vec, 1425 .vece = MO_8 }, 1426 { .fni8 = tcg_gen_vec_sub16_i64, 1427 .fniv = tcg_gen_sub_vec, 1428 .fno = gen_helper_gvec_subs16, 1429 .opc = INDEX_op_sub_vec, 1430 .vece = MO_16 }, 1431 { .fni4 = tcg_gen_sub_i32, 1432 .fniv = tcg_gen_sub_vec, 1433 .fno = gen_helper_gvec_subs32, 1434 .opc = INDEX_op_sub_vec, 1435 .vece = MO_32 }, 1436 { .fni8 = tcg_gen_sub_i64, 1437 .fniv = tcg_gen_sub_vec, 1438 .fno = gen_helper_gvec_subs64, 1439 .opc = INDEX_op_sub_vec, 1440 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1441 .vece = MO_64 }, 1442 }; 1443 1444 tcg_debug_assert(vece <= MO_64); 1445 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1446 } 1447 1448 /* Perform a vector subtraction using normal subtraction and a mask. 1449 Compare gen_addv_mask above. */ 1450 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1451 { 1452 TCGv_i64 t1 = tcg_temp_new_i64(); 1453 TCGv_i64 t2 = tcg_temp_new_i64(); 1454 TCGv_i64 t3 = tcg_temp_new_i64(); 1455 1456 tcg_gen_or_i64(t1, a, m); 1457 tcg_gen_andc_i64(t2, b, m); 1458 tcg_gen_eqv_i64(t3, a, b); 1459 tcg_gen_sub_i64(d, t1, t2); 1460 tcg_gen_and_i64(t3, t3, m); 1461 tcg_gen_xor_i64(d, d, t3); 1462 1463 tcg_temp_free_i64(t1); 1464 tcg_temp_free_i64(t2); 1465 tcg_temp_free_i64(t3); 1466 } 1467 1468 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1469 { 1470 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1471 gen_subv_mask(d, a, b, m); 1472 tcg_temp_free_i64(m); 1473 } 1474 1475 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1476 { 1477 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1478 gen_subv_mask(d, a, b, m); 1479 tcg_temp_free_i64(m); 1480 } 1481 1482 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1483 { 1484 TCGv_i64 t1 = tcg_temp_new_i64(); 1485 TCGv_i64 t2 = tcg_temp_new_i64(); 1486 1487 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1488 tcg_gen_sub_i64(t2, a, b); 1489 tcg_gen_sub_i64(t1, a, t1); 1490 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1491 1492 tcg_temp_free_i64(t1); 1493 tcg_temp_free_i64(t2); 1494 } 1495 1496 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1497 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1498 { 1499 static const GVecGen3 g[4] = { 1500 { .fni8 = tcg_gen_vec_sub8_i64, 1501 .fniv = tcg_gen_sub_vec, 1502 .fno = gen_helper_gvec_sub8, 1503 .opc = INDEX_op_sub_vec, 1504 .vece = MO_8 }, 1505 { .fni8 = tcg_gen_vec_sub16_i64, 1506 .fniv = tcg_gen_sub_vec, 1507 .fno = gen_helper_gvec_sub16, 1508 .opc = INDEX_op_sub_vec, 1509 .vece = MO_16 }, 1510 { .fni4 = tcg_gen_sub_i32, 1511 .fniv = tcg_gen_sub_vec, 1512 .fno = gen_helper_gvec_sub32, 1513 .opc = INDEX_op_sub_vec, 1514 .vece = MO_32 }, 1515 { .fni8 = tcg_gen_sub_i64, 1516 .fniv = tcg_gen_sub_vec, 1517 .fno = gen_helper_gvec_sub64, 1518 .opc = INDEX_op_sub_vec, 1519 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1520 .vece = MO_64 }, 1521 }; 1522 1523 tcg_debug_assert(vece <= MO_64); 1524 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1525 } 1526 1527 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, 1528 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1529 { 1530 static const GVecGen3 g[4] = { 1531 { .fniv = tcg_gen_mul_vec, 1532 .fno = gen_helper_gvec_mul8, 1533 .opc = INDEX_op_mul_vec, 1534 .vece = MO_8 }, 1535 { .fniv = tcg_gen_mul_vec, 1536 .fno = gen_helper_gvec_mul16, 1537 .opc = INDEX_op_mul_vec, 1538 .vece = MO_16 }, 1539 { .fni4 = tcg_gen_mul_i32, 1540 .fniv = tcg_gen_mul_vec, 1541 .fno = gen_helper_gvec_mul32, 1542 .opc = INDEX_op_mul_vec, 1543 .vece = MO_32 }, 1544 { .fni8 = tcg_gen_mul_i64, 1545 .fniv = tcg_gen_mul_vec, 1546 .fno = gen_helper_gvec_mul64, 1547 .opc = INDEX_op_mul_vec, 1548 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1549 .vece = MO_64 }, 1550 }; 1551 1552 tcg_debug_assert(vece <= MO_64); 1553 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1554 } 1555 1556 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, 1557 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1558 { 1559 static const GVecGen2s g[4] = { 1560 { .fniv = tcg_gen_mul_vec, 1561 .fno = gen_helper_gvec_muls8, 1562 .opc = INDEX_op_mul_vec, 1563 .vece = MO_8 }, 1564 { .fniv = tcg_gen_mul_vec, 1565 .fno = gen_helper_gvec_muls16, 1566 .opc = INDEX_op_mul_vec, 1567 .vece = MO_16 }, 1568 { .fni4 = tcg_gen_mul_i32, 1569 .fniv = tcg_gen_mul_vec, 1570 .fno = gen_helper_gvec_muls32, 1571 .opc = INDEX_op_mul_vec, 1572 .vece = MO_32 }, 1573 { .fni8 = tcg_gen_mul_i64, 1574 .fniv = tcg_gen_mul_vec, 1575 .fno = gen_helper_gvec_muls64, 1576 .opc = INDEX_op_mul_vec, 1577 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1578 .vece = MO_64 }, 1579 }; 1580 1581 tcg_debug_assert(vece <= MO_64); 1582 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); 1583 } 1584 1585 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, 1586 int64_t c, uint32_t oprsz, uint32_t maxsz) 1587 { 1588 TCGv_i64 tmp = tcg_const_i64(c); 1589 tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); 1590 tcg_temp_free_i64(tmp); 1591 } 1592 1593 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1594 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1595 { 1596 static const GVecGen3 g[4] = { 1597 { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 }, 1598 { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 }, 1599 { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 }, 1600 { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 } 1601 }; 1602 tcg_debug_assert(vece <= MO_64); 1603 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1604 } 1605 1606 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, 1607 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1608 { 1609 static const GVecGen3 g[4] = { 1610 { .fno = gen_helper_gvec_sssub8, .vece = MO_8 }, 1611 { .fno = gen_helper_gvec_sssub16, .vece = MO_16 }, 1612 { .fno = gen_helper_gvec_sssub32, .vece = MO_32 }, 1613 { .fno = gen_helper_gvec_sssub64, .vece = MO_64 } 1614 }; 1615 tcg_debug_assert(vece <= MO_64); 1616 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1617 } 1618 1619 static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1620 { 1621 TCGv_i32 max = tcg_const_i32(-1); 1622 tcg_gen_add_i32(d, a, b); 1623 tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); 1624 tcg_temp_free_i32(max); 1625 } 1626 1627 static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1628 { 1629 TCGv_i64 max = tcg_const_i64(-1); 1630 tcg_gen_add_i64(d, a, b); 1631 tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); 1632 tcg_temp_free_i64(max); 1633 } 1634 1635 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, 1636 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1637 { 1638 static const GVecGen3 g[4] = { 1639 { .fno = gen_helper_gvec_usadd8, .vece = MO_8 }, 1640 { .fno = gen_helper_gvec_usadd16, .vece = MO_16 }, 1641 { .fni4 = tcg_gen_vec_usadd32_i32, 1642 .fno = gen_helper_gvec_usadd32, 1643 .vece = MO_32 }, 1644 { .fni8 = tcg_gen_vec_usadd32_i64, 1645 .fno = gen_helper_gvec_usadd64, 1646 .vece = MO_64 } 1647 }; 1648 tcg_debug_assert(vece <= MO_64); 1649 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1650 } 1651 1652 static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1653 { 1654 TCGv_i32 min = tcg_const_i32(0); 1655 tcg_gen_sub_i32(d, a, b); 1656 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); 1657 tcg_temp_free_i32(min); 1658 } 1659 1660 static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1661 { 1662 TCGv_i64 min = tcg_const_i64(0); 1663 tcg_gen_sub_i64(d, a, b); 1664 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); 1665 tcg_temp_free_i64(min); 1666 } 1667 1668 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, 1669 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1670 { 1671 static const GVecGen3 g[4] = { 1672 { .fno = gen_helper_gvec_ussub8, .vece = MO_8 }, 1673 { .fno = gen_helper_gvec_ussub16, .vece = MO_16 }, 1674 { .fni4 = tcg_gen_vec_ussub32_i32, 1675 .fno = gen_helper_gvec_ussub32, 1676 .vece = MO_32 }, 1677 { .fni8 = tcg_gen_vec_ussub32_i64, 1678 .fno = gen_helper_gvec_ussub64, 1679 .vece = MO_64 } 1680 }; 1681 tcg_debug_assert(vece <= MO_64); 1682 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1683 } 1684 1685 /* Perform a vector negation using normal negation and a mask. 1686 Compare gen_subv_mask above. */ 1687 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 1688 { 1689 TCGv_i64 t2 = tcg_temp_new_i64(); 1690 TCGv_i64 t3 = tcg_temp_new_i64(); 1691 1692 tcg_gen_andc_i64(t3, m, b); 1693 tcg_gen_andc_i64(t2, b, m); 1694 tcg_gen_sub_i64(d, m, t2); 1695 tcg_gen_xor_i64(d, d, t3); 1696 1697 tcg_temp_free_i64(t2); 1698 tcg_temp_free_i64(t3); 1699 } 1700 1701 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 1702 { 1703 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1704 gen_negv_mask(d, b, m); 1705 tcg_temp_free_i64(m); 1706 } 1707 1708 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 1709 { 1710 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1711 gen_negv_mask(d, b, m); 1712 tcg_temp_free_i64(m); 1713 } 1714 1715 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 1716 { 1717 TCGv_i64 t1 = tcg_temp_new_i64(); 1718 TCGv_i64 t2 = tcg_temp_new_i64(); 1719 1720 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1721 tcg_gen_neg_i64(t2, b); 1722 tcg_gen_neg_i64(t1, t1); 1723 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1724 1725 tcg_temp_free_i64(t1); 1726 tcg_temp_free_i64(t2); 1727 } 1728 1729 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 1730 uint32_t oprsz, uint32_t maxsz) 1731 { 1732 static const GVecGen2 g[4] = { 1733 { .fni8 = tcg_gen_vec_neg8_i64, 1734 .fniv = tcg_gen_neg_vec, 1735 .fno = gen_helper_gvec_neg8, 1736 .opc = INDEX_op_neg_vec, 1737 .vece = MO_8 }, 1738 { .fni8 = tcg_gen_vec_neg16_i64, 1739 .fniv = tcg_gen_neg_vec, 1740 .fno = gen_helper_gvec_neg16, 1741 .opc = INDEX_op_neg_vec, 1742 .vece = MO_16 }, 1743 { .fni4 = tcg_gen_neg_i32, 1744 .fniv = tcg_gen_neg_vec, 1745 .fno = gen_helper_gvec_neg32, 1746 .opc = INDEX_op_neg_vec, 1747 .vece = MO_32 }, 1748 { .fni8 = tcg_gen_neg_i64, 1749 .fniv = tcg_gen_neg_vec, 1750 .fno = gen_helper_gvec_neg64, 1751 .opc = INDEX_op_neg_vec, 1752 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1753 .vece = MO_64 }, 1754 }; 1755 1756 tcg_debug_assert(vece <= MO_64); 1757 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 1758 } 1759 1760 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 1761 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1762 { 1763 static const GVecGen3 g = { 1764 .fni8 = tcg_gen_and_i64, 1765 .fniv = tcg_gen_and_vec, 1766 .fno = gen_helper_gvec_and, 1767 .opc = INDEX_op_and_vec, 1768 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1769 }; 1770 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1771 } 1772 1773 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 1774 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1775 { 1776 static const GVecGen3 g = { 1777 .fni8 = tcg_gen_or_i64, 1778 .fniv = tcg_gen_or_vec, 1779 .fno = gen_helper_gvec_or, 1780 .opc = INDEX_op_or_vec, 1781 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1782 }; 1783 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1784 } 1785 1786 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 1787 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1788 { 1789 static const GVecGen3 g = { 1790 .fni8 = tcg_gen_xor_i64, 1791 .fniv = tcg_gen_xor_vec, 1792 .fno = gen_helper_gvec_xor, 1793 .opc = INDEX_op_xor_vec, 1794 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1795 }; 1796 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1797 } 1798 1799 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 1800 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1801 { 1802 static const GVecGen3 g = { 1803 .fni8 = tcg_gen_andc_i64, 1804 .fniv = tcg_gen_andc_vec, 1805 .fno = gen_helper_gvec_andc, 1806 .opc = INDEX_op_andc_vec, 1807 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1808 }; 1809 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1810 } 1811 1812 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 1813 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1814 { 1815 static const GVecGen3 g = { 1816 .fni8 = tcg_gen_orc_i64, 1817 .fniv = tcg_gen_orc_vec, 1818 .fno = gen_helper_gvec_orc, 1819 .opc = INDEX_op_orc_vec, 1820 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1821 }; 1822 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1823 } 1824 1825 static const GVecGen2s gop_ands = { 1826 .fni8 = tcg_gen_and_i64, 1827 .fniv = tcg_gen_and_vec, 1828 .fno = gen_helper_gvec_ands, 1829 .opc = INDEX_op_and_vec, 1830 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1831 .vece = MO_64 1832 }; 1833 1834 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, 1835 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1836 { 1837 TCGv_i64 tmp = tcg_temp_new_i64(); 1838 gen_dup_i64(vece, tmp, c); 1839 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 1840 tcg_temp_free_i64(tmp); 1841 } 1842 1843 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, 1844 int64_t c, uint32_t oprsz, uint32_t maxsz) 1845 { 1846 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 1847 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); 1848 tcg_temp_free_i64(tmp); 1849 } 1850 1851 static const GVecGen2s gop_xors = { 1852 .fni8 = tcg_gen_xor_i64, 1853 .fniv = tcg_gen_xor_vec, 1854 .fno = gen_helper_gvec_xors, 1855 .opc = INDEX_op_xor_vec, 1856 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1857 .vece = MO_64 1858 }; 1859 1860 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, 1861 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1862 { 1863 TCGv_i64 tmp = tcg_temp_new_i64(); 1864 gen_dup_i64(vece, tmp, c); 1865 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 1866 tcg_temp_free_i64(tmp); 1867 } 1868 1869 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, 1870 int64_t c, uint32_t oprsz, uint32_t maxsz) 1871 { 1872 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 1873 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); 1874 tcg_temp_free_i64(tmp); 1875 } 1876 1877 static const GVecGen2s gop_ors = { 1878 .fni8 = tcg_gen_or_i64, 1879 .fniv = tcg_gen_or_vec, 1880 .fno = gen_helper_gvec_ors, 1881 .opc = INDEX_op_or_vec, 1882 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1883 .vece = MO_64 1884 }; 1885 1886 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, 1887 TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) 1888 { 1889 TCGv_i64 tmp = tcg_temp_new_i64(); 1890 gen_dup_i64(vece, tmp, c); 1891 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 1892 tcg_temp_free_i64(tmp); 1893 } 1894 1895 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, 1896 int64_t c, uint32_t oprsz, uint32_t maxsz) 1897 { 1898 TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); 1899 tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); 1900 tcg_temp_free_i64(tmp); 1901 } 1902 1903 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1904 { 1905 uint64_t mask = dup_const(MO_8, 0xff << c); 1906 tcg_gen_shli_i64(d, a, c); 1907 tcg_gen_andi_i64(d, d, mask); 1908 } 1909 1910 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1911 { 1912 uint64_t mask = dup_const(MO_16, 0xffff << c); 1913 tcg_gen_shli_i64(d, a, c); 1914 tcg_gen_andi_i64(d, d, mask); 1915 } 1916 1917 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 1918 int64_t shift, uint32_t oprsz, uint32_t maxsz) 1919 { 1920 static const GVecGen2i g[4] = { 1921 { .fni8 = tcg_gen_vec_shl8i_i64, 1922 .fniv = tcg_gen_shli_vec, 1923 .fno = gen_helper_gvec_shl8i, 1924 .opc = INDEX_op_shli_vec, 1925 .vece = MO_8 }, 1926 { .fni8 = tcg_gen_vec_shl16i_i64, 1927 .fniv = tcg_gen_shli_vec, 1928 .fno = gen_helper_gvec_shl16i, 1929 .opc = INDEX_op_shli_vec, 1930 .vece = MO_16 }, 1931 { .fni4 = tcg_gen_shli_i32, 1932 .fniv = tcg_gen_shli_vec, 1933 .fno = gen_helper_gvec_shl32i, 1934 .opc = INDEX_op_shli_vec, 1935 .vece = MO_32 }, 1936 { .fni8 = tcg_gen_shli_i64, 1937 .fniv = tcg_gen_shli_vec, 1938 .fno = gen_helper_gvec_shl64i, 1939 .opc = INDEX_op_shli_vec, 1940 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1941 .vece = MO_64 }, 1942 }; 1943 1944 tcg_debug_assert(vece <= MO_64); 1945 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 1946 if (shift == 0) { 1947 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 1948 } else { 1949 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 1950 } 1951 } 1952 1953 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1954 { 1955 uint64_t mask = dup_const(MO_8, 0xff >> c); 1956 tcg_gen_shri_i64(d, a, c); 1957 tcg_gen_andi_i64(d, d, mask); 1958 } 1959 1960 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1961 { 1962 uint64_t mask = dup_const(MO_16, 0xffff >> c); 1963 tcg_gen_shri_i64(d, a, c); 1964 tcg_gen_andi_i64(d, d, mask); 1965 } 1966 1967 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 1968 int64_t shift, uint32_t oprsz, uint32_t maxsz) 1969 { 1970 static const GVecGen2i g[4] = { 1971 { .fni8 = tcg_gen_vec_shr8i_i64, 1972 .fniv = tcg_gen_shri_vec, 1973 .fno = gen_helper_gvec_shr8i, 1974 .opc = INDEX_op_shri_vec, 1975 .vece = MO_8 }, 1976 { .fni8 = tcg_gen_vec_shr16i_i64, 1977 .fniv = tcg_gen_shri_vec, 1978 .fno = gen_helper_gvec_shr16i, 1979 .opc = INDEX_op_shri_vec, 1980 .vece = MO_16 }, 1981 { .fni4 = tcg_gen_shri_i32, 1982 .fniv = tcg_gen_shri_vec, 1983 .fno = gen_helper_gvec_shr32i, 1984 .opc = INDEX_op_shri_vec, 1985 .vece = MO_32 }, 1986 { .fni8 = tcg_gen_shri_i64, 1987 .fniv = tcg_gen_shri_vec, 1988 .fno = gen_helper_gvec_shr64i, 1989 .opc = INDEX_op_shri_vec, 1990 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1991 .vece = MO_64 }, 1992 }; 1993 1994 tcg_debug_assert(vece <= MO_64); 1995 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 1996 if (shift == 0) { 1997 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 1998 } else { 1999 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2000 } 2001 } 2002 2003 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2004 { 2005 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 2006 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 2007 TCGv_i64 s = tcg_temp_new_i64(); 2008 2009 tcg_gen_shri_i64(d, a, c); 2010 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2011 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2012 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2013 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2014 tcg_temp_free_i64(s); 2015 } 2016 2017 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 2018 { 2019 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 2020 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 2021 TCGv_i64 s = tcg_temp_new_i64(); 2022 2023 tcg_gen_shri_i64(d, a, c); 2024 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 2025 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 2026 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 2027 tcg_gen_or_i64(d, d, s); /* include sign extension */ 2028 tcg_temp_free_i64(s); 2029 } 2030 2031 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 2032 int64_t shift, uint32_t oprsz, uint32_t maxsz) 2033 { 2034 static const GVecGen2i g[4] = { 2035 { .fni8 = tcg_gen_vec_sar8i_i64, 2036 .fniv = tcg_gen_sari_vec, 2037 .fno = gen_helper_gvec_sar8i, 2038 .opc = INDEX_op_sari_vec, 2039 .vece = MO_8 }, 2040 { .fni8 = tcg_gen_vec_sar16i_i64, 2041 .fniv = tcg_gen_sari_vec, 2042 .fno = gen_helper_gvec_sar16i, 2043 .opc = INDEX_op_sari_vec, 2044 .vece = MO_16 }, 2045 { .fni4 = tcg_gen_sari_i32, 2046 .fniv = tcg_gen_sari_vec, 2047 .fno = gen_helper_gvec_sar32i, 2048 .opc = INDEX_op_sari_vec, 2049 .vece = MO_32 }, 2050 { .fni8 = tcg_gen_sari_i64, 2051 .fniv = tcg_gen_sari_vec, 2052 .fno = gen_helper_gvec_sar64i, 2053 .opc = INDEX_op_sari_vec, 2054 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 2055 .vece = MO_64 }, 2056 }; 2057 2058 tcg_debug_assert(vece <= MO_64); 2059 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 2060 if (shift == 0) { 2061 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 2062 } else { 2063 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 2064 } 2065 } 2066 2067 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 2068 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 2069 uint32_t oprsz, TCGCond cond) 2070 { 2071 TCGv_i32 t0 = tcg_temp_new_i32(); 2072 TCGv_i32 t1 = tcg_temp_new_i32(); 2073 uint32_t i; 2074 2075 for (i = 0; i < oprsz; i += 4) { 2076 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 2077 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 2078 tcg_gen_setcond_i32(cond, t0, t0, t1); 2079 tcg_gen_neg_i32(t0, t0); 2080 tcg_gen_st_i32(t0, cpu_env, dofs + i); 2081 } 2082 tcg_temp_free_i32(t1); 2083 tcg_temp_free_i32(t0); 2084 } 2085 2086 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 2087 uint32_t oprsz, TCGCond cond) 2088 { 2089 TCGv_i64 t0 = tcg_temp_new_i64(); 2090 TCGv_i64 t1 = tcg_temp_new_i64(); 2091 uint32_t i; 2092 2093 for (i = 0; i < oprsz; i += 8) { 2094 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 2095 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 2096 tcg_gen_setcond_i64(cond, t0, t0, t1); 2097 tcg_gen_neg_i64(t0, t0); 2098 tcg_gen_st_i64(t0, cpu_env, dofs + i); 2099 } 2100 tcg_temp_free_i64(t1); 2101 tcg_temp_free_i64(t0); 2102 } 2103 2104 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 2105 uint32_t bofs, uint32_t oprsz, uint32_t tysz, 2106 TCGType type, TCGCond cond) 2107 { 2108 TCGv_vec t0 = tcg_temp_new_vec(type); 2109 TCGv_vec t1 = tcg_temp_new_vec(type); 2110 uint32_t i; 2111 2112 for (i = 0; i < oprsz; i += tysz) { 2113 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 2114 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 2115 tcg_gen_cmp_vec(cond, vece, t0, t0, t1); 2116 tcg_gen_st_vec(t0, cpu_env, dofs + i); 2117 } 2118 tcg_temp_free_vec(t1); 2119 tcg_temp_free_vec(t0); 2120 } 2121 2122 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, 2123 uint32_t aofs, uint32_t bofs, 2124 uint32_t oprsz, uint32_t maxsz) 2125 { 2126 static gen_helper_gvec_3 * const eq_fn[4] = { 2127 gen_helper_gvec_eq8, gen_helper_gvec_eq16, 2128 gen_helper_gvec_eq32, gen_helper_gvec_eq64 2129 }; 2130 static gen_helper_gvec_3 * const ne_fn[4] = { 2131 gen_helper_gvec_ne8, gen_helper_gvec_ne16, 2132 gen_helper_gvec_ne32, gen_helper_gvec_ne64 2133 }; 2134 static gen_helper_gvec_3 * const lt_fn[4] = { 2135 gen_helper_gvec_lt8, gen_helper_gvec_lt16, 2136 gen_helper_gvec_lt32, gen_helper_gvec_lt64 2137 }; 2138 static gen_helper_gvec_3 * const le_fn[4] = { 2139 gen_helper_gvec_le8, gen_helper_gvec_le16, 2140 gen_helper_gvec_le32, gen_helper_gvec_le64 2141 }; 2142 static gen_helper_gvec_3 * const ltu_fn[4] = { 2143 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, 2144 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 2145 }; 2146 static gen_helper_gvec_3 * const leu_fn[4] = { 2147 gen_helper_gvec_leu8, gen_helper_gvec_leu16, 2148 gen_helper_gvec_leu32, gen_helper_gvec_leu64 2149 }; 2150 static gen_helper_gvec_3 * const * const fns[16] = { 2151 [TCG_COND_EQ] = eq_fn, 2152 [TCG_COND_NE] = ne_fn, 2153 [TCG_COND_LT] = lt_fn, 2154 [TCG_COND_LE] = le_fn, 2155 [TCG_COND_LTU] = ltu_fn, 2156 [TCG_COND_LEU] = leu_fn, 2157 }; 2158 2159 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 2160 check_overlap_3(dofs, aofs, bofs, maxsz); 2161 2162 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { 2163 do_dup(MO_8, dofs, oprsz, maxsz, 2164 NULL, NULL, -(cond == TCG_COND_ALWAYS)); 2165 return; 2166 } 2167 2168 /* Recall that ARM SVE allows vector sizes that are not a power of 2. 2169 Expand with successively smaller host vector sizes. The intent is 2170 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ 2171 2172 if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32) 2173 && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V256, vece)) { 2174 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); 2175 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); 2176 if (some == oprsz) { 2177 goto done; 2178 } 2179 dofs += some; 2180 aofs += some; 2181 bofs += some; 2182 oprsz -= some; 2183 maxsz -= some; 2184 } 2185 2186 if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16) 2187 && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V128, vece)) { 2188 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); 2189 } else if (TCG_TARGET_HAS_v64 2190 && check_size_impl(oprsz, 8) 2191 && (TCG_TARGET_REG_BITS == 32 || vece != MO_64) 2192 && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V64, vece)) { 2193 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); 2194 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { 2195 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); 2196 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { 2197 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); 2198 } else { 2199 gen_helper_gvec_3 * const *fn = fns[cond]; 2200 2201 if (fn == NULL) { 2202 uint32_t tmp; 2203 tmp = aofs, aofs = bofs, bofs = tmp; 2204 cond = tcg_swap_cond(cond); 2205 fn = fns[cond]; 2206 assert(fn != NULL); 2207 } 2208 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); 2209 return; 2210 } 2211 2212 done: 2213 if (oprsz < maxsz) { 2214 expand_clr(dofs + oprsz, maxsz - oprsz); 2215 } 2216 } 2217