1 /* 2 * Generic vector operation expansion 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu-common.h" 22 #include "tcg.h" 23 #include "tcg-op.h" 24 #include "tcg-op-gvec.h" 25 #include "tcg-gvec-desc.h" 26 27 #define MAX_UNROLL 4 28 29 /* Verify vector size and alignment rules. OFS should be the OR of all 30 of the operand offsets so that we can check them all at once. */ 31 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) 32 { 33 uint32_t opr_align = oprsz >= 16 ? 15 : 7; 34 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; 35 tcg_debug_assert(oprsz > 0); 36 tcg_debug_assert(oprsz <= maxsz); 37 tcg_debug_assert((oprsz & opr_align) == 0); 38 tcg_debug_assert((maxsz & max_align) == 0); 39 tcg_debug_assert((ofs & max_align) == 0); 40 } 41 42 /* Verify vector overlap rules for two operands. */ 43 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) 44 { 45 tcg_debug_assert(d == a || d + s <= a || a + s <= d); 46 } 47 48 /* Verify vector overlap rules for three operands. */ 49 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) 50 { 51 check_overlap_2(d, a, s); 52 check_overlap_2(d, b, s); 53 check_overlap_2(a, b, s); 54 } 55 56 /* Verify vector overlap rules for four operands. */ 57 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, 58 uint32_t c, uint32_t s) 59 { 60 check_overlap_2(d, a, s); 61 check_overlap_2(d, b, s); 62 check_overlap_2(d, c, s); 63 check_overlap_2(a, b, s); 64 check_overlap_2(a, c, s); 65 check_overlap_2(b, c, s); 66 } 67 68 /* Create a descriptor from components. */ 69 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) 70 { 71 uint32_t desc = 0; 72 73 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); 74 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); 75 assert(data == sextract32(data, 0, SIMD_DATA_BITS)); 76 77 oprsz = (oprsz / 8) - 1; 78 maxsz = (maxsz / 8) - 1; 79 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); 80 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); 81 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); 82 83 return desc; 84 } 85 86 /* Generate a call to a gvec-style helper with two vector operands. */ 87 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, 88 uint32_t oprsz, uint32_t maxsz, int32_t data, 89 gen_helper_gvec_2 *fn) 90 { 91 TCGv_ptr a0, a1; 92 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 93 94 a0 = tcg_temp_new_ptr(); 95 a1 = tcg_temp_new_ptr(); 96 97 tcg_gen_addi_ptr(a0, cpu_env, dofs); 98 tcg_gen_addi_ptr(a1, cpu_env, aofs); 99 100 fn(a0, a1, desc); 101 102 tcg_temp_free_ptr(a0); 103 tcg_temp_free_ptr(a1); 104 tcg_temp_free_i32(desc); 105 } 106 107 /* Generate a call to a gvec-style helper with three vector operands. */ 108 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 109 uint32_t oprsz, uint32_t maxsz, int32_t data, 110 gen_helper_gvec_3 *fn) 111 { 112 TCGv_ptr a0, a1, a2; 113 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 114 115 a0 = tcg_temp_new_ptr(); 116 a1 = tcg_temp_new_ptr(); 117 a2 = tcg_temp_new_ptr(); 118 119 tcg_gen_addi_ptr(a0, cpu_env, dofs); 120 tcg_gen_addi_ptr(a1, cpu_env, aofs); 121 tcg_gen_addi_ptr(a2, cpu_env, bofs); 122 123 fn(a0, a1, a2, desc); 124 125 tcg_temp_free_ptr(a0); 126 tcg_temp_free_ptr(a1); 127 tcg_temp_free_ptr(a2); 128 tcg_temp_free_i32(desc); 129 } 130 131 /* Generate a call to a gvec-style helper with four vector operands. */ 132 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 133 uint32_t cofs, uint32_t oprsz, uint32_t maxsz, 134 int32_t data, gen_helper_gvec_4 *fn) 135 { 136 TCGv_ptr a0, a1, a2, a3; 137 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 138 139 a0 = tcg_temp_new_ptr(); 140 a1 = tcg_temp_new_ptr(); 141 a2 = tcg_temp_new_ptr(); 142 a3 = tcg_temp_new_ptr(); 143 144 tcg_gen_addi_ptr(a0, cpu_env, dofs); 145 tcg_gen_addi_ptr(a1, cpu_env, aofs); 146 tcg_gen_addi_ptr(a2, cpu_env, bofs); 147 tcg_gen_addi_ptr(a3, cpu_env, cofs); 148 149 fn(a0, a1, a2, a3, desc); 150 151 tcg_temp_free_ptr(a0); 152 tcg_temp_free_ptr(a1); 153 tcg_temp_free_ptr(a2); 154 tcg_temp_free_ptr(a3); 155 tcg_temp_free_i32(desc); 156 } 157 158 /* Generate a call to a gvec-style helper with five vector operands. */ 159 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, 160 uint32_t cofs, uint32_t xofs, uint32_t oprsz, 161 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) 162 { 163 TCGv_ptr a0, a1, a2, a3, a4; 164 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 165 166 a0 = tcg_temp_new_ptr(); 167 a1 = tcg_temp_new_ptr(); 168 a2 = tcg_temp_new_ptr(); 169 a3 = tcg_temp_new_ptr(); 170 a4 = tcg_temp_new_ptr(); 171 172 tcg_gen_addi_ptr(a0, cpu_env, dofs); 173 tcg_gen_addi_ptr(a1, cpu_env, aofs); 174 tcg_gen_addi_ptr(a2, cpu_env, bofs); 175 tcg_gen_addi_ptr(a3, cpu_env, cofs); 176 tcg_gen_addi_ptr(a4, cpu_env, xofs); 177 178 fn(a0, a1, a2, a3, a4, desc); 179 180 tcg_temp_free_ptr(a0); 181 tcg_temp_free_ptr(a1); 182 tcg_temp_free_ptr(a2); 183 tcg_temp_free_ptr(a3); 184 tcg_temp_free_ptr(a4); 185 tcg_temp_free_i32(desc); 186 } 187 188 /* Generate a call to a gvec-style helper with three vector operands 189 and an extra pointer operand. */ 190 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, 191 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 192 int32_t data, gen_helper_gvec_2_ptr *fn) 193 { 194 TCGv_ptr a0, a1; 195 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 196 197 a0 = tcg_temp_new_ptr(); 198 a1 = tcg_temp_new_ptr(); 199 200 tcg_gen_addi_ptr(a0, cpu_env, dofs); 201 tcg_gen_addi_ptr(a1, cpu_env, aofs); 202 203 fn(a0, a1, ptr, desc); 204 205 tcg_temp_free_ptr(a0); 206 tcg_temp_free_ptr(a1); 207 tcg_temp_free_i32(desc); 208 } 209 210 /* Generate a call to a gvec-style helper with three vector operands 211 and an extra pointer operand. */ 212 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 213 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, 214 int32_t data, gen_helper_gvec_3_ptr *fn) 215 { 216 TCGv_ptr a0, a1, a2; 217 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 218 219 a0 = tcg_temp_new_ptr(); 220 a1 = tcg_temp_new_ptr(); 221 a2 = tcg_temp_new_ptr(); 222 223 tcg_gen_addi_ptr(a0, cpu_env, dofs); 224 tcg_gen_addi_ptr(a1, cpu_env, aofs); 225 tcg_gen_addi_ptr(a2, cpu_env, bofs); 226 227 fn(a0, a1, a2, ptr, desc); 228 229 tcg_temp_free_ptr(a0); 230 tcg_temp_free_ptr(a1); 231 tcg_temp_free_ptr(a2); 232 tcg_temp_free_i32(desc); 233 } 234 235 /* Generate a call to a gvec-style helper with four vector operands 236 and an extra pointer operand. */ 237 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, 238 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, 239 uint32_t maxsz, int32_t data, 240 gen_helper_gvec_4_ptr *fn) 241 { 242 TCGv_ptr a0, a1, a2, a3; 243 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); 244 245 a0 = tcg_temp_new_ptr(); 246 a1 = tcg_temp_new_ptr(); 247 a2 = tcg_temp_new_ptr(); 248 a3 = tcg_temp_new_ptr(); 249 250 tcg_gen_addi_ptr(a0, cpu_env, dofs); 251 tcg_gen_addi_ptr(a1, cpu_env, aofs); 252 tcg_gen_addi_ptr(a2, cpu_env, bofs); 253 tcg_gen_addi_ptr(a3, cpu_env, cofs); 254 255 fn(a0, a1, a2, a3, ptr, desc); 256 257 tcg_temp_free_ptr(a0); 258 tcg_temp_free_ptr(a1); 259 tcg_temp_free_ptr(a2); 260 tcg_temp_free_ptr(a3); 261 tcg_temp_free_i32(desc); 262 } 263 264 /* Return true if we want to implement something of OPRSZ bytes 265 in units of LNSZ. This limits the expansion of inline code. */ 266 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) 267 { 268 uint32_t lnct = oprsz / lnsz; 269 return lnct >= 1 && lnct <= MAX_UNROLL; 270 } 271 272 static void expand_clr(uint32_t dofs, uint32_t maxsz); 273 274 /* Duplicate C as per VECE. */ 275 uint64_t (dup_const)(unsigned vece, uint64_t c) 276 { 277 switch (vece) { 278 case MO_8: 279 return 0x0101010101010101ull * (uint8_t)c; 280 case MO_16: 281 return 0x0001000100010001ull * (uint16_t)c; 282 case MO_32: 283 return 0x0000000100000001ull * (uint32_t)c; 284 case MO_64: 285 return c; 286 default: 287 g_assert_not_reached(); 288 } 289 } 290 291 /* Duplicate IN into OUT as per VECE. */ 292 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) 293 { 294 switch (vece) { 295 case MO_8: 296 tcg_gen_ext8u_i32(out, in); 297 tcg_gen_muli_i32(out, out, 0x01010101); 298 break; 299 case MO_16: 300 tcg_gen_deposit_i32(out, in, in, 16, 16); 301 break; 302 case MO_32: 303 tcg_gen_mov_i32(out, in); 304 break; 305 default: 306 g_assert_not_reached(); 307 } 308 } 309 310 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) 311 { 312 switch (vece) { 313 case MO_8: 314 tcg_gen_ext8u_i64(out, in); 315 tcg_gen_muli_i64(out, out, 0x0101010101010101ull); 316 break; 317 case MO_16: 318 tcg_gen_ext16u_i64(out, in); 319 tcg_gen_muli_i64(out, out, 0x0001000100010001ull); 320 break; 321 case MO_32: 322 tcg_gen_deposit_i64(out, in, in, 32, 32); 323 break; 324 case MO_64: 325 tcg_gen_mov_i64(out, in); 326 break; 327 default: 328 g_assert_not_reached(); 329 } 330 } 331 332 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. 333 * Only one of IN_32 or IN_64 may be set; 334 * IN_C is used if IN_32 and IN_64 are unset. 335 */ 336 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, 337 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, 338 uint64_t in_c) 339 { 340 TCGType type; 341 TCGv_i64 t_64; 342 TCGv_i32 t_32, t_desc; 343 TCGv_ptr t_ptr; 344 uint32_t i; 345 346 assert(vece <= (in_32 ? MO_32 : MO_64)); 347 assert(in_32 == NULL || in_64 == NULL); 348 349 /* If we're storing 0, expand oprsz to maxsz. */ 350 if (in_32 == NULL && in_64 == NULL) { 351 in_c = dup_const(vece, in_c); 352 if (in_c == 0) { 353 oprsz = maxsz; 354 } 355 } 356 357 type = 0; 358 if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) { 359 type = TCG_TYPE_V256; 360 } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) { 361 type = TCG_TYPE_V128; 362 } else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8) 363 /* Prefer integer when 64-bit host and no variable dup. */ 364 && !(TCG_TARGET_REG_BITS == 64 && in_32 == NULL 365 && (in_64 == NULL || vece == MO_64))) { 366 type = TCG_TYPE_V64; 367 } 368 369 /* Implement inline with a vector type, if possible. */ 370 if (type != 0) { 371 TCGv_vec t_vec = tcg_temp_new_vec(type); 372 373 if (in_32) { 374 tcg_gen_dup_i32_vec(vece, t_vec, in_32); 375 } else if (in_64) { 376 tcg_gen_dup_i64_vec(vece, t_vec, in_64); 377 } else { 378 switch (vece) { 379 case MO_8: 380 tcg_gen_dup8i_vec(t_vec, in_c); 381 break; 382 case MO_16: 383 tcg_gen_dup16i_vec(t_vec, in_c); 384 break; 385 case MO_32: 386 tcg_gen_dup32i_vec(t_vec, in_c); 387 break; 388 default: 389 tcg_gen_dup64i_vec(t_vec, in_c); 390 break; 391 } 392 } 393 394 i = 0; 395 if (TCG_TARGET_HAS_v256) { 396 for (; i + 32 <= oprsz; i += 32) { 397 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); 398 } 399 } 400 if (TCG_TARGET_HAS_v128) { 401 for (; i + 16 <= oprsz; i += 16) { 402 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); 403 } 404 } 405 if (TCG_TARGET_HAS_v64) { 406 for (; i < oprsz; i += 8) { 407 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); 408 } 409 } 410 tcg_temp_free_vec(t_vec); 411 goto done; 412 } 413 414 /* Otherwise, inline with an integer type, unless "large". */ 415 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { 416 t_64 = NULL; 417 t_32 = NULL; 418 419 if (in_32) { 420 /* We are given a 32-bit variable input. For a 64-bit host, 421 use a 64-bit operation unless the 32-bit operation would 422 be simple enough. */ 423 if (TCG_TARGET_REG_BITS == 64 424 && (vece != MO_32 || !check_size_impl(oprsz, 4))) { 425 t_64 = tcg_temp_new_i64(); 426 tcg_gen_extu_i32_i64(t_64, in_32); 427 gen_dup_i64(vece, t_64, t_64); 428 } else { 429 t_32 = tcg_temp_new_i32(); 430 gen_dup_i32(vece, t_32, in_32); 431 } 432 } else if (in_64) { 433 /* We are given a 64-bit variable input. */ 434 t_64 = tcg_temp_new_i64(); 435 gen_dup_i64(vece, t_64, in_64); 436 } else { 437 /* We are given a constant input. */ 438 /* For 64-bit hosts, use 64-bit constants for "simple" constants 439 or when we'd need too many 32-bit stores, or when a 64-bit 440 constant is really required. */ 441 if (vece == MO_64 442 || (TCG_TARGET_REG_BITS == 64 443 && (in_c == 0 || in_c == -1 444 || !check_size_impl(oprsz, 4)))) { 445 t_64 = tcg_const_i64(in_c); 446 } else { 447 t_32 = tcg_const_i32(in_c); 448 } 449 } 450 451 /* Implement inline if we picked an implementation size above. */ 452 if (t_32) { 453 for (i = 0; i < oprsz; i += 4) { 454 tcg_gen_st_i32(t_32, cpu_env, dofs + i); 455 } 456 tcg_temp_free_i32(t_32); 457 goto done; 458 } 459 if (t_64) { 460 for (i = 0; i < oprsz; i += 8) { 461 tcg_gen_st_i64(t_64, cpu_env, dofs + i); 462 } 463 tcg_temp_free_i64(t_64); 464 goto done; 465 } 466 } 467 468 /* Otherwise implement out of line. */ 469 t_ptr = tcg_temp_new_ptr(); 470 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); 471 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); 472 473 if (vece == MO_64) { 474 if (in_64) { 475 gen_helper_gvec_dup64(t_ptr, t_desc, in_64); 476 } else { 477 t_64 = tcg_const_i64(in_c); 478 gen_helper_gvec_dup64(t_ptr, t_desc, t_64); 479 tcg_temp_free_i64(t_64); 480 } 481 } else { 482 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); 483 static dup_fn * const fns[3] = { 484 gen_helper_gvec_dup8, 485 gen_helper_gvec_dup16, 486 gen_helper_gvec_dup32 487 }; 488 489 if (in_32) { 490 fns[vece](t_ptr, t_desc, in_32); 491 } else { 492 t_32 = tcg_temp_new_i32(); 493 if (in_64) { 494 tcg_gen_extrl_i64_i32(t_32, in_64); 495 } else if (vece == MO_8) { 496 tcg_gen_movi_i32(t_32, in_c & 0xff); 497 } else if (vece == MO_16) { 498 tcg_gen_movi_i32(t_32, in_c & 0xffff); 499 } else { 500 tcg_gen_movi_i32(t_32, in_c); 501 } 502 fns[vece](t_ptr, t_desc, t_32); 503 tcg_temp_free_i32(t_32); 504 } 505 } 506 507 tcg_temp_free_ptr(t_ptr); 508 tcg_temp_free_i32(t_desc); 509 return; 510 511 done: 512 if (oprsz < maxsz) { 513 expand_clr(dofs + oprsz, maxsz - oprsz); 514 } 515 } 516 517 /* Likewise, but with zero. */ 518 static void expand_clr(uint32_t dofs, uint32_t maxsz) 519 { 520 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); 521 } 522 523 /* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ 524 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 525 void (*fni)(TCGv_i32, TCGv_i32)) 526 { 527 TCGv_i32 t0 = tcg_temp_new_i32(); 528 uint32_t i; 529 530 for (i = 0; i < oprsz; i += 4) { 531 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 532 fni(t0, t0); 533 tcg_gen_st_i32(t0, cpu_env, dofs + i); 534 } 535 tcg_temp_free_i32(t0); 536 } 537 538 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 539 int32_t c, bool load_dest, 540 void (*fni)(TCGv_i32, TCGv_i32, int32_t)) 541 { 542 TCGv_i32 t0 = tcg_temp_new_i32(); 543 TCGv_i32 t1 = tcg_temp_new_i32(); 544 uint32_t i; 545 546 for (i = 0; i < oprsz; i += 4) { 547 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 548 if (load_dest) { 549 tcg_gen_ld_i32(t1, cpu_env, dofs + i); 550 } 551 fni(t1, t0, c); 552 tcg_gen_st_i32(t1, cpu_env, dofs + i); 553 } 554 tcg_temp_free_i32(t0); 555 tcg_temp_free_i32(t1); 556 } 557 558 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 559 static void expand_3_i32(uint32_t dofs, uint32_t aofs, 560 uint32_t bofs, uint32_t oprsz, bool load_dest, 561 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) 562 { 563 TCGv_i32 t0 = tcg_temp_new_i32(); 564 TCGv_i32 t1 = tcg_temp_new_i32(); 565 TCGv_i32 t2 = tcg_temp_new_i32(); 566 uint32_t i; 567 568 for (i = 0; i < oprsz; i += 4) { 569 tcg_gen_ld_i32(t0, cpu_env, aofs + i); 570 tcg_gen_ld_i32(t1, cpu_env, bofs + i); 571 if (load_dest) { 572 tcg_gen_ld_i32(t2, cpu_env, dofs + i); 573 } 574 fni(t2, t0, t1); 575 tcg_gen_st_i32(t2, cpu_env, dofs + i); 576 } 577 tcg_temp_free_i32(t2); 578 tcg_temp_free_i32(t1); 579 tcg_temp_free_i32(t0); 580 } 581 582 /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ 583 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, 584 uint32_t cofs, uint32_t oprsz, 585 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) 586 { 587 TCGv_i32 t0 = tcg_temp_new_i32(); 588 TCGv_i32 t1 = tcg_temp_new_i32(); 589 TCGv_i32 t2 = tcg_temp_new_i32(); 590 TCGv_i32 t3 = tcg_temp_new_i32(); 591 uint32_t i; 592 593 for (i = 0; i < oprsz; i += 4) { 594 tcg_gen_ld_i32(t1, cpu_env, aofs + i); 595 tcg_gen_ld_i32(t2, cpu_env, bofs + i); 596 tcg_gen_ld_i32(t3, cpu_env, cofs + i); 597 fni(t0, t1, t2, t3); 598 tcg_gen_st_i32(t0, cpu_env, dofs + i); 599 } 600 tcg_temp_free_i32(t3); 601 tcg_temp_free_i32(t2); 602 tcg_temp_free_i32(t1); 603 tcg_temp_free_i32(t0); 604 } 605 606 /* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ 607 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 608 void (*fni)(TCGv_i64, TCGv_i64)) 609 { 610 TCGv_i64 t0 = tcg_temp_new_i64(); 611 uint32_t i; 612 613 for (i = 0; i < oprsz; i += 8) { 614 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 615 fni(t0, t0); 616 tcg_gen_st_i64(t0, cpu_env, dofs + i); 617 } 618 tcg_temp_free_i64(t0); 619 } 620 621 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 622 int64_t c, bool load_dest, 623 void (*fni)(TCGv_i64, TCGv_i64, int64_t)) 624 { 625 TCGv_i64 t0 = tcg_temp_new_i64(); 626 TCGv_i64 t1 = tcg_temp_new_i64(); 627 uint32_t i; 628 629 for (i = 0; i < oprsz; i += 8) { 630 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 631 if (load_dest) { 632 tcg_gen_ld_i64(t1, cpu_env, dofs + i); 633 } 634 fni(t1, t0, c); 635 tcg_gen_st_i64(t1, cpu_env, dofs + i); 636 } 637 tcg_temp_free_i64(t0); 638 tcg_temp_free_i64(t1); 639 } 640 641 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 642 static void expand_3_i64(uint32_t dofs, uint32_t aofs, 643 uint32_t bofs, uint32_t oprsz, bool load_dest, 644 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) 645 { 646 TCGv_i64 t0 = tcg_temp_new_i64(); 647 TCGv_i64 t1 = tcg_temp_new_i64(); 648 TCGv_i64 t2 = tcg_temp_new_i64(); 649 uint32_t i; 650 651 for (i = 0; i < oprsz; i += 8) { 652 tcg_gen_ld_i64(t0, cpu_env, aofs + i); 653 tcg_gen_ld_i64(t1, cpu_env, bofs + i); 654 if (load_dest) { 655 tcg_gen_ld_i64(t2, cpu_env, dofs + i); 656 } 657 fni(t2, t0, t1); 658 tcg_gen_st_i64(t2, cpu_env, dofs + i); 659 } 660 tcg_temp_free_i64(t2); 661 tcg_temp_free_i64(t1); 662 tcg_temp_free_i64(t0); 663 } 664 665 /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ 666 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, 667 uint32_t cofs, uint32_t oprsz, 668 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) 669 { 670 TCGv_i64 t0 = tcg_temp_new_i64(); 671 TCGv_i64 t1 = tcg_temp_new_i64(); 672 TCGv_i64 t2 = tcg_temp_new_i64(); 673 TCGv_i64 t3 = tcg_temp_new_i64(); 674 uint32_t i; 675 676 for (i = 0; i < oprsz; i += 8) { 677 tcg_gen_ld_i64(t1, cpu_env, aofs + i); 678 tcg_gen_ld_i64(t2, cpu_env, bofs + i); 679 tcg_gen_ld_i64(t3, cpu_env, cofs + i); 680 fni(t0, t1, t2, t3); 681 tcg_gen_st_i64(t0, cpu_env, dofs + i); 682 } 683 tcg_temp_free_i64(t3); 684 tcg_temp_free_i64(t2); 685 tcg_temp_free_i64(t1); 686 tcg_temp_free_i64(t0); 687 } 688 689 /* Expand OPSZ bytes worth of two-operand operations using host vectors. */ 690 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 691 uint32_t oprsz, uint32_t tysz, TCGType type, 692 void (*fni)(unsigned, TCGv_vec, TCGv_vec)) 693 { 694 TCGv_vec t0 = tcg_temp_new_vec(type); 695 uint32_t i; 696 697 for (i = 0; i < oprsz; i += tysz) { 698 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 699 fni(vece, t0, t0); 700 tcg_gen_st_vec(t0, cpu_env, dofs + i); 701 } 702 tcg_temp_free_vec(t0); 703 } 704 705 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand 706 using host vectors. */ 707 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 708 uint32_t oprsz, uint32_t tysz, TCGType type, 709 int64_t c, bool load_dest, 710 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) 711 { 712 TCGv_vec t0 = tcg_temp_new_vec(type); 713 TCGv_vec t1 = tcg_temp_new_vec(type); 714 uint32_t i; 715 716 for (i = 0; i < oprsz; i += tysz) { 717 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 718 if (load_dest) { 719 tcg_gen_ld_vec(t1, cpu_env, dofs + i); 720 } 721 fni(vece, t1, t0, c); 722 tcg_gen_st_vec(t1, cpu_env, dofs + i); 723 } 724 tcg_temp_free_vec(t0); 725 tcg_temp_free_vec(t1); 726 } 727 728 /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ 729 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 730 uint32_t bofs, uint32_t oprsz, 731 uint32_t tysz, TCGType type, bool load_dest, 732 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) 733 { 734 TCGv_vec t0 = tcg_temp_new_vec(type); 735 TCGv_vec t1 = tcg_temp_new_vec(type); 736 TCGv_vec t2 = tcg_temp_new_vec(type); 737 uint32_t i; 738 739 for (i = 0; i < oprsz; i += tysz) { 740 tcg_gen_ld_vec(t0, cpu_env, aofs + i); 741 tcg_gen_ld_vec(t1, cpu_env, bofs + i); 742 if (load_dest) { 743 tcg_gen_ld_vec(t2, cpu_env, dofs + i); 744 } 745 fni(vece, t2, t0, t1); 746 tcg_gen_st_vec(t2, cpu_env, dofs + i); 747 } 748 tcg_temp_free_vec(t2); 749 tcg_temp_free_vec(t1); 750 tcg_temp_free_vec(t0); 751 } 752 753 /* Expand OPSZ bytes worth of four-operand operations using host vectors. */ 754 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, 755 uint32_t bofs, uint32_t cofs, uint32_t oprsz, 756 uint32_t tysz, TCGType type, 757 void (*fni)(unsigned, TCGv_vec, TCGv_vec, 758 TCGv_vec, TCGv_vec)) 759 { 760 TCGv_vec t0 = tcg_temp_new_vec(type); 761 TCGv_vec t1 = tcg_temp_new_vec(type); 762 TCGv_vec t2 = tcg_temp_new_vec(type); 763 TCGv_vec t3 = tcg_temp_new_vec(type); 764 uint32_t i; 765 766 for (i = 0; i < oprsz; i += tysz) { 767 tcg_gen_ld_vec(t1, cpu_env, aofs + i); 768 tcg_gen_ld_vec(t2, cpu_env, bofs + i); 769 tcg_gen_ld_vec(t3, cpu_env, cofs + i); 770 fni(vece, t0, t1, t2, t3); 771 tcg_gen_st_vec(t0, cpu_env, dofs + i); 772 } 773 tcg_temp_free_vec(t3); 774 tcg_temp_free_vec(t2); 775 tcg_temp_free_vec(t1); 776 tcg_temp_free_vec(t0); 777 } 778 779 /* Expand a vector two-operand operation. */ 780 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, 781 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) 782 { 783 check_size_align(oprsz, maxsz, dofs | aofs); 784 check_overlap_2(dofs, aofs, maxsz); 785 786 /* Recall that ARM SVE allows vector sizes that are not a power of 2. 787 Expand with successively smaller host vector sizes. The intent is 788 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ 789 /* ??? For maxsz > oprsz, the host may be able to use an opr-sized 790 operation, zeroing the balance of the register. We can then 791 use a max-sized store to implement the clearing without an extra 792 store operation. This is true for aarch64 and x86_64 hosts. */ 793 794 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) 795 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { 796 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); 797 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); 798 if (some == oprsz) { 799 goto done; 800 } 801 dofs += some; 802 aofs += some; 803 oprsz -= some; 804 maxsz -= some; 805 } 806 807 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) 808 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { 809 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); 810 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 811 && g->fniv && check_size_impl(oprsz, 8) 812 && (!g->opc 813 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { 814 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); 815 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 816 expand_2_i64(dofs, aofs, oprsz, g->fni8); 817 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 818 expand_2_i32(dofs, aofs, oprsz, g->fni4); 819 } else { 820 assert(g->fno != NULL); 821 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); 822 return; 823 } 824 825 done: 826 if (oprsz < maxsz) { 827 expand_clr(dofs + oprsz, maxsz - oprsz); 828 } 829 } 830 831 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, 832 uint32_t maxsz, int64_t c, const GVecGen2i *g) 833 { 834 check_size_align(oprsz, maxsz, dofs | aofs); 835 check_overlap_2(dofs, aofs, maxsz); 836 837 /* Recall that ARM SVE allows vector sizes that are not a power of 2. 838 Expand with successively smaller host vector sizes. The intent is 839 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ 840 841 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) 842 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { 843 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); 844 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, 845 c, g->load_dest, g->fniv); 846 if (some == oprsz) { 847 goto done; 848 } 849 dofs += some; 850 aofs += some; 851 oprsz -= some; 852 maxsz -= some; 853 } 854 855 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) 856 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { 857 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, 858 c, g->load_dest, g->fniv); 859 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 860 && g->fniv && check_size_impl(oprsz, 8) 861 && (!g->opc 862 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { 863 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, 864 c, g->load_dest, g->fniv); 865 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 866 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); 867 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 868 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); 869 } else { 870 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); 871 return; 872 } 873 874 done: 875 if (oprsz < maxsz) { 876 expand_clr(dofs + oprsz, maxsz - oprsz); 877 } 878 } 879 880 /* Expand a vector three-operand operation. */ 881 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, 882 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) 883 { 884 check_size_align(oprsz, maxsz, dofs | aofs | bofs); 885 check_overlap_3(dofs, aofs, bofs, maxsz); 886 887 /* Recall that ARM SVE allows vector sizes that are not a power of 2. 888 Expand with successively smaller host vector sizes. The intent is 889 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ 890 891 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) 892 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { 893 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); 894 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, 895 g->load_dest, g->fniv); 896 if (some == oprsz) { 897 goto done; 898 } 899 dofs += some; 900 aofs += some; 901 bofs += some; 902 oprsz -= some; 903 maxsz -= some; 904 } 905 906 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) 907 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { 908 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, 909 g->load_dest, g->fniv); 910 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 911 && g->fniv && check_size_impl(oprsz, 8) 912 && (!g->opc 913 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { 914 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, 915 g->load_dest, g->fniv); 916 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 917 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); 918 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 919 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); 920 } else { 921 assert(g->fno != NULL); 922 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, g->data, g->fno); 923 } 924 925 done: 926 if (oprsz < maxsz) { 927 expand_clr(dofs + oprsz, maxsz - oprsz); 928 } 929 } 930 931 /* Expand a vector four-operand operation. */ 932 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, 933 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) 934 { 935 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); 936 check_overlap_4(dofs, aofs, bofs, cofs, maxsz); 937 938 /* Recall that ARM SVE allows vector sizes that are not a power of 2. 939 Expand with successively smaller host vector sizes. The intent is 940 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ 941 942 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) 943 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { 944 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); 945 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, 946 32, TCG_TYPE_V256, g->fniv); 947 if (some == oprsz) { 948 goto done; 949 } 950 dofs += some; 951 aofs += some; 952 bofs += some; 953 cofs += some; 954 oprsz -= some; 955 maxsz -= some; 956 } 957 958 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) 959 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { 960 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 961 16, TCG_TYPE_V128, g->fniv); 962 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 963 && g->fniv && check_size_impl(oprsz, 8) 964 && (!g->opc 965 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { 966 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, 967 8, TCG_TYPE_V64, g->fniv); 968 } else if (g->fni8 && check_size_impl(oprsz, 8)) { 969 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8); 970 } else if (g->fni4 && check_size_impl(oprsz, 4)) { 971 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4); 972 } else { 973 assert(g->fno != NULL); 974 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, 975 oprsz, maxsz, g->data, g->fno); 976 return; 977 } 978 979 done: 980 if (oprsz < maxsz) { 981 expand_clr(dofs + oprsz, maxsz - oprsz); 982 } 983 } 984 985 /* 986 * Expand specific vector operations. 987 */ 988 989 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) 990 { 991 tcg_gen_mov_vec(a, b); 992 } 993 994 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, 995 uint32_t oprsz, uint32_t maxsz) 996 { 997 static const GVecGen2 g = { 998 .fni8 = tcg_gen_mov_i64, 999 .fniv = vec_mov2, 1000 .fno = gen_helper_gvec_mov, 1001 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1002 }; 1003 if (dofs != aofs) { 1004 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1005 } else { 1006 check_size_align(oprsz, maxsz, dofs); 1007 if (oprsz < maxsz) { 1008 expand_clr(dofs + oprsz, maxsz - oprsz); 1009 } 1010 } 1011 } 1012 1013 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, 1014 uint32_t maxsz, TCGv_i32 in) 1015 { 1016 check_size_align(oprsz, maxsz, dofs); 1017 tcg_debug_assert(vece <= MO_32); 1018 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); 1019 } 1020 1021 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, 1022 uint32_t maxsz, TCGv_i64 in) 1023 { 1024 check_size_align(oprsz, maxsz, dofs); 1025 tcg_debug_assert(vece <= MO_64); 1026 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); 1027 } 1028 1029 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, 1030 uint32_t oprsz, uint32_t maxsz) 1031 { 1032 if (vece <= MO_32) { 1033 TCGv_i32 in = tcg_temp_new_i32(); 1034 switch (vece) { 1035 case MO_8: 1036 tcg_gen_ld8u_i32(in, cpu_env, aofs); 1037 break; 1038 case MO_16: 1039 tcg_gen_ld16u_i32(in, cpu_env, aofs); 1040 break; 1041 case MO_32: 1042 tcg_gen_ld_i32(in, cpu_env, aofs); 1043 break; 1044 } 1045 tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in); 1046 tcg_temp_free_i32(in); 1047 } else if (vece == MO_64) { 1048 TCGv_i64 in = tcg_temp_new_i64(); 1049 tcg_gen_ld_i64(in, cpu_env, aofs); 1050 tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in); 1051 tcg_temp_free_i64(in); 1052 } else { 1053 /* 128-bit duplicate. */ 1054 /* ??? Dup to 256-bit vector. */ 1055 int i; 1056 1057 tcg_debug_assert(vece == 4); 1058 tcg_debug_assert(oprsz >= 16); 1059 if (TCG_TARGET_HAS_v128) { 1060 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); 1061 1062 tcg_gen_ld_vec(in, cpu_env, aofs); 1063 for (i = 0; i < oprsz; i += 16) { 1064 tcg_gen_st_vec(in, cpu_env, dofs + i); 1065 } 1066 tcg_temp_free_vec(in); 1067 } else { 1068 TCGv_i64 in0 = tcg_temp_new_i64(); 1069 TCGv_i64 in1 = tcg_temp_new_i64(); 1070 1071 tcg_gen_ld_i64(in0, cpu_env, aofs); 1072 tcg_gen_ld_i64(in1, cpu_env, aofs + 8); 1073 for (i = 0; i < oprsz; i += 16) { 1074 tcg_gen_st_i64(in0, cpu_env, dofs + i); 1075 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); 1076 } 1077 tcg_temp_free_i64(in0); 1078 tcg_temp_free_i64(in1); 1079 } 1080 } 1081 } 1082 1083 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, 1084 uint32_t maxsz, uint64_t x) 1085 { 1086 check_size_align(oprsz, maxsz, dofs); 1087 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); 1088 } 1089 1090 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, 1091 uint32_t maxsz, uint32_t x) 1092 { 1093 check_size_align(oprsz, maxsz, dofs); 1094 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); 1095 } 1096 1097 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, 1098 uint32_t maxsz, uint16_t x) 1099 { 1100 check_size_align(oprsz, maxsz, dofs); 1101 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); 1102 } 1103 1104 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, 1105 uint32_t maxsz, uint8_t x) 1106 { 1107 check_size_align(oprsz, maxsz, dofs); 1108 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); 1109 } 1110 1111 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, 1112 uint32_t oprsz, uint32_t maxsz) 1113 { 1114 static const GVecGen2 g = { 1115 .fni8 = tcg_gen_not_i64, 1116 .fniv = tcg_gen_not_vec, 1117 .fno = gen_helper_gvec_not, 1118 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1119 }; 1120 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); 1121 } 1122 1123 /* Perform a vector addition using normal addition and a mask. The mask 1124 should be the sign bit of each lane. This 6-operation form is more 1125 efficient than separate additions when there are 4 or more lanes in 1126 the 64-bit operation. */ 1127 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1128 { 1129 TCGv_i64 t1 = tcg_temp_new_i64(); 1130 TCGv_i64 t2 = tcg_temp_new_i64(); 1131 TCGv_i64 t3 = tcg_temp_new_i64(); 1132 1133 tcg_gen_andc_i64(t1, a, m); 1134 tcg_gen_andc_i64(t2, b, m); 1135 tcg_gen_xor_i64(t3, a, b); 1136 tcg_gen_add_i64(d, t1, t2); 1137 tcg_gen_and_i64(t3, t3, m); 1138 tcg_gen_xor_i64(d, d, t3); 1139 1140 tcg_temp_free_i64(t1); 1141 tcg_temp_free_i64(t2); 1142 tcg_temp_free_i64(t3); 1143 } 1144 1145 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1146 { 1147 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1148 gen_addv_mask(d, a, b, m); 1149 tcg_temp_free_i64(m); 1150 } 1151 1152 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1153 { 1154 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1155 gen_addv_mask(d, a, b, m); 1156 tcg_temp_free_i64(m); 1157 } 1158 1159 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1160 { 1161 TCGv_i64 t1 = tcg_temp_new_i64(); 1162 TCGv_i64 t2 = tcg_temp_new_i64(); 1163 1164 tcg_gen_andi_i64(t1, a, ~0xffffffffull); 1165 tcg_gen_add_i64(t2, a, b); 1166 tcg_gen_add_i64(t1, t1, b); 1167 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1168 1169 tcg_temp_free_i64(t1); 1170 tcg_temp_free_i64(t2); 1171 } 1172 1173 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, 1174 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1175 { 1176 static const GVecGen3 g[4] = { 1177 { .fni8 = tcg_gen_vec_add8_i64, 1178 .fniv = tcg_gen_add_vec, 1179 .fno = gen_helper_gvec_add8, 1180 .opc = INDEX_op_add_vec, 1181 .vece = MO_8 }, 1182 { .fni8 = tcg_gen_vec_add16_i64, 1183 .fniv = tcg_gen_add_vec, 1184 .fno = gen_helper_gvec_add16, 1185 .opc = INDEX_op_add_vec, 1186 .vece = MO_16 }, 1187 { .fni4 = tcg_gen_add_i32, 1188 .fniv = tcg_gen_add_vec, 1189 .fno = gen_helper_gvec_add32, 1190 .opc = INDEX_op_add_vec, 1191 .vece = MO_32 }, 1192 { .fni8 = tcg_gen_add_i64, 1193 .fniv = tcg_gen_add_vec, 1194 .fno = gen_helper_gvec_add64, 1195 .opc = INDEX_op_add_vec, 1196 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1197 .vece = MO_64 }, 1198 }; 1199 1200 tcg_debug_assert(vece <= MO_64); 1201 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1202 } 1203 1204 /* Perform a vector subtraction using normal subtraction and a mask. 1205 Compare gen_addv_mask above. */ 1206 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) 1207 { 1208 TCGv_i64 t1 = tcg_temp_new_i64(); 1209 TCGv_i64 t2 = tcg_temp_new_i64(); 1210 TCGv_i64 t3 = tcg_temp_new_i64(); 1211 1212 tcg_gen_or_i64(t1, a, m); 1213 tcg_gen_andc_i64(t2, b, m); 1214 tcg_gen_eqv_i64(t3, a, b); 1215 tcg_gen_sub_i64(d, t1, t2); 1216 tcg_gen_and_i64(t3, t3, m); 1217 tcg_gen_xor_i64(d, d, t3); 1218 1219 tcg_temp_free_i64(t1); 1220 tcg_temp_free_i64(t2); 1221 tcg_temp_free_i64(t3); 1222 } 1223 1224 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1225 { 1226 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1227 gen_subv_mask(d, a, b, m); 1228 tcg_temp_free_i64(m); 1229 } 1230 1231 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1232 { 1233 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1234 gen_subv_mask(d, a, b, m); 1235 tcg_temp_free_i64(m); 1236 } 1237 1238 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1239 { 1240 TCGv_i64 t1 = tcg_temp_new_i64(); 1241 TCGv_i64 t2 = tcg_temp_new_i64(); 1242 1243 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1244 tcg_gen_sub_i64(t2, a, b); 1245 tcg_gen_sub_i64(t1, a, t1); 1246 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1247 1248 tcg_temp_free_i64(t1); 1249 tcg_temp_free_i64(t2); 1250 } 1251 1252 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, 1253 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1254 { 1255 static const GVecGen3 g[4] = { 1256 { .fni8 = tcg_gen_vec_sub8_i64, 1257 .fniv = tcg_gen_sub_vec, 1258 .fno = gen_helper_gvec_sub8, 1259 .opc = INDEX_op_sub_vec, 1260 .vece = MO_8 }, 1261 { .fni8 = tcg_gen_vec_sub16_i64, 1262 .fniv = tcg_gen_sub_vec, 1263 .fno = gen_helper_gvec_sub16, 1264 .opc = INDEX_op_sub_vec, 1265 .vece = MO_16 }, 1266 { .fni4 = tcg_gen_sub_i32, 1267 .fniv = tcg_gen_sub_vec, 1268 .fno = gen_helper_gvec_sub32, 1269 .opc = INDEX_op_sub_vec, 1270 .vece = MO_32 }, 1271 { .fni8 = tcg_gen_sub_i64, 1272 .fniv = tcg_gen_sub_vec, 1273 .fno = gen_helper_gvec_sub64, 1274 .opc = INDEX_op_sub_vec, 1275 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1276 .vece = MO_64 }, 1277 }; 1278 1279 tcg_debug_assert(vece <= MO_64); 1280 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); 1281 } 1282 1283 /* Perform a vector negation using normal negation and a mask. 1284 Compare gen_subv_mask above. */ 1285 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) 1286 { 1287 TCGv_i64 t2 = tcg_temp_new_i64(); 1288 TCGv_i64 t3 = tcg_temp_new_i64(); 1289 1290 tcg_gen_andc_i64(t3, m, b); 1291 tcg_gen_andc_i64(t2, b, m); 1292 tcg_gen_sub_i64(d, m, t2); 1293 tcg_gen_xor_i64(d, d, t3); 1294 1295 tcg_temp_free_i64(t2); 1296 tcg_temp_free_i64(t3); 1297 } 1298 1299 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) 1300 { 1301 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); 1302 gen_negv_mask(d, b, m); 1303 tcg_temp_free_i64(m); 1304 } 1305 1306 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) 1307 { 1308 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); 1309 gen_negv_mask(d, b, m); 1310 tcg_temp_free_i64(m); 1311 } 1312 1313 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) 1314 { 1315 TCGv_i64 t1 = tcg_temp_new_i64(); 1316 TCGv_i64 t2 = tcg_temp_new_i64(); 1317 1318 tcg_gen_andi_i64(t1, b, ~0xffffffffull); 1319 tcg_gen_neg_i64(t2, b); 1320 tcg_gen_neg_i64(t1, t1); 1321 tcg_gen_deposit_i64(d, t1, t2, 0, 32); 1322 1323 tcg_temp_free_i64(t1); 1324 tcg_temp_free_i64(t2); 1325 } 1326 1327 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, 1328 uint32_t oprsz, uint32_t maxsz) 1329 { 1330 static const GVecGen2 g[4] = { 1331 { .fni8 = tcg_gen_vec_neg8_i64, 1332 .fniv = tcg_gen_neg_vec, 1333 .fno = gen_helper_gvec_neg8, 1334 .opc = INDEX_op_neg_vec, 1335 .vece = MO_8 }, 1336 { .fni8 = tcg_gen_vec_neg16_i64, 1337 .fniv = tcg_gen_neg_vec, 1338 .fno = gen_helper_gvec_neg16, 1339 .opc = INDEX_op_neg_vec, 1340 .vece = MO_16 }, 1341 { .fni4 = tcg_gen_neg_i32, 1342 .fniv = tcg_gen_neg_vec, 1343 .fno = gen_helper_gvec_neg32, 1344 .opc = INDEX_op_neg_vec, 1345 .vece = MO_32 }, 1346 { .fni8 = tcg_gen_neg_i64, 1347 .fniv = tcg_gen_neg_vec, 1348 .fno = gen_helper_gvec_neg64, 1349 .opc = INDEX_op_neg_vec, 1350 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1351 .vece = MO_64 }, 1352 }; 1353 1354 tcg_debug_assert(vece <= MO_64); 1355 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); 1356 } 1357 1358 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, 1359 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1360 { 1361 static const GVecGen3 g = { 1362 .fni8 = tcg_gen_and_i64, 1363 .fniv = tcg_gen_and_vec, 1364 .fno = gen_helper_gvec_and, 1365 .opc = INDEX_op_and_vec, 1366 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1367 }; 1368 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1369 } 1370 1371 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, 1372 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1373 { 1374 static const GVecGen3 g = { 1375 .fni8 = tcg_gen_or_i64, 1376 .fniv = tcg_gen_or_vec, 1377 .fno = gen_helper_gvec_or, 1378 .opc = INDEX_op_or_vec, 1379 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1380 }; 1381 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1382 } 1383 1384 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, 1385 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1386 { 1387 static const GVecGen3 g = { 1388 .fni8 = tcg_gen_xor_i64, 1389 .fniv = tcg_gen_xor_vec, 1390 .fno = gen_helper_gvec_xor, 1391 .opc = INDEX_op_xor_vec, 1392 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1393 }; 1394 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1395 } 1396 1397 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, 1398 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1399 { 1400 static const GVecGen3 g = { 1401 .fni8 = tcg_gen_andc_i64, 1402 .fniv = tcg_gen_andc_vec, 1403 .fno = gen_helper_gvec_andc, 1404 .opc = INDEX_op_andc_vec, 1405 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1406 }; 1407 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1408 } 1409 1410 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, 1411 uint32_t bofs, uint32_t oprsz, uint32_t maxsz) 1412 { 1413 static const GVecGen3 g = { 1414 .fni8 = tcg_gen_orc_i64, 1415 .fniv = tcg_gen_orc_vec, 1416 .fno = gen_helper_gvec_orc, 1417 .opc = INDEX_op_orc_vec, 1418 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1419 }; 1420 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); 1421 } 1422 1423 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1424 { 1425 uint64_t mask = dup_const(MO_8, 0xff << c); 1426 tcg_gen_shli_i64(d, a, c); 1427 tcg_gen_andi_i64(d, d, mask); 1428 } 1429 1430 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1431 { 1432 uint64_t mask = dup_const(MO_16, 0xffff << c); 1433 tcg_gen_shli_i64(d, a, c); 1434 tcg_gen_andi_i64(d, d, mask); 1435 } 1436 1437 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, 1438 int64_t shift, uint32_t oprsz, uint32_t maxsz) 1439 { 1440 static const GVecGen2i g[4] = { 1441 { .fni8 = tcg_gen_vec_shl8i_i64, 1442 .fniv = tcg_gen_shli_vec, 1443 .fno = gen_helper_gvec_shl8i, 1444 .opc = INDEX_op_shli_vec, 1445 .vece = MO_8 }, 1446 { .fni8 = tcg_gen_vec_shl16i_i64, 1447 .fniv = tcg_gen_shli_vec, 1448 .fno = gen_helper_gvec_shl16i, 1449 .opc = INDEX_op_shli_vec, 1450 .vece = MO_16 }, 1451 { .fni4 = tcg_gen_shli_i32, 1452 .fniv = tcg_gen_shli_vec, 1453 .fno = gen_helper_gvec_shl32i, 1454 .opc = INDEX_op_shli_vec, 1455 .vece = MO_32 }, 1456 { .fni8 = tcg_gen_shli_i64, 1457 .fniv = tcg_gen_shli_vec, 1458 .fno = gen_helper_gvec_shl64i, 1459 .opc = INDEX_op_shli_vec, 1460 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1461 .vece = MO_64 }, 1462 }; 1463 1464 tcg_debug_assert(vece <= MO_64); 1465 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 1466 if (shift == 0) { 1467 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 1468 } else { 1469 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 1470 } 1471 } 1472 1473 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1474 { 1475 uint64_t mask = dup_const(MO_8, 0xff >> c); 1476 tcg_gen_shri_i64(d, a, c); 1477 tcg_gen_andi_i64(d, d, mask); 1478 } 1479 1480 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1481 { 1482 uint64_t mask = dup_const(MO_16, 0xffff >> c); 1483 tcg_gen_shri_i64(d, a, c); 1484 tcg_gen_andi_i64(d, d, mask); 1485 } 1486 1487 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, 1488 int64_t shift, uint32_t oprsz, uint32_t maxsz) 1489 { 1490 static const GVecGen2i g[4] = { 1491 { .fni8 = tcg_gen_vec_shr8i_i64, 1492 .fniv = tcg_gen_shri_vec, 1493 .fno = gen_helper_gvec_shr8i, 1494 .opc = INDEX_op_shri_vec, 1495 .vece = MO_8 }, 1496 { .fni8 = tcg_gen_vec_shr16i_i64, 1497 .fniv = tcg_gen_shri_vec, 1498 .fno = gen_helper_gvec_shr16i, 1499 .opc = INDEX_op_shri_vec, 1500 .vece = MO_16 }, 1501 { .fni4 = tcg_gen_shri_i32, 1502 .fniv = tcg_gen_shri_vec, 1503 .fno = gen_helper_gvec_shr32i, 1504 .opc = INDEX_op_shri_vec, 1505 .vece = MO_32 }, 1506 { .fni8 = tcg_gen_shri_i64, 1507 .fniv = tcg_gen_shri_vec, 1508 .fno = gen_helper_gvec_shr64i, 1509 .opc = INDEX_op_shri_vec, 1510 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1511 .vece = MO_64 }, 1512 }; 1513 1514 tcg_debug_assert(vece <= MO_64); 1515 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 1516 if (shift == 0) { 1517 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 1518 } else { 1519 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 1520 } 1521 } 1522 1523 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1524 { 1525 uint64_t s_mask = dup_const(MO_8, 0x80 >> c); 1526 uint64_t c_mask = dup_const(MO_8, 0xff >> c); 1527 TCGv_i64 s = tcg_temp_new_i64(); 1528 1529 tcg_gen_shri_i64(d, a, c); 1530 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 1531 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 1532 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 1533 tcg_gen_or_i64(d, d, s); /* include sign extension */ 1534 tcg_temp_free_i64(s); 1535 } 1536 1537 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) 1538 { 1539 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); 1540 uint64_t c_mask = dup_const(MO_16, 0xffff >> c); 1541 TCGv_i64 s = tcg_temp_new_i64(); 1542 1543 tcg_gen_shri_i64(d, a, c); 1544 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ 1545 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ 1546 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ 1547 tcg_gen_or_i64(d, d, s); /* include sign extension */ 1548 tcg_temp_free_i64(s); 1549 } 1550 1551 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, 1552 int64_t shift, uint32_t oprsz, uint32_t maxsz) 1553 { 1554 static const GVecGen2i g[4] = { 1555 { .fni8 = tcg_gen_vec_sar8i_i64, 1556 .fniv = tcg_gen_sari_vec, 1557 .fno = gen_helper_gvec_sar8i, 1558 .opc = INDEX_op_sari_vec, 1559 .vece = MO_8 }, 1560 { .fni8 = tcg_gen_vec_sar16i_i64, 1561 .fniv = tcg_gen_sari_vec, 1562 .fno = gen_helper_gvec_sar16i, 1563 .opc = INDEX_op_sari_vec, 1564 .vece = MO_16 }, 1565 { .fni4 = tcg_gen_sari_i32, 1566 .fniv = tcg_gen_sari_vec, 1567 .fno = gen_helper_gvec_sar32i, 1568 .opc = INDEX_op_sari_vec, 1569 .vece = MO_32 }, 1570 { .fni8 = tcg_gen_sari_i64, 1571 .fniv = tcg_gen_sari_vec, 1572 .fno = gen_helper_gvec_sar64i, 1573 .opc = INDEX_op_sari_vec, 1574 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1575 .vece = MO_64 }, 1576 }; 1577 1578 tcg_debug_assert(vece <= MO_64); 1579 tcg_debug_assert(shift >= 0 && shift < (8 << vece)); 1580 if (shift == 0) { 1581 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); 1582 } else { 1583 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); 1584 } 1585 } 1586