1 /* 2 * ARM generic vector expansion 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * Copyright (c) 2005-2007 CodeSourcery 6 * Copyright (c) 2007 OpenedHand, Ltd. 7 * 8 * This library is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * This library is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "translate.h" 24 25 26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, 27 uint32_t opr_sz, uint32_t max_sz, 28 gen_helper_gvec_3_ptr *fn) 29 { 30 TCGv_ptr qc_ptr = tcg_temp_new_ptr(); 31 32 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 33 tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc)); 34 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr, 35 opr_sz, max_sz, 0, fn); 36 } 37 38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 39 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 40 { 41 static gen_helper_gvec_3_ptr * const fns[2] = { 42 gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s 43 }; 44 tcg_debug_assert(vece >= 1 && vece <= 2); 45 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 46 } 47 48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 49 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 50 { 51 static gen_helper_gvec_3_ptr * const fns[2] = { 52 gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s 53 }; 54 tcg_debug_assert(vece >= 1 && vece <= 2); 55 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 56 } 57 58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 59 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 60 { 61 static gen_helper_gvec_3_ptr * const fns[2] = { 62 gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32 63 }; 64 tcg_debug_assert(vece >= 1 && vece <= 2); 65 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 66 } 67 68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 69 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 70 { 71 static gen_helper_gvec_3_ptr * const fns[2] = { 72 gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32 73 }; 74 tcg_debug_assert(vece >= 1 && vece <= 2); 75 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 76 } 77 78 #define GEN_CMP0(NAME, COND) \ 79 void NAME(unsigned vece, uint32_t d, uint32_t m, \ 80 uint32_t opr_sz, uint32_t max_sz) \ 81 { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); } 82 83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ) 84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE) 85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE) 86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT) 87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT) 88 89 #undef GEN_CMP0 90 91 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 92 { 93 tcg_gen_vec_sar8i_i64(a, a, shift); 94 tcg_gen_vec_add8_i64(d, d, a); 95 } 96 97 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 98 { 99 tcg_gen_vec_sar16i_i64(a, a, shift); 100 tcg_gen_vec_add16_i64(d, d, a); 101 } 102 103 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 104 { 105 tcg_gen_sari_i32(a, a, shift); 106 tcg_gen_add_i32(d, d, a); 107 } 108 109 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 110 { 111 tcg_gen_sari_i64(a, a, shift); 112 tcg_gen_add_i64(d, d, a); 113 } 114 115 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 116 { 117 tcg_gen_sari_vec(vece, a, a, sh); 118 tcg_gen_add_vec(vece, d, d, a); 119 } 120 121 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 122 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 123 { 124 static const TCGOpcode vecop_list[] = { 125 INDEX_op_sari_vec, INDEX_op_add_vec, 0 126 }; 127 static const GVecGen2i ops[4] = { 128 { .fni8 = gen_ssra8_i64, 129 .fniv = gen_ssra_vec, 130 .fno = gen_helper_gvec_ssra_b, 131 .load_dest = true, 132 .opt_opc = vecop_list, 133 .vece = MO_8 }, 134 { .fni8 = gen_ssra16_i64, 135 .fniv = gen_ssra_vec, 136 .fno = gen_helper_gvec_ssra_h, 137 .load_dest = true, 138 .opt_opc = vecop_list, 139 .vece = MO_16 }, 140 { .fni4 = gen_ssra32_i32, 141 .fniv = gen_ssra_vec, 142 .fno = gen_helper_gvec_ssra_s, 143 .load_dest = true, 144 .opt_opc = vecop_list, 145 .vece = MO_32 }, 146 { .fni8 = gen_ssra64_i64, 147 .fniv = gen_ssra_vec, 148 .fno = gen_helper_gvec_ssra_d, 149 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 150 .opt_opc = vecop_list, 151 .load_dest = true, 152 .vece = MO_64 }, 153 }; 154 155 /* tszimm encoding produces immediates in the range [1..esize]. */ 156 tcg_debug_assert(shift > 0); 157 tcg_debug_assert(shift <= (8 << vece)); 158 159 /* 160 * Shifts larger than the element size are architecturally valid. 161 * Signed results in all sign bits. 162 */ 163 shift = MIN(shift, (8 << vece) - 1); 164 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 165 } 166 167 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 168 { 169 tcg_gen_vec_shr8i_i64(a, a, shift); 170 tcg_gen_vec_add8_i64(d, d, a); 171 } 172 173 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 174 { 175 tcg_gen_vec_shr16i_i64(a, a, shift); 176 tcg_gen_vec_add16_i64(d, d, a); 177 } 178 179 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 180 { 181 tcg_gen_shri_i32(a, a, shift); 182 tcg_gen_add_i32(d, d, a); 183 } 184 185 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 186 { 187 tcg_gen_shri_i64(a, a, shift); 188 tcg_gen_add_i64(d, d, a); 189 } 190 191 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 192 { 193 tcg_gen_shri_vec(vece, a, a, sh); 194 tcg_gen_add_vec(vece, d, d, a); 195 } 196 197 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 198 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 199 { 200 static const TCGOpcode vecop_list[] = { 201 INDEX_op_shri_vec, INDEX_op_add_vec, 0 202 }; 203 static const GVecGen2i ops[4] = { 204 { .fni8 = gen_usra8_i64, 205 .fniv = gen_usra_vec, 206 .fno = gen_helper_gvec_usra_b, 207 .load_dest = true, 208 .opt_opc = vecop_list, 209 .vece = MO_8, }, 210 { .fni8 = gen_usra16_i64, 211 .fniv = gen_usra_vec, 212 .fno = gen_helper_gvec_usra_h, 213 .load_dest = true, 214 .opt_opc = vecop_list, 215 .vece = MO_16, }, 216 { .fni4 = gen_usra32_i32, 217 .fniv = gen_usra_vec, 218 .fno = gen_helper_gvec_usra_s, 219 .load_dest = true, 220 .opt_opc = vecop_list, 221 .vece = MO_32, }, 222 { .fni8 = gen_usra64_i64, 223 .fniv = gen_usra_vec, 224 .fno = gen_helper_gvec_usra_d, 225 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 226 .load_dest = true, 227 .opt_opc = vecop_list, 228 .vece = MO_64, }, 229 }; 230 231 /* tszimm encoding produces immediates in the range [1..esize]. */ 232 tcg_debug_assert(shift > 0); 233 tcg_debug_assert(shift <= (8 << vece)); 234 235 /* 236 * Shifts larger than the element size are architecturally valid. 237 * Unsigned results in all zeros as input to accumulate: nop. 238 */ 239 if (shift < (8 << vece)) { 240 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 241 } else { 242 /* Nop, but we do need to clear the tail. */ 243 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 244 } 245 } 246 247 /* 248 * Shift one less than the requested amount, and the low bit is 249 * the rounding bit. For the 8 and 16-bit operations, because we 250 * mask the low bit, we can perform a normal integer shift instead 251 * of a vector shift. 252 */ 253 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 254 { 255 TCGv_i64 t = tcg_temp_new_i64(); 256 257 tcg_gen_shri_i64(t, a, sh - 1); 258 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 259 tcg_gen_vec_sar8i_i64(d, a, sh); 260 tcg_gen_vec_add8_i64(d, d, t); 261 } 262 263 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 264 { 265 TCGv_i64 t = tcg_temp_new_i64(); 266 267 tcg_gen_shri_i64(t, a, sh - 1); 268 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 269 tcg_gen_vec_sar16i_i64(d, a, sh); 270 tcg_gen_vec_add16_i64(d, d, t); 271 } 272 273 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 274 { 275 TCGv_i32 t; 276 277 /* Handle shift by the input size for the benefit of trans_SRSHR_ri */ 278 if (sh == 32) { 279 tcg_gen_movi_i32(d, 0); 280 return; 281 } 282 t = tcg_temp_new_i32(); 283 tcg_gen_extract_i32(t, a, sh - 1, 1); 284 tcg_gen_sari_i32(d, a, sh); 285 tcg_gen_add_i32(d, d, t); 286 } 287 288 void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 289 { 290 TCGv_i64 t = tcg_temp_new_i64(); 291 292 tcg_gen_extract_i64(t, a, sh - 1, 1); 293 tcg_gen_sari_i64(d, a, sh); 294 tcg_gen_add_i64(d, d, t); 295 } 296 297 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 298 { 299 TCGv_vec t = tcg_temp_new_vec_matching(d); 300 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); 301 302 tcg_gen_shri_vec(vece, t, a, sh - 1); 303 tcg_gen_and_vec(vece, t, t, ones); 304 tcg_gen_sari_vec(vece, d, a, sh); 305 tcg_gen_add_vec(vece, d, d, t); 306 } 307 308 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 309 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 310 { 311 static const TCGOpcode vecop_list[] = { 312 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 313 }; 314 static const GVecGen2i ops[4] = { 315 { .fni8 = gen_srshr8_i64, 316 .fniv = gen_srshr_vec, 317 .fno = gen_helper_gvec_srshr_b, 318 .opt_opc = vecop_list, 319 .vece = MO_8 }, 320 { .fni8 = gen_srshr16_i64, 321 .fniv = gen_srshr_vec, 322 .fno = gen_helper_gvec_srshr_h, 323 .opt_opc = vecop_list, 324 .vece = MO_16 }, 325 { .fni4 = gen_srshr32_i32, 326 .fniv = gen_srshr_vec, 327 .fno = gen_helper_gvec_srshr_s, 328 .opt_opc = vecop_list, 329 .vece = MO_32 }, 330 { .fni8 = gen_srshr64_i64, 331 .fniv = gen_srshr_vec, 332 .fno = gen_helper_gvec_srshr_d, 333 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 334 .opt_opc = vecop_list, 335 .vece = MO_64 }, 336 }; 337 338 /* tszimm encoding produces immediates in the range [1..esize] */ 339 tcg_debug_assert(shift > 0); 340 tcg_debug_assert(shift <= (8 << vece)); 341 342 if (shift == (8 << vece)) { 343 /* 344 * Shifts larger than the element size are architecturally valid. 345 * Signed results in all sign bits. With rounding, this produces 346 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 347 * I.e. always zero. 348 */ 349 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); 350 } else { 351 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 352 } 353 } 354 355 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 356 { 357 TCGv_i64 t = tcg_temp_new_i64(); 358 359 gen_srshr8_i64(t, a, sh); 360 tcg_gen_vec_add8_i64(d, d, t); 361 } 362 363 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 364 { 365 TCGv_i64 t = tcg_temp_new_i64(); 366 367 gen_srshr16_i64(t, a, sh); 368 tcg_gen_vec_add16_i64(d, d, t); 369 } 370 371 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 372 { 373 TCGv_i32 t = tcg_temp_new_i32(); 374 375 gen_srshr32_i32(t, a, sh); 376 tcg_gen_add_i32(d, d, t); 377 } 378 379 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 380 { 381 TCGv_i64 t = tcg_temp_new_i64(); 382 383 gen_srshr64_i64(t, a, sh); 384 tcg_gen_add_i64(d, d, t); 385 } 386 387 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 388 { 389 TCGv_vec t = tcg_temp_new_vec_matching(d); 390 391 gen_srshr_vec(vece, t, a, sh); 392 tcg_gen_add_vec(vece, d, d, t); 393 } 394 395 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 396 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 397 { 398 static const TCGOpcode vecop_list[] = { 399 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 400 }; 401 static const GVecGen2i ops[4] = { 402 { .fni8 = gen_srsra8_i64, 403 .fniv = gen_srsra_vec, 404 .fno = gen_helper_gvec_srsra_b, 405 .opt_opc = vecop_list, 406 .load_dest = true, 407 .vece = MO_8 }, 408 { .fni8 = gen_srsra16_i64, 409 .fniv = gen_srsra_vec, 410 .fno = gen_helper_gvec_srsra_h, 411 .opt_opc = vecop_list, 412 .load_dest = true, 413 .vece = MO_16 }, 414 { .fni4 = gen_srsra32_i32, 415 .fniv = gen_srsra_vec, 416 .fno = gen_helper_gvec_srsra_s, 417 .opt_opc = vecop_list, 418 .load_dest = true, 419 .vece = MO_32 }, 420 { .fni8 = gen_srsra64_i64, 421 .fniv = gen_srsra_vec, 422 .fno = gen_helper_gvec_srsra_d, 423 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 424 .opt_opc = vecop_list, 425 .load_dest = true, 426 .vece = MO_64 }, 427 }; 428 429 /* tszimm encoding produces immediates in the range [1..esize] */ 430 tcg_debug_assert(shift > 0); 431 tcg_debug_assert(shift <= (8 << vece)); 432 433 /* 434 * Shifts larger than the element size are architecturally valid. 435 * Signed results in all sign bits. With rounding, this produces 436 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 437 * I.e. always zero. With accumulation, this leaves D unchanged. 438 */ 439 if (shift == (8 << vece)) { 440 /* Nop, but we do need to clear the tail. */ 441 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 442 } else { 443 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 444 } 445 } 446 447 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 448 { 449 TCGv_i64 t = tcg_temp_new_i64(); 450 451 tcg_gen_shri_i64(t, a, sh - 1); 452 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 453 tcg_gen_vec_shr8i_i64(d, a, sh); 454 tcg_gen_vec_add8_i64(d, d, t); 455 } 456 457 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 458 { 459 TCGv_i64 t = tcg_temp_new_i64(); 460 461 tcg_gen_shri_i64(t, a, sh - 1); 462 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 463 tcg_gen_vec_shr16i_i64(d, a, sh); 464 tcg_gen_vec_add16_i64(d, d, t); 465 } 466 467 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 468 { 469 TCGv_i32 t; 470 471 /* Handle shift by the input size for the benefit of trans_URSHR_ri */ 472 if (sh == 32) { 473 tcg_gen_extract_i32(d, a, sh - 1, 1); 474 return; 475 } 476 t = tcg_temp_new_i32(); 477 tcg_gen_extract_i32(t, a, sh - 1, 1); 478 tcg_gen_shri_i32(d, a, sh); 479 tcg_gen_add_i32(d, d, t); 480 } 481 482 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 483 { 484 TCGv_i64 t = tcg_temp_new_i64(); 485 486 tcg_gen_extract_i64(t, a, sh - 1, 1); 487 tcg_gen_shri_i64(d, a, sh); 488 tcg_gen_add_i64(d, d, t); 489 } 490 491 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift) 492 { 493 TCGv_vec t = tcg_temp_new_vec_matching(d); 494 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); 495 496 tcg_gen_shri_vec(vece, t, a, shift - 1); 497 tcg_gen_and_vec(vece, t, t, ones); 498 tcg_gen_shri_vec(vece, d, a, shift); 499 tcg_gen_add_vec(vece, d, d, t); 500 } 501 502 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 503 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 504 { 505 static const TCGOpcode vecop_list[] = { 506 INDEX_op_shri_vec, INDEX_op_add_vec, 0 507 }; 508 static const GVecGen2i ops[4] = { 509 { .fni8 = gen_urshr8_i64, 510 .fniv = gen_urshr_vec, 511 .fno = gen_helper_gvec_urshr_b, 512 .opt_opc = vecop_list, 513 .vece = MO_8 }, 514 { .fni8 = gen_urshr16_i64, 515 .fniv = gen_urshr_vec, 516 .fno = gen_helper_gvec_urshr_h, 517 .opt_opc = vecop_list, 518 .vece = MO_16 }, 519 { .fni4 = gen_urshr32_i32, 520 .fniv = gen_urshr_vec, 521 .fno = gen_helper_gvec_urshr_s, 522 .opt_opc = vecop_list, 523 .vece = MO_32 }, 524 { .fni8 = gen_urshr64_i64, 525 .fniv = gen_urshr_vec, 526 .fno = gen_helper_gvec_urshr_d, 527 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 528 .opt_opc = vecop_list, 529 .vece = MO_64 }, 530 }; 531 532 /* tszimm encoding produces immediates in the range [1..esize] */ 533 tcg_debug_assert(shift > 0); 534 tcg_debug_assert(shift <= (8 << vece)); 535 536 if (shift == (8 << vece)) { 537 /* 538 * Shifts larger than the element size are architecturally valid. 539 * Unsigned results in zero. With rounding, this produces a 540 * copy of the most significant bit. 541 */ 542 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz); 543 } else { 544 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 545 } 546 } 547 548 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 549 { 550 TCGv_i64 t = tcg_temp_new_i64(); 551 552 if (sh == 8) { 553 tcg_gen_vec_shr8i_i64(t, a, 7); 554 } else { 555 gen_urshr8_i64(t, a, sh); 556 } 557 tcg_gen_vec_add8_i64(d, d, t); 558 } 559 560 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 561 { 562 TCGv_i64 t = tcg_temp_new_i64(); 563 564 if (sh == 16) { 565 tcg_gen_vec_shr16i_i64(t, a, 15); 566 } else { 567 gen_urshr16_i64(t, a, sh); 568 } 569 tcg_gen_vec_add16_i64(d, d, t); 570 } 571 572 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 573 { 574 TCGv_i32 t = tcg_temp_new_i32(); 575 576 if (sh == 32) { 577 tcg_gen_shri_i32(t, a, 31); 578 } else { 579 gen_urshr32_i32(t, a, sh); 580 } 581 tcg_gen_add_i32(d, d, t); 582 } 583 584 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 585 { 586 TCGv_i64 t = tcg_temp_new_i64(); 587 588 if (sh == 64) { 589 tcg_gen_shri_i64(t, a, 63); 590 } else { 591 gen_urshr64_i64(t, a, sh); 592 } 593 tcg_gen_add_i64(d, d, t); 594 } 595 596 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 597 { 598 TCGv_vec t = tcg_temp_new_vec_matching(d); 599 600 if (sh == (8 << vece)) { 601 tcg_gen_shri_vec(vece, t, a, sh - 1); 602 } else { 603 gen_urshr_vec(vece, t, a, sh); 604 } 605 tcg_gen_add_vec(vece, d, d, t); 606 } 607 608 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 609 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 610 { 611 static const TCGOpcode vecop_list[] = { 612 INDEX_op_shri_vec, INDEX_op_add_vec, 0 613 }; 614 static const GVecGen2i ops[4] = { 615 { .fni8 = gen_ursra8_i64, 616 .fniv = gen_ursra_vec, 617 .fno = gen_helper_gvec_ursra_b, 618 .opt_opc = vecop_list, 619 .load_dest = true, 620 .vece = MO_8 }, 621 { .fni8 = gen_ursra16_i64, 622 .fniv = gen_ursra_vec, 623 .fno = gen_helper_gvec_ursra_h, 624 .opt_opc = vecop_list, 625 .load_dest = true, 626 .vece = MO_16 }, 627 { .fni4 = gen_ursra32_i32, 628 .fniv = gen_ursra_vec, 629 .fno = gen_helper_gvec_ursra_s, 630 .opt_opc = vecop_list, 631 .load_dest = true, 632 .vece = MO_32 }, 633 { .fni8 = gen_ursra64_i64, 634 .fniv = gen_ursra_vec, 635 .fno = gen_helper_gvec_ursra_d, 636 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 637 .opt_opc = vecop_list, 638 .load_dest = true, 639 .vece = MO_64 }, 640 }; 641 642 /* tszimm encoding produces immediates in the range [1..esize] */ 643 tcg_debug_assert(shift > 0); 644 tcg_debug_assert(shift <= (8 << vece)); 645 646 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 647 } 648 649 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 650 { 651 uint64_t mask = dup_const(MO_8, 0xff >> shift); 652 TCGv_i64 t = tcg_temp_new_i64(); 653 654 tcg_gen_shri_i64(t, a, shift); 655 tcg_gen_andi_i64(t, t, mask); 656 tcg_gen_andi_i64(d, d, ~mask); 657 tcg_gen_or_i64(d, d, t); 658 } 659 660 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 661 { 662 uint64_t mask = dup_const(MO_16, 0xffff >> shift); 663 TCGv_i64 t = tcg_temp_new_i64(); 664 665 tcg_gen_shri_i64(t, a, shift); 666 tcg_gen_andi_i64(t, t, mask); 667 tcg_gen_andi_i64(d, d, ~mask); 668 tcg_gen_or_i64(d, d, t); 669 } 670 671 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 672 { 673 tcg_gen_shri_i32(a, a, shift); 674 tcg_gen_deposit_i32(d, d, a, 0, 32 - shift); 675 } 676 677 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 678 { 679 tcg_gen_shri_i64(a, a, shift); 680 tcg_gen_deposit_i64(d, d, a, 0, 64 - shift); 681 } 682 683 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 684 { 685 TCGv_vec t = tcg_temp_new_vec_matching(d); 686 int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh); 687 TCGv_vec m = tcg_constant_vec_matching(d, vece, mi); 688 689 tcg_gen_shri_vec(vece, t, a, sh); 690 tcg_gen_and_vec(vece, d, d, m); 691 tcg_gen_or_vec(vece, d, d, t); 692 } 693 694 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 695 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 696 { 697 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 698 const GVecGen2i ops[4] = { 699 { .fni8 = gen_shr8_ins_i64, 700 .fniv = gen_shr_ins_vec, 701 .fno = gen_helper_gvec_sri_b, 702 .load_dest = true, 703 .opt_opc = vecop_list, 704 .vece = MO_8 }, 705 { .fni8 = gen_shr16_ins_i64, 706 .fniv = gen_shr_ins_vec, 707 .fno = gen_helper_gvec_sri_h, 708 .load_dest = true, 709 .opt_opc = vecop_list, 710 .vece = MO_16 }, 711 { .fni4 = gen_shr32_ins_i32, 712 .fniv = gen_shr_ins_vec, 713 .fno = gen_helper_gvec_sri_s, 714 .load_dest = true, 715 .opt_opc = vecop_list, 716 .vece = MO_32 }, 717 { .fni8 = gen_shr64_ins_i64, 718 .fniv = gen_shr_ins_vec, 719 .fno = gen_helper_gvec_sri_d, 720 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 721 .load_dest = true, 722 .opt_opc = vecop_list, 723 .vece = MO_64 }, 724 }; 725 726 /* tszimm encoding produces immediates in the range [1..esize]. */ 727 tcg_debug_assert(shift > 0); 728 tcg_debug_assert(shift <= (8 << vece)); 729 730 /* Shift of esize leaves destination unchanged. */ 731 if (shift < (8 << vece)) { 732 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 733 } else { 734 /* Nop, but we do need to clear the tail. */ 735 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 736 } 737 } 738 739 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 740 { 741 uint64_t mask = dup_const(MO_8, 0xff << shift); 742 TCGv_i64 t = tcg_temp_new_i64(); 743 744 tcg_gen_shli_i64(t, a, shift); 745 tcg_gen_andi_i64(t, t, mask); 746 tcg_gen_andi_i64(d, d, ~mask); 747 tcg_gen_or_i64(d, d, t); 748 } 749 750 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 751 { 752 uint64_t mask = dup_const(MO_16, 0xffff << shift); 753 TCGv_i64 t = tcg_temp_new_i64(); 754 755 tcg_gen_shli_i64(t, a, shift); 756 tcg_gen_andi_i64(t, t, mask); 757 tcg_gen_andi_i64(d, d, ~mask); 758 tcg_gen_or_i64(d, d, t); 759 } 760 761 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 762 { 763 tcg_gen_deposit_i32(d, d, a, shift, 32 - shift); 764 } 765 766 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 767 { 768 tcg_gen_deposit_i64(d, d, a, shift, 64 - shift); 769 } 770 771 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 772 { 773 TCGv_vec t = tcg_temp_new_vec_matching(d); 774 TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh)); 775 776 tcg_gen_shli_vec(vece, t, a, sh); 777 tcg_gen_and_vec(vece, d, d, m); 778 tcg_gen_or_vec(vece, d, d, t); 779 } 780 781 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 782 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 783 { 784 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 785 const GVecGen2i ops[4] = { 786 { .fni8 = gen_shl8_ins_i64, 787 .fniv = gen_shl_ins_vec, 788 .fno = gen_helper_gvec_sli_b, 789 .load_dest = true, 790 .opt_opc = vecop_list, 791 .vece = MO_8 }, 792 { .fni8 = gen_shl16_ins_i64, 793 .fniv = gen_shl_ins_vec, 794 .fno = gen_helper_gvec_sli_h, 795 .load_dest = true, 796 .opt_opc = vecop_list, 797 .vece = MO_16 }, 798 { .fni4 = gen_shl32_ins_i32, 799 .fniv = gen_shl_ins_vec, 800 .fno = gen_helper_gvec_sli_s, 801 .load_dest = true, 802 .opt_opc = vecop_list, 803 .vece = MO_32 }, 804 { .fni8 = gen_shl64_ins_i64, 805 .fniv = gen_shl_ins_vec, 806 .fno = gen_helper_gvec_sli_d, 807 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 808 .load_dest = true, 809 .opt_opc = vecop_list, 810 .vece = MO_64 }, 811 }; 812 813 /* tszimm encoding produces immediates in the range [0..esize-1]. */ 814 tcg_debug_assert(shift >= 0); 815 tcg_debug_assert(shift < (8 << vece)); 816 817 if (shift == 0) { 818 tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz); 819 } else { 820 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 821 } 822 } 823 824 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 825 { 826 gen_helper_neon_mul_u8(a, a, b); 827 gen_helper_neon_add_u8(d, d, a); 828 } 829 830 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 831 { 832 gen_helper_neon_mul_u8(a, a, b); 833 gen_helper_neon_sub_u8(d, d, a); 834 } 835 836 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 837 { 838 gen_helper_neon_mul_u16(a, a, b); 839 gen_helper_neon_add_u16(d, d, a); 840 } 841 842 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 843 { 844 gen_helper_neon_mul_u16(a, a, b); 845 gen_helper_neon_sub_u16(d, d, a); 846 } 847 848 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 849 { 850 tcg_gen_mul_i32(a, a, b); 851 tcg_gen_add_i32(d, d, a); 852 } 853 854 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 855 { 856 tcg_gen_mul_i32(a, a, b); 857 tcg_gen_sub_i32(d, d, a); 858 } 859 860 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 861 { 862 tcg_gen_mul_i64(a, a, b); 863 tcg_gen_add_i64(d, d, a); 864 } 865 866 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 867 { 868 tcg_gen_mul_i64(a, a, b); 869 tcg_gen_sub_i64(d, d, a); 870 } 871 872 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 873 { 874 tcg_gen_mul_vec(vece, a, a, b); 875 tcg_gen_add_vec(vece, d, d, a); 876 } 877 878 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 879 { 880 tcg_gen_mul_vec(vece, a, a, b); 881 tcg_gen_sub_vec(vece, d, d, a); 882 } 883 884 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops, 885 * these tables are shared with AArch64 which does support them. 886 */ 887 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 888 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 889 { 890 static const TCGOpcode vecop_list[] = { 891 INDEX_op_mul_vec, INDEX_op_add_vec, 0 892 }; 893 static const GVecGen3 ops[4] = { 894 { .fni4 = gen_mla8_i32, 895 .fniv = gen_mla_vec, 896 .load_dest = true, 897 .opt_opc = vecop_list, 898 .vece = MO_8 }, 899 { .fni4 = gen_mla16_i32, 900 .fniv = gen_mla_vec, 901 .load_dest = true, 902 .opt_opc = vecop_list, 903 .vece = MO_16 }, 904 { .fni4 = gen_mla32_i32, 905 .fniv = gen_mla_vec, 906 .load_dest = true, 907 .opt_opc = vecop_list, 908 .vece = MO_32 }, 909 { .fni8 = gen_mla64_i64, 910 .fniv = gen_mla_vec, 911 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 912 .load_dest = true, 913 .opt_opc = vecop_list, 914 .vece = MO_64 }, 915 }; 916 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 917 } 918 919 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 920 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 921 { 922 static const TCGOpcode vecop_list[] = { 923 INDEX_op_mul_vec, INDEX_op_sub_vec, 0 924 }; 925 static const GVecGen3 ops[4] = { 926 { .fni4 = gen_mls8_i32, 927 .fniv = gen_mls_vec, 928 .load_dest = true, 929 .opt_opc = vecop_list, 930 .vece = MO_8 }, 931 { .fni4 = gen_mls16_i32, 932 .fniv = gen_mls_vec, 933 .load_dest = true, 934 .opt_opc = vecop_list, 935 .vece = MO_16 }, 936 { .fni4 = gen_mls32_i32, 937 .fniv = gen_mls_vec, 938 .load_dest = true, 939 .opt_opc = vecop_list, 940 .vece = MO_32 }, 941 { .fni8 = gen_mls64_i64, 942 .fniv = gen_mls_vec, 943 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 944 .load_dest = true, 945 .opt_opc = vecop_list, 946 .vece = MO_64 }, 947 }; 948 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 949 } 950 951 /* CMTST : test is "if (X & Y != 0)". */ 952 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 953 { 954 tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b); 955 } 956 957 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 958 { 959 tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b); 960 } 961 962 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 963 { 964 tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b); 965 } 966 967 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 968 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 969 { 970 static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 }; 971 static const GVecGen3 ops[4] = { 972 { .fni4 = gen_helper_neon_tst_u8, 973 .fniv = gen_cmtst_vec, 974 .opt_opc = vecop_list, 975 .vece = MO_8 }, 976 { .fni4 = gen_helper_neon_tst_u16, 977 .fniv = gen_cmtst_vec, 978 .opt_opc = vecop_list, 979 .vece = MO_16 }, 980 { .fni4 = gen_cmtst_i32, 981 .fniv = gen_cmtst_vec, 982 .opt_opc = vecop_list, 983 .vece = MO_32 }, 984 { .fni8 = gen_cmtst_i64, 985 .fniv = gen_cmtst_vec, 986 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 987 .opt_opc = vecop_list, 988 .vece = MO_64 }, 989 }; 990 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 991 } 992 993 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 994 { 995 TCGv_i32 lval = tcg_temp_new_i32(); 996 TCGv_i32 rval = tcg_temp_new_i32(); 997 TCGv_i32 lsh = tcg_temp_new_i32(); 998 TCGv_i32 rsh = tcg_temp_new_i32(); 999 TCGv_i32 zero = tcg_constant_i32(0); 1000 TCGv_i32 max = tcg_constant_i32(32); 1001 1002 /* 1003 * Rely on the TCG guarantee that out of range shifts produce 1004 * unspecified results, not undefined behaviour (i.e. no trap). 1005 * Discard out-of-range results after the fact. 1006 */ 1007 tcg_gen_ext8s_i32(lsh, shift); 1008 tcg_gen_neg_i32(rsh, lsh); 1009 tcg_gen_shl_i32(lval, src, lsh); 1010 tcg_gen_shr_i32(rval, src, rsh); 1011 tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero); 1012 tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst); 1013 } 1014 1015 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1016 { 1017 TCGv_i64 lval = tcg_temp_new_i64(); 1018 TCGv_i64 rval = tcg_temp_new_i64(); 1019 TCGv_i64 lsh = tcg_temp_new_i64(); 1020 TCGv_i64 rsh = tcg_temp_new_i64(); 1021 TCGv_i64 zero = tcg_constant_i64(0); 1022 TCGv_i64 max = tcg_constant_i64(64); 1023 1024 /* 1025 * Rely on the TCG guarantee that out of range shifts produce 1026 * unspecified results, not undefined behaviour (i.e. no trap). 1027 * Discard out-of-range results after the fact. 1028 */ 1029 tcg_gen_ext8s_i64(lsh, shift); 1030 tcg_gen_neg_i64(rsh, lsh); 1031 tcg_gen_shl_i64(lval, src, lsh); 1032 tcg_gen_shr_i64(rval, src, rsh); 1033 tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero); 1034 tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst); 1035 } 1036 1037 static void gen_ushl_vec(unsigned vece, TCGv_vec dst, 1038 TCGv_vec src, TCGv_vec shift) 1039 { 1040 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1041 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1042 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1043 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1044 TCGv_vec max; 1045 1046 tcg_gen_neg_vec(vece, rsh, shift); 1047 if (vece == MO_8) { 1048 tcg_gen_mov_vec(lsh, shift); 1049 } else { 1050 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); 1051 tcg_gen_and_vec(vece, lsh, shift, msk); 1052 tcg_gen_and_vec(vece, rsh, rsh, msk); 1053 } 1054 1055 /* 1056 * Rely on the TCG guarantee that out of range shifts produce 1057 * unspecified results, not undefined behaviour (i.e. no trap). 1058 * Discard out-of-range results after the fact. 1059 */ 1060 tcg_gen_shlv_vec(vece, lval, src, lsh); 1061 tcg_gen_shrv_vec(vece, rval, src, rsh); 1062 1063 /* 1064 * The choice of LT (signed) and GEU (unsigned) are biased toward 1065 * the instructions of the x86_64 host. For MO_8, the whole byte 1066 * is significant so we must use an unsigned compare; otherwise we 1067 * have already masked to a byte and so a signed compare works. 1068 * Other tcg hosts have a full set of comparisons and do not care. 1069 */ 1070 max = tcg_constant_vec_matching(dst, vece, 8 << vece); 1071 if (vece == MO_8) { 1072 tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max); 1073 tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max); 1074 tcg_gen_andc_vec(vece, lval, lval, lsh); 1075 tcg_gen_andc_vec(vece, rval, rval, rsh); 1076 } else { 1077 tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max); 1078 tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max); 1079 tcg_gen_and_vec(vece, lval, lval, lsh); 1080 tcg_gen_and_vec(vece, rval, rval, rsh); 1081 } 1082 tcg_gen_or_vec(vece, dst, lval, rval); 1083 } 1084 1085 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1086 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1087 { 1088 static const TCGOpcode vecop_list[] = { 1089 INDEX_op_neg_vec, INDEX_op_shlv_vec, 1090 INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0 1091 }; 1092 static const GVecGen3 ops[4] = { 1093 { .fniv = gen_ushl_vec, 1094 .fno = gen_helper_gvec_ushl_b, 1095 .opt_opc = vecop_list, 1096 .vece = MO_8 }, 1097 { .fniv = gen_ushl_vec, 1098 .fno = gen_helper_gvec_ushl_h, 1099 .opt_opc = vecop_list, 1100 .vece = MO_16 }, 1101 { .fni4 = gen_ushl_i32, 1102 .fniv = gen_ushl_vec, 1103 .opt_opc = vecop_list, 1104 .vece = MO_32 }, 1105 { .fni8 = gen_ushl_i64, 1106 .fniv = gen_ushl_vec, 1107 .opt_opc = vecop_list, 1108 .vece = MO_64 }, 1109 }; 1110 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1111 } 1112 1113 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 1114 { 1115 TCGv_i32 lval = tcg_temp_new_i32(); 1116 TCGv_i32 rval = tcg_temp_new_i32(); 1117 TCGv_i32 lsh = tcg_temp_new_i32(); 1118 TCGv_i32 rsh = tcg_temp_new_i32(); 1119 TCGv_i32 zero = tcg_constant_i32(0); 1120 TCGv_i32 max = tcg_constant_i32(31); 1121 1122 /* 1123 * Rely on the TCG guarantee that out of range shifts produce 1124 * unspecified results, not undefined behaviour (i.e. no trap). 1125 * Discard out-of-range results after the fact. 1126 */ 1127 tcg_gen_ext8s_i32(lsh, shift); 1128 tcg_gen_neg_i32(rsh, lsh); 1129 tcg_gen_shl_i32(lval, src, lsh); 1130 tcg_gen_umin_i32(rsh, rsh, max); 1131 tcg_gen_sar_i32(rval, src, rsh); 1132 tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero); 1133 tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval); 1134 } 1135 1136 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1137 { 1138 TCGv_i64 lval = tcg_temp_new_i64(); 1139 TCGv_i64 rval = tcg_temp_new_i64(); 1140 TCGv_i64 lsh = tcg_temp_new_i64(); 1141 TCGv_i64 rsh = tcg_temp_new_i64(); 1142 TCGv_i64 zero = tcg_constant_i64(0); 1143 TCGv_i64 max = tcg_constant_i64(63); 1144 1145 /* 1146 * Rely on the TCG guarantee that out of range shifts produce 1147 * unspecified results, not undefined behaviour (i.e. no trap). 1148 * Discard out-of-range results after the fact. 1149 */ 1150 tcg_gen_ext8s_i64(lsh, shift); 1151 tcg_gen_neg_i64(rsh, lsh); 1152 tcg_gen_shl_i64(lval, src, lsh); 1153 tcg_gen_umin_i64(rsh, rsh, max); 1154 tcg_gen_sar_i64(rval, src, rsh); 1155 tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero); 1156 tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval); 1157 } 1158 1159 static void gen_sshl_vec(unsigned vece, TCGv_vec dst, 1160 TCGv_vec src, TCGv_vec shift) 1161 { 1162 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1163 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1164 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1165 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1166 TCGv_vec tmp = tcg_temp_new_vec_matching(dst); 1167 TCGv_vec max, zero; 1168 1169 /* 1170 * Rely on the TCG guarantee that out of range shifts produce 1171 * unspecified results, not undefined behaviour (i.e. no trap). 1172 * Discard out-of-range results after the fact. 1173 */ 1174 tcg_gen_neg_vec(vece, rsh, shift); 1175 if (vece == MO_8) { 1176 tcg_gen_mov_vec(lsh, shift); 1177 } else { 1178 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); 1179 tcg_gen_and_vec(vece, lsh, shift, msk); 1180 tcg_gen_and_vec(vece, rsh, rsh, msk); 1181 } 1182 1183 /* Bound rsh so out of bound right shift gets -1. */ 1184 max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1); 1185 tcg_gen_umin_vec(vece, rsh, rsh, max); 1186 tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, max); 1187 1188 tcg_gen_shlv_vec(vece, lval, src, lsh); 1189 tcg_gen_sarv_vec(vece, rval, src, rsh); 1190 1191 /* Select in-bound left shift. */ 1192 tcg_gen_andc_vec(vece, lval, lval, tmp); 1193 1194 /* Select between left and right shift. */ 1195 zero = tcg_constant_vec_matching(dst, vece, 0); 1196 if (vece == MO_8) { 1197 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval); 1198 } else { 1199 TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80); 1200 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval); 1201 } 1202 } 1203 1204 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1205 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1206 { 1207 static const TCGOpcode vecop_list[] = { 1208 INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec, 1209 INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0 1210 }; 1211 static const GVecGen3 ops[4] = { 1212 { .fniv = gen_sshl_vec, 1213 .fno = gen_helper_gvec_sshl_b, 1214 .opt_opc = vecop_list, 1215 .vece = MO_8 }, 1216 { .fniv = gen_sshl_vec, 1217 .fno = gen_helper_gvec_sshl_h, 1218 .opt_opc = vecop_list, 1219 .vece = MO_16 }, 1220 { .fni4 = gen_sshl_i32, 1221 .fniv = gen_sshl_vec, 1222 .opt_opc = vecop_list, 1223 .vece = MO_32 }, 1224 { .fni8 = gen_sshl_i64, 1225 .fniv = gen_sshl_vec, 1226 .opt_opc = vecop_list, 1227 .vece = MO_64 }, 1228 }; 1229 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1230 } 1231 1232 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1233 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1234 { 1235 static gen_helper_gvec_3 * const fns[] = { 1236 gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h, 1237 gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d, 1238 }; 1239 tcg_debug_assert(vece <= MO_64); 1240 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1241 } 1242 1243 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1244 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1245 { 1246 static gen_helper_gvec_3 * const fns[] = { 1247 gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h, 1248 gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d, 1249 }; 1250 tcg_debug_assert(vece <= MO_64); 1251 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1252 } 1253 1254 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1255 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1256 { 1257 static gen_helper_gvec_3_ptr * const fns[] = { 1258 gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h, 1259 gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d, 1260 }; 1261 tcg_debug_assert(vece <= MO_64); 1262 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1263 opr_sz, max_sz, 0, fns[vece]); 1264 } 1265 1266 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1267 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1268 { 1269 static gen_helper_gvec_3_ptr * const fns[] = { 1270 gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h, 1271 gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d, 1272 }; 1273 tcg_debug_assert(vece <= MO_64); 1274 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1275 opr_sz, max_sz, 0, fns[vece]); 1276 } 1277 1278 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1279 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1280 { 1281 static gen_helper_gvec_3_ptr * const fns[] = { 1282 gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h, 1283 gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d, 1284 }; 1285 tcg_debug_assert(vece <= MO_64); 1286 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1287 opr_sz, max_sz, 0, fns[vece]); 1288 } 1289 1290 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1291 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1292 { 1293 static gen_helper_gvec_3_ptr * const fns[] = { 1294 gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h, 1295 gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d, 1296 }; 1297 tcg_debug_assert(vece <= MO_64); 1298 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1299 opr_sz, max_sz, 0, fns[vece]); 1300 } 1301 1302 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1303 { 1304 uint64_t max = MAKE_64BIT_MASK(0, 8 << esz); 1305 TCGv_i64 tmp = tcg_temp_new_i64(); 1306 1307 tcg_gen_add_i64(tmp, a, b); 1308 tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max)); 1309 tcg_gen_xor_i64(tmp, tmp, res); 1310 tcg_gen_or_i64(qc, qc, tmp); 1311 } 1312 1313 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1314 { 1315 TCGv_i64 t = tcg_temp_new_i64(); 1316 1317 tcg_gen_add_i64(t, a, b); 1318 tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a, 1319 tcg_constant_i64(UINT64_MAX), t); 1320 tcg_gen_xor_i64(t, t, res); 1321 tcg_gen_or_i64(qc, qc, t); 1322 } 1323 1324 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1325 TCGv_vec a, TCGv_vec b) 1326 { 1327 TCGv_vec x = tcg_temp_new_vec_matching(t); 1328 tcg_gen_add_vec(vece, x, a, b); 1329 tcg_gen_usadd_vec(vece, t, a, b); 1330 tcg_gen_xor_vec(vece, x, x, t); 1331 tcg_gen_or_vec(vece, qc, qc, x); 1332 } 1333 1334 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1335 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1336 { 1337 static const TCGOpcode vecop_list[] = { 1338 INDEX_op_usadd_vec, INDEX_op_add_vec, 0 1339 }; 1340 static const GVecGen4 ops[4] = { 1341 { .fniv = gen_uqadd_vec, 1342 .fno = gen_helper_gvec_uqadd_b, 1343 .write_aofs = true, 1344 .opt_opc = vecop_list, 1345 .vece = MO_8 }, 1346 { .fniv = gen_uqadd_vec, 1347 .fno = gen_helper_gvec_uqadd_h, 1348 .write_aofs = true, 1349 .opt_opc = vecop_list, 1350 .vece = MO_16 }, 1351 { .fniv = gen_uqadd_vec, 1352 .fno = gen_helper_gvec_uqadd_s, 1353 .write_aofs = true, 1354 .opt_opc = vecop_list, 1355 .vece = MO_32 }, 1356 { .fniv = gen_uqadd_vec, 1357 .fni8 = gen_uqadd_d, 1358 .fno = gen_helper_gvec_uqadd_d, 1359 .write_aofs = true, 1360 .opt_opc = vecop_list, 1361 .vece = MO_64 }, 1362 }; 1363 1364 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1365 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1366 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1367 } 1368 1369 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1370 { 1371 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1372 int64_t min = -1ll - max; 1373 TCGv_i64 tmp = tcg_temp_new_i64(); 1374 1375 tcg_gen_add_i64(tmp, a, b); 1376 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1377 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1378 tcg_gen_xor_i64(tmp, tmp, res); 1379 tcg_gen_or_i64(qc, qc, tmp); 1380 } 1381 1382 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1383 { 1384 TCGv_i64 t0 = tcg_temp_new_i64(); 1385 TCGv_i64 t1 = tcg_temp_new_i64(); 1386 TCGv_i64 t2 = tcg_temp_new_i64(); 1387 1388 tcg_gen_add_i64(t0, a, b); 1389 1390 /* Compute signed overflow indication into T1 */ 1391 tcg_gen_xor_i64(t1, a, b); 1392 tcg_gen_xor_i64(t2, t0, a); 1393 tcg_gen_andc_i64(t1, t2, t1); 1394 1395 /* Compute saturated value into T2 */ 1396 tcg_gen_sari_i64(t2, a, 63); 1397 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1398 1399 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1400 tcg_gen_xor_i64(t0, t0, res); 1401 tcg_gen_or_i64(qc, qc, t0); 1402 } 1403 1404 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1405 TCGv_vec a, TCGv_vec b) 1406 { 1407 TCGv_vec x = tcg_temp_new_vec_matching(t); 1408 tcg_gen_add_vec(vece, x, a, b); 1409 tcg_gen_ssadd_vec(vece, t, a, b); 1410 tcg_gen_xor_vec(vece, x, x, t); 1411 tcg_gen_or_vec(vece, qc, qc, x); 1412 } 1413 1414 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1415 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1416 { 1417 static const TCGOpcode vecop_list[] = { 1418 INDEX_op_ssadd_vec, INDEX_op_add_vec, 0 1419 }; 1420 static const GVecGen4 ops[4] = { 1421 { .fniv = gen_sqadd_vec, 1422 .fno = gen_helper_gvec_sqadd_b, 1423 .opt_opc = vecop_list, 1424 .write_aofs = true, 1425 .vece = MO_8 }, 1426 { .fniv = gen_sqadd_vec, 1427 .fno = gen_helper_gvec_sqadd_h, 1428 .opt_opc = vecop_list, 1429 .write_aofs = true, 1430 .vece = MO_16 }, 1431 { .fniv = gen_sqadd_vec, 1432 .fno = gen_helper_gvec_sqadd_s, 1433 .opt_opc = vecop_list, 1434 .write_aofs = true, 1435 .vece = MO_32 }, 1436 { .fniv = gen_sqadd_vec, 1437 .fni8 = gen_sqadd_d, 1438 .fno = gen_helper_gvec_sqadd_d, 1439 .opt_opc = vecop_list, 1440 .write_aofs = true, 1441 .vece = MO_64 }, 1442 }; 1443 1444 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1445 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1446 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1447 } 1448 1449 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1450 { 1451 TCGv_i64 tmp = tcg_temp_new_i64(); 1452 1453 tcg_gen_sub_i64(tmp, a, b); 1454 tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0)); 1455 tcg_gen_xor_i64(tmp, tmp, res); 1456 tcg_gen_or_i64(qc, qc, tmp); 1457 } 1458 1459 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1460 { 1461 TCGv_i64 t = tcg_temp_new_i64(); 1462 1463 tcg_gen_sub_i64(t, a, b); 1464 tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t); 1465 tcg_gen_xor_i64(t, t, res); 1466 tcg_gen_or_i64(qc, qc, t); 1467 } 1468 1469 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1470 TCGv_vec a, TCGv_vec b) 1471 { 1472 TCGv_vec x = tcg_temp_new_vec_matching(t); 1473 tcg_gen_sub_vec(vece, x, a, b); 1474 tcg_gen_ussub_vec(vece, t, a, b); 1475 tcg_gen_xor_vec(vece, x, x, t); 1476 tcg_gen_or_vec(vece, qc, qc, x); 1477 } 1478 1479 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1480 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1481 { 1482 static const TCGOpcode vecop_list[] = { 1483 INDEX_op_ussub_vec, INDEX_op_sub_vec, 0 1484 }; 1485 static const GVecGen4 ops[4] = { 1486 { .fniv = gen_uqsub_vec, 1487 .fno = gen_helper_gvec_uqsub_b, 1488 .opt_opc = vecop_list, 1489 .write_aofs = true, 1490 .vece = MO_8 }, 1491 { .fniv = gen_uqsub_vec, 1492 .fno = gen_helper_gvec_uqsub_h, 1493 .opt_opc = vecop_list, 1494 .write_aofs = true, 1495 .vece = MO_16 }, 1496 { .fniv = gen_uqsub_vec, 1497 .fno = gen_helper_gvec_uqsub_s, 1498 .opt_opc = vecop_list, 1499 .write_aofs = true, 1500 .vece = MO_32 }, 1501 { .fniv = gen_uqsub_vec, 1502 .fni8 = gen_uqsub_d, 1503 .fno = gen_helper_gvec_uqsub_d, 1504 .opt_opc = vecop_list, 1505 .write_aofs = true, 1506 .vece = MO_64 }, 1507 }; 1508 1509 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1510 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1511 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1512 } 1513 1514 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1515 { 1516 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1517 int64_t min = -1ll - max; 1518 TCGv_i64 tmp = tcg_temp_new_i64(); 1519 1520 tcg_gen_sub_i64(tmp, a, b); 1521 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1522 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1523 tcg_gen_xor_i64(tmp, tmp, res); 1524 tcg_gen_or_i64(qc, qc, tmp); 1525 } 1526 1527 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1528 { 1529 TCGv_i64 t0 = tcg_temp_new_i64(); 1530 TCGv_i64 t1 = tcg_temp_new_i64(); 1531 TCGv_i64 t2 = tcg_temp_new_i64(); 1532 1533 tcg_gen_sub_i64(t0, a, b); 1534 1535 /* Compute signed overflow indication into T1 */ 1536 tcg_gen_xor_i64(t1, a, b); 1537 tcg_gen_xor_i64(t2, t0, a); 1538 tcg_gen_and_i64(t1, t1, t2); 1539 1540 /* Compute saturated value into T2 */ 1541 tcg_gen_sari_i64(t2, a, 63); 1542 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1543 1544 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1545 tcg_gen_xor_i64(t0, t0, res); 1546 tcg_gen_or_i64(qc, qc, t0); 1547 } 1548 1549 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1550 TCGv_vec a, TCGv_vec b) 1551 { 1552 TCGv_vec x = tcg_temp_new_vec_matching(t); 1553 tcg_gen_sub_vec(vece, x, a, b); 1554 tcg_gen_sssub_vec(vece, t, a, b); 1555 tcg_gen_xor_vec(vece, x, x, t); 1556 tcg_gen_or_vec(vece, qc, qc, x); 1557 } 1558 1559 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1560 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1561 { 1562 static const TCGOpcode vecop_list[] = { 1563 INDEX_op_sssub_vec, INDEX_op_sub_vec, 0 1564 }; 1565 static const GVecGen4 ops[4] = { 1566 { .fniv = gen_sqsub_vec, 1567 .fno = gen_helper_gvec_sqsub_b, 1568 .opt_opc = vecop_list, 1569 .write_aofs = true, 1570 .vece = MO_8 }, 1571 { .fniv = gen_sqsub_vec, 1572 .fno = gen_helper_gvec_sqsub_h, 1573 .opt_opc = vecop_list, 1574 .write_aofs = true, 1575 .vece = MO_16 }, 1576 { .fniv = gen_sqsub_vec, 1577 .fno = gen_helper_gvec_sqsub_s, 1578 .opt_opc = vecop_list, 1579 .write_aofs = true, 1580 .vece = MO_32 }, 1581 { .fniv = gen_sqsub_vec, 1582 .fni8 = gen_sqsub_d, 1583 .fno = gen_helper_gvec_sqsub_d, 1584 .opt_opc = vecop_list, 1585 .write_aofs = true, 1586 .vece = MO_64 }, 1587 }; 1588 1589 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1590 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1591 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1592 } 1593 1594 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1595 { 1596 TCGv_i32 t = tcg_temp_new_i32(); 1597 1598 tcg_gen_sub_i32(t, a, b); 1599 tcg_gen_sub_i32(d, b, a); 1600 tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t); 1601 } 1602 1603 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1604 { 1605 TCGv_i64 t = tcg_temp_new_i64(); 1606 1607 tcg_gen_sub_i64(t, a, b); 1608 tcg_gen_sub_i64(d, b, a); 1609 tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t); 1610 } 1611 1612 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1613 { 1614 TCGv_vec t = tcg_temp_new_vec_matching(d); 1615 1616 tcg_gen_smin_vec(vece, t, a, b); 1617 tcg_gen_smax_vec(vece, d, a, b); 1618 tcg_gen_sub_vec(vece, d, d, t); 1619 } 1620 1621 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1622 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1623 { 1624 static const TCGOpcode vecop_list[] = { 1625 INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1626 }; 1627 static const GVecGen3 ops[4] = { 1628 { .fniv = gen_sabd_vec, 1629 .fno = gen_helper_gvec_sabd_b, 1630 .opt_opc = vecop_list, 1631 .vece = MO_8 }, 1632 { .fniv = gen_sabd_vec, 1633 .fno = gen_helper_gvec_sabd_h, 1634 .opt_opc = vecop_list, 1635 .vece = MO_16 }, 1636 { .fni4 = gen_sabd_i32, 1637 .fniv = gen_sabd_vec, 1638 .fno = gen_helper_gvec_sabd_s, 1639 .opt_opc = vecop_list, 1640 .vece = MO_32 }, 1641 { .fni8 = gen_sabd_i64, 1642 .fniv = gen_sabd_vec, 1643 .fno = gen_helper_gvec_sabd_d, 1644 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1645 .opt_opc = vecop_list, 1646 .vece = MO_64 }, 1647 }; 1648 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1649 } 1650 1651 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1652 { 1653 TCGv_i32 t = tcg_temp_new_i32(); 1654 1655 tcg_gen_sub_i32(t, a, b); 1656 tcg_gen_sub_i32(d, b, a); 1657 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t); 1658 } 1659 1660 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1661 { 1662 TCGv_i64 t = tcg_temp_new_i64(); 1663 1664 tcg_gen_sub_i64(t, a, b); 1665 tcg_gen_sub_i64(d, b, a); 1666 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t); 1667 } 1668 1669 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1670 { 1671 TCGv_vec t = tcg_temp_new_vec_matching(d); 1672 1673 tcg_gen_umin_vec(vece, t, a, b); 1674 tcg_gen_umax_vec(vece, d, a, b); 1675 tcg_gen_sub_vec(vece, d, d, t); 1676 } 1677 1678 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1679 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1680 { 1681 static const TCGOpcode vecop_list[] = { 1682 INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1683 }; 1684 static const GVecGen3 ops[4] = { 1685 { .fniv = gen_uabd_vec, 1686 .fno = gen_helper_gvec_uabd_b, 1687 .opt_opc = vecop_list, 1688 .vece = MO_8 }, 1689 { .fniv = gen_uabd_vec, 1690 .fno = gen_helper_gvec_uabd_h, 1691 .opt_opc = vecop_list, 1692 .vece = MO_16 }, 1693 { .fni4 = gen_uabd_i32, 1694 .fniv = gen_uabd_vec, 1695 .fno = gen_helper_gvec_uabd_s, 1696 .opt_opc = vecop_list, 1697 .vece = MO_32 }, 1698 { .fni8 = gen_uabd_i64, 1699 .fniv = gen_uabd_vec, 1700 .fno = gen_helper_gvec_uabd_d, 1701 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1702 .opt_opc = vecop_list, 1703 .vece = MO_64 }, 1704 }; 1705 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1706 } 1707 1708 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1709 { 1710 TCGv_i32 t = tcg_temp_new_i32(); 1711 gen_sabd_i32(t, a, b); 1712 tcg_gen_add_i32(d, d, t); 1713 } 1714 1715 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1716 { 1717 TCGv_i64 t = tcg_temp_new_i64(); 1718 gen_sabd_i64(t, a, b); 1719 tcg_gen_add_i64(d, d, t); 1720 } 1721 1722 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1723 { 1724 TCGv_vec t = tcg_temp_new_vec_matching(d); 1725 gen_sabd_vec(vece, t, a, b); 1726 tcg_gen_add_vec(vece, d, d, t); 1727 } 1728 1729 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1730 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1731 { 1732 static const TCGOpcode vecop_list[] = { 1733 INDEX_op_sub_vec, INDEX_op_add_vec, 1734 INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1735 }; 1736 static const GVecGen3 ops[4] = { 1737 { .fniv = gen_saba_vec, 1738 .fno = gen_helper_gvec_saba_b, 1739 .opt_opc = vecop_list, 1740 .load_dest = true, 1741 .vece = MO_8 }, 1742 { .fniv = gen_saba_vec, 1743 .fno = gen_helper_gvec_saba_h, 1744 .opt_opc = vecop_list, 1745 .load_dest = true, 1746 .vece = MO_16 }, 1747 { .fni4 = gen_saba_i32, 1748 .fniv = gen_saba_vec, 1749 .fno = gen_helper_gvec_saba_s, 1750 .opt_opc = vecop_list, 1751 .load_dest = true, 1752 .vece = MO_32 }, 1753 { .fni8 = gen_saba_i64, 1754 .fniv = gen_saba_vec, 1755 .fno = gen_helper_gvec_saba_d, 1756 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1757 .opt_opc = vecop_list, 1758 .load_dest = true, 1759 .vece = MO_64 }, 1760 }; 1761 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1762 } 1763 1764 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1765 { 1766 TCGv_i32 t = tcg_temp_new_i32(); 1767 gen_uabd_i32(t, a, b); 1768 tcg_gen_add_i32(d, d, t); 1769 } 1770 1771 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1772 { 1773 TCGv_i64 t = tcg_temp_new_i64(); 1774 gen_uabd_i64(t, a, b); 1775 tcg_gen_add_i64(d, d, t); 1776 } 1777 1778 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1779 { 1780 TCGv_vec t = tcg_temp_new_vec_matching(d); 1781 gen_uabd_vec(vece, t, a, b); 1782 tcg_gen_add_vec(vece, d, d, t); 1783 } 1784 1785 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1786 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1787 { 1788 static const TCGOpcode vecop_list[] = { 1789 INDEX_op_sub_vec, INDEX_op_add_vec, 1790 INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1791 }; 1792 static const GVecGen3 ops[4] = { 1793 { .fniv = gen_uaba_vec, 1794 .fno = gen_helper_gvec_uaba_b, 1795 .opt_opc = vecop_list, 1796 .load_dest = true, 1797 .vece = MO_8 }, 1798 { .fniv = gen_uaba_vec, 1799 .fno = gen_helper_gvec_uaba_h, 1800 .opt_opc = vecop_list, 1801 .load_dest = true, 1802 .vece = MO_16 }, 1803 { .fni4 = gen_uaba_i32, 1804 .fniv = gen_uaba_vec, 1805 .fno = gen_helper_gvec_uaba_s, 1806 .opt_opc = vecop_list, 1807 .load_dest = true, 1808 .vece = MO_32 }, 1809 { .fni8 = gen_uaba_i64, 1810 .fniv = gen_uaba_vec, 1811 .fno = gen_helper_gvec_uaba_d, 1812 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1813 .opt_opc = vecop_list, 1814 .load_dest = true, 1815 .vece = MO_64 }, 1816 }; 1817 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1818 } 1819 1820 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1821 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1822 { 1823 static gen_helper_gvec_3 * const fns[4] = { 1824 gen_helper_gvec_addp_b, 1825 gen_helper_gvec_addp_h, 1826 gen_helper_gvec_addp_s, 1827 gen_helper_gvec_addp_d, 1828 }; 1829 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1830 } 1831 1832 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1833 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1834 { 1835 static gen_helper_gvec_3 * const fns[4] = { 1836 gen_helper_gvec_smaxp_b, 1837 gen_helper_gvec_smaxp_h, 1838 gen_helper_gvec_smaxp_s, 1839 }; 1840 tcg_debug_assert(vece <= MO_32); 1841 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1842 } 1843 1844 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1845 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1846 { 1847 static gen_helper_gvec_3 * const fns[4] = { 1848 gen_helper_gvec_sminp_b, 1849 gen_helper_gvec_sminp_h, 1850 gen_helper_gvec_sminp_s, 1851 }; 1852 tcg_debug_assert(vece <= MO_32); 1853 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1854 } 1855 1856 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1857 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1858 { 1859 static gen_helper_gvec_3 * const fns[4] = { 1860 gen_helper_gvec_umaxp_b, 1861 gen_helper_gvec_umaxp_h, 1862 gen_helper_gvec_umaxp_s, 1863 }; 1864 tcg_debug_assert(vece <= MO_32); 1865 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1866 } 1867 1868 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1869 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1870 { 1871 static gen_helper_gvec_3 * const fns[4] = { 1872 gen_helper_gvec_uminp_b, 1873 gen_helper_gvec_uminp_h, 1874 gen_helper_gvec_uminp_s, 1875 }; 1876 tcg_debug_assert(vece <= MO_32); 1877 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1878 } 1879 1880 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1881 { 1882 TCGv_i64 t = tcg_temp_new_i64(); 1883 1884 tcg_gen_and_i64(t, a, b); 1885 tcg_gen_vec_sar8i_i64(a, a, 1); 1886 tcg_gen_vec_sar8i_i64(b, b, 1); 1887 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1888 tcg_gen_vec_add8_i64(d, a, b); 1889 tcg_gen_vec_add8_i64(d, d, t); 1890 } 1891 1892 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1893 { 1894 TCGv_i64 t = tcg_temp_new_i64(); 1895 1896 tcg_gen_and_i64(t, a, b); 1897 tcg_gen_vec_sar16i_i64(a, a, 1); 1898 tcg_gen_vec_sar16i_i64(b, b, 1); 1899 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1900 tcg_gen_vec_add16_i64(d, a, b); 1901 tcg_gen_vec_add16_i64(d, d, t); 1902 } 1903 1904 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1905 { 1906 TCGv_i32 t = tcg_temp_new_i32(); 1907 1908 tcg_gen_and_i32(t, a, b); 1909 tcg_gen_sari_i32(a, a, 1); 1910 tcg_gen_sari_i32(b, b, 1); 1911 tcg_gen_andi_i32(t, t, 1); 1912 tcg_gen_add_i32(d, a, b); 1913 tcg_gen_add_i32(d, d, t); 1914 } 1915 1916 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1917 { 1918 TCGv_vec t = tcg_temp_new_vec_matching(d); 1919 1920 tcg_gen_and_vec(vece, t, a, b); 1921 tcg_gen_sari_vec(vece, a, a, 1); 1922 tcg_gen_sari_vec(vece, b, b, 1); 1923 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 1924 tcg_gen_add_vec(vece, d, a, b); 1925 tcg_gen_add_vec(vece, d, d, t); 1926 } 1927 1928 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1929 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1930 { 1931 static const TCGOpcode vecop_list[] = { 1932 INDEX_op_sari_vec, INDEX_op_add_vec, 0 1933 }; 1934 static const GVecGen3 g[] = { 1935 { .fni8 = gen_shadd8_i64, 1936 .fniv = gen_shadd_vec, 1937 .opt_opc = vecop_list, 1938 .vece = MO_8 }, 1939 { .fni8 = gen_shadd16_i64, 1940 .fniv = gen_shadd_vec, 1941 .opt_opc = vecop_list, 1942 .vece = MO_16 }, 1943 { .fni4 = gen_shadd_i32, 1944 .fniv = gen_shadd_vec, 1945 .opt_opc = vecop_list, 1946 .vece = MO_32 }, 1947 }; 1948 tcg_debug_assert(vece <= MO_32); 1949 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 1950 } 1951 1952 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1953 { 1954 TCGv_i64 t = tcg_temp_new_i64(); 1955 1956 tcg_gen_and_i64(t, a, b); 1957 tcg_gen_vec_shr8i_i64(a, a, 1); 1958 tcg_gen_vec_shr8i_i64(b, b, 1); 1959 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1960 tcg_gen_vec_add8_i64(d, a, b); 1961 tcg_gen_vec_add8_i64(d, d, t); 1962 } 1963 1964 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1965 { 1966 TCGv_i64 t = tcg_temp_new_i64(); 1967 1968 tcg_gen_and_i64(t, a, b); 1969 tcg_gen_vec_shr16i_i64(a, a, 1); 1970 tcg_gen_vec_shr16i_i64(b, b, 1); 1971 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1972 tcg_gen_vec_add16_i64(d, a, b); 1973 tcg_gen_vec_add16_i64(d, d, t); 1974 } 1975 1976 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1977 { 1978 TCGv_i32 t = tcg_temp_new_i32(); 1979 1980 tcg_gen_and_i32(t, a, b); 1981 tcg_gen_shri_i32(a, a, 1); 1982 tcg_gen_shri_i32(b, b, 1); 1983 tcg_gen_andi_i32(t, t, 1); 1984 tcg_gen_add_i32(d, a, b); 1985 tcg_gen_add_i32(d, d, t); 1986 } 1987 1988 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1989 { 1990 TCGv_vec t = tcg_temp_new_vec_matching(d); 1991 1992 tcg_gen_and_vec(vece, t, a, b); 1993 tcg_gen_shri_vec(vece, a, a, 1); 1994 tcg_gen_shri_vec(vece, b, b, 1); 1995 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 1996 tcg_gen_add_vec(vece, d, a, b); 1997 tcg_gen_add_vec(vece, d, d, t); 1998 } 1999 2000 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2001 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2002 { 2003 static const TCGOpcode vecop_list[] = { 2004 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2005 }; 2006 static const GVecGen3 g[] = { 2007 { .fni8 = gen_uhadd8_i64, 2008 .fniv = gen_uhadd_vec, 2009 .opt_opc = vecop_list, 2010 .vece = MO_8 }, 2011 { .fni8 = gen_uhadd16_i64, 2012 .fniv = gen_uhadd_vec, 2013 .opt_opc = vecop_list, 2014 .vece = MO_16 }, 2015 { .fni4 = gen_uhadd_i32, 2016 .fniv = gen_uhadd_vec, 2017 .opt_opc = vecop_list, 2018 .vece = MO_32 }, 2019 }; 2020 tcg_debug_assert(vece <= MO_32); 2021 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2022 } 2023 2024 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2025 { 2026 TCGv_i64 t = tcg_temp_new_i64(); 2027 2028 tcg_gen_andc_i64(t, b, a); 2029 tcg_gen_vec_sar8i_i64(a, a, 1); 2030 tcg_gen_vec_sar8i_i64(b, b, 1); 2031 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2032 tcg_gen_vec_sub8_i64(d, a, b); 2033 tcg_gen_vec_sub8_i64(d, d, t); 2034 } 2035 2036 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2037 { 2038 TCGv_i64 t = tcg_temp_new_i64(); 2039 2040 tcg_gen_andc_i64(t, b, a); 2041 tcg_gen_vec_sar16i_i64(a, a, 1); 2042 tcg_gen_vec_sar16i_i64(b, b, 1); 2043 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2044 tcg_gen_vec_sub16_i64(d, a, b); 2045 tcg_gen_vec_sub16_i64(d, d, t); 2046 } 2047 2048 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2049 { 2050 TCGv_i32 t = tcg_temp_new_i32(); 2051 2052 tcg_gen_andc_i32(t, b, a); 2053 tcg_gen_sari_i32(a, a, 1); 2054 tcg_gen_sari_i32(b, b, 1); 2055 tcg_gen_andi_i32(t, t, 1); 2056 tcg_gen_sub_i32(d, a, b); 2057 tcg_gen_sub_i32(d, d, t); 2058 } 2059 2060 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2061 { 2062 TCGv_vec t = tcg_temp_new_vec_matching(d); 2063 2064 tcg_gen_andc_vec(vece, t, b, a); 2065 tcg_gen_sari_vec(vece, a, a, 1); 2066 tcg_gen_sari_vec(vece, b, b, 1); 2067 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2068 tcg_gen_sub_vec(vece, d, a, b); 2069 tcg_gen_sub_vec(vece, d, d, t); 2070 } 2071 2072 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2073 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2074 { 2075 static const TCGOpcode vecop_list[] = { 2076 INDEX_op_sari_vec, INDEX_op_sub_vec, 0 2077 }; 2078 static const GVecGen3 g[4] = { 2079 { .fni8 = gen_shsub8_i64, 2080 .fniv = gen_shsub_vec, 2081 .opt_opc = vecop_list, 2082 .vece = MO_8 }, 2083 { .fni8 = gen_shsub16_i64, 2084 .fniv = gen_shsub_vec, 2085 .opt_opc = vecop_list, 2086 .vece = MO_16 }, 2087 { .fni4 = gen_shsub_i32, 2088 .fniv = gen_shsub_vec, 2089 .opt_opc = vecop_list, 2090 .vece = MO_32 }, 2091 }; 2092 assert(vece <= MO_32); 2093 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2094 } 2095 2096 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2097 { 2098 TCGv_i64 t = tcg_temp_new_i64(); 2099 2100 tcg_gen_andc_i64(t, b, a); 2101 tcg_gen_vec_shr8i_i64(a, a, 1); 2102 tcg_gen_vec_shr8i_i64(b, b, 1); 2103 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2104 tcg_gen_vec_sub8_i64(d, a, b); 2105 tcg_gen_vec_sub8_i64(d, d, t); 2106 } 2107 2108 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2109 { 2110 TCGv_i64 t = tcg_temp_new_i64(); 2111 2112 tcg_gen_andc_i64(t, b, a); 2113 tcg_gen_vec_shr16i_i64(a, a, 1); 2114 tcg_gen_vec_shr16i_i64(b, b, 1); 2115 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2116 tcg_gen_vec_sub16_i64(d, a, b); 2117 tcg_gen_vec_sub16_i64(d, d, t); 2118 } 2119 2120 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2121 { 2122 TCGv_i32 t = tcg_temp_new_i32(); 2123 2124 tcg_gen_andc_i32(t, b, a); 2125 tcg_gen_shri_i32(a, a, 1); 2126 tcg_gen_shri_i32(b, b, 1); 2127 tcg_gen_andi_i32(t, t, 1); 2128 tcg_gen_sub_i32(d, a, b); 2129 tcg_gen_sub_i32(d, d, t); 2130 } 2131 2132 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2133 { 2134 TCGv_vec t = tcg_temp_new_vec_matching(d); 2135 2136 tcg_gen_andc_vec(vece, t, b, a); 2137 tcg_gen_shri_vec(vece, a, a, 1); 2138 tcg_gen_shri_vec(vece, b, b, 1); 2139 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2140 tcg_gen_sub_vec(vece, d, a, b); 2141 tcg_gen_sub_vec(vece, d, d, t); 2142 } 2143 2144 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2145 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2146 { 2147 static const TCGOpcode vecop_list[] = { 2148 INDEX_op_shri_vec, INDEX_op_sub_vec, 0 2149 }; 2150 static const GVecGen3 g[4] = { 2151 { .fni8 = gen_uhsub8_i64, 2152 .fniv = gen_uhsub_vec, 2153 .opt_opc = vecop_list, 2154 .vece = MO_8 }, 2155 { .fni8 = gen_uhsub16_i64, 2156 .fniv = gen_uhsub_vec, 2157 .opt_opc = vecop_list, 2158 .vece = MO_16 }, 2159 { .fni4 = gen_uhsub_i32, 2160 .fniv = gen_uhsub_vec, 2161 .opt_opc = vecop_list, 2162 .vece = MO_32 }, 2163 }; 2164 assert(vece <= MO_32); 2165 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2166 } 2167 2168 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2169 { 2170 TCGv_i64 t = tcg_temp_new_i64(); 2171 2172 tcg_gen_or_i64(t, a, b); 2173 tcg_gen_vec_sar8i_i64(a, a, 1); 2174 tcg_gen_vec_sar8i_i64(b, b, 1); 2175 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2176 tcg_gen_vec_add8_i64(d, a, b); 2177 tcg_gen_vec_add8_i64(d, d, t); 2178 } 2179 2180 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2181 { 2182 TCGv_i64 t = tcg_temp_new_i64(); 2183 2184 tcg_gen_or_i64(t, a, b); 2185 tcg_gen_vec_sar16i_i64(a, a, 1); 2186 tcg_gen_vec_sar16i_i64(b, b, 1); 2187 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2188 tcg_gen_vec_add16_i64(d, a, b); 2189 tcg_gen_vec_add16_i64(d, d, t); 2190 } 2191 2192 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2193 { 2194 TCGv_i32 t = tcg_temp_new_i32(); 2195 2196 tcg_gen_or_i32(t, a, b); 2197 tcg_gen_sari_i32(a, a, 1); 2198 tcg_gen_sari_i32(b, b, 1); 2199 tcg_gen_andi_i32(t, t, 1); 2200 tcg_gen_add_i32(d, a, b); 2201 tcg_gen_add_i32(d, d, t); 2202 } 2203 2204 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2205 { 2206 TCGv_vec t = tcg_temp_new_vec_matching(d); 2207 2208 tcg_gen_or_vec(vece, t, a, b); 2209 tcg_gen_sari_vec(vece, a, a, 1); 2210 tcg_gen_sari_vec(vece, b, b, 1); 2211 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2212 tcg_gen_add_vec(vece, d, a, b); 2213 tcg_gen_add_vec(vece, d, d, t); 2214 } 2215 2216 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2217 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2218 { 2219 static const TCGOpcode vecop_list[] = { 2220 INDEX_op_sari_vec, INDEX_op_add_vec, 0 2221 }; 2222 static const GVecGen3 g[] = { 2223 { .fni8 = gen_srhadd8_i64, 2224 .fniv = gen_srhadd_vec, 2225 .opt_opc = vecop_list, 2226 .vece = MO_8 }, 2227 { .fni8 = gen_srhadd16_i64, 2228 .fniv = gen_srhadd_vec, 2229 .opt_opc = vecop_list, 2230 .vece = MO_16 }, 2231 { .fni4 = gen_srhadd_i32, 2232 .fniv = gen_srhadd_vec, 2233 .opt_opc = vecop_list, 2234 .vece = MO_32 }, 2235 }; 2236 assert(vece <= MO_32); 2237 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2238 } 2239 2240 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2241 { 2242 TCGv_i64 t = tcg_temp_new_i64(); 2243 2244 tcg_gen_or_i64(t, a, b); 2245 tcg_gen_vec_shr8i_i64(a, a, 1); 2246 tcg_gen_vec_shr8i_i64(b, b, 1); 2247 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2248 tcg_gen_vec_add8_i64(d, a, b); 2249 tcg_gen_vec_add8_i64(d, d, t); 2250 } 2251 2252 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2253 { 2254 TCGv_i64 t = tcg_temp_new_i64(); 2255 2256 tcg_gen_or_i64(t, a, b); 2257 tcg_gen_vec_shr16i_i64(a, a, 1); 2258 tcg_gen_vec_shr16i_i64(b, b, 1); 2259 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2260 tcg_gen_vec_add16_i64(d, a, b); 2261 tcg_gen_vec_add16_i64(d, d, t); 2262 } 2263 2264 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2265 { 2266 TCGv_i32 t = tcg_temp_new_i32(); 2267 2268 tcg_gen_or_i32(t, a, b); 2269 tcg_gen_shri_i32(a, a, 1); 2270 tcg_gen_shri_i32(b, b, 1); 2271 tcg_gen_andi_i32(t, t, 1); 2272 tcg_gen_add_i32(d, a, b); 2273 tcg_gen_add_i32(d, d, t); 2274 } 2275 2276 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2277 { 2278 TCGv_vec t = tcg_temp_new_vec_matching(d); 2279 2280 tcg_gen_or_vec(vece, t, a, b); 2281 tcg_gen_shri_vec(vece, a, a, 1); 2282 tcg_gen_shri_vec(vece, b, b, 1); 2283 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2284 tcg_gen_add_vec(vece, d, a, b); 2285 tcg_gen_add_vec(vece, d, d, t); 2286 } 2287 2288 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2289 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2290 { 2291 static const TCGOpcode vecop_list[] = { 2292 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2293 }; 2294 static const GVecGen3 g[] = { 2295 { .fni8 = gen_urhadd8_i64, 2296 .fniv = gen_urhadd_vec, 2297 .opt_opc = vecop_list, 2298 .vece = MO_8 }, 2299 { .fni8 = gen_urhadd16_i64, 2300 .fniv = gen_urhadd_vec, 2301 .opt_opc = vecop_list, 2302 .vece = MO_16 }, 2303 { .fni4 = gen_urhadd_i32, 2304 .fniv = gen_urhadd_vec, 2305 .opt_opc = vecop_list, 2306 .vece = MO_32 }, 2307 }; 2308 assert(vece <= MO_32); 2309 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2310 } 2311