1 /* 2 * ARM generic vector expansion 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * Copyright (c) 2005-2007 CodeSourcery 6 * Copyright (c) 2007 OpenedHand, Ltd. 7 * 8 * This library is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * This library is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "translate.h" 24 25 26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, 27 uint32_t opr_sz, uint32_t max_sz, 28 gen_helper_gvec_3_ptr *fn) 29 { 30 TCGv_ptr qc_ptr = tcg_temp_new_ptr(); 31 32 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 33 tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc)); 34 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr, 35 opr_sz, max_sz, 0, fn); 36 } 37 38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 39 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 40 { 41 static gen_helper_gvec_3_ptr * const fns[2] = { 42 gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s 43 }; 44 tcg_debug_assert(vece >= 1 && vece <= 2); 45 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 46 } 47 48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 49 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 50 { 51 static gen_helper_gvec_3_ptr * const fns[2] = { 52 gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s 53 }; 54 tcg_debug_assert(vece >= 1 && vece <= 2); 55 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 56 } 57 58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 59 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 60 { 61 static gen_helper_gvec_3_ptr * const fns[2] = { 62 gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32 63 }; 64 tcg_debug_assert(vece >= 1 && vece <= 2); 65 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 66 } 67 68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 69 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 70 { 71 static gen_helper_gvec_3_ptr * const fns[2] = { 72 gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32 73 }; 74 tcg_debug_assert(vece >= 1 && vece <= 2); 75 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 76 } 77 78 #define GEN_CMP0(NAME, COND) \ 79 void NAME(unsigned vece, uint32_t d, uint32_t m, \ 80 uint32_t opr_sz, uint32_t max_sz) \ 81 { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); } 82 83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ) 84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE) 85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE) 86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT) 87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT) 88 89 #undef GEN_CMP0 90 91 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 92 { 93 tcg_gen_vec_sar8i_i64(a, a, shift); 94 tcg_gen_vec_add8_i64(d, d, a); 95 } 96 97 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 98 { 99 tcg_gen_vec_sar16i_i64(a, a, shift); 100 tcg_gen_vec_add16_i64(d, d, a); 101 } 102 103 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 104 { 105 tcg_gen_sari_i32(a, a, shift); 106 tcg_gen_add_i32(d, d, a); 107 } 108 109 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 110 { 111 tcg_gen_sari_i64(a, a, shift); 112 tcg_gen_add_i64(d, d, a); 113 } 114 115 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 116 { 117 tcg_gen_sari_vec(vece, a, a, sh); 118 tcg_gen_add_vec(vece, d, d, a); 119 } 120 121 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 122 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 123 { 124 static const TCGOpcode vecop_list[] = { 125 INDEX_op_sari_vec, INDEX_op_add_vec, 0 126 }; 127 static const GVecGen2i ops[4] = { 128 { .fni8 = gen_ssra8_i64, 129 .fniv = gen_ssra_vec, 130 .fno = gen_helper_gvec_ssra_b, 131 .load_dest = true, 132 .opt_opc = vecop_list, 133 .vece = MO_8 }, 134 { .fni8 = gen_ssra16_i64, 135 .fniv = gen_ssra_vec, 136 .fno = gen_helper_gvec_ssra_h, 137 .load_dest = true, 138 .opt_opc = vecop_list, 139 .vece = MO_16 }, 140 { .fni4 = gen_ssra32_i32, 141 .fniv = gen_ssra_vec, 142 .fno = gen_helper_gvec_ssra_s, 143 .load_dest = true, 144 .opt_opc = vecop_list, 145 .vece = MO_32 }, 146 { .fni8 = gen_ssra64_i64, 147 .fniv = gen_ssra_vec, 148 .fno = gen_helper_gvec_ssra_d, 149 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 150 .opt_opc = vecop_list, 151 .load_dest = true, 152 .vece = MO_64 }, 153 }; 154 155 /* tszimm encoding produces immediates in the range [1..esize]. */ 156 tcg_debug_assert(shift > 0); 157 tcg_debug_assert(shift <= (8 << vece)); 158 159 /* 160 * Shifts larger than the element size are architecturally valid. 161 * Signed results in all sign bits. 162 */ 163 shift = MIN(shift, (8 << vece) - 1); 164 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 165 } 166 167 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 168 { 169 tcg_gen_vec_shr8i_i64(a, a, shift); 170 tcg_gen_vec_add8_i64(d, d, a); 171 } 172 173 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 174 { 175 tcg_gen_vec_shr16i_i64(a, a, shift); 176 tcg_gen_vec_add16_i64(d, d, a); 177 } 178 179 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 180 { 181 tcg_gen_shri_i32(a, a, shift); 182 tcg_gen_add_i32(d, d, a); 183 } 184 185 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 186 { 187 tcg_gen_shri_i64(a, a, shift); 188 tcg_gen_add_i64(d, d, a); 189 } 190 191 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 192 { 193 tcg_gen_shri_vec(vece, a, a, sh); 194 tcg_gen_add_vec(vece, d, d, a); 195 } 196 197 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 198 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 199 { 200 static const TCGOpcode vecop_list[] = { 201 INDEX_op_shri_vec, INDEX_op_add_vec, 0 202 }; 203 static const GVecGen2i ops[4] = { 204 { .fni8 = gen_usra8_i64, 205 .fniv = gen_usra_vec, 206 .fno = gen_helper_gvec_usra_b, 207 .load_dest = true, 208 .opt_opc = vecop_list, 209 .vece = MO_8, }, 210 { .fni8 = gen_usra16_i64, 211 .fniv = gen_usra_vec, 212 .fno = gen_helper_gvec_usra_h, 213 .load_dest = true, 214 .opt_opc = vecop_list, 215 .vece = MO_16, }, 216 { .fni4 = gen_usra32_i32, 217 .fniv = gen_usra_vec, 218 .fno = gen_helper_gvec_usra_s, 219 .load_dest = true, 220 .opt_opc = vecop_list, 221 .vece = MO_32, }, 222 { .fni8 = gen_usra64_i64, 223 .fniv = gen_usra_vec, 224 .fno = gen_helper_gvec_usra_d, 225 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 226 .load_dest = true, 227 .opt_opc = vecop_list, 228 .vece = MO_64, }, 229 }; 230 231 /* tszimm encoding produces immediates in the range [1..esize]. */ 232 tcg_debug_assert(shift > 0); 233 tcg_debug_assert(shift <= (8 << vece)); 234 235 /* 236 * Shifts larger than the element size are architecturally valid. 237 * Unsigned results in all zeros as input to accumulate: nop. 238 */ 239 if (shift < (8 << vece)) { 240 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 241 } else { 242 /* Nop, but we do need to clear the tail. */ 243 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 244 } 245 } 246 247 /* 248 * Shift one less than the requested amount, and the low bit is 249 * the rounding bit. For the 8 and 16-bit operations, because we 250 * mask the low bit, we can perform a normal integer shift instead 251 * of a vector shift. 252 */ 253 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 254 { 255 TCGv_i64 t = tcg_temp_new_i64(); 256 257 tcg_gen_shri_i64(t, a, sh - 1); 258 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 259 tcg_gen_vec_sar8i_i64(d, a, sh); 260 tcg_gen_vec_add8_i64(d, d, t); 261 } 262 263 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 264 { 265 TCGv_i64 t = tcg_temp_new_i64(); 266 267 tcg_gen_shri_i64(t, a, sh - 1); 268 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 269 tcg_gen_vec_sar16i_i64(d, a, sh); 270 tcg_gen_vec_add16_i64(d, d, t); 271 } 272 273 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 274 { 275 TCGv_i32 t; 276 277 /* Handle shift by the input size for the benefit of trans_SRSHR_ri */ 278 if (sh == 32) { 279 tcg_gen_movi_i32(d, 0); 280 return; 281 } 282 t = tcg_temp_new_i32(); 283 tcg_gen_extract_i32(t, a, sh - 1, 1); 284 tcg_gen_sari_i32(d, a, sh); 285 tcg_gen_add_i32(d, d, t); 286 } 287 288 void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 289 { 290 TCGv_i64 t = tcg_temp_new_i64(); 291 292 tcg_gen_extract_i64(t, a, sh - 1, 1); 293 tcg_gen_sari_i64(d, a, sh); 294 tcg_gen_add_i64(d, d, t); 295 } 296 297 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 298 { 299 TCGv_vec t = tcg_temp_new_vec_matching(d); 300 TCGv_vec ones = tcg_temp_new_vec_matching(d); 301 302 tcg_gen_shri_vec(vece, t, a, sh - 1); 303 tcg_gen_dupi_vec(vece, ones, 1); 304 tcg_gen_and_vec(vece, t, t, ones); 305 tcg_gen_sari_vec(vece, d, a, sh); 306 tcg_gen_add_vec(vece, d, d, t); 307 } 308 309 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 310 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 311 { 312 static const TCGOpcode vecop_list[] = { 313 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 314 }; 315 static const GVecGen2i ops[4] = { 316 { .fni8 = gen_srshr8_i64, 317 .fniv = gen_srshr_vec, 318 .fno = gen_helper_gvec_srshr_b, 319 .opt_opc = vecop_list, 320 .vece = MO_8 }, 321 { .fni8 = gen_srshr16_i64, 322 .fniv = gen_srshr_vec, 323 .fno = gen_helper_gvec_srshr_h, 324 .opt_opc = vecop_list, 325 .vece = MO_16 }, 326 { .fni4 = gen_srshr32_i32, 327 .fniv = gen_srshr_vec, 328 .fno = gen_helper_gvec_srshr_s, 329 .opt_opc = vecop_list, 330 .vece = MO_32 }, 331 { .fni8 = gen_srshr64_i64, 332 .fniv = gen_srshr_vec, 333 .fno = gen_helper_gvec_srshr_d, 334 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 335 .opt_opc = vecop_list, 336 .vece = MO_64 }, 337 }; 338 339 /* tszimm encoding produces immediates in the range [1..esize] */ 340 tcg_debug_assert(shift > 0); 341 tcg_debug_assert(shift <= (8 << vece)); 342 343 if (shift == (8 << vece)) { 344 /* 345 * Shifts larger than the element size are architecturally valid. 346 * Signed results in all sign bits. With rounding, this produces 347 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 348 * I.e. always zero. 349 */ 350 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); 351 } else { 352 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 353 } 354 } 355 356 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 357 { 358 TCGv_i64 t = tcg_temp_new_i64(); 359 360 gen_srshr8_i64(t, a, sh); 361 tcg_gen_vec_add8_i64(d, d, t); 362 } 363 364 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 365 { 366 TCGv_i64 t = tcg_temp_new_i64(); 367 368 gen_srshr16_i64(t, a, sh); 369 tcg_gen_vec_add16_i64(d, d, t); 370 } 371 372 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 373 { 374 TCGv_i32 t = tcg_temp_new_i32(); 375 376 gen_srshr32_i32(t, a, sh); 377 tcg_gen_add_i32(d, d, t); 378 } 379 380 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 381 { 382 TCGv_i64 t = tcg_temp_new_i64(); 383 384 gen_srshr64_i64(t, a, sh); 385 tcg_gen_add_i64(d, d, t); 386 } 387 388 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 389 { 390 TCGv_vec t = tcg_temp_new_vec_matching(d); 391 392 gen_srshr_vec(vece, t, a, sh); 393 tcg_gen_add_vec(vece, d, d, t); 394 } 395 396 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 397 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 398 { 399 static const TCGOpcode vecop_list[] = { 400 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 401 }; 402 static const GVecGen2i ops[4] = { 403 { .fni8 = gen_srsra8_i64, 404 .fniv = gen_srsra_vec, 405 .fno = gen_helper_gvec_srsra_b, 406 .opt_opc = vecop_list, 407 .load_dest = true, 408 .vece = MO_8 }, 409 { .fni8 = gen_srsra16_i64, 410 .fniv = gen_srsra_vec, 411 .fno = gen_helper_gvec_srsra_h, 412 .opt_opc = vecop_list, 413 .load_dest = true, 414 .vece = MO_16 }, 415 { .fni4 = gen_srsra32_i32, 416 .fniv = gen_srsra_vec, 417 .fno = gen_helper_gvec_srsra_s, 418 .opt_opc = vecop_list, 419 .load_dest = true, 420 .vece = MO_32 }, 421 { .fni8 = gen_srsra64_i64, 422 .fniv = gen_srsra_vec, 423 .fno = gen_helper_gvec_srsra_d, 424 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 425 .opt_opc = vecop_list, 426 .load_dest = true, 427 .vece = MO_64 }, 428 }; 429 430 /* tszimm encoding produces immediates in the range [1..esize] */ 431 tcg_debug_assert(shift > 0); 432 tcg_debug_assert(shift <= (8 << vece)); 433 434 /* 435 * Shifts larger than the element size are architecturally valid. 436 * Signed results in all sign bits. With rounding, this produces 437 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 438 * I.e. always zero. With accumulation, this leaves D unchanged. 439 */ 440 if (shift == (8 << vece)) { 441 /* Nop, but we do need to clear the tail. */ 442 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 443 } else { 444 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 445 } 446 } 447 448 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 449 { 450 TCGv_i64 t = tcg_temp_new_i64(); 451 452 tcg_gen_shri_i64(t, a, sh - 1); 453 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 454 tcg_gen_vec_shr8i_i64(d, a, sh); 455 tcg_gen_vec_add8_i64(d, d, t); 456 } 457 458 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 459 { 460 TCGv_i64 t = tcg_temp_new_i64(); 461 462 tcg_gen_shri_i64(t, a, sh - 1); 463 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 464 tcg_gen_vec_shr16i_i64(d, a, sh); 465 tcg_gen_vec_add16_i64(d, d, t); 466 } 467 468 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 469 { 470 TCGv_i32 t; 471 472 /* Handle shift by the input size for the benefit of trans_URSHR_ri */ 473 if (sh == 32) { 474 tcg_gen_extract_i32(d, a, sh - 1, 1); 475 return; 476 } 477 t = tcg_temp_new_i32(); 478 tcg_gen_extract_i32(t, a, sh - 1, 1); 479 tcg_gen_shri_i32(d, a, sh); 480 tcg_gen_add_i32(d, d, t); 481 } 482 483 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 484 { 485 TCGv_i64 t = tcg_temp_new_i64(); 486 487 tcg_gen_extract_i64(t, a, sh - 1, 1); 488 tcg_gen_shri_i64(d, a, sh); 489 tcg_gen_add_i64(d, d, t); 490 } 491 492 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift) 493 { 494 TCGv_vec t = tcg_temp_new_vec_matching(d); 495 TCGv_vec ones = tcg_temp_new_vec_matching(d); 496 497 tcg_gen_shri_vec(vece, t, a, shift - 1); 498 tcg_gen_dupi_vec(vece, ones, 1); 499 tcg_gen_and_vec(vece, t, t, ones); 500 tcg_gen_shri_vec(vece, d, a, shift); 501 tcg_gen_add_vec(vece, d, d, t); 502 } 503 504 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 505 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 506 { 507 static const TCGOpcode vecop_list[] = { 508 INDEX_op_shri_vec, INDEX_op_add_vec, 0 509 }; 510 static const GVecGen2i ops[4] = { 511 { .fni8 = gen_urshr8_i64, 512 .fniv = gen_urshr_vec, 513 .fno = gen_helper_gvec_urshr_b, 514 .opt_opc = vecop_list, 515 .vece = MO_8 }, 516 { .fni8 = gen_urshr16_i64, 517 .fniv = gen_urshr_vec, 518 .fno = gen_helper_gvec_urshr_h, 519 .opt_opc = vecop_list, 520 .vece = MO_16 }, 521 { .fni4 = gen_urshr32_i32, 522 .fniv = gen_urshr_vec, 523 .fno = gen_helper_gvec_urshr_s, 524 .opt_opc = vecop_list, 525 .vece = MO_32 }, 526 { .fni8 = gen_urshr64_i64, 527 .fniv = gen_urshr_vec, 528 .fno = gen_helper_gvec_urshr_d, 529 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 530 .opt_opc = vecop_list, 531 .vece = MO_64 }, 532 }; 533 534 /* tszimm encoding produces immediates in the range [1..esize] */ 535 tcg_debug_assert(shift > 0); 536 tcg_debug_assert(shift <= (8 << vece)); 537 538 if (shift == (8 << vece)) { 539 /* 540 * Shifts larger than the element size are architecturally valid. 541 * Unsigned results in zero. With rounding, this produces a 542 * copy of the most significant bit. 543 */ 544 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz); 545 } else { 546 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 547 } 548 } 549 550 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 551 { 552 TCGv_i64 t = tcg_temp_new_i64(); 553 554 if (sh == 8) { 555 tcg_gen_vec_shr8i_i64(t, a, 7); 556 } else { 557 gen_urshr8_i64(t, a, sh); 558 } 559 tcg_gen_vec_add8_i64(d, d, t); 560 } 561 562 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 563 { 564 TCGv_i64 t = tcg_temp_new_i64(); 565 566 if (sh == 16) { 567 tcg_gen_vec_shr16i_i64(t, a, 15); 568 } else { 569 gen_urshr16_i64(t, a, sh); 570 } 571 tcg_gen_vec_add16_i64(d, d, t); 572 } 573 574 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 575 { 576 TCGv_i32 t = tcg_temp_new_i32(); 577 578 if (sh == 32) { 579 tcg_gen_shri_i32(t, a, 31); 580 } else { 581 gen_urshr32_i32(t, a, sh); 582 } 583 tcg_gen_add_i32(d, d, t); 584 } 585 586 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 587 { 588 TCGv_i64 t = tcg_temp_new_i64(); 589 590 if (sh == 64) { 591 tcg_gen_shri_i64(t, a, 63); 592 } else { 593 gen_urshr64_i64(t, a, sh); 594 } 595 tcg_gen_add_i64(d, d, t); 596 } 597 598 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 599 { 600 TCGv_vec t = tcg_temp_new_vec_matching(d); 601 602 if (sh == (8 << vece)) { 603 tcg_gen_shri_vec(vece, t, a, sh - 1); 604 } else { 605 gen_urshr_vec(vece, t, a, sh); 606 } 607 tcg_gen_add_vec(vece, d, d, t); 608 } 609 610 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 611 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 612 { 613 static const TCGOpcode vecop_list[] = { 614 INDEX_op_shri_vec, INDEX_op_add_vec, 0 615 }; 616 static const GVecGen2i ops[4] = { 617 { .fni8 = gen_ursra8_i64, 618 .fniv = gen_ursra_vec, 619 .fno = gen_helper_gvec_ursra_b, 620 .opt_opc = vecop_list, 621 .load_dest = true, 622 .vece = MO_8 }, 623 { .fni8 = gen_ursra16_i64, 624 .fniv = gen_ursra_vec, 625 .fno = gen_helper_gvec_ursra_h, 626 .opt_opc = vecop_list, 627 .load_dest = true, 628 .vece = MO_16 }, 629 { .fni4 = gen_ursra32_i32, 630 .fniv = gen_ursra_vec, 631 .fno = gen_helper_gvec_ursra_s, 632 .opt_opc = vecop_list, 633 .load_dest = true, 634 .vece = MO_32 }, 635 { .fni8 = gen_ursra64_i64, 636 .fniv = gen_ursra_vec, 637 .fno = gen_helper_gvec_ursra_d, 638 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 639 .opt_opc = vecop_list, 640 .load_dest = true, 641 .vece = MO_64 }, 642 }; 643 644 /* tszimm encoding produces immediates in the range [1..esize] */ 645 tcg_debug_assert(shift > 0); 646 tcg_debug_assert(shift <= (8 << vece)); 647 648 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 649 } 650 651 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 652 { 653 uint64_t mask = dup_const(MO_8, 0xff >> shift); 654 TCGv_i64 t = tcg_temp_new_i64(); 655 656 tcg_gen_shri_i64(t, a, shift); 657 tcg_gen_andi_i64(t, t, mask); 658 tcg_gen_andi_i64(d, d, ~mask); 659 tcg_gen_or_i64(d, d, t); 660 } 661 662 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 663 { 664 uint64_t mask = dup_const(MO_16, 0xffff >> shift); 665 TCGv_i64 t = tcg_temp_new_i64(); 666 667 tcg_gen_shri_i64(t, a, shift); 668 tcg_gen_andi_i64(t, t, mask); 669 tcg_gen_andi_i64(d, d, ~mask); 670 tcg_gen_or_i64(d, d, t); 671 } 672 673 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 674 { 675 tcg_gen_shri_i32(a, a, shift); 676 tcg_gen_deposit_i32(d, d, a, 0, 32 - shift); 677 } 678 679 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 680 { 681 tcg_gen_shri_i64(a, a, shift); 682 tcg_gen_deposit_i64(d, d, a, 0, 64 - shift); 683 } 684 685 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 686 { 687 TCGv_vec t = tcg_temp_new_vec_matching(d); 688 TCGv_vec m = tcg_temp_new_vec_matching(d); 689 690 tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh)); 691 tcg_gen_shri_vec(vece, t, a, sh); 692 tcg_gen_and_vec(vece, d, d, m); 693 tcg_gen_or_vec(vece, d, d, t); 694 } 695 696 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 697 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 698 { 699 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 700 const GVecGen2i ops[4] = { 701 { .fni8 = gen_shr8_ins_i64, 702 .fniv = gen_shr_ins_vec, 703 .fno = gen_helper_gvec_sri_b, 704 .load_dest = true, 705 .opt_opc = vecop_list, 706 .vece = MO_8 }, 707 { .fni8 = gen_shr16_ins_i64, 708 .fniv = gen_shr_ins_vec, 709 .fno = gen_helper_gvec_sri_h, 710 .load_dest = true, 711 .opt_opc = vecop_list, 712 .vece = MO_16 }, 713 { .fni4 = gen_shr32_ins_i32, 714 .fniv = gen_shr_ins_vec, 715 .fno = gen_helper_gvec_sri_s, 716 .load_dest = true, 717 .opt_opc = vecop_list, 718 .vece = MO_32 }, 719 { .fni8 = gen_shr64_ins_i64, 720 .fniv = gen_shr_ins_vec, 721 .fno = gen_helper_gvec_sri_d, 722 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 723 .load_dest = true, 724 .opt_opc = vecop_list, 725 .vece = MO_64 }, 726 }; 727 728 /* tszimm encoding produces immediates in the range [1..esize]. */ 729 tcg_debug_assert(shift > 0); 730 tcg_debug_assert(shift <= (8 << vece)); 731 732 /* Shift of esize leaves destination unchanged. */ 733 if (shift < (8 << vece)) { 734 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 735 } else { 736 /* Nop, but we do need to clear the tail. */ 737 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 738 } 739 } 740 741 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 742 { 743 uint64_t mask = dup_const(MO_8, 0xff << shift); 744 TCGv_i64 t = tcg_temp_new_i64(); 745 746 tcg_gen_shli_i64(t, a, shift); 747 tcg_gen_andi_i64(t, t, mask); 748 tcg_gen_andi_i64(d, d, ~mask); 749 tcg_gen_or_i64(d, d, t); 750 } 751 752 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 753 { 754 uint64_t mask = dup_const(MO_16, 0xffff << shift); 755 TCGv_i64 t = tcg_temp_new_i64(); 756 757 tcg_gen_shli_i64(t, a, shift); 758 tcg_gen_andi_i64(t, t, mask); 759 tcg_gen_andi_i64(d, d, ~mask); 760 tcg_gen_or_i64(d, d, t); 761 } 762 763 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 764 { 765 tcg_gen_deposit_i32(d, d, a, shift, 32 - shift); 766 } 767 768 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 769 { 770 tcg_gen_deposit_i64(d, d, a, shift, 64 - shift); 771 } 772 773 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 774 { 775 TCGv_vec t = tcg_temp_new_vec_matching(d); 776 TCGv_vec m = tcg_temp_new_vec_matching(d); 777 778 tcg_gen_shli_vec(vece, t, a, sh); 779 tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh)); 780 tcg_gen_and_vec(vece, d, d, m); 781 tcg_gen_or_vec(vece, d, d, t); 782 } 783 784 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 785 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 786 { 787 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 788 const GVecGen2i ops[4] = { 789 { .fni8 = gen_shl8_ins_i64, 790 .fniv = gen_shl_ins_vec, 791 .fno = gen_helper_gvec_sli_b, 792 .load_dest = true, 793 .opt_opc = vecop_list, 794 .vece = MO_8 }, 795 { .fni8 = gen_shl16_ins_i64, 796 .fniv = gen_shl_ins_vec, 797 .fno = gen_helper_gvec_sli_h, 798 .load_dest = true, 799 .opt_opc = vecop_list, 800 .vece = MO_16 }, 801 { .fni4 = gen_shl32_ins_i32, 802 .fniv = gen_shl_ins_vec, 803 .fno = gen_helper_gvec_sli_s, 804 .load_dest = true, 805 .opt_opc = vecop_list, 806 .vece = MO_32 }, 807 { .fni8 = gen_shl64_ins_i64, 808 .fniv = gen_shl_ins_vec, 809 .fno = gen_helper_gvec_sli_d, 810 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 811 .load_dest = true, 812 .opt_opc = vecop_list, 813 .vece = MO_64 }, 814 }; 815 816 /* tszimm encoding produces immediates in the range [0..esize-1]. */ 817 tcg_debug_assert(shift >= 0); 818 tcg_debug_assert(shift < (8 << vece)); 819 820 if (shift == 0) { 821 tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz); 822 } else { 823 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 824 } 825 } 826 827 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 828 { 829 gen_helper_neon_mul_u8(a, a, b); 830 gen_helper_neon_add_u8(d, d, a); 831 } 832 833 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 834 { 835 gen_helper_neon_mul_u8(a, a, b); 836 gen_helper_neon_sub_u8(d, d, a); 837 } 838 839 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 840 { 841 gen_helper_neon_mul_u16(a, a, b); 842 gen_helper_neon_add_u16(d, d, a); 843 } 844 845 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 846 { 847 gen_helper_neon_mul_u16(a, a, b); 848 gen_helper_neon_sub_u16(d, d, a); 849 } 850 851 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 852 { 853 tcg_gen_mul_i32(a, a, b); 854 tcg_gen_add_i32(d, d, a); 855 } 856 857 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 858 { 859 tcg_gen_mul_i32(a, a, b); 860 tcg_gen_sub_i32(d, d, a); 861 } 862 863 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 864 { 865 tcg_gen_mul_i64(a, a, b); 866 tcg_gen_add_i64(d, d, a); 867 } 868 869 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 870 { 871 tcg_gen_mul_i64(a, a, b); 872 tcg_gen_sub_i64(d, d, a); 873 } 874 875 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 876 { 877 tcg_gen_mul_vec(vece, a, a, b); 878 tcg_gen_add_vec(vece, d, d, a); 879 } 880 881 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 882 { 883 tcg_gen_mul_vec(vece, a, a, b); 884 tcg_gen_sub_vec(vece, d, d, a); 885 } 886 887 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops, 888 * these tables are shared with AArch64 which does support them. 889 */ 890 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 891 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 892 { 893 static const TCGOpcode vecop_list[] = { 894 INDEX_op_mul_vec, INDEX_op_add_vec, 0 895 }; 896 static const GVecGen3 ops[4] = { 897 { .fni4 = gen_mla8_i32, 898 .fniv = gen_mla_vec, 899 .load_dest = true, 900 .opt_opc = vecop_list, 901 .vece = MO_8 }, 902 { .fni4 = gen_mla16_i32, 903 .fniv = gen_mla_vec, 904 .load_dest = true, 905 .opt_opc = vecop_list, 906 .vece = MO_16 }, 907 { .fni4 = gen_mla32_i32, 908 .fniv = gen_mla_vec, 909 .load_dest = true, 910 .opt_opc = vecop_list, 911 .vece = MO_32 }, 912 { .fni8 = gen_mla64_i64, 913 .fniv = gen_mla_vec, 914 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 915 .load_dest = true, 916 .opt_opc = vecop_list, 917 .vece = MO_64 }, 918 }; 919 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 920 } 921 922 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 923 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 924 { 925 static const TCGOpcode vecop_list[] = { 926 INDEX_op_mul_vec, INDEX_op_sub_vec, 0 927 }; 928 static const GVecGen3 ops[4] = { 929 { .fni4 = gen_mls8_i32, 930 .fniv = gen_mls_vec, 931 .load_dest = true, 932 .opt_opc = vecop_list, 933 .vece = MO_8 }, 934 { .fni4 = gen_mls16_i32, 935 .fniv = gen_mls_vec, 936 .load_dest = true, 937 .opt_opc = vecop_list, 938 .vece = MO_16 }, 939 { .fni4 = gen_mls32_i32, 940 .fniv = gen_mls_vec, 941 .load_dest = true, 942 .opt_opc = vecop_list, 943 .vece = MO_32 }, 944 { .fni8 = gen_mls64_i64, 945 .fniv = gen_mls_vec, 946 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 947 .load_dest = true, 948 .opt_opc = vecop_list, 949 .vece = MO_64 }, 950 }; 951 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 952 } 953 954 /* CMTST : test is "if (X & Y != 0)". */ 955 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 956 { 957 tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b); 958 } 959 960 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 961 { 962 tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b); 963 } 964 965 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 966 { 967 tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b); 968 } 969 970 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 971 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 972 { 973 static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 }; 974 static const GVecGen3 ops[4] = { 975 { .fni4 = gen_helper_neon_tst_u8, 976 .fniv = gen_cmtst_vec, 977 .opt_opc = vecop_list, 978 .vece = MO_8 }, 979 { .fni4 = gen_helper_neon_tst_u16, 980 .fniv = gen_cmtst_vec, 981 .opt_opc = vecop_list, 982 .vece = MO_16 }, 983 { .fni4 = gen_cmtst_i32, 984 .fniv = gen_cmtst_vec, 985 .opt_opc = vecop_list, 986 .vece = MO_32 }, 987 { .fni8 = gen_cmtst_i64, 988 .fniv = gen_cmtst_vec, 989 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 990 .opt_opc = vecop_list, 991 .vece = MO_64 }, 992 }; 993 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 994 } 995 996 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 997 { 998 TCGv_i32 lval = tcg_temp_new_i32(); 999 TCGv_i32 rval = tcg_temp_new_i32(); 1000 TCGv_i32 lsh = tcg_temp_new_i32(); 1001 TCGv_i32 rsh = tcg_temp_new_i32(); 1002 TCGv_i32 zero = tcg_constant_i32(0); 1003 TCGv_i32 max = tcg_constant_i32(32); 1004 1005 /* 1006 * Rely on the TCG guarantee that out of range shifts produce 1007 * unspecified results, not undefined behaviour (i.e. no trap). 1008 * Discard out-of-range results after the fact. 1009 */ 1010 tcg_gen_ext8s_i32(lsh, shift); 1011 tcg_gen_neg_i32(rsh, lsh); 1012 tcg_gen_shl_i32(lval, src, lsh); 1013 tcg_gen_shr_i32(rval, src, rsh); 1014 tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero); 1015 tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst); 1016 } 1017 1018 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1019 { 1020 TCGv_i64 lval = tcg_temp_new_i64(); 1021 TCGv_i64 rval = tcg_temp_new_i64(); 1022 TCGv_i64 lsh = tcg_temp_new_i64(); 1023 TCGv_i64 rsh = tcg_temp_new_i64(); 1024 TCGv_i64 zero = tcg_constant_i64(0); 1025 TCGv_i64 max = tcg_constant_i64(64); 1026 1027 /* 1028 * Rely on the TCG guarantee that out of range shifts produce 1029 * unspecified results, not undefined behaviour (i.e. no trap). 1030 * Discard out-of-range results after the fact. 1031 */ 1032 tcg_gen_ext8s_i64(lsh, shift); 1033 tcg_gen_neg_i64(rsh, lsh); 1034 tcg_gen_shl_i64(lval, src, lsh); 1035 tcg_gen_shr_i64(rval, src, rsh); 1036 tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero); 1037 tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst); 1038 } 1039 1040 static void gen_ushl_vec(unsigned vece, TCGv_vec dst, 1041 TCGv_vec src, TCGv_vec shift) 1042 { 1043 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1044 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1045 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1046 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1047 TCGv_vec msk, max; 1048 1049 tcg_gen_neg_vec(vece, rsh, shift); 1050 if (vece == MO_8) { 1051 tcg_gen_mov_vec(lsh, shift); 1052 } else { 1053 msk = tcg_temp_new_vec_matching(dst); 1054 tcg_gen_dupi_vec(vece, msk, 0xff); 1055 tcg_gen_and_vec(vece, lsh, shift, msk); 1056 tcg_gen_and_vec(vece, rsh, rsh, msk); 1057 } 1058 1059 /* 1060 * Rely on the TCG guarantee that out of range shifts produce 1061 * unspecified results, not undefined behaviour (i.e. no trap). 1062 * Discard out-of-range results after the fact. 1063 */ 1064 tcg_gen_shlv_vec(vece, lval, src, lsh); 1065 tcg_gen_shrv_vec(vece, rval, src, rsh); 1066 1067 max = tcg_temp_new_vec_matching(dst); 1068 tcg_gen_dupi_vec(vece, max, 8 << vece); 1069 1070 /* 1071 * The choice of LT (signed) and GEU (unsigned) are biased toward 1072 * the instructions of the x86_64 host. For MO_8, the whole byte 1073 * is significant so we must use an unsigned compare; otherwise we 1074 * have already masked to a byte and so a signed compare works. 1075 * Other tcg hosts have a full set of comparisons and do not care. 1076 */ 1077 if (vece == MO_8) { 1078 tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max); 1079 tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max); 1080 tcg_gen_andc_vec(vece, lval, lval, lsh); 1081 tcg_gen_andc_vec(vece, rval, rval, rsh); 1082 } else { 1083 tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max); 1084 tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max); 1085 tcg_gen_and_vec(vece, lval, lval, lsh); 1086 tcg_gen_and_vec(vece, rval, rval, rsh); 1087 } 1088 tcg_gen_or_vec(vece, dst, lval, rval); 1089 } 1090 1091 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1092 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1093 { 1094 static const TCGOpcode vecop_list[] = { 1095 INDEX_op_neg_vec, INDEX_op_shlv_vec, 1096 INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0 1097 }; 1098 static const GVecGen3 ops[4] = { 1099 { .fniv = gen_ushl_vec, 1100 .fno = gen_helper_gvec_ushl_b, 1101 .opt_opc = vecop_list, 1102 .vece = MO_8 }, 1103 { .fniv = gen_ushl_vec, 1104 .fno = gen_helper_gvec_ushl_h, 1105 .opt_opc = vecop_list, 1106 .vece = MO_16 }, 1107 { .fni4 = gen_ushl_i32, 1108 .fniv = gen_ushl_vec, 1109 .opt_opc = vecop_list, 1110 .vece = MO_32 }, 1111 { .fni8 = gen_ushl_i64, 1112 .fniv = gen_ushl_vec, 1113 .opt_opc = vecop_list, 1114 .vece = MO_64 }, 1115 }; 1116 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1117 } 1118 1119 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 1120 { 1121 TCGv_i32 lval = tcg_temp_new_i32(); 1122 TCGv_i32 rval = tcg_temp_new_i32(); 1123 TCGv_i32 lsh = tcg_temp_new_i32(); 1124 TCGv_i32 rsh = tcg_temp_new_i32(); 1125 TCGv_i32 zero = tcg_constant_i32(0); 1126 TCGv_i32 max = tcg_constant_i32(31); 1127 1128 /* 1129 * Rely on the TCG guarantee that out of range shifts produce 1130 * unspecified results, not undefined behaviour (i.e. no trap). 1131 * Discard out-of-range results after the fact. 1132 */ 1133 tcg_gen_ext8s_i32(lsh, shift); 1134 tcg_gen_neg_i32(rsh, lsh); 1135 tcg_gen_shl_i32(lval, src, lsh); 1136 tcg_gen_umin_i32(rsh, rsh, max); 1137 tcg_gen_sar_i32(rval, src, rsh); 1138 tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero); 1139 tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval); 1140 } 1141 1142 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1143 { 1144 TCGv_i64 lval = tcg_temp_new_i64(); 1145 TCGv_i64 rval = tcg_temp_new_i64(); 1146 TCGv_i64 lsh = tcg_temp_new_i64(); 1147 TCGv_i64 rsh = tcg_temp_new_i64(); 1148 TCGv_i64 zero = tcg_constant_i64(0); 1149 TCGv_i64 max = tcg_constant_i64(63); 1150 1151 /* 1152 * Rely on the TCG guarantee that out of range shifts produce 1153 * unspecified results, not undefined behaviour (i.e. no trap). 1154 * Discard out-of-range results after the fact. 1155 */ 1156 tcg_gen_ext8s_i64(lsh, shift); 1157 tcg_gen_neg_i64(rsh, lsh); 1158 tcg_gen_shl_i64(lval, src, lsh); 1159 tcg_gen_umin_i64(rsh, rsh, max); 1160 tcg_gen_sar_i64(rval, src, rsh); 1161 tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero); 1162 tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval); 1163 } 1164 1165 static void gen_sshl_vec(unsigned vece, TCGv_vec dst, 1166 TCGv_vec src, TCGv_vec shift) 1167 { 1168 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1169 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1170 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1171 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1172 TCGv_vec tmp = tcg_temp_new_vec_matching(dst); 1173 1174 /* 1175 * Rely on the TCG guarantee that out of range shifts produce 1176 * unspecified results, not undefined behaviour (i.e. no trap). 1177 * Discard out-of-range results after the fact. 1178 */ 1179 tcg_gen_neg_vec(vece, rsh, shift); 1180 if (vece == MO_8) { 1181 tcg_gen_mov_vec(lsh, shift); 1182 } else { 1183 tcg_gen_dupi_vec(vece, tmp, 0xff); 1184 tcg_gen_and_vec(vece, lsh, shift, tmp); 1185 tcg_gen_and_vec(vece, rsh, rsh, tmp); 1186 } 1187 1188 /* Bound rsh so out of bound right shift gets -1. */ 1189 tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1); 1190 tcg_gen_umin_vec(vece, rsh, rsh, tmp); 1191 tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp); 1192 1193 tcg_gen_shlv_vec(vece, lval, src, lsh); 1194 tcg_gen_sarv_vec(vece, rval, src, rsh); 1195 1196 /* Select in-bound left shift. */ 1197 tcg_gen_andc_vec(vece, lval, lval, tmp); 1198 1199 /* Select between left and right shift. */ 1200 if (vece == MO_8) { 1201 tcg_gen_dupi_vec(vece, tmp, 0); 1202 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval); 1203 } else { 1204 tcg_gen_dupi_vec(vece, tmp, 0x80); 1205 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval); 1206 } 1207 } 1208 1209 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1210 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1211 { 1212 static const TCGOpcode vecop_list[] = { 1213 INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec, 1214 INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0 1215 }; 1216 static const GVecGen3 ops[4] = { 1217 { .fniv = gen_sshl_vec, 1218 .fno = gen_helper_gvec_sshl_b, 1219 .opt_opc = vecop_list, 1220 .vece = MO_8 }, 1221 { .fniv = gen_sshl_vec, 1222 .fno = gen_helper_gvec_sshl_h, 1223 .opt_opc = vecop_list, 1224 .vece = MO_16 }, 1225 { .fni4 = gen_sshl_i32, 1226 .fniv = gen_sshl_vec, 1227 .opt_opc = vecop_list, 1228 .vece = MO_32 }, 1229 { .fni8 = gen_sshl_i64, 1230 .fniv = gen_sshl_vec, 1231 .opt_opc = vecop_list, 1232 .vece = MO_64 }, 1233 }; 1234 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1235 } 1236 1237 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1238 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1239 { 1240 static gen_helper_gvec_3 * const fns[] = { 1241 gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h, 1242 gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d, 1243 }; 1244 tcg_debug_assert(vece <= MO_64); 1245 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1246 } 1247 1248 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1249 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1250 { 1251 static gen_helper_gvec_3 * const fns[] = { 1252 gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h, 1253 gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d, 1254 }; 1255 tcg_debug_assert(vece <= MO_64); 1256 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1257 } 1258 1259 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1260 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1261 { 1262 static gen_helper_gvec_3_ptr * const fns[] = { 1263 gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h, 1264 gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d, 1265 }; 1266 tcg_debug_assert(vece <= MO_64); 1267 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1268 opr_sz, max_sz, 0, fns[vece]); 1269 } 1270 1271 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1272 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1273 { 1274 static gen_helper_gvec_3_ptr * const fns[] = { 1275 gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h, 1276 gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d, 1277 }; 1278 tcg_debug_assert(vece <= MO_64); 1279 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1280 opr_sz, max_sz, 0, fns[vece]); 1281 } 1282 1283 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1284 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1285 { 1286 static gen_helper_gvec_3_ptr * const fns[] = { 1287 gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h, 1288 gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d, 1289 }; 1290 tcg_debug_assert(vece <= MO_64); 1291 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1292 opr_sz, max_sz, 0, fns[vece]); 1293 } 1294 1295 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1296 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1297 { 1298 static gen_helper_gvec_3_ptr * const fns[] = { 1299 gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h, 1300 gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d, 1301 }; 1302 tcg_debug_assert(vece <= MO_64); 1303 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1304 opr_sz, max_sz, 0, fns[vece]); 1305 } 1306 1307 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1308 { 1309 uint64_t max = MAKE_64BIT_MASK(0, 8 << esz); 1310 TCGv_i64 tmp = tcg_temp_new_i64(); 1311 1312 tcg_gen_add_i64(tmp, a, b); 1313 tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max)); 1314 tcg_gen_xor_i64(tmp, tmp, res); 1315 tcg_gen_or_i64(qc, qc, tmp); 1316 } 1317 1318 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1319 { 1320 TCGv_i64 t = tcg_temp_new_i64(); 1321 1322 tcg_gen_add_i64(t, a, b); 1323 tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a, 1324 tcg_constant_i64(UINT64_MAX), t); 1325 tcg_gen_xor_i64(t, t, res); 1326 tcg_gen_or_i64(qc, qc, t); 1327 } 1328 1329 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1330 TCGv_vec a, TCGv_vec b) 1331 { 1332 TCGv_vec x = tcg_temp_new_vec_matching(t); 1333 tcg_gen_add_vec(vece, x, a, b); 1334 tcg_gen_usadd_vec(vece, t, a, b); 1335 tcg_gen_xor_vec(vece, x, x, t); 1336 tcg_gen_or_vec(vece, qc, qc, x); 1337 } 1338 1339 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1340 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1341 { 1342 static const TCGOpcode vecop_list[] = { 1343 INDEX_op_usadd_vec, INDEX_op_add_vec, 0 1344 }; 1345 static const GVecGen4 ops[4] = { 1346 { .fniv = gen_uqadd_vec, 1347 .fno = gen_helper_gvec_uqadd_b, 1348 .write_aofs = true, 1349 .opt_opc = vecop_list, 1350 .vece = MO_8 }, 1351 { .fniv = gen_uqadd_vec, 1352 .fno = gen_helper_gvec_uqadd_h, 1353 .write_aofs = true, 1354 .opt_opc = vecop_list, 1355 .vece = MO_16 }, 1356 { .fniv = gen_uqadd_vec, 1357 .fno = gen_helper_gvec_uqadd_s, 1358 .write_aofs = true, 1359 .opt_opc = vecop_list, 1360 .vece = MO_32 }, 1361 { .fniv = gen_uqadd_vec, 1362 .fni8 = gen_uqadd_d, 1363 .fno = gen_helper_gvec_uqadd_d, 1364 .write_aofs = true, 1365 .opt_opc = vecop_list, 1366 .vece = MO_64 }, 1367 }; 1368 1369 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1370 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1371 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1372 } 1373 1374 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1375 { 1376 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1377 int64_t min = -1ll - max; 1378 TCGv_i64 tmp = tcg_temp_new_i64(); 1379 1380 tcg_gen_add_i64(tmp, a, b); 1381 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1382 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1383 tcg_gen_xor_i64(tmp, tmp, res); 1384 tcg_gen_or_i64(qc, qc, tmp); 1385 } 1386 1387 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1388 { 1389 TCGv_i64 t0 = tcg_temp_new_i64(); 1390 TCGv_i64 t1 = tcg_temp_new_i64(); 1391 TCGv_i64 t2 = tcg_temp_new_i64(); 1392 1393 tcg_gen_add_i64(t0, a, b); 1394 1395 /* Compute signed overflow indication into T1 */ 1396 tcg_gen_xor_i64(t1, a, b); 1397 tcg_gen_xor_i64(t2, t0, a); 1398 tcg_gen_andc_i64(t1, t2, t1); 1399 1400 /* Compute saturated value into T2 */ 1401 tcg_gen_sari_i64(t2, a, 63); 1402 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1403 1404 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1405 tcg_gen_xor_i64(t0, t0, res); 1406 tcg_gen_or_i64(qc, qc, t0); 1407 } 1408 1409 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1410 TCGv_vec a, TCGv_vec b) 1411 { 1412 TCGv_vec x = tcg_temp_new_vec_matching(t); 1413 tcg_gen_add_vec(vece, x, a, b); 1414 tcg_gen_ssadd_vec(vece, t, a, b); 1415 tcg_gen_xor_vec(vece, x, x, t); 1416 tcg_gen_or_vec(vece, qc, qc, x); 1417 } 1418 1419 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1420 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1421 { 1422 static const TCGOpcode vecop_list[] = { 1423 INDEX_op_ssadd_vec, INDEX_op_add_vec, 0 1424 }; 1425 static const GVecGen4 ops[4] = { 1426 { .fniv = gen_sqadd_vec, 1427 .fno = gen_helper_gvec_sqadd_b, 1428 .opt_opc = vecop_list, 1429 .write_aofs = true, 1430 .vece = MO_8 }, 1431 { .fniv = gen_sqadd_vec, 1432 .fno = gen_helper_gvec_sqadd_h, 1433 .opt_opc = vecop_list, 1434 .write_aofs = true, 1435 .vece = MO_16 }, 1436 { .fniv = gen_sqadd_vec, 1437 .fno = gen_helper_gvec_sqadd_s, 1438 .opt_opc = vecop_list, 1439 .write_aofs = true, 1440 .vece = MO_32 }, 1441 { .fniv = gen_sqadd_vec, 1442 .fni8 = gen_sqadd_d, 1443 .fno = gen_helper_gvec_sqadd_d, 1444 .opt_opc = vecop_list, 1445 .write_aofs = true, 1446 .vece = MO_64 }, 1447 }; 1448 1449 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1450 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1451 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1452 } 1453 1454 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1455 { 1456 TCGv_i64 tmp = tcg_temp_new_i64(); 1457 1458 tcg_gen_sub_i64(tmp, a, b); 1459 tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0)); 1460 tcg_gen_xor_i64(tmp, tmp, res); 1461 tcg_gen_or_i64(qc, qc, tmp); 1462 } 1463 1464 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1465 { 1466 TCGv_i64 t = tcg_temp_new_i64(); 1467 1468 tcg_gen_sub_i64(t, a, b); 1469 tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t); 1470 tcg_gen_xor_i64(t, t, res); 1471 tcg_gen_or_i64(qc, qc, t); 1472 } 1473 1474 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1475 TCGv_vec a, TCGv_vec b) 1476 { 1477 TCGv_vec x = tcg_temp_new_vec_matching(t); 1478 tcg_gen_sub_vec(vece, x, a, b); 1479 tcg_gen_ussub_vec(vece, t, a, b); 1480 tcg_gen_xor_vec(vece, x, x, t); 1481 tcg_gen_or_vec(vece, qc, qc, x); 1482 } 1483 1484 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1485 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1486 { 1487 static const TCGOpcode vecop_list[] = { 1488 INDEX_op_ussub_vec, INDEX_op_sub_vec, 0 1489 }; 1490 static const GVecGen4 ops[4] = { 1491 { .fniv = gen_uqsub_vec, 1492 .fno = gen_helper_gvec_uqsub_b, 1493 .opt_opc = vecop_list, 1494 .write_aofs = true, 1495 .vece = MO_8 }, 1496 { .fniv = gen_uqsub_vec, 1497 .fno = gen_helper_gvec_uqsub_h, 1498 .opt_opc = vecop_list, 1499 .write_aofs = true, 1500 .vece = MO_16 }, 1501 { .fniv = gen_uqsub_vec, 1502 .fno = gen_helper_gvec_uqsub_s, 1503 .opt_opc = vecop_list, 1504 .write_aofs = true, 1505 .vece = MO_32 }, 1506 { .fniv = gen_uqsub_vec, 1507 .fni8 = gen_uqsub_d, 1508 .fno = gen_helper_gvec_uqsub_d, 1509 .opt_opc = vecop_list, 1510 .write_aofs = true, 1511 .vece = MO_64 }, 1512 }; 1513 1514 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1515 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1516 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1517 } 1518 1519 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1520 { 1521 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1522 int64_t min = -1ll - max; 1523 TCGv_i64 tmp = tcg_temp_new_i64(); 1524 1525 tcg_gen_sub_i64(tmp, a, b); 1526 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1527 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1528 tcg_gen_xor_i64(tmp, tmp, res); 1529 tcg_gen_or_i64(qc, qc, tmp); 1530 } 1531 1532 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1533 { 1534 TCGv_i64 t0 = tcg_temp_new_i64(); 1535 TCGv_i64 t1 = tcg_temp_new_i64(); 1536 TCGv_i64 t2 = tcg_temp_new_i64(); 1537 1538 tcg_gen_sub_i64(t0, a, b); 1539 1540 /* Compute signed overflow indication into T1 */ 1541 tcg_gen_xor_i64(t1, a, b); 1542 tcg_gen_xor_i64(t2, t0, a); 1543 tcg_gen_and_i64(t1, t1, t2); 1544 1545 /* Compute saturated value into T2 */ 1546 tcg_gen_sari_i64(t2, a, 63); 1547 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1548 1549 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1550 tcg_gen_xor_i64(t0, t0, res); 1551 tcg_gen_or_i64(qc, qc, t0); 1552 } 1553 1554 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1555 TCGv_vec a, TCGv_vec b) 1556 { 1557 TCGv_vec x = tcg_temp_new_vec_matching(t); 1558 tcg_gen_sub_vec(vece, x, a, b); 1559 tcg_gen_sssub_vec(vece, t, a, b); 1560 tcg_gen_xor_vec(vece, x, x, t); 1561 tcg_gen_or_vec(vece, qc, qc, x); 1562 } 1563 1564 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1565 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1566 { 1567 static const TCGOpcode vecop_list[] = { 1568 INDEX_op_sssub_vec, INDEX_op_sub_vec, 0 1569 }; 1570 static const GVecGen4 ops[4] = { 1571 { .fniv = gen_sqsub_vec, 1572 .fno = gen_helper_gvec_sqsub_b, 1573 .opt_opc = vecop_list, 1574 .write_aofs = true, 1575 .vece = MO_8 }, 1576 { .fniv = gen_sqsub_vec, 1577 .fno = gen_helper_gvec_sqsub_h, 1578 .opt_opc = vecop_list, 1579 .write_aofs = true, 1580 .vece = MO_16 }, 1581 { .fniv = gen_sqsub_vec, 1582 .fno = gen_helper_gvec_sqsub_s, 1583 .opt_opc = vecop_list, 1584 .write_aofs = true, 1585 .vece = MO_32 }, 1586 { .fniv = gen_sqsub_vec, 1587 .fni8 = gen_sqsub_d, 1588 .fno = gen_helper_gvec_sqsub_d, 1589 .opt_opc = vecop_list, 1590 .write_aofs = true, 1591 .vece = MO_64 }, 1592 }; 1593 1594 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1595 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1596 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1597 } 1598 1599 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1600 { 1601 TCGv_i32 t = tcg_temp_new_i32(); 1602 1603 tcg_gen_sub_i32(t, a, b); 1604 tcg_gen_sub_i32(d, b, a); 1605 tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t); 1606 } 1607 1608 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1609 { 1610 TCGv_i64 t = tcg_temp_new_i64(); 1611 1612 tcg_gen_sub_i64(t, a, b); 1613 tcg_gen_sub_i64(d, b, a); 1614 tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t); 1615 } 1616 1617 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1618 { 1619 TCGv_vec t = tcg_temp_new_vec_matching(d); 1620 1621 tcg_gen_smin_vec(vece, t, a, b); 1622 tcg_gen_smax_vec(vece, d, a, b); 1623 tcg_gen_sub_vec(vece, d, d, t); 1624 } 1625 1626 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1627 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1628 { 1629 static const TCGOpcode vecop_list[] = { 1630 INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1631 }; 1632 static const GVecGen3 ops[4] = { 1633 { .fniv = gen_sabd_vec, 1634 .fno = gen_helper_gvec_sabd_b, 1635 .opt_opc = vecop_list, 1636 .vece = MO_8 }, 1637 { .fniv = gen_sabd_vec, 1638 .fno = gen_helper_gvec_sabd_h, 1639 .opt_opc = vecop_list, 1640 .vece = MO_16 }, 1641 { .fni4 = gen_sabd_i32, 1642 .fniv = gen_sabd_vec, 1643 .fno = gen_helper_gvec_sabd_s, 1644 .opt_opc = vecop_list, 1645 .vece = MO_32 }, 1646 { .fni8 = gen_sabd_i64, 1647 .fniv = gen_sabd_vec, 1648 .fno = gen_helper_gvec_sabd_d, 1649 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1650 .opt_opc = vecop_list, 1651 .vece = MO_64 }, 1652 }; 1653 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1654 } 1655 1656 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1657 { 1658 TCGv_i32 t = tcg_temp_new_i32(); 1659 1660 tcg_gen_sub_i32(t, a, b); 1661 tcg_gen_sub_i32(d, b, a); 1662 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t); 1663 } 1664 1665 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1666 { 1667 TCGv_i64 t = tcg_temp_new_i64(); 1668 1669 tcg_gen_sub_i64(t, a, b); 1670 tcg_gen_sub_i64(d, b, a); 1671 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t); 1672 } 1673 1674 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1675 { 1676 TCGv_vec t = tcg_temp_new_vec_matching(d); 1677 1678 tcg_gen_umin_vec(vece, t, a, b); 1679 tcg_gen_umax_vec(vece, d, a, b); 1680 tcg_gen_sub_vec(vece, d, d, t); 1681 } 1682 1683 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1684 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1685 { 1686 static const TCGOpcode vecop_list[] = { 1687 INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1688 }; 1689 static const GVecGen3 ops[4] = { 1690 { .fniv = gen_uabd_vec, 1691 .fno = gen_helper_gvec_uabd_b, 1692 .opt_opc = vecop_list, 1693 .vece = MO_8 }, 1694 { .fniv = gen_uabd_vec, 1695 .fno = gen_helper_gvec_uabd_h, 1696 .opt_opc = vecop_list, 1697 .vece = MO_16 }, 1698 { .fni4 = gen_uabd_i32, 1699 .fniv = gen_uabd_vec, 1700 .fno = gen_helper_gvec_uabd_s, 1701 .opt_opc = vecop_list, 1702 .vece = MO_32 }, 1703 { .fni8 = gen_uabd_i64, 1704 .fniv = gen_uabd_vec, 1705 .fno = gen_helper_gvec_uabd_d, 1706 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1707 .opt_opc = vecop_list, 1708 .vece = MO_64 }, 1709 }; 1710 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1711 } 1712 1713 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1714 { 1715 TCGv_i32 t = tcg_temp_new_i32(); 1716 gen_sabd_i32(t, a, b); 1717 tcg_gen_add_i32(d, d, t); 1718 } 1719 1720 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1721 { 1722 TCGv_i64 t = tcg_temp_new_i64(); 1723 gen_sabd_i64(t, a, b); 1724 tcg_gen_add_i64(d, d, t); 1725 } 1726 1727 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1728 { 1729 TCGv_vec t = tcg_temp_new_vec_matching(d); 1730 gen_sabd_vec(vece, t, a, b); 1731 tcg_gen_add_vec(vece, d, d, t); 1732 } 1733 1734 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1735 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1736 { 1737 static const TCGOpcode vecop_list[] = { 1738 INDEX_op_sub_vec, INDEX_op_add_vec, 1739 INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1740 }; 1741 static const GVecGen3 ops[4] = { 1742 { .fniv = gen_saba_vec, 1743 .fno = gen_helper_gvec_saba_b, 1744 .opt_opc = vecop_list, 1745 .load_dest = true, 1746 .vece = MO_8 }, 1747 { .fniv = gen_saba_vec, 1748 .fno = gen_helper_gvec_saba_h, 1749 .opt_opc = vecop_list, 1750 .load_dest = true, 1751 .vece = MO_16 }, 1752 { .fni4 = gen_saba_i32, 1753 .fniv = gen_saba_vec, 1754 .fno = gen_helper_gvec_saba_s, 1755 .opt_opc = vecop_list, 1756 .load_dest = true, 1757 .vece = MO_32 }, 1758 { .fni8 = gen_saba_i64, 1759 .fniv = gen_saba_vec, 1760 .fno = gen_helper_gvec_saba_d, 1761 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1762 .opt_opc = vecop_list, 1763 .load_dest = true, 1764 .vece = MO_64 }, 1765 }; 1766 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1767 } 1768 1769 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1770 { 1771 TCGv_i32 t = tcg_temp_new_i32(); 1772 gen_uabd_i32(t, a, b); 1773 tcg_gen_add_i32(d, d, t); 1774 } 1775 1776 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1777 { 1778 TCGv_i64 t = tcg_temp_new_i64(); 1779 gen_uabd_i64(t, a, b); 1780 tcg_gen_add_i64(d, d, t); 1781 } 1782 1783 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1784 { 1785 TCGv_vec t = tcg_temp_new_vec_matching(d); 1786 gen_uabd_vec(vece, t, a, b); 1787 tcg_gen_add_vec(vece, d, d, t); 1788 } 1789 1790 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1791 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1792 { 1793 static const TCGOpcode vecop_list[] = { 1794 INDEX_op_sub_vec, INDEX_op_add_vec, 1795 INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1796 }; 1797 static const GVecGen3 ops[4] = { 1798 { .fniv = gen_uaba_vec, 1799 .fno = gen_helper_gvec_uaba_b, 1800 .opt_opc = vecop_list, 1801 .load_dest = true, 1802 .vece = MO_8 }, 1803 { .fniv = gen_uaba_vec, 1804 .fno = gen_helper_gvec_uaba_h, 1805 .opt_opc = vecop_list, 1806 .load_dest = true, 1807 .vece = MO_16 }, 1808 { .fni4 = gen_uaba_i32, 1809 .fniv = gen_uaba_vec, 1810 .fno = gen_helper_gvec_uaba_s, 1811 .opt_opc = vecop_list, 1812 .load_dest = true, 1813 .vece = MO_32 }, 1814 { .fni8 = gen_uaba_i64, 1815 .fniv = gen_uaba_vec, 1816 .fno = gen_helper_gvec_uaba_d, 1817 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1818 .opt_opc = vecop_list, 1819 .load_dest = true, 1820 .vece = MO_64 }, 1821 }; 1822 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1823 } 1824 1825 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1826 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1827 { 1828 static gen_helper_gvec_3 * const fns[4] = { 1829 gen_helper_gvec_addp_b, 1830 gen_helper_gvec_addp_h, 1831 gen_helper_gvec_addp_s, 1832 gen_helper_gvec_addp_d, 1833 }; 1834 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1835 } 1836 1837 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1838 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1839 { 1840 static gen_helper_gvec_3 * const fns[4] = { 1841 gen_helper_gvec_smaxp_b, 1842 gen_helper_gvec_smaxp_h, 1843 gen_helper_gvec_smaxp_s, 1844 }; 1845 tcg_debug_assert(vece <= MO_32); 1846 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1847 } 1848 1849 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1850 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1851 { 1852 static gen_helper_gvec_3 * const fns[4] = { 1853 gen_helper_gvec_sminp_b, 1854 gen_helper_gvec_sminp_h, 1855 gen_helper_gvec_sminp_s, 1856 }; 1857 tcg_debug_assert(vece <= MO_32); 1858 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1859 } 1860 1861 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1862 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1863 { 1864 static gen_helper_gvec_3 * const fns[4] = { 1865 gen_helper_gvec_umaxp_b, 1866 gen_helper_gvec_umaxp_h, 1867 gen_helper_gvec_umaxp_s, 1868 }; 1869 tcg_debug_assert(vece <= MO_32); 1870 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1871 } 1872 1873 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1874 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1875 { 1876 static gen_helper_gvec_3 * const fns[4] = { 1877 gen_helper_gvec_uminp_b, 1878 gen_helper_gvec_uminp_h, 1879 gen_helper_gvec_uminp_s, 1880 }; 1881 tcg_debug_assert(vece <= MO_32); 1882 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1883 } 1884 1885 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1886 { 1887 TCGv_i64 t = tcg_temp_new_i64(); 1888 1889 tcg_gen_and_i64(t, a, b); 1890 tcg_gen_vec_sar8i_i64(a, a, 1); 1891 tcg_gen_vec_sar8i_i64(b, b, 1); 1892 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1893 tcg_gen_vec_add8_i64(d, a, b); 1894 tcg_gen_vec_add8_i64(d, d, t); 1895 } 1896 1897 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1898 { 1899 TCGv_i64 t = tcg_temp_new_i64(); 1900 1901 tcg_gen_and_i64(t, a, b); 1902 tcg_gen_vec_sar16i_i64(a, a, 1); 1903 tcg_gen_vec_sar16i_i64(b, b, 1); 1904 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1905 tcg_gen_vec_add16_i64(d, a, b); 1906 tcg_gen_vec_add16_i64(d, d, t); 1907 } 1908 1909 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1910 { 1911 TCGv_i32 t = tcg_temp_new_i32(); 1912 1913 tcg_gen_and_i32(t, a, b); 1914 tcg_gen_sari_i32(a, a, 1); 1915 tcg_gen_sari_i32(b, b, 1); 1916 tcg_gen_andi_i32(t, t, 1); 1917 tcg_gen_add_i32(d, a, b); 1918 tcg_gen_add_i32(d, d, t); 1919 } 1920 1921 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1922 { 1923 TCGv_vec t = tcg_temp_new_vec_matching(d); 1924 1925 tcg_gen_and_vec(vece, t, a, b); 1926 tcg_gen_sari_vec(vece, a, a, 1); 1927 tcg_gen_sari_vec(vece, b, b, 1); 1928 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 1929 tcg_gen_add_vec(vece, d, a, b); 1930 tcg_gen_add_vec(vece, d, d, t); 1931 } 1932 1933 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1934 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1935 { 1936 static const TCGOpcode vecop_list[] = { 1937 INDEX_op_sari_vec, INDEX_op_add_vec, 0 1938 }; 1939 static const GVecGen3 g[] = { 1940 { .fni8 = gen_shadd8_i64, 1941 .fniv = gen_shadd_vec, 1942 .opt_opc = vecop_list, 1943 .vece = MO_8 }, 1944 { .fni8 = gen_shadd16_i64, 1945 .fniv = gen_shadd_vec, 1946 .opt_opc = vecop_list, 1947 .vece = MO_16 }, 1948 { .fni4 = gen_shadd_i32, 1949 .fniv = gen_shadd_vec, 1950 .opt_opc = vecop_list, 1951 .vece = MO_32 }, 1952 }; 1953 tcg_debug_assert(vece <= MO_32); 1954 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 1955 } 1956 1957 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1958 { 1959 TCGv_i64 t = tcg_temp_new_i64(); 1960 1961 tcg_gen_and_i64(t, a, b); 1962 tcg_gen_vec_shr8i_i64(a, a, 1); 1963 tcg_gen_vec_shr8i_i64(b, b, 1); 1964 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1965 tcg_gen_vec_add8_i64(d, a, b); 1966 tcg_gen_vec_add8_i64(d, d, t); 1967 } 1968 1969 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1970 { 1971 TCGv_i64 t = tcg_temp_new_i64(); 1972 1973 tcg_gen_and_i64(t, a, b); 1974 tcg_gen_vec_shr16i_i64(a, a, 1); 1975 tcg_gen_vec_shr16i_i64(b, b, 1); 1976 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1977 tcg_gen_vec_add16_i64(d, a, b); 1978 tcg_gen_vec_add16_i64(d, d, t); 1979 } 1980 1981 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1982 { 1983 TCGv_i32 t = tcg_temp_new_i32(); 1984 1985 tcg_gen_and_i32(t, a, b); 1986 tcg_gen_shri_i32(a, a, 1); 1987 tcg_gen_shri_i32(b, b, 1); 1988 tcg_gen_andi_i32(t, t, 1); 1989 tcg_gen_add_i32(d, a, b); 1990 tcg_gen_add_i32(d, d, t); 1991 } 1992 1993 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1994 { 1995 TCGv_vec t = tcg_temp_new_vec_matching(d); 1996 1997 tcg_gen_and_vec(vece, t, a, b); 1998 tcg_gen_shri_vec(vece, a, a, 1); 1999 tcg_gen_shri_vec(vece, b, b, 1); 2000 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2001 tcg_gen_add_vec(vece, d, a, b); 2002 tcg_gen_add_vec(vece, d, d, t); 2003 } 2004 2005 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2006 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2007 { 2008 static const TCGOpcode vecop_list[] = { 2009 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2010 }; 2011 static const GVecGen3 g[] = { 2012 { .fni8 = gen_uhadd8_i64, 2013 .fniv = gen_uhadd_vec, 2014 .opt_opc = vecop_list, 2015 .vece = MO_8 }, 2016 { .fni8 = gen_uhadd16_i64, 2017 .fniv = gen_uhadd_vec, 2018 .opt_opc = vecop_list, 2019 .vece = MO_16 }, 2020 { .fni4 = gen_uhadd_i32, 2021 .fniv = gen_uhadd_vec, 2022 .opt_opc = vecop_list, 2023 .vece = MO_32 }, 2024 }; 2025 tcg_debug_assert(vece <= MO_32); 2026 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2027 } 2028 2029 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2030 { 2031 TCGv_i64 t = tcg_temp_new_i64(); 2032 2033 tcg_gen_andc_i64(t, b, a); 2034 tcg_gen_vec_sar8i_i64(a, a, 1); 2035 tcg_gen_vec_sar8i_i64(b, b, 1); 2036 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2037 tcg_gen_vec_sub8_i64(d, a, b); 2038 tcg_gen_vec_sub8_i64(d, d, t); 2039 } 2040 2041 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2042 { 2043 TCGv_i64 t = tcg_temp_new_i64(); 2044 2045 tcg_gen_andc_i64(t, b, a); 2046 tcg_gen_vec_sar16i_i64(a, a, 1); 2047 tcg_gen_vec_sar16i_i64(b, b, 1); 2048 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2049 tcg_gen_vec_sub16_i64(d, a, b); 2050 tcg_gen_vec_sub16_i64(d, d, t); 2051 } 2052 2053 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2054 { 2055 TCGv_i32 t = tcg_temp_new_i32(); 2056 2057 tcg_gen_andc_i32(t, b, a); 2058 tcg_gen_sari_i32(a, a, 1); 2059 tcg_gen_sari_i32(b, b, 1); 2060 tcg_gen_andi_i32(t, t, 1); 2061 tcg_gen_sub_i32(d, a, b); 2062 tcg_gen_sub_i32(d, d, t); 2063 } 2064 2065 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2066 { 2067 TCGv_vec t = tcg_temp_new_vec_matching(d); 2068 2069 tcg_gen_andc_vec(vece, t, b, a); 2070 tcg_gen_sari_vec(vece, a, a, 1); 2071 tcg_gen_sari_vec(vece, b, b, 1); 2072 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2073 tcg_gen_sub_vec(vece, d, a, b); 2074 tcg_gen_sub_vec(vece, d, d, t); 2075 } 2076 2077 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2078 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2079 { 2080 static const TCGOpcode vecop_list[] = { 2081 INDEX_op_sari_vec, INDEX_op_sub_vec, 0 2082 }; 2083 static const GVecGen3 g[4] = { 2084 { .fni8 = gen_shsub8_i64, 2085 .fniv = gen_shsub_vec, 2086 .opt_opc = vecop_list, 2087 .vece = MO_8 }, 2088 { .fni8 = gen_shsub16_i64, 2089 .fniv = gen_shsub_vec, 2090 .opt_opc = vecop_list, 2091 .vece = MO_16 }, 2092 { .fni4 = gen_shsub_i32, 2093 .fniv = gen_shsub_vec, 2094 .opt_opc = vecop_list, 2095 .vece = MO_32 }, 2096 }; 2097 assert(vece <= MO_32); 2098 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2099 } 2100 2101 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2102 { 2103 TCGv_i64 t = tcg_temp_new_i64(); 2104 2105 tcg_gen_andc_i64(t, b, a); 2106 tcg_gen_vec_shr8i_i64(a, a, 1); 2107 tcg_gen_vec_shr8i_i64(b, b, 1); 2108 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2109 tcg_gen_vec_sub8_i64(d, a, b); 2110 tcg_gen_vec_sub8_i64(d, d, t); 2111 } 2112 2113 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2114 { 2115 TCGv_i64 t = tcg_temp_new_i64(); 2116 2117 tcg_gen_andc_i64(t, b, a); 2118 tcg_gen_vec_shr16i_i64(a, a, 1); 2119 tcg_gen_vec_shr16i_i64(b, b, 1); 2120 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2121 tcg_gen_vec_sub16_i64(d, a, b); 2122 tcg_gen_vec_sub16_i64(d, d, t); 2123 } 2124 2125 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2126 { 2127 TCGv_i32 t = tcg_temp_new_i32(); 2128 2129 tcg_gen_andc_i32(t, b, a); 2130 tcg_gen_shri_i32(a, a, 1); 2131 tcg_gen_shri_i32(b, b, 1); 2132 tcg_gen_andi_i32(t, t, 1); 2133 tcg_gen_sub_i32(d, a, b); 2134 tcg_gen_sub_i32(d, d, t); 2135 } 2136 2137 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2138 { 2139 TCGv_vec t = tcg_temp_new_vec_matching(d); 2140 2141 tcg_gen_andc_vec(vece, t, b, a); 2142 tcg_gen_shri_vec(vece, a, a, 1); 2143 tcg_gen_shri_vec(vece, b, b, 1); 2144 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2145 tcg_gen_sub_vec(vece, d, a, b); 2146 tcg_gen_sub_vec(vece, d, d, t); 2147 } 2148 2149 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2150 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2151 { 2152 static const TCGOpcode vecop_list[] = { 2153 INDEX_op_shri_vec, INDEX_op_sub_vec, 0 2154 }; 2155 static const GVecGen3 g[4] = { 2156 { .fni8 = gen_uhsub8_i64, 2157 .fniv = gen_uhsub_vec, 2158 .opt_opc = vecop_list, 2159 .vece = MO_8 }, 2160 { .fni8 = gen_uhsub16_i64, 2161 .fniv = gen_uhsub_vec, 2162 .opt_opc = vecop_list, 2163 .vece = MO_16 }, 2164 { .fni4 = gen_uhsub_i32, 2165 .fniv = gen_uhsub_vec, 2166 .opt_opc = vecop_list, 2167 .vece = MO_32 }, 2168 }; 2169 assert(vece <= MO_32); 2170 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2171 } 2172 2173 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2174 { 2175 TCGv_i64 t = tcg_temp_new_i64(); 2176 2177 tcg_gen_or_i64(t, a, b); 2178 tcg_gen_vec_sar8i_i64(a, a, 1); 2179 tcg_gen_vec_sar8i_i64(b, b, 1); 2180 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2181 tcg_gen_vec_add8_i64(d, a, b); 2182 tcg_gen_vec_add8_i64(d, d, t); 2183 } 2184 2185 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2186 { 2187 TCGv_i64 t = tcg_temp_new_i64(); 2188 2189 tcg_gen_or_i64(t, a, b); 2190 tcg_gen_vec_sar16i_i64(a, a, 1); 2191 tcg_gen_vec_sar16i_i64(b, b, 1); 2192 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2193 tcg_gen_vec_add16_i64(d, a, b); 2194 tcg_gen_vec_add16_i64(d, d, t); 2195 } 2196 2197 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2198 { 2199 TCGv_i32 t = tcg_temp_new_i32(); 2200 2201 tcg_gen_or_i32(t, a, b); 2202 tcg_gen_sari_i32(a, a, 1); 2203 tcg_gen_sari_i32(b, b, 1); 2204 tcg_gen_andi_i32(t, t, 1); 2205 tcg_gen_add_i32(d, a, b); 2206 tcg_gen_add_i32(d, d, t); 2207 } 2208 2209 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2210 { 2211 TCGv_vec t = tcg_temp_new_vec_matching(d); 2212 2213 tcg_gen_or_vec(vece, t, a, b); 2214 tcg_gen_sari_vec(vece, a, a, 1); 2215 tcg_gen_sari_vec(vece, b, b, 1); 2216 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2217 tcg_gen_add_vec(vece, d, a, b); 2218 tcg_gen_add_vec(vece, d, d, t); 2219 } 2220 2221 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2222 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2223 { 2224 static const TCGOpcode vecop_list[] = { 2225 INDEX_op_sari_vec, INDEX_op_add_vec, 0 2226 }; 2227 static const GVecGen3 g[] = { 2228 { .fni8 = gen_srhadd8_i64, 2229 .fniv = gen_srhadd_vec, 2230 .opt_opc = vecop_list, 2231 .vece = MO_8 }, 2232 { .fni8 = gen_srhadd16_i64, 2233 .fniv = gen_srhadd_vec, 2234 .opt_opc = vecop_list, 2235 .vece = MO_16 }, 2236 { .fni4 = gen_srhadd_i32, 2237 .fniv = gen_srhadd_vec, 2238 .opt_opc = vecop_list, 2239 .vece = MO_32 }, 2240 }; 2241 assert(vece <= MO_32); 2242 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2243 } 2244 2245 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2246 { 2247 TCGv_i64 t = tcg_temp_new_i64(); 2248 2249 tcg_gen_or_i64(t, a, b); 2250 tcg_gen_vec_shr8i_i64(a, a, 1); 2251 tcg_gen_vec_shr8i_i64(b, b, 1); 2252 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2253 tcg_gen_vec_add8_i64(d, a, b); 2254 tcg_gen_vec_add8_i64(d, d, t); 2255 } 2256 2257 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2258 { 2259 TCGv_i64 t = tcg_temp_new_i64(); 2260 2261 tcg_gen_or_i64(t, a, b); 2262 tcg_gen_vec_shr16i_i64(a, a, 1); 2263 tcg_gen_vec_shr16i_i64(b, b, 1); 2264 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2265 tcg_gen_vec_add16_i64(d, a, b); 2266 tcg_gen_vec_add16_i64(d, d, t); 2267 } 2268 2269 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2270 { 2271 TCGv_i32 t = tcg_temp_new_i32(); 2272 2273 tcg_gen_or_i32(t, a, b); 2274 tcg_gen_shri_i32(a, a, 1); 2275 tcg_gen_shri_i32(b, b, 1); 2276 tcg_gen_andi_i32(t, t, 1); 2277 tcg_gen_add_i32(d, a, b); 2278 tcg_gen_add_i32(d, d, t); 2279 } 2280 2281 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2282 { 2283 TCGv_vec t = tcg_temp_new_vec_matching(d); 2284 2285 tcg_gen_or_vec(vece, t, a, b); 2286 tcg_gen_shri_vec(vece, a, a, 1); 2287 tcg_gen_shri_vec(vece, b, b, 1); 2288 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2289 tcg_gen_add_vec(vece, d, a, b); 2290 tcg_gen_add_vec(vece, d, d, t); 2291 } 2292 2293 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2294 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2295 { 2296 static const TCGOpcode vecop_list[] = { 2297 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2298 }; 2299 static const GVecGen3 g[] = { 2300 { .fni8 = gen_urhadd8_i64, 2301 .fniv = gen_urhadd_vec, 2302 .opt_opc = vecop_list, 2303 .vece = MO_8 }, 2304 { .fni8 = gen_urhadd16_i64, 2305 .fniv = gen_urhadd_vec, 2306 .opt_opc = vecop_list, 2307 .vece = MO_16 }, 2308 { .fni4 = gen_urhadd_i32, 2309 .fniv = gen_urhadd_vec, 2310 .opt_opc = vecop_list, 2311 .vece = MO_32 }, 2312 }; 2313 assert(vece <= MO_32); 2314 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2315 } 2316