1 /* 2 * ARM generic vector expansion 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * Copyright (c) 2005-2007 CodeSourcery 6 * Copyright (c) 2007 OpenedHand, Ltd. 7 * 8 * This library is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * This library is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "translate.h" 24 25 26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, 27 uint32_t opr_sz, uint32_t max_sz, 28 gen_helper_gvec_3_ptr *fn) 29 { 30 TCGv_ptr qc_ptr = tcg_temp_new_ptr(); 31 32 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 33 tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc)); 34 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr, 35 opr_sz, max_sz, 0, fn); 36 } 37 38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 39 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 40 { 41 static gen_helper_gvec_3_ptr * const fns[2] = { 42 gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s 43 }; 44 tcg_debug_assert(vece >= 1 && vece <= 2); 45 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 46 } 47 48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 49 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 50 { 51 static gen_helper_gvec_3_ptr * const fns[2] = { 52 gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s 53 }; 54 tcg_debug_assert(vece >= 1 && vece <= 2); 55 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 56 } 57 58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 59 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 60 { 61 static gen_helper_gvec_3_ptr * const fns[2] = { 62 gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32 63 }; 64 tcg_debug_assert(vece >= 1 && vece <= 2); 65 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 66 } 67 68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 69 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 70 { 71 static gen_helper_gvec_3_ptr * const fns[2] = { 72 gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32 73 }; 74 tcg_debug_assert(vece >= 1 && vece <= 2); 75 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 76 } 77 78 #define GEN_CMP0(NAME, COND) \ 79 void NAME(unsigned vece, uint32_t d, uint32_t m, \ 80 uint32_t opr_sz, uint32_t max_sz) \ 81 { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); } 82 83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ) 84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE) 85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE) 86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT) 87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT) 88 89 #undef GEN_CMP0 90 91 void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 92 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 93 { 94 /* Signed shift out of range results in all-sign-bits */ 95 shift = MIN(shift, (8 << vece) - 1); 96 tcg_gen_gvec_sari(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz); 97 } 98 99 void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 100 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 101 { 102 /* Unsigned shift out of range results in all-zero-bits */ 103 if (shift >= (8 << vece)) { 104 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); 105 } else { 106 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz); 107 } 108 } 109 110 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 111 { 112 tcg_gen_vec_sar8i_i64(a, a, shift); 113 tcg_gen_vec_add8_i64(d, d, a); 114 } 115 116 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 117 { 118 tcg_gen_vec_sar16i_i64(a, a, shift); 119 tcg_gen_vec_add16_i64(d, d, a); 120 } 121 122 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 123 { 124 tcg_gen_sari_i32(a, a, shift); 125 tcg_gen_add_i32(d, d, a); 126 } 127 128 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 129 { 130 tcg_gen_sari_i64(a, a, shift); 131 tcg_gen_add_i64(d, d, a); 132 } 133 134 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 135 { 136 tcg_gen_sari_vec(vece, a, a, sh); 137 tcg_gen_add_vec(vece, d, d, a); 138 } 139 140 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 141 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 142 { 143 static const TCGOpcode vecop_list[] = { 144 INDEX_op_sari_vec, INDEX_op_add_vec, 0 145 }; 146 static const GVecGen2i ops[4] = { 147 { .fni8 = gen_ssra8_i64, 148 .fniv = gen_ssra_vec, 149 .fno = gen_helper_gvec_ssra_b, 150 .load_dest = true, 151 .opt_opc = vecop_list, 152 .vece = MO_8 }, 153 { .fni8 = gen_ssra16_i64, 154 .fniv = gen_ssra_vec, 155 .fno = gen_helper_gvec_ssra_h, 156 .load_dest = true, 157 .opt_opc = vecop_list, 158 .vece = MO_16 }, 159 { .fni4 = gen_ssra32_i32, 160 .fniv = gen_ssra_vec, 161 .fno = gen_helper_gvec_ssra_s, 162 .load_dest = true, 163 .opt_opc = vecop_list, 164 .vece = MO_32 }, 165 { .fni8 = gen_ssra64_i64, 166 .fniv = gen_ssra_vec, 167 .fno = gen_helper_gvec_ssra_d, 168 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 169 .opt_opc = vecop_list, 170 .load_dest = true, 171 .vece = MO_64 }, 172 }; 173 174 /* tszimm encoding produces immediates in the range [1..esize]. */ 175 tcg_debug_assert(shift > 0); 176 tcg_debug_assert(shift <= (8 << vece)); 177 178 /* 179 * Shifts larger than the element size are architecturally valid. 180 * Signed results in all sign bits. 181 */ 182 shift = MIN(shift, (8 << vece) - 1); 183 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 184 } 185 186 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 187 { 188 tcg_gen_vec_shr8i_i64(a, a, shift); 189 tcg_gen_vec_add8_i64(d, d, a); 190 } 191 192 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 193 { 194 tcg_gen_vec_shr16i_i64(a, a, shift); 195 tcg_gen_vec_add16_i64(d, d, a); 196 } 197 198 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 199 { 200 tcg_gen_shri_i32(a, a, shift); 201 tcg_gen_add_i32(d, d, a); 202 } 203 204 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 205 { 206 tcg_gen_shri_i64(a, a, shift); 207 tcg_gen_add_i64(d, d, a); 208 } 209 210 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 211 { 212 tcg_gen_shri_vec(vece, a, a, sh); 213 tcg_gen_add_vec(vece, d, d, a); 214 } 215 216 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 217 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 218 { 219 static const TCGOpcode vecop_list[] = { 220 INDEX_op_shri_vec, INDEX_op_add_vec, 0 221 }; 222 static const GVecGen2i ops[4] = { 223 { .fni8 = gen_usra8_i64, 224 .fniv = gen_usra_vec, 225 .fno = gen_helper_gvec_usra_b, 226 .load_dest = true, 227 .opt_opc = vecop_list, 228 .vece = MO_8, }, 229 { .fni8 = gen_usra16_i64, 230 .fniv = gen_usra_vec, 231 .fno = gen_helper_gvec_usra_h, 232 .load_dest = true, 233 .opt_opc = vecop_list, 234 .vece = MO_16, }, 235 { .fni4 = gen_usra32_i32, 236 .fniv = gen_usra_vec, 237 .fno = gen_helper_gvec_usra_s, 238 .load_dest = true, 239 .opt_opc = vecop_list, 240 .vece = MO_32, }, 241 { .fni8 = gen_usra64_i64, 242 .fniv = gen_usra_vec, 243 .fno = gen_helper_gvec_usra_d, 244 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 245 .load_dest = true, 246 .opt_opc = vecop_list, 247 .vece = MO_64, }, 248 }; 249 250 /* tszimm encoding produces immediates in the range [1..esize]. */ 251 tcg_debug_assert(shift > 0); 252 tcg_debug_assert(shift <= (8 << vece)); 253 254 /* 255 * Shifts larger than the element size are architecturally valid. 256 * Unsigned results in all zeros as input to accumulate: nop. 257 */ 258 if (shift < (8 << vece)) { 259 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 260 } else { 261 /* Nop, but we do need to clear the tail. */ 262 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 263 } 264 } 265 266 /* 267 * Shift one less than the requested amount, and the low bit is 268 * the rounding bit. For the 8 and 16-bit operations, because we 269 * mask the low bit, we can perform a normal integer shift instead 270 * of a vector shift. 271 */ 272 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 273 { 274 TCGv_i64 t = tcg_temp_new_i64(); 275 276 tcg_gen_shri_i64(t, a, sh - 1); 277 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 278 tcg_gen_vec_sar8i_i64(d, a, sh); 279 tcg_gen_vec_add8_i64(d, d, t); 280 } 281 282 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 283 { 284 TCGv_i64 t = tcg_temp_new_i64(); 285 286 tcg_gen_shri_i64(t, a, sh - 1); 287 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 288 tcg_gen_vec_sar16i_i64(d, a, sh); 289 tcg_gen_vec_add16_i64(d, d, t); 290 } 291 292 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 293 { 294 TCGv_i32 t; 295 296 /* Handle shift by the input size for the benefit of trans_SRSHR_ri */ 297 if (sh == 32) { 298 tcg_gen_movi_i32(d, 0); 299 return; 300 } 301 t = tcg_temp_new_i32(); 302 tcg_gen_extract_i32(t, a, sh - 1, 1); 303 tcg_gen_sari_i32(d, a, sh); 304 tcg_gen_add_i32(d, d, t); 305 } 306 307 void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 308 { 309 TCGv_i64 t = tcg_temp_new_i64(); 310 311 tcg_gen_extract_i64(t, a, sh - 1, 1); 312 tcg_gen_sari_i64(d, a, sh); 313 tcg_gen_add_i64(d, d, t); 314 } 315 316 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 317 { 318 TCGv_vec t = tcg_temp_new_vec_matching(d); 319 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); 320 321 tcg_gen_shri_vec(vece, t, a, sh - 1); 322 tcg_gen_and_vec(vece, t, t, ones); 323 tcg_gen_sari_vec(vece, d, a, sh); 324 tcg_gen_add_vec(vece, d, d, t); 325 } 326 327 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 328 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 329 { 330 static const TCGOpcode vecop_list[] = { 331 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 332 }; 333 static const GVecGen2i ops[4] = { 334 { .fni8 = gen_srshr8_i64, 335 .fniv = gen_srshr_vec, 336 .fno = gen_helper_gvec_srshr_b, 337 .opt_opc = vecop_list, 338 .vece = MO_8 }, 339 { .fni8 = gen_srshr16_i64, 340 .fniv = gen_srshr_vec, 341 .fno = gen_helper_gvec_srshr_h, 342 .opt_opc = vecop_list, 343 .vece = MO_16 }, 344 { .fni4 = gen_srshr32_i32, 345 .fniv = gen_srshr_vec, 346 .fno = gen_helper_gvec_srshr_s, 347 .opt_opc = vecop_list, 348 .vece = MO_32 }, 349 { .fni8 = gen_srshr64_i64, 350 .fniv = gen_srshr_vec, 351 .fno = gen_helper_gvec_srshr_d, 352 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 353 .opt_opc = vecop_list, 354 .vece = MO_64 }, 355 }; 356 357 /* tszimm encoding produces immediates in the range [1..esize] */ 358 tcg_debug_assert(shift > 0); 359 tcg_debug_assert(shift <= (8 << vece)); 360 361 if (shift == (8 << vece)) { 362 /* 363 * Shifts larger than the element size are architecturally valid. 364 * Signed results in all sign bits. With rounding, this produces 365 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 366 * I.e. always zero. 367 */ 368 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); 369 } else { 370 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 371 } 372 } 373 374 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 375 { 376 TCGv_i64 t = tcg_temp_new_i64(); 377 378 gen_srshr8_i64(t, a, sh); 379 tcg_gen_vec_add8_i64(d, d, t); 380 } 381 382 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 383 { 384 TCGv_i64 t = tcg_temp_new_i64(); 385 386 gen_srshr16_i64(t, a, sh); 387 tcg_gen_vec_add16_i64(d, d, t); 388 } 389 390 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 391 { 392 TCGv_i32 t = tcg_temp_new_i32(); 393 394 gen_srshr32_i32(t, a, sh); 395 tcg_gen_add_i32(d, d, t); 396 } 397 398 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 399 { 400 TCGv_i64 t = tcg_temp_new_i64(); 401 402 gen_srshr64_i64(t, a, sh); 403 tcg_gen_add_i64(d, d, t); 404 } 405 406 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 407 { 408 TCGv_vec t = tcg_temp_new_vec_matching(d); 409 410 gen_srshr_vec(vece, t, a, sh); 411 tcg_gen_add_vec(vece, d, d, t); 412 } 413 414 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 415 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 416 { 417 static const TCGOpcode vecop_list[] = { 418 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 419 }; 420 static const GVecGen2i ops[4] = { 421 { .fni8 = gen_srsra8_i64, 422 .fniv = gen_srsra_vec, 423 .fno = gen_helper_gvec_srsra_b, 424 .opt_opc = vecop_list, 425 .load_dest = true, 426 .vece = MO_8 }, 427 { .fni8 = gen_srsra16_i64, 428 .fniv = gen_srsra_vec, 429 .fno = gen_helper_gvec_srsra_h, 430 .opt_opc = vecop_list, 431 .load_dest = true, 432 .vece = MO_16 }, 433 { .fni4 = gen_srsra32_i32, 434 .fniv = gen_srsra_vec, 435 .fno = gen_helper_gvec_srsra_s, 436 .opt_opc = vecop_list, 437 .load_dest = true, 438 .vece = MO_32 }, 439 { .fni8 = gen_srsra64_i64, 440 .fniv = gen_srsra_vec, 441 .fno = gen_helper_gvec_srsra_d, 442 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 443 .opt_opc = vecop_list, 444 .load_dest = true, 445 .vece = MO_64 }, 446 }; 447 448 /* tszimm encoding produces immediates in the range [1..esize] */ 449 tcg_debug_assert(shift > 0); 450 tcg_debug_assert(shift <= (8 << vece)); 451 452 /* 453 * Shifts larger than the element size are architecturally valid. 454 * Signed results in all sign bits. With rounding, this produces 455 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 456 * I.e. always zero. With accumulation, this leaves D unchanged. 457 */ 458 if (shift == (8 << vece)) { 459 /* Nop, but we do need to clear the tail. */ 460 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 461 } else { 462 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 463 } 464 } 465 466 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 467 { 468 TCGv_i64 t = tcg_temp_new_i64(); 469 470 tcg_gen_shri_i64(t, a, sh - 1); 471 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 472 tcg_gen_vec_shr8i_i64(d, a, sh); 473 tcg_gen_vec_add8_i64(d, d, t); 474 } 475 476 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 477 { 478 TCGv_i64 t = tcg_temp_new_i64(); 479 480 tcg_gen_shri_i64(t, a, sh - 1); 481 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 482 tcg_gen_vec_shr16i_i64(d, a, sh); 483 tcg_gen_vec_add16_i64(d, d, t); 484 } 485 486 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 487 { 488 TCGv_i32 t; 489 490 /* Handle shift by the input size for the benefit of trans_URSHR_ri */ 491 if (sh == 32) { 492 tcg_gen_extract_i32(d, a, sh - 1, 1); 493 return; 494 } 495 t = tcg_temp_new_i32(); 496 tcg_gen_extract_i32(t, a, sh - 1, 1); 497 tcg_gen_shri_i32(d, a, sh); 498 tcg_gen_add_i32(d, d, t); 499 } 500 501 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 502 { 503 TCGv_i64 t = tcg_temp_new_i64(); 504 505 tcg_gen_extract_i64(t, a, sh - 1, 1); 506 tcg_gen_shri_i64(d, a, sh); 507 tcg_gen_add_i64(d, d, t); 508 } 509 510 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift) 511 { 512 TCGv_vec t = tcg_temp_new_vec_matching(d); 513 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); 514 515 tcg_gen_shri_vec(vece, t, a, shift - 1); 516 tcg_gen_and_vec(vece, t, t, ones); 517 tcg_gen_shri_vec(vece, d, a, shift); 518 tcg_gen_add_vec(vece, d, d, t); 519 } 520 521 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 522 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 523 { 524 static const TCGOpcode vecop_list[] = { 525 INDEX_op_shri_vec, INDEX_op_add_vec, 0 526 }; 527 static const GVecGen2i ops[4] = { 528 { .fni8 = gen_urshr8_i64, 529 .fniv = gen_urshr_vec, 530 .fno = gen_helper_gvec_urshr_b, 531 .opt_opc = vecop_list, 532 .vece = MO_8 }, 533 { .fni8 = gen_urshr16_i64, 534 .fniv = gen_urshr_vec, 535 .fno = gen_helper_gvec_urshr_h, 536 .opt_opc = vecop_list, 537 .vece = MO_16 }, 538 { .fni4 = gen_urshr32_i32, 539 .fniv = gen_urshr_vec, 540 .fno = gen_helper_gvec_urshr_s, 541 .opt_opc = vecop_list, 542 .vece = MO_32 }, 543 { .fni8 = gen_urshr64_i64, 544 .fniv = gen_urshr_vec, 545 .fno = gen_helper_gvec_urshr_d, 546 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 547 .opt_opc = vecop_list, 548 .vece = MO_64 }, 549 }; 550 551 /* tszimm encoding produces immediates in the range [1..esize] */ 552 tcg_debug_assert(shift > 0); 553 tcg_debug_assert(shift <= (8 << vece)); 554 555 if (shift == (8 << vece)) { 556 /* 557 * Shifts larger than the element size are architecturally valid. 558 * Unsigned results in zero. With rounding, this produces a 559 * copy of the most significant bit. 560 */ 561 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz); 562 } else { 563 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 564 } 565 } 566 567 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 568 { 569 TCGv_i64 t = tcg_temp_new_i64(); 570 571 if (sh == 8) { 572 tcg_gen_vec_shr8i_i64(t, a, 7); 573 } else { 574 gen_urshr8_i64(t, a, sh); 575 } 576 tcg_gen_vec_add8_i64(d, d, t); 577 } 578 579 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 580 { 581 TCGv_i64 t = tcg_temp_new_i64(); 582 583 if (sh == 16) { 584 tcg_gen_vec_shr16i_i64(t, a, 15); 585 } else { 586 gen_urshr16_i64(t, a, sh); 587 } 588 tcg_gen_vec_add16_i64(d, d, t); 589 } 590 591 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 592 { 593 TCGv_i32 t = tcg_temp_new_i32(); 594 595 if (sh == 32) { 596 tcg_gen_shri_i32(t, a, 31); 597 } else { 598 gen_urshr32_i32(t, a, sh); 599 } 600 tcg_gen_add_i32(d, d, t); 601 } 602 603 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 604 { 605 TCGv_i64 t = tcg_temp_new_i64(); 606 607 if (sh == 64) { 608 tcg_gen_shri_i64(t, a, 63); 609 } else { 610 gen_urshr64_i64(t, a, sh); 611 } 612 tcg_gen_add_i64(d, d, t); 613 } 614 615 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 616 { 617 TCGv_vec t = tcg_temp_new_vec_matching(d); 618 619 if (sh == (8 << vece)) { 620 tcg_gen_shri_vec(vece, t, a, sh - 1); 621 } else { 622 gen_urshr_vec(vece, t, a, sh); 623 } 624 tcg_gen_add_vec(vece, d, d, t); 625 } 626 627 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 628 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 629 { 630 static const TCGOpcode vecop_list[] = { 631 INDEX_op_shri_vec, INDEX_op_add_vec, 0 632 }; 633 static const GVecGen2i ops[4] = { 634 { .fni8 = gen_ursra8_i64, 635 .fniv = gen_ursra_vec, 636 .fno = gen_helper_gvec_ursra_b, 637 .opt_opc = vecop_list, 638 .load_dest = true, 639 .vece = MO_8 }, 640 { .fni8 = gen_ursra16_i64, 641 .fniv = gen_ursra_vec, 642 .fno = gen_helper_gvec_ursra_h, 643 .opt_opc = vecop_list, 644 .load_dest = true, 645 .vece = MO_16 }, 646 { .fni4 = gen_ursra32_i32, 647 .fniv = gen_ursra_vec, 648 .fno = gen_helper_gvec_ursra_s, 649 .opt_opc = vecop_list, 650 .load_dest = true, 651 .vece = MO_32 }, 652 { .fni8 = gen_ursra64_i64, 653 .fniv = gen_ursra_vec, 654 .fno = gen_helper_gvec_ursra_d, 655 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 656 .opt_opc = vecop_list, 657 .load_dest = true, 658 .vece = MO_64 }, 659 }; 660 661 /* tszimm encoding produces immediates in the range [1..esize] */ 662 tcg_debug_assert(shift > 0); 663 tcg_debug_assert(shift <= (8 << vece)); 664 665 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 666 } 667 668 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 669 { 670 uint64_t mask = dup_const(MO_8, 0xff >> shift); 671 TCGv_i64 t = tcg_temp_new_i64(); 672 673 tcg_gen_shri_i64(t, a, shift); 674 tcg_gen_andi_i64(t, t, mask); 675 tcg_gen_andi_i64(d, d, ~mask); 676 tcg_gen_or_i64(d, d, t); 677 } 678 679 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 680 { 681 uint64_t mask = dup_const(MO_16, 0xffff >> shift); 682 TCGv_i64 t = tcg_temp_new_i64(); 683 684 tcg_gen_shri_i64(t, a, shift); 685 tcg_gen_andi_i64(t, t, mask); 686 tcg_gen_andi_i64(d, d, ~mask); 687 tcg_gen_or_i64(d, d, t); 688 } 689 690 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 691 { 692 tcg_gen_shri_i32(a, a, shift); 693 tcg_gen_deposit_i32(d, d, a, 0, 32 - shift); 694 } 695 696 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 697 { 698 tcg_gen_shri_i64(a, a, shift); 699 tcg_gen_deposit_i64(d, d, a, 0, 64 - shift); 700 } 701 702 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 703 { 704 TCGv_vec t = tcg_temp_new_vec_matching(d); 705 int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh); 706 TCGv_vec m = tcg_constant_vec_matching(d, vece, mi); 707 708 tcg_gen_shri_vec(vece, t, a, sh); 709 tcg_gen_and_vec(vece, d, d, m); 710 tcg_gen_or_vec(vece, d, d, t); 711 } 712 713 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 714 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 715 { 716 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 717 const GVecGen2i ops[4] = { 718 { .fni8 = gen_shr8_ins_i64, 719 .fniv = gen_shr_ins_vec, 720 .fno = gen_helper_gvec_sri_b, 721 .load_dest = true, 722 .opt_opc = vecop_list, 723 .vece = MO_8 }, 724 { .fni8 = gen_shr16_ins_i64, 725 .fniv = gen_shr_ins_vec, 726 .fno = gen_helper_gvec_sri_h, 727 .load_dest = true, 728 .opt_opc = vecop_list, 729 .vece = MO_16 }, 730 { .fni4 = gen_shr32_ins_i32, 731 .fniv = gen_shr_ins_vec, 732 .fno = gen_helper_gvec_sri_s, 733 .load_dest = true, 734 .opt_opc = vecop_list, 735 .vece = MO_32 }, 736 { .fni8 = gen_shr64_ins_i64, 737 .fniv = gen_shr_ins_vec, 738 .fno = gen_helper_gvec_sri_d, 739 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 740 .load_dest = true, 741 .opt_opc = vecop_list, 742 .vece = MO_64 }, 743 }; 744 745 /* tszimm encoding produces immediates in the range [1..esize]. */ 746 tcg_debug_assert(shift > 0); 747 tcg_debug_assert(shift <= (8 << vece)); 748 749 /* Shift of esize leaves destination unchanged. */ 750 if (shift < (8 << vece)) { 751 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 752 } else { 753 /* Nop, but we do need to clear the tail. */ 754 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 755 } 756 } 757 758 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 759 { 760 uint64_t mask = dup_const(MO_8, 0xff << shift); 761 TCGv_i64 t = tcg_temp_new_i64(); 762 763 tcg_gen_shli_i64(t, a, shift); 764 tcg_gen_andi_i64(t, t, mask); 765 tcg_gen_andi_i64(d, d, ~mask); 766 tcg_gen_or_i64(d, d, t); 767 } 768 769 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 770 { 771 uint64_t mask = dup_const(MO_16, 0xffff << shift); 772 TCGv_i64 t = tcg_temp_new_i64(); 773 774 tcg_gen_shli_i64(t, a, shift); 775 tcg_gen_andi_i64(t, t, mask); 776 tcg_gen_andi_i64(d, d, ~mask); 777 tcg_gen_or_i64(d, d, t); 778 } 779 780 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 781 { 782 tcg_gen_deposit_i32(d, d, a, shift, 32 - shift); 783 } 784 785 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 786 { 787 tcg_gen_deposit_i64(d, d, a, shift, 64 - shift); 788 } 789 790 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 791 { 792 TCGv_vec t = tcg_temp_new_vec_matching(d); 793 TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh)); 794 795 tcg_gen_shli_vec(vece, t, a, sh); 796 tcg_gen_and_vec(vece, d, d, m); 797 tcg_gen_or_vec(vece, d, d, t); 798 } 799 800 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 801 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 802 { 803 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 804 const GVecGen2i ops[4] = { 805 { .fni8 = gen_shl8_ins_i64, 806 .fniv = gen_shl_ins_vec, 807 .fno = gen_helper_gvec_sli_b, 808 .load_dest = true, 809 .opt_opc = vecop_list, 810 .vece = MO_8 }, 811 { .fni8 = gen_shl16_ins_i64, 812 .fniv = gen_shl_ins_vec, 813 .fno = gen_helper_gvec_sli_h, 814 .load_dest = true, 815 .opt_opc = vecop_list, 816 .vece = MO_16 }, 817 { .fni4 = gen_shl32_ins_i32, 818 .fniv = gen_shl_ins_vec, 819 .fno = gen_helper_gvec_sli_s, 820 .load_dest = true, 821 .opt_opc = vecop_list, 822 .vece = MO_32 }, 823 { .fni8 = gen_shl64_ins_i64, 824 .fniv = gen_shl_ins_vec, 825 .fno = gen_helper_gvec_sli_d, 826 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 827 .load_dest = true, 828 .opt_opc = vecop_list, 829 .vece = MO_64 }, 830 }; 831 832 /* tszimm encoding produces immediates in the range [0..esize-1]. */ 833 tcg_debug_assert(shift >= 0); 834 tcg_debug_assert(shift < (8 << vece)); 835 836 if (shift == 0) { 837 tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz); 838 } else { 839 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 840 } 841 } 842 843 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 844 { 845 gen_helper_neon_mul_u8(a, a, b); 846 gen_helper_neon_add_u8(d, d, a); 847 } 848 849 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 850 { 851 gen_helper_neon_mul_u8(a, a, b); 852 gen_helper_neon_sub_u8(d, d, a); 853 } 854 855 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 856 { 857 gen_helper_neon_mul_u16(a, a, b); 858 gen_helper_neon_add_u16(d, d, a); 859 } 860 861 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 862 { 863 gen_helper_neon_mul_u16(a, a, b); 864 gen_helper_neon_sub_u16(d, d, a); 865 } 866 867 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 868 { 869 tcg_gen_mul_i32(a, a, b); 870 tcg_gen_add_i32(d, d, a); 871 } 872 873 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 874 { 875 tcg_gen_mul_i32(a, a, b); 876 tcg_gen_sub_i32(d, d, a); 877 } 878 879 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 880 { 881 tcg_gen_mul_i64(a, a, b); 882 tcg_gen_add_i64(d, d, a); 883 } 884 885 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 886 { 887 tcg_gen_mul_i64(a, a, b); 888 tcg_gen_sub_i64(d, d, a); 889 } 890 891 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 892 { 893 tcg_gen_mul_vec(vece, a, a, b); 894 tcg_gen_add_vec(vece, d, d, a); 895 } 896 897 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 898 { 899 tcg_gen_mul_vec(vece, a, a, b); 900 tcg_gen_sub_vec(vece, d, d, a); 901 } 902 903 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops, 904 * these tables are shared with AArch64 which does support them. 905 */ 906 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 907 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 908 { 909 static const TCGOpcode vecop_list[] = { 910 INDEX_op_mul_vec, INDEX_op_add_vec, 0 911 }; 912 static const GVecGen3 ops[4] = { 913 { .fni4 = gen_mla8_i32, 914 .fniv = gen_mla_vec, 915 .load_dest = true, 916 .opt_opc = vecop_list, 917 .vece = MO_8 }, 918 { .fni4 = gen_mla16_i32, 919 .fniv = gen_mla_vec, 920 .load_dest = true, 921 .opt_opc = vecop_list, 922 .vece = MO_16 }, 923 { .fni4 = gen_mla32_i32, 924 .fniv = gen_mla_vec, 925 .load_dest = true, 926 .opt_opc = vecop_list, 927 .vece = MO_32 }, 928 { .fni8 = gen_mla64_i64, 929 .fniv = gen_mla_vec, 930 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 931 .load_dest = true, 932 .opt_opc = vecop_list, 933 .vece = MO_64 }, 934 }; 935 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 936 } 937 938 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 939 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 940 { 941 static const TCGOpcode vecop_list[] = { 942 INDEX_op_mul_vec, INDEX_op_sub_vec, 0 943 }; 944 static const GVecGen3 ops[4] = { 945 { .fni4 = gen_mls8_i32, 946 .fniv = gen_mls_vec, 947 .load_dest = true, 948 .opt_opc = vecop_list, 949 .vece = MO_8 }, 950 { .fni4 = gen_mls16_i32, 951 .fniv = gen_mls_vec, 952 .load_dest = true, 953 .opt_opc = vecop_list, 954 .vece = MO_16 }, 955 { .fni4 = gen_mls32_i32, 956 .fniv = gen_mls_vec, 957 .load_dest = true, 958 .opt_opc = vecop_list, 959 .vece = MO_32 }, 960 { .fni8 = gen_mls64_i64, 961 .fniv = gen_mls_vec, 962 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 963 .load_dest = true, 964 .opt_opc = vecop_list, 965 .vece = MO_64 }, 966 }; 967 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 968 } 969 970 /* CMTST : test is "if (X & Y != 0)". */ 971 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 972 { 973 tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b); 974 } 975 976 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 977 { 978 tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b); 979 } 980 981 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 982 { 983 tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b); 984 } 985 986 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 987 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 988 { 989 static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 }; 990 static const GVecGen3 ops[4] = { 991 { .fni4 = gen_helper_neon_tst_u8, 992 .fniv = gen_cmtst_vec, 993 .opt_opc = vecop_list, 994 .vece = MO_8 }, 995 { .fni4 = gen_helper_neon_tst_u16, 996 .fniv = gen_cmtst_vec, 997 .opt_opc = vecop_list, 998 .vece = MO_16 }, 999 { .fni4 = gen_cmtst_i32, 1000 .fniv = gen_cmtst_vec, 1001 .opt_opc = vecop_list, 1002 .vece = MO_32 }, 1003 { .fni8 = gen_cmtst_i64, 1004 .fniv = gen_cmtst_vec, 1005 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1006 .opt_opc = vecop_list, 1007 .vece = MO_64 }, 1008 }; 1009 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1010 } 1011 1012 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 1013 { 1014 TCGv_i32 lval = tcg_temp_new_i32(); 1015 TCGv_i32 rval = tcg_temp_new_i32(); 1016 TCGv_i32 lsh = tcg_temp_new_i32(); 1017 TCGv_i32 rsh = tcg_temp_new_i32(); 1018 TCGv_i32 zero = tcg_constant_i32(0); 1019 TCGv_i32 max = tcg_constant_i32(32); 1020 1021 /* 1022 * Rely on the TCG guarantee that out of range shifts produce 1023 * unspecified results, not undefined behaviour (i.e. no trap). 1024 * Discard out-of-range results after the fact. 1025 */ 1026 tcg_gen_ext8s_i32(lsh, shift); 1027 tcg_gen_neg_i32(rsh, lsh); 1028 tcg_gen_shl_i32(lval, src, lsh); 1029 tcg_gen_shr_i32(rval, src, rsh); 1030 tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero); 1031 tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst); 1032 } 1033 1034 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1035 { 1036 TCGv_i64 lval = tcg_temp_new_i64(); 1037 TCGv_i64 rval = tcg_temp_new_i64(); 1038 TCGv_i64 lsh = tcg_temp_new_i64(); 1039 TCGv_i64 rsh = tcg_temp_new_i64(); 1040 TCGv_i64 zero = tcg_constant_i64(0); 1041 TCGv_i64 max = tcg_constant_i64(64); 1042 1043 /* 1044 * Rely on the TCG guarantee that out of range shifts produce 1045 * unspecified results, not undefined behaviour (i.e. no trap). 1046 * Discard out-of-range results after the fact. 1047 */ 1048 tcg_gen_ext8s_i64(lsh, shift); 1049 tcg_gen_neg_i64(rsh, lsh); 1050 tcg_gen_shl_i64(lval, src, lsh); 1051 tcg_gen_shr_i64(rval, src, rsh); 1052 tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero); 1053 tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst); 1054 } 1055 1056 static void gen_ushl_vec(unsigned vece, TCGv_vec dst, 1057 TCGv_vec src, TCGv_vec shift) 1058 { 1059 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1060 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1061 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1062 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1063 TCGv_vec max, zero; 1064 1065 tcg_gen_neg_vec(vece, rsh, shift); 1066 if (vece == MO_8) { 1067 tcg_gen_mov_vec(lsh, shift); 1068 } else { 1069 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); 1070 tcg_gen_and_vec(vece, lsh, shift, msk); 1071 tcg_gen_and_vec(vece, rsh, rsh, msk); 1072 } 1073 1074 /* 1075 * Rely on the TCG guarantee that out of range shifts produce 1076 * unspecified results, not undefined behaviour (i.e. no trap). 1077 * Discard out-of-range results after the fact. 1078 */ 1079 tcg_gen_shlv_vec(vece, lval, src, lsh); 1080 tcg_gen_shrv_vec(vece, rval, src, rsh); 1081 1082 /* 1083 * The choice of GE (signed) and GEU (unsigned) are biased toward 1084 * the instructions of the x86_64 host. For MO_8, the whole byte 1085 * is significant so we must use an unsigned compare; otherwise we 1086 * have already masked to a byte and so a signed compare works. 1087 * Other tcg hosts have a full set of comparisons and do not care. 1088 */ 1089 zero = tcg_constant_vec_matching(dst, vece, 0); 1090 max = tcg_constant_vec_matching(dst, vece, 8 << vece); 1091 if (vece == MO_8) { 1092 tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, lval, lsh, max, zero, lval); 1093 tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, rval, rsh, max, zero, rval); 1094 } else { 1095 tcg_gen_cmpsel_vec(TCG_COND_GE, vece, lval, lsh, max, zero, lval); 1096 tcg_gen_cmpsel_vec(TCG_COND_GE, vece, rval, rsh, max, zero, rval); 1097 } 1098 tcg_gen_or_vec(vece, dst, lval, rval); 1099 } 1100 1101 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1102 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1103 { 1104 static const TCGOpcode vecop_list[] = { 1105 INDEX_op_neg_vec, INDEX_op_shlv_vec, 1106 INDEX_op_shrv_vec, INDEX_op_cmpsel_vec, 0 1107 }; 1108 static const GVecGen3 ops[4] = { 1109 { .fniv = gen_ushl_vec, 1110 .fno = gen_helper_gvec_ushl_b, 1111 .opt_opc = vecop_list, 1112 .vece = MO_8 }, 1113 { .fniv = gen_ushl_vec, 1114 .fno = gen_helper_gvec_ushl_h, 1115 .opt_opc = vecop_list, 1116 .vece = MO_16 }, 1117 { .fni4 = gen_ushl_i32, 1118 .fniv = gen_ushl_vec, 1119 .opt_opc = vecop_list, 1120 .vece = MO_32 }, 1121 { .fni8 = gen_ushl_i64, 1122 .fniv = gen_ushl_vec, 1123 .opt_opc = vecop_list, 1124 .vece = MO_64 }, 1125 }; 1126 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1127 } 1128 1129 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 1130 { 1131 TCGv_i32 lval = tcg_temp_new_i32(); 1132 TCGv_i32 rval = tcg_temp_new_i32(); 1133 TCGv_i32 lsh = tcg_temp_new_i32(); 1134 TCGv_i32 rsh = tcg_temp_new_i32(); 1135 TCGv_i32 zero = tcg_constant_i32(0); 1136 TCGv_i32 max = tcg_constant_i32(31); 1137 1138 /* 1139 * Rely on the TCG guarantee that out of range shifts produce 1140 * unspecified results, not undefined behaviour (i.e. no trap). 1141 * Discard out-of-range results after the fact. 1142 */ 1143 tcg_gen_ext8s_i32(lsh, shift); 1144 tcg_gen_neg_i32(rsh, lsh); 1145 tcg_gen_shl_i32(lval, src, lsh); 1146 tcg_gen_umin_i32(rsh, rsh, max); 1147 tcg_gen_sar_i32(rval, src, rsh); 1148 tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero); 1149 tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval); 1150 } 1151 1152 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1153 { 1154 TCGv_i64 lval = tcg_temp_new_i64(); 1155 TCGv_i64 rval = tcg_temp_new_i64(); 1156 TCGv_i64 lsh = tcg_temp_new_i64(); 1157 TCGv_i64 rsh = tcg_temp_new_i64(); 1158 TCGv_i64 zero = tcg_constant_i64(0); 1159 TCGv_i64 max = tcg_constant_i64(63); 1160 1161 /* 1162 * Rely on the TCG guarantee that out of range shifts produce 1163 * unspecified results, not undefined behaviour (i.e. no trap). 1164 * Discard out-of-range results after the fact. 1165 */ 1166 tcg_gen_ext8s_i64(lsh, shift); 1167 tcg_gen_neg_i64(rsh, lsh); 1168 tcg_gen_shl_i64(lval, src, lsh); 1169 tcg_gen_umin_i64(rsh, rsh, max); 1170 tcg_gen_sar_i64(rval, src, rsh); 1171 tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero); 1172 tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval); 1173 } 1174 1175 static void gen_sshl_vec(unsigned vece, TCGv_vec dst, 1176 TCGv_vec src, TCGv_vec shift) 1177 { 1178 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1179 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1180 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1181 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1182 TCGv_vec max, zero; 1183 1184 /* 1185 * Rely on the TCG guarantee that out of range shifts produce 1186 * unspecified results, not undefined behaviour (i.e. no trap). 1187 * Discard out-of-range results after the fact. 1188 */ 1189 tcg_gen_neg_vec(vece, rsh, shift); 1190 if (vece == MO_8) { 1191 tcg_gen_mov_vec(lsh, shift); 1192 } else { 1193 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); 1194 tcg_gen_and_vec(vece, lsh, shift, msk); 1195 tcg_gen_and_vec(vece, rsh, rsh, msk); 1196 } 1197 1198 /* Bound rsh so out of bound right shift gets -1. */ 1199 max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1); 1200 tcg_gen_umin_vec(vece, rsh, rsh, max); 1201 1202 tcg_gen_shlv_vec(vece, lval, src, lsh); 1203 tcg_gen_sarv_vec(vece, rval, src, rsh); 1204 1205 /* Select in-bound left shift. */ 1206 zero = tcg_constant_vec_matching(dst, vece, 0); 1207 tcg_gen_cmpsel_vec(TCG_COND_GT, vece, lval, lsh, max, zero, lval); 1208 1209 /* Select between left and right shift. */ 1210 if (vece == MO_8) { 1211 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval); 1212 } else { 1213 TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80); 1214 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval); 1215 } 1216 } 1217 1218 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1219 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1220 { 1221 static const TCGOpcode vecop_list[] = { 1222 INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec, 1223 INDEX_op_sarv_vec, INDEX_op_cmpsel_vec, 0 1224 }; 1225 static const GVecGen3 ops[4] = { 1226 { .fniv = gen_sshl_vec, 1227 .fno = gen_helper_gvec_sshl_b, 1228 .opt_opc = vecop_list, 1229 .vece = MO_8 }, 1230 { .fniv = gen_sshl_vec, 1231 .fno = gen_helper_gvec_sshl_h, 1232 .opt_opc = vecop_list, 1233 .vece = MO_16 }, 1234 { .fni4 = gen_sshl_i32, 1235 .fniv = gen_sshl_vec, 1236 .opt_opc = vecop_list, 1237 .vece = MO_32 }, 1238 { .fni8 = gen_sshl_i64, 1239 .fniv = gen_sshl_vec, 1240 .opt_opc = vecop_list, 1241 .vece = MO_64 }, 1242 }; 1243 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1244 } 1245 1246 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1247 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1248 { 1249 static gen_helper_gvec_3 * const fns[] = { 1250 gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h, 1251 gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d, 1252 }; 1253 tcg_debug_assert(vece <= MO_64); 1254 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1255 } 1256 1257 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1258 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1259 { 1260 static gen_helper_gvec_3 * const fns[] = { 1261 gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h, 1262 gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d, 1263 }; 1264 tcg_debug_assert(vece <= MO_64); 1265 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1266 } 1267 1268 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1269 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1270 { 1271 static gen_helper_gvec_3_ptr * const fns[] = { 1272 gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h, 1273 gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d, 1274 }; 1275 tcg_debug_assert(vece <= MO_64); 1276 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1277 opr_sz, max_sz, 0, fns[vece]); 1278 } 1279 1280 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1281 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1282 { 1283 static gen_helper_gvec_3_ptr * const fns[] = { 1284 gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h, 1285 gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d, 1286 }; 1287 tcg_debug_assert(vece <= MO_64); 1288 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1289 opr_sz, max_sz, 0, fns[vece]); 1290 } 1291 1292 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1293 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1294 { 1295 static gen_helper_gvec_3_ptr * const fns[] = { 1296 gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h, 1297 gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d, 1298 }; 1299 tcg_debug_assert(vece <= MO_64); 1300 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1301 opr_sz, max_sz, 0, fns[vece]); 1302 } 1303 1304 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1305 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1306 { 1307 static gen_helper_gvec_3_ptr * const fns[] = { 1308 gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h, 1309 gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d, 1310 }; 1311 tcg_debug_assert(vece <= MO_64); 1312 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1313 opr_sz, max_sz, 0, fns[vece]); 1314 } 1315 1316 void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1317 int64_t c, uint32_t opr_sz, uint32_t max_sz) 1318 { 1319 static gen_helper_gvec_2_ptr * const fns[] = { 1320 gen_helper_neon_sqshli_b, gen_helper_neon_sqshli_h, 1321 gen_helper_neon_sqshli_s, gen_helper_neon_sqshli_d, 1322 }; 1323 tcg_debug_assert(vece <= MO_64); 1324 tcg_debug_assert(c >= 0 && c <= (8 << vece)); 1325 tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]); 1326 } 1327 1328 void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1329 int64_t c, uint32_t opr_sz, uint32_t max_sz) 1330 { 1331 static gen_helper_gvec_2_ptr * const fns[] = { 1332 gen_helper_neon_uqshli_b, gen_helper_neon_uqshli_h, 1333 gen_helper_neon_uqshli_s, gen_helper_neon_uqshli_d, 1334 }; 1335 tcg_debug_assert(vece <= MO_64); 1336 tcg_debug_assert(c >= 0 && c <= (8 << vece)); 1337 tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]); 1338 } 1339 1340 void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1341 int64_t c, uint32_t opr_sz, uint32_t max_sz) 1342 { 1343 static gen_helper_gvec_2_ptr * const fns[] = { 1344 gen_helper_neon_sqshlui_b, gen_helper_neon_sqshlui_h, 1345 gen_helper_neon_sqshlui_s, gen_helper_neon_sqshlui_d, 1346 }; 1347 tcg_debug_assert(vece <= MO_64); 1348 tcg_debug_assert(c >= 0 && c <= (8 << vece)); 1349 tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]); 1350 } 1351 1352 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1353 { 1354 uint64_t max = MAKE_64BIT_MASK(0, 8 << esz); 1355 TCGv_i64 tmp = tcg_temp_new_i64(); 1356 1357 tcg_gen_add_i64(tmp, a, b); 1358 tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max)); 1359 tcg_gen_xor_i64(tmp, tmp, res); 1360 tcg_gen_or_i64(qc, qc, tmp); 1361 } 1362 1363 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1364 { 1365 TCGv_i64 t = tcg_temp_new_i64(); 1366 1367 tcg_gen_add_i64(t, a, b); 1368 tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a, 1369 tcg_constant_i64(UINT64_MAX), t); 1370 tcg_gen_xor_i64(t, t, res); 1371 tcg_gen_or_i64(qc, qc, t); 1372 } 1373 1374 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1375 TCGv_vec a, TCGv_vec b) 1376 { 1377 TCGv_vec x = tcg_temp_new_vec_matching(t); 1378 tcg_gen_add_vec(vece, x, a, b); 1379 tcg_gen_usadd_vec(vece, t, a, b); 1380 tcg_gen_xor_vec(vece, x, x, t); 1381 tcg_gen_or_vec(vece, qc, qc, x); 1382 } 1383 1384 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1385 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1386 { 1387 static const TCGOpcode vecop_list[] = { 1388 INDEX_op_usadd_vec, INDEX_op_add_vec, 0 1389 }; 1390 static const GVecGen4 ops[4] = { 1391 { .fniv = gen_uqadd_vec, 1392 .fno = gen_helper_gvec_uqadd_b, 1393 .write_aofs = true, 1394 .opt_opc = vecop_list, 1395 .vece = MO_8 }, 1396 { .fniv = gen_uqadd_vec, 1397 .fno = gen_helper_gvec_uqadd_h, 1398 .write_aofs = true, 1399 .opt_opc = vecop_list, 1400 .vece = MO_16 }, 1401 { .fniv = gen_uqadd_vec, 1402 .fno = gen_helper_gvec_uqadd_s, 1403 .write_aofs = true, 1404 .opt_opc = vecop_list, 1405 .vece = MO_32 }, 1406 { .fniv = gen_uqadd_vec, 1407 .fni8 = gen_uqadd_d, 1408 .fno = gen_helper_gvec_uqadd_d, 1409 .write_aofs = true, 1410 .opt_opc = vecop_list, 1411 .vece = MO_64 }, 1412 }; 1413 1414 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1415 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1416 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1417 } 1418 1419 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1420 { 1421 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1422 int64_t min = -1ll - max; 1423 TCGv_i64 tmp = tcg_temp_new_i64(); 1424 1425 tcg_gen_add_i64(tmp, a, b); 1426 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1427 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1428 tcg_gen_xor_i64(tmp, tmp, res); 1429 tcg_gen_or_i64(qc, qc, tmp); 1430 } 1431 1432 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1433 { 1434 TCGv_i64 t0 = tcg_temp_new_i64(); 1435 TCGv_i64 t1 = tcg_temp_new_i64(); 1436 TCGv_i64 t2 = tcg_temp_new_i64(); 1437 1438 tcg_gen_add_i64(t0, a, b); 1439 1440 /* Compute signed overflow indication into T1 */ 1441 tcg_gen_xor_i64(t1, a, b); 1442 tcg_gen_xor_i64(t2, t0, a); 1443 tcg_gen_andc_i64(t1, t2, t1); 1444 1445 /* Compute saturated value into T2 */ 1446 tcg_gen_sari_i64(t2, a, 63); 1447 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1448 1449 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1450 tcg_gen_xor_i64(t0, t0, res); 1451 tcg_gen_or_i64(qc, qc, t0); 1452 } 1453 1454 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1455 TCGv_vec a, TCGv_vec b) 1456 { 1457 TCGv_vec x = tcg_temp_new_vec_matching(t); 1458 tcg_gen_add_vec(vece, x, a, b); 1459 tcg_gen_ssadd_vec(vece, t, a, b); 1460 tcg_gen_xor_vec(vece, x, x, t); 1461 tcg_gen_or_vec(vece, qc, qc, x); 1462 } 1463 1464 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1465 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1466 { 1467 static const TCGOpcode vecop_list[] = { 1468 INDEX_op_ssadd_vec, INDEX_op_add_vec, 0 1469 }; 1470 static const GVecGen4 ops[4] = { 1471 { .fniv = gen_sqadd_vec, 1472 .fno = gen_helper_gvec_sqadd_b, 1473 .opt_opc = vecop_list, 1474 .write_aofs = true, 1475 .vece = MO_8 }, 1476 { .fniv = gen_sqadd_vec, 1477 .fno = gen_helper_gvec_sqadd_h, 1478 .opt_opc = vecop_list, 1479 .write_aofs = true, 1480 .vece = MO_16 }, 1481 { .fniv = gen_sqadd_vec, 1482 .fno = gen_helper_gvec_sqadd_s, 1483 .opt_opc = vecop_list, 1484 .write_aofs = true, 1485 .vece = MO_32 }, 1486 { .fniv = gen_sqadd_vec, 1487 .fni8 = gen_sqadd_d, 1488 .fno = gen_helper_gvec_sqadd_d, 1489 .opt_opc = vecop_list, 1490 .write_aofs = true, 1491 .vece = MO_64 }, 1492 }; 1493 1494 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1495 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1496 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1497 } 1498 1499 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1500 { 1501 TCGv_i64 tmp = tcg_temp_new_i64(); 1502 1503 tcg_gen_sub_i64(tmp, a, b); 1504 tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0)); 1505 tcg_gen_xor_i64(tmp, tmp, res); 1506 tcg_gen_or_i64(qc, qc, tmp); 1507 } 1508 1509 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1510 { 1511 TCGv_i64 t = tcg_temp_new_i64(); 1512 1513 tcg_gen_sub_i64(t, a, b); 1514 tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t); 1515 tcg_gen_xor_i64(t, t, res); 1516 tcg_gen_or_i64(qc, qc, t); 1517 } 1518 1519 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1520 TCGv_vec a, TCGv_vec b) 1521 { 1522 TCGv_vec x = tcg_temp_new_vec_matching(t); 1523 tcg_gen_sub_vec(vece, x, a, b); 1524 tcg_gen_ussub_vec(vece, t, a, b); 1525 tcg_gen_xor_vec(vece, x, x, t); 1526 tcg_gen_or_vec(vece, qc, qc, x); 1527 } 1528 1529 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1530 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1531 { 1532 static const TCGOpcode vecop_list[] = { 1533 INDEX_op_ussub_vec, INDEX_op_sub_vec, 0 1534 }; 1535 static const GVecGen4 ops[4] = { 1536 { .fniv = gen_uqsub_vec, 1537 .fno = gen_helper_gvec_uqsub_b, 1538 .opt_opc = vecop_list, 1539 .write_aofs = true, 1540 .vece = MO_8 }, 1541 { .fniv = gen_uqsub_vec, 1542 .fno = gen_helper_gvec_uqsub_h, 1543 .opt_opc = vecop_list, 1544 .write_aofs = true, 1545 .vece = MO_16 }, 1546 { .fniv = gen_uqsub_vec, 1547 .fno = gen_helper_gvec_uqsub_s, 1548 .opt_opc = vecop_list, 1549 .write_aofs = true, 1550 .vece = MO_32 }, 1551 { .fniv = gen_uqsub_vec, 1552 .fni8 = gen_uqsub_d, 1553 .fno = gen_helper_gvec_uqsub_d, 1554 .opt_opc = vecop_list, 1555 .write_aofs = true, 1556 .vece = MO_64 }, 1557 }; 1558 1559 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1560 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1561 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1562 } 1563 1564 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1565 { 1566 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1567 int64_t min = -1ll - max; 1568 TCGv_i64 tmp = tcg_temp_new_i64(); 1569 1570 tcg_gen_sub_i64(tmp, a, b); 1571 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1572 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1573 tcg_gen_xor_i64(tmp, tmp, res); 1574 tcg_gen_or_i64(qc, qc, tmp); 1575 } 1576 1577 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1578 { 1579 TCGv_i64 t0 = tcg_temp_new_i64(); 1580 TCGv_i64 t1 = tcg_temp_new_i64(); 1581 TCGv_i64 t2 = tcg_temp_new_i64(); 1582 1583 tcg_gen_sub_i64(t0, a, b); 1584 1585 /* Compute signed overflow indication into T1 */ 1586 tcg_gen_xor_i64(t1, a, b); 1587 tcg_gen_xor_i64(t2, t0, a); 1588 tcg_gen_and_i64(t1, t1, t2); 1589 1590 /* Compute saturated value into T2 */ 1591 tcg_gen_sari_i64(t2, a, 63); 1592 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1593 1594 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1595 tcg_gen_xor_i64(t0, t0, res); 1596 tcg_gen_or_i64(qc, qc, t0); 1597 } 1598 1599 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1600 TCGv_vec a, TCGv_vec b) 1601 { 1602 TCGv_vec x = tcg_temp_new_vec_matching(t); 1603 tcg_gen_sub_vec(vece, x, a, b); 1604 tcg_gen_sssub_vec(vece, t, a, b); 1605 tcg_gen_xor_vec(vece, x, x, t); 1606 tcg_gen_or_vec(vece, qc, qc, x); 1607 } 1608 1609 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1610 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1611 { 1612 static const TCGOpcode vecop_list[] = { 1613 INDEX_op_sssub_vec, INDEX_op_sub_vec, 0 1614 }; 1615 static const GVecGen4 ops[4] = { 1616 { .fniv = gen_sqsub_vec, 1617 .fno = gen_helper_gvec_sqsub_b, 1618 .opt_opc = vecop_list, 1619 .write_aofs = true, 1620 .vece = MO_8 }, 1621 { .fniv = gen_sqsub_vec, 1622 .fno = gen_helper_gvec_sqsub_h, 1623 .opt_opc = vecop_list, 1624 .write_aofs = true, 1625 .vece = MO_16 }, 1626 { .fniv = gen_sqsub_vec, 1627 .fno = gen_helper_gvec_sqsub_s, 1628 .opt_opc = vecop_list, 1629 .write_aofs = true, 1630 .vece = MO_32 }, 1631 { .fniv = gen_sqsub_vec, 1632 .fni8 = gen_sqsub_d, 1633 .fno = gen_helper_gvec_sqsub_d, 1634 .opt_opc = vecop_list, 1635 .write_aofs = true, 1636 .vece = MO_64 }, 1637 }; 1638 1639 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1640 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1641 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1642 } 1643 1644 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1645 { 1646 TCGv_i32 t = tcg_temp_new_i32(); 1647 1648 tcg_gen_sub_i32(t, a, b); 1649 tcg_gen_sub_i32(d, b, a); 1650 tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t); 1651 } 1652 1653 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1654 { 1655 TCGv_i64 t = tcg_temp_new_i64(); 1656 1657 tcg_gen_sub_i64(t, a, b); 1658 tcg_gen_sub_i64(d, b, a); 1659 tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t); 1660 } 1661 1662 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1663 { 1664 TCGv_vec t = tcg_temp_new_vec_matching(d); 1665 1666 tcg_gen_smin_vec(vece, t, a, b); 1667 tcg_gen_smax_vec(vece, d, a, b); 1668 tcg_gen_sub_vec(vece, d, d, t); 1669 } 1670 1671 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1672 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1673 { 1674 static const TCGOpcode vecop_list[] = { 1675 INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1676 }; 1677 static const GVecGen3 ops[4] = { 1678 { .fniv = gen_sabd_vec, 1679 .fno = gen_helper_gvec_sabd_b, 1680 .opt_opc = vecop_list, 1681 .vece = MO_8 }, 1682 { .fniv = gen_sabd_vec, 1683 .fno = gen_helper_gvec_sabd_h, 1684 .opt_opc = vecop_list, 1685 .vece = MO_16 }, 1686 { .fni4 = gen_sabd_i32, 1687 .fniv = gen_sabd_vec, 1688 .fno = gen_helper_gvec_sabd_s, 1689 .opt_opc = vecop_list, 1690 .vece = MO_32 }, 1691 { .fni8 = gen_sabd_i64, 1692 .fniv = gen_sabd_vec, 1693 .fno = gen_helper_gvec_sabd_d, 1694 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1695 .opt_opc = vecop_list, 1696 .vece = MO_64 }, 1697 }; 1698 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1699 } 1700 1701 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1702 { 1703 TCGv_i32 t = tcg_temp_new_i32(); 1704 1705 tcg_gen_sub_i32(t, a, b); 1706 tcg_gen_sub_i32(d, b, a); 1707 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t); 1708 } 1709 1710 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1711 { 1712 TCGv_i64 t = tcg_temp_new_i64(); 1713 1714 tcg_gen_sub_i64(t, a, b); 1715 tcg_gen_sub_i64(d, b, a); 1716 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t); 1717 } 1718 1719 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1720 { 1721 TCGv_vec t = tcg_temp_new_vec_matching(d); 1722 1723 tcg_gen_umin_vec(vece, t, a, b); 1724 tcg_gen_umax_vec(vece, d, a, b); 1725 tcg_gen_sub_vec(vece, d, d, t); 1726 } 1727 1728 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1729 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1730 { 1731 static const TCGOpcode vecop_list[] = { 1732 INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1733 }; 1734 static const GVecGen3 ops[4] = { 1735 { .fniv = gen_uabd_vec, 1736 .fno = gen_helper_gvec_uabd_b, 1737 .opt_opc = vecop_list, 1738 .vece = MO_8 }, 1739 { .fniv = gen_uabd_vec, 1740 .fno = gen_helper_gvec_uabd_h, 1741 .opt_opc = vecop_list, 1742 .vece = MO_16 }, 1743 { .fni4 = gen_uabd_i32, 1744 .fniv = gen_uabd_vec, 1745 .fno = gen_helper_gvec_uabd_s, 1746 .opt_opc = vecop_list, 1747 .vece = MO_32 }, 1748 { .fni8 = gen_uabd_i64, 1749 .fniv = gen_uabd_vec, 1750 .fno = gen_helper_gvec_uabd_d, 1751 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1752 .opt_opc = vecop_list, 1753 .vece = MO_64 }, 1754 }; 1755 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1756 } 1757 1758 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1759 { 1760 TCGv_i32 t = tcg_temp_new_i32(); 1761 gen_sabd_i32(t, a, b); 1762 tcg_gen_add_i32(d, d, t); 1763 } 1764 1765 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1766 { 1767 TCGv_i64 t = tcg_temp_new_i64(); 1768 gen_sabd_i64(t, a, b); 1769 tcg_gen_add_i64(d, d, t); 1770 } 1771 1772 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1773 { 1774 TCGv_vec t = tcg_temp_new_vec_matching(d); 1775 gen_sabd_vec(vece, t, a, b); 1776 tcg_gen_add_vec(vece, d, d, t); 1777 } 1778 1779 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1780 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1781 { 1782 static const TCGOpcode vecop_list[] = { 1783 INDEX_op_sub_vec, INDEX_op_add_vec, 1784 INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1785 }; 1786 static const GVecGen3 ops[4] = { 1787 { .fniv = gen_saba_vec, 1788 .fno = gen_helper_gvec_saba_b, 1789 .opt_opc = vecop_list, 1790 .load_dest = true, 1791 .vece = MO_8 }, 1792 { .fniv = gen_saba_vec, 1793 .fno = gen_helper_gvec_saba_h, 1794 .opt_opc = vecop_list, 1795 .load_dest = true, 1796 .vece = MO_16 }, 1797 { .fni4 = gen_saba_i32, 1798 .fniv = gen_saba_vec, 1799 .fno = gen_helper_gvec_saba_s, 1800 .opt_opc = vecop_list, 1801 .load_dest = true, 1802 .vece = MO_32 }, 1803 { .fni8 = gen_saba_i64, 1804 .fniv = gen_saba_vec, 1805 .fno = gen_helper_gvec_saba_d, 1806 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1807 .opt_opc = vecop_list, 1808 .load_dest = true, 1809 .vece = MO_64 }, 1810 }; 1811 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1812 } 1813 1814 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1815 { 1816 TCGv_i32 t = tcg_temp_new_i32(); 1817 gen_uabd_i32(t, a, b); 1818 tcg_gen_add_i32(d, d, t); 1819 } 1820 1821 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1822 { 1823 TCGv_i64 t = tcg_temp_new_i64(); 1824 gen_uabd_i64(t, a, b); 1825 tcg_gen_add_i64(d, d, t); 1826 } 1827 1828 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1829 { 1830 TCGv_vec t = tcg_temp_new_vec_matching(d); 1831 gen_uabd_vec(vece, t, a, b); 1832 tcg_gen_add_vec(vece, d, d, t); 1833 } 1834 1835 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1836 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1837 { 1838 static const TCGOpcode vecop_list[] = { 1839 INDEX_op_sub_vec, INDEX_op_add_vec, 1840 INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1841 }; 1842 static const GVecGen3 ops[4] = { 1843 { .fniv = gen_uaba_vec, 1844 .fno = gen_helper_gvec_uaba_b, 1845 .opt_opc = vecop_list, 1846 .load_dest = true, 1847 .vece = MO_8 }, 1848 { .fniv = gen_uaba_vec, 1849 .fno = gen_helper_gvec_uaba_h, 1850 .opt_opc = vecop_list, 1851 .load_dest = true, 1852 .vece = MO_16 }, 1853 { .fni4 = gen_uaba_i32, 1854 .fniv = gen_uaba_vec, 1855 .fno = gen_helper_gvec_uaba_s, 1856 .opt_opc = vecop_list, 1857 .load_dest = true, 1858 .vece = MO_32 }, 1859 { .fni8 = gen_uaba_i64, 1860 .fniv = gen_uaba_vec, 1861 .fno = gen_helper_gvec_uaba_d, 1862 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1863 .opt_opc = vecop_list, 1864 .load_dest = true, 1865 .vece = MO_64 }, 1866 }; 1867 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1868 } 1869 1870 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1871 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1872 { 1873 static gen_helper_gvec_3 * const fns[4] = { 1874 gen_helper_gvec_addp_b, 1875 gen_helper_gvec_addp_h, 1876 gen_helper_gvec_addp_s, 1877 gen_helper_gvec_addp_d, 1878 }; 1879 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1880 } 1881 1882 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1883 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1884 { 1885 static gen_helper_gvec_3 * const fns[4] = { 1886 gen_helper_gvec_smaxp_b, 1887 gen_helper_gvec_smaxp_h, 1888 gen_helper_gvec_smaxp_s, 1889 }; 1890 tcg_debug_assert(vece <= MO_32); 1891 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1892 } 1893 1894 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1895 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1896 { 1897 static gen_helper_gvec_3 * const fns[4] = { 1898 gen_helper_gvec_sminp_b, 1899 gen_helper_gvec_sminp_h, 1900 gen_helper_gvec_sminp_s, 1901 }; 1902 tcg_debug_assert(vece <= MO_32); 1903 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1904 } 1905 1906 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1907 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1908 { 1909 static gen_helper_gvec_3 * const fns[4] = { 1910 gen_helper_gvec_umaxp_b, 1911 gen_helper_gvec_umaxp_h, 1912 gen_helper_gvec_umaxp_s, 1913 }; 1914 tcg_debug_assert(vece <= MO_32); 1915 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1916 } 1917 1918 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1919 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1920 { 1921 static gen_helper_gvec_3 * const fns[4] = { 1922 gen_helper_gvec_uminp_b, 1923 gen_helper_gvec_uminp_h, 1924 gen_helper_gvec_uminp_s, 1925 }; 1926 tcg_debug_assert(vece <= MO_32); 1927 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1928 } 1929 1930 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1931 { 1932 TCGv_i64 t = tcg_temp_new_i64(); 1933 1934 tcg_gen_and_i64(t, a, b); 1935 tcg_gen_vec_sar8i_i64(a, a, 1); 1936 tcg_gen_vec_sar8i_i64(b, b, 1); 1937 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1938 tcg_gen_vec_add8_i64(d, a, b); 1939 tcg_gen_vec_add8_i64(d, d, t); 1940 } 1941 1942 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1943 { 1944 TCGv_i64 t = tcg_temp_new_i64(); 1945 1946 tcg_gen_and_i64(t, a, b); 1947 tcg_gen_vec_sar16i_i64(a, a, 1); 1948 tcg_gen_vec_sar16i_i64(b, b, 1); 1949 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1950 tcg_gen_vec_add16_i64(d, a, b); 1951 tcg_gen_vec_add16_i64(d, d, t); 1952 } 1953 1954 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1955 { 1956 TCGv_i32 t = tcg_temp_new_i32(); 1957 1958 tcg_gen_and_i32(t, a, b); 1959 tcg_gen_sari_i32(a, a, 1); 1960 tcg_gen_sari_i32(b, b, 1); 1961 tcg_gen_andi_i32(t, t, 1); 1962 tcg_gen_add_i32(d, a, b); 1963 tcg_gen_add_i32(d, d, t); 1964 } 1965 1966 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1967 { 1968 TCGv_vec t = tcg_temp_new_vec_matching(d); 1969 1970 tcg_gen_and_vec(vece, t, a, b); 1971 tcg_gen_sari_vec(vece, a, a, 1); 1972 tcg_gen_sari_vec(vece, b, b, 1); 1973 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 1974 tcg_gen_add_vec(vece, d, a, b); 1975 tcg_gen_add_vec(vece, d, d, t); 1976 } 1977 1978 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1979 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1980 { 1981 static const TCGOpcode vecop_list[] = { 1982 INDEX_op_sari_vec, INDEX_op_add_vec, 0 1983 }; 1984 static const GVecGen3 g[] = { 1985 { .fni8 = gen_shadd8_i64, 1986 .fniv = gen_shadd_vec, 1987 .opt_opc = vecop_list, 1988 .vece = MO_8 }, 1989 { .fni8 = gen_shadd16_i64, 1990 .fniv = gen_shadd_vec, 1991 .opt_opc = vecop_list, 1992 .vece = MO_16 }, 1993 { .fni4 = gen_shadd_i32, 1994 .fniv = gen_shadd_vec, 1995 .opt_opc = vecop_list, 1996 .vece = MO_32 }, 1997 }; 1998 tcg_debug_assert(vece <= MO_32); 1999 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2000 } 2001 2002 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2003 { 2004 TCGv_i64 t = tcg_temp_new_i64(); 2005 2006 tcg_gen_and_i64(t, a, b); 2007 tcg_gen_vec_shr8i_i64(a, a, 1); 2008 tcg_gen_vec_shr8i_i64(b, b, 1); 2009 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2010 tcg_gen_vec_add8_i64(d, a, b); 2011 tcg_gen_vec_add8_i64(d, d, t); 2012 } 2013 2014 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2015 { 2016 TCGv_i64 t = tcg_temp_new_i64(); 2017 2018 tcg_gen_and_i64(t, a, b); 2019 tcg_gen_vec_shr16i_i64(a, a, 1); 2020 tcg_gen_vec_shr16i_i64(b, b, 1); 2021 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2022 tcg_gen_vec_add16_i64(d, a, b); 2023 tcg_gen_vec_add16_i64(d, d, t); 2024 } 2025 2026 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2027 { 2028 TCGv_i32 t = tcg_temp_new_i32(); 2029 2030 tcg_gen_and_i32(t, a, b); 2031 tcg_gen_shri_i32(a, a, 1); 2032 tcg_gen_shri_i32(b, b, 1); 2033 tcg_gen_andi_i32(t, t, 1); 2034 tcg_gen_add_i32(d, a, b); 2035 tcg_gen_add_i32(d, d, t); 2036 } 2037 2038 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2039 { 2040 TCGv_vec t = tcg_temp_new_vec_matching(d); 2041 2042 tcg_gen_and_vec(vece, t, a, b); 2043 tcg_gen_shri_vec(vece, a, a, 1); 2044 tcg_gen_shri_vec(vece, b, b, 1); 2045 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2046 tcg_gen_add_vec(vece, d, a, b); 2047 tcg_gen_add_vec(vece, d, d, t); 2048 } 2049 2050 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2051 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2052 { 2053 static const TCGOpcode vecop_list[] = { 2054 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2055 }; 2056 static const GVecGen3 g[] = { 2057 { .fni8 = gen_uhadd8_i64, 2058 .fniv = gen_uhadd_vec, 2059 .opt_opc = vecop_list, 2060 .vece = MO_8 }, 2061 { .fni8 = gen_uhadd16_i64, 2062 .fniv = gen_uhadd_vec, 2063 .opt_opc = vecop_list, 2064 .vece = MO_16 }, 2065 { .fni4 = gen_uhadd_i32, 2066 .fniv = gen_uhadd_vec, 2067 .opt_opc = vecop_list, 2068 .vece = MO_32 }, 2069 }; 2070 tcg_debug_assert(vece <= MO_32); 2071 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2072 } 2073 2074 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2075 { 2076 TCGv_i64 t = tcg_temp_new_i64(); 2077 2078 tcg_gen_andc_i64(t, b, a); 2079 tcg_gen_vec_sar8i_i64(a, a, 1); 2080 tcg_gen_vec_sar8i_i64(b, b, 1); 2081 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2082 tcg_gen_vec_sub8_i64(d, a, b); 2083 tcg_gen_vec_sub8_i64(d, d, t); 2084 } 2085 2086 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2087 { 2088 TCGv_i64 t = tcg_temp_new_i64(); 2089 2090 tcg_gen_andc_i64(t, b, a); 2091 tcg_gen_vec_sar16i_i64(a, a, 1); 2092 tcg_gen_vec_sar16i_i64(b, b, 1); 2093 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2094 tcg_gen_vec_sub16_i64(d, a, b); 2095 tcg_gen_vec_sub16_i64(d, d, t); 2096 } 2097 2098 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2099 { 2100 TCGv_i32 t = tcg_temp_new_i32(); 2101 2102 tcg_gen_andc_i32(t, b, a); 2103 tcg_gen_sari_i32(a, a, 1); 2104 tcg_gen_sari_i32(b, b, 1); 2105 tcg_gen_andi_i32(t, t, 1); 2106 tcg_gen_sub_i32(d, a, b); 2107 tcg_gen_sub_i32(d, d, t); 2108 } 2109 2110 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2111 { 2112 TCGv_vec t = tcg_temp_new_vec_matching(d); 2113 2114 tcg_gen_andc_vec(vece, t, b, a); 2115 tcg_gen_sari_vec(vece, a, a, 1); 2116 tcg_gen_sari_vec(vece, b, b, 1); 2117 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2118 tcg_gen_sub_vec(vece, d, a, b); 2119 tcg_gen_sub_vec(vece, d, d, t); 2120 } 2121 2122 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2123 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2124 { 2125 static const TCGOpcode vecop_list[] = { 2126 INDEX_op_sari_vec, INDEX_op_sub_vec, 0 2127 }; 2128 static const GVecGen3 g[4] = { 2129 { .fni8 = gen_shsub8_i64, 2130 .fniv = gen_shsub_vec, 2131 .opt_opc = vecop_list, 2132 .vece = MO_8 }, 2133 { .fni8 = gen_shsub16_i64, 2134 .fniv = gen_shsub_vec, 2135 .opt_opc = vecop_list, 2136 .vece = MO_16 }, 2137 { .fni4 = gen_shsub_i32, 2138 .fniv = gen_shsub_vec, 2139 .opt_opc = vecop_list, 2140 .vece = MO_32 }, 2141 }; 2142 assert(vece <= MO_32); 2143 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2144 } 2145 2146 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2147 { 2148 TCGv_i64 t = tcg_temp_new_i64(); 2149 2150 tcg_gen_andc_i64(t, b, a); 2151 tcg_gen_vec_shr8i_i64(a, a, 1); 2152 tcg_gen_vec_shr8i_i64(b, b, 1); 2153 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2154 tcg_gen_vec_sub8_i64(d, a, b); 2155 tcg_gen_vec_sub8_i64(d, d, t); 2156 } 2157 2158 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2159 { 2160 TCGv_i64 t = tcg_temp_new_i64(); 2161 2162 tcg_gen_andc_i64(t, b, a); 2163 tcg_gen_vec_shr16i_i64(a, a, 1); 2164 tcg_gen_vec_shr16i_i64(b, b, 1); 2165 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2166 tcg_gen_vec_sub16_i64(d, a, b); 2167 tcg_gen_vec_sub16_i64(d, d, t); 2168 } 2169 2170 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2171 { 2172 TCGv_i32 t = tcg_temp_new_i32(); 2173 2174 tcg_gen_andc_i32(t, b, a); 2175 tcg_gen_shri_i32(a, a, 1); 2176 tcg_gen_shri_i32(b, b, 1); 2177 tcg_gen_andi_i32(t, t, 1); 2178 tcg_gen_sub_i32(d, a, b); 2179 tcg_gen_sub_i32(d, d, t); 2180 } 2181 2182 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2183 { 2184 TCGv_vec t = tcg_temp_new_vec_matching(d); 2185 2186 tcg_gen_andc_vec(vece, t, b, a); 2187 tcg_gen_shri_vec(vece, a, a, 1); 2188 tcg_gen_shri_vec(vece, b, b, 1); 2189 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2190 tcg_gen_sub_vec(vece, d, a, b); 2191 tcg_gen_sub_vec(vece, d, d, t); 2192 } 2193 2194 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2195 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2196 { 2197 static const TCGOpcode vecop_list[] = { 2198 INDEX_op_shri_vec, INDEX_op_sub_vec, 0 2199 }; 2200 static const GVecGen3 g[4] = { 2201 { .fni8 = gen_uhsub8_i64, 2202 .fniv = gen_uhsub_vec, 2203 .opt_opc = vecop_list, 2204 .vece = MO_8 }, 2205 { .fni8 = gen_uhsub16_i64, 2206 .fniv = gen_uhsub_vec, 2207 .opt_opc = vecop_list, 2208 .vece = MO_16 }, 2209 { .fni4 = gen_uhsub_i32, 2210 .fniv = gen_uhsub_vec, 2211 .opt_opc = vecop_list, 2212 .vece = MO_32 }, 2213 }; 2214 assert(vece <= MO_32); 2215 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2216 } 2217 2218 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2219 { 2220 TCGv_i64 t = tcg_temp_new_i64(); 2221 2222 tcg_gen_or_i64(t, a, b); 2223 tcg_gen_vec_sar8i_i64(a, a, 1); 2224 tcg_gen_vec_sar8i_i64(b, b, 1); 2225 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2226 tcg_gen_vec_add8_i64(d, a, b); 2227 tcg_gen_vec_add8_i64(d, d, t); 2228 } 2229 2230 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2231 { 2232 TCGv_i64 t = tcg_temp_new_i64(); 2233 2234 tcg_gen_or_i64(t, a, b); 2235 tcg_gen_vec_sar16i_i64(a, a, 1); 2236 tcg_gen_vec_sar16i_i64(b, b, 1); 2237 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2238 tcg_gen_vec_add16_i64(d, a, b); 2239 tcg_gen_vec_add16_i64(d, d, t); 2240 } 2241 2242 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2243 { 2244 TCGv_i32 t = tcg_temp_new_i32(); 2245 2246 tcg_gen_or_i32(t, a, b); 2247 tcg_gen_sari_i32(a, a, 1); 2248 tcg_gen_sari_i32(b, b, 1); 2249 tcg_gen_andi_i32(t, t, 1); 2250 tcg_gen_add_i32(d, a, b); 2251 tcg_gen_add_i32(d, d, t); 2252 } 2253 2254 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2255 { 2256 TCGv_vec t = tcg_temp_new_vec_matching(d); 2257 2258 tcg_gen_or_vec(vece, t, a, b); 2259 tcg_gen_sari_vec(vece, a, a, 1); 2260 tcg_gen_sari_vec(vece, b, b, 1); 2261 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2262 tcg_gen_add_vec(vece, d, a, b); 2263 tcg_gen_add_vec(vece, d, d, t); 2264 } 2265 2266 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2267 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2268 { 2269 static const TCGOpcode vecop_list[] = { 2270 INDEX_op_sari_vec, INDEX_op_add_vec, 0 2271 }; 2272 static const GVecGen3 g[] = { 2273 { .fni8 = gen_srhadd8_i64, 2274 .fniv = gen_srhadd_vec, 2275 .opt_opc = vecop_list, 2276 .vece = MO_8 }, 2277 { .fni8 = gen_srhadd16_i64, 2278 .fniv = gen_srhadd_vec, 2279 .opt_opc = vecop_list, 2280 .vece = MO_16 }, 2281 { .fni4 = gen_srhadd_i32, 2282 .fniv = gen_srhadd_vec, 2283 .opt_opc = vecop_list, 2284 .vece = MO_32 }, 2285 }; 2286 assert(vece <= MO_32); 2287 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2288 } 2289 2290 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2291 { 2292 TCGv_i64 t = tcg_temp_new_i64(); 2293 2294 tcg_gen_or_i64(t, a, b); 2295 tcg_gen_vec_shr8i_i64(a, a, 1); 2296 tcg_gen_vec_shr8i_i64(b, b, 1); 2297 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2298 tcg_gen_vec_add8_i64(d, a, b); 2299 tcg_gen_vec_add8_i64(d, d, t); 2300 } 2301 2302 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2303 { 2304 TCGv_i64 t = tcg_temp_new_i64(); 2305 2306 tcg_gen_or_i64(t, a, b); 2307 tcg_gen_vec_shr16i_i64(a, a, 1); 2308 tcg_gen_vec_shr16i_i64(b, b, 1); 2309 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2310 tcg_gen_vec_add16_i64(d, a, b); 2311 tcg_gen_vec_add16_i64(d, d, t); 2312 } 2313 2314 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2315 { 2316 TCGv_i32 t = tcg_temp_new_i32(); 2317 2318 tcg_gen_or_i32(t, a, b); 2319 tcg_gen_shri_i32(a, a, 1); 2320 tcg_gen_shri_i32(b, b, 1); 2321 tcg_gen_andi_i32(t, t, 1); 2322 tcg_gen_add_i32(d, a, b); 2323 tcg_gen_add_i32(d, d, t); 2324 } 2325 2326 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2327 { 2328 TCGv_vec t = tcg_temp_new_vec_matching(d); 2329 2330 tcg_gen_or_vec(vece, t, a, b); 2331 tcg_gen_shri_vec(vece, a, a, 1); 2332 tcg_gen_shri_vec(vece, b, b, 1); 2333 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2334 tcg_gen_add_vec(vece, d, a, b); 2335 tcg_gen_add_vec(vece, d, d, t); 2336 } 2337 2338 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2339 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2340 { 2341 static const TCGOpcode vecop_list[] = { 2342 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2343 }; 2344 static const GVecGen3 g[] = { 2345 { .fni8 = gen_urhadd8_i64, 2346 .fniv = gen_urhadd_vec, 2347 .opt_opc = vecop_list, 2348 .vece = MO_8 }, 2349 { .fni8 = gen_urhadd16_i64, 2350 .fniv = gen_urhadd_vec, 2351 .opt_opc = vecop_list, 2352 .vece = MO_16 }, 2353 { .fni4 = gen_urhadd_i32, 2354 .fniv = gen_urhadd_vec, 2355 .opt_opc = vecop_list, 2356 .vece = MO_32 }, 2357 }; 2358 assert(vece <= MO_32); 2359 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2360 } 2361