1 /* 2 * ARM generic vector expansion 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * Copyright (c) 2005-2007 CodeSourcery 6 * Copyright (c) 2007 OpenedHand, Ltd. 7 * 8 * This library is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * This library is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 20 */ 21 22 #include "qemu/osdep.h" 23 #include "translate.h" 24 25 26 static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, 27 uint32_t opr_sz, uint32_t max_sz, 28 gen_helper_gvec_3_ptr *fn) 29 { 30 TCGv_ptr qc_ptr = tcg_temp_new_ptr(); 31 32 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 33 tcg_gen_addi_ptr(qc_ptr, tcg_env, offsetof(CPUARMState, vfp.qc)); 34 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, qc_ptr, 35 opr_sz, max_sz, 0, fn); 36 } 37 38 void gen_gvec_sqdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 39 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 40 { 41 static gen_helper_gvec_3_ptr * const fns[2] = { 42 gen_helper_neon_sqdmulh_h, gen_helper_neon_sqdmulh_s 43 }; 44 tcg_debug_assert(vece >= 1 && vece <= 2); 45 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 46 } 47 48 void gen_gvec_sqrdmulh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 49 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 50 { 51 static gen_helper_gvec_3_ptr * const fns[2] = { 52 gen_helper_neon_sqrdmulh_h, gen_helper_neon_sqrdmulh_s 53 }; 54 tcg_debug_assert(vece >= 1 && vece <= 2); 55 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 56 } 57 58 void gen_gvec_sqrdmlah_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 59 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 60 { 61 static gen_helper_gvec_3_ptr * const fns[2] = { 62 gen_helper_gvec_qrdmlah_s16, gen_helper_gvec_qrdmlah_s32 63 }; 64 tcg_debug_assert(vece >= 1 && vece <= 2); 65 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 66 } 67 68 void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 69 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 70 { 71 static gen_helper_gvec_3_ptr * const fns[2] = { 72 gen_helper_gvec_qrdmlsh_s16, gen_helper_gvec_qrdmlsh_s32 73 }; 74 tcg_debug_assert(vece >= 1 && vece <= 2); 75 gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); 76 } 77 78 #define GEN_CMP0(NAME, COND) \ 79 void NAME(unsigned vece, uint32_t d, uint32_t m, \ 80 uint32_t opr_sz, uint32_t max_sz) \ 81 { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); } 82 83 GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ) 84 GEN_CMP0(gen_gvec_cle0, TCG_COND_LE) 85 GEN_CMP0(gen_gvec_cge0, TCG_COND_GE) 86 GEN_CMP0(gen_gvec_clt0, TCG_COND_LT) 87 GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT) 88 89 #undef GEN_CMP0 90 91 void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 92 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 93 { 94 /* Signed shift out of range results in all-sign-bits */ 95 shift = MIN(shift, (8 << vece) - 1); 96 tcg_gen_gvec_sari(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz); 97 } 98 99 void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 100 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 101 { 102 /* Unsigned shift out of range results in all-zero-bits */ 103 if (shift >= (8 << vece)) { 104 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); 105 } else { 106 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz); 107 } 108 } 109 110 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 111 { 112 tcg_gen_vec_sar8i_i64(a, a, shift); 113 tcg_gen_vec_add8_i64(d, d, a); 114 } 115 116 static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 117 { 118 tcg_gen_vec_sar16i_i64(a, a, shift); 119 tcg_gen_vec_add16_i64(d, d, a); 120 } 121 122 static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 123 { 124 tcg_gen_sari_i32(a, a, shift); 125 tcg_gen_add_i32(d, d, a); 126 } 127 128 static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 129 { 130 tcg_gen_sari_i64(a, a, shift); 131 tcg_gen_add_i64(d, d, a); 132 } 133 134 static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 135 { 136 tcg_gen_sari_vec(vece, a, a, sh); 137 tcg_gen_add_vec(vece, d, d, a); 138 } 139 140 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 141 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 142 { 143 static const TCGOpcode vecop_list[] = { 144 INDEX_op_sari_vec, INDEX_op_add_vec, 0 145 }; 146 static const GVecGen2i ops[4] = { 147 { .fni8 = gen_ssra8_i64, 148 .fniv = gen_ssra_vec, 149 .fno = gen_helper_gvec_ssra_b, 150 .load_dest = true, 151 .opt_opc = vecop_list, 152 .vece = MO_8 }, 153 { .fni8 = gen_ssra16_i64, 154 .fniv = gen_ssra_vec, 155 .fno = gen_helper_gvec_ssra_h, 156 .load_dest = true, 157 .opt_opc = vecop_list, 158 .vece = MO_16 }, 159 { .fni4 = gen_ssra32_i32, 160 .fniv = gen_ssra_vec, 161 .fno = gen_helper_gvec_ssra_s, 162 .load_dest = true, 163 .opt_opc = vecop_list, 164 .vece = MO_32 }, 165 { .fni8 = gen_ssra64_i64, 166 .fniv = gen_ssra_vec, 167 .fno = gen_helper_gvec_ssra_d, 168 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 169 .opt_opc = vecop_list, 170 .load_dest = true, 171 .vece = MO_64 }, 172 }; 173 174 /* tszimm encoding produces immediates in the range [1..esize]. */ 175 tcg_debug_assert(shift > 0); 176 tcg_debug_assert(shift <= (8 << vece)); 177 178 /* 179 * Shifts larger than the element size are architecturally valid. 180 * Signed results in all sign bits. 181 */ 182 shift = MIN(shift, (8 << vece) - 1); 183 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 184 } 185 186 static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 187 { 188 tcg_gen_vec_shr8i_i64(a, a, shift); 189 tcg_gen_vec_add8_i64(d, d, a); 190 } 191 192 static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 193 { 194 tcg_gen_vec_shr16i_i64(a, a, shift); 195 tcg_gen_vec_add16_i64(d, d, a); 196 } 197 198 static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 199 { 200 tcg_gen_shri_i32(a, a, shift); 201 tcg_gen_add_i32(d, d, a); 202 } 203 204 static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 205 { 206 tcg_gen_shri_i64(a, a, shift); 207 tcg_gen_add_i64(d, d, a); 208 } 209 210 static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 211 { 212 tcg_gen_shri_vec(vece, a, a, sh); 213 tcg_gen_add_vec(vece, d, d, a); 214 } 215 216 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 217 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 218 { 219 static const TCGOpcode vecop_list[] = { 220 INDEX_op_shri_vec, INDEX_op_add_vec, 0 221 }; 222 static const GVecGen2i ops[4] = { 223 { .fni8 = gen_usra8_i64, 224 .fniv = gen_usra_vec, 225 .fno = gen_helper_gvec_usra_b, 226 .load_dest = true, 227 .opt_opc = vecop_list, 228 .vece = MO_8, }, 229 { .fni8 = gen_usra16_i64, 230 .fniv = gen_usra_vec, 231 .fno = gen_helper_gvec_usra_h, 232 .load_dest = true, 233 .opt_opc = vecop_list, 234 .vece = MO_16, }, 235 { .fni4 = gen_usra32_i32, 236 .fniv = gen_usra_vec, 237 .fno = gen_helper_gvec_usra_s, 238 .load_dest = true, 239 .opt_opc = vecop_list, 240 .vece = MO_32, }, 241 { .fni8 = gen_usra64_i64, 242 .fniv = gen_usra_vec, 243 .fno = gen_helper_gvec_usra_d, 244 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 245 .load_dest = true, 246 .opt_opc = vecop_list, 247 .vece = MO_64, }, 248 }; 249 250 /* tszimm encoding produces immediates in the range [1..esize]. */ 251 tcg_debug_assert(shift > 0); 252 tcg_debug_assert(shift <= (8 << vece)); 253 254 /* 255 * Shifts larger than the element size are architecturally valid. 256 * Unsigned results in all zeros as input to accumulate: nop. 257 */ 258 if (shift < (8 << vece)) { 259 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 260 } else { 261 /* Nop, but we do need to clear the tail. */ 262 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 263 } 264 } 265 266 /* 267 * Shift one less than the requested amount, and the low bit is 268 * the rounding bit. For the 8 and 16-bit operations, because we 269 * mask the low bit, we can perform a normal integer shift instead 270 * of a vector shift. 271 */ 272 static void gen_srshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 273 { 274 TCGv_i64 t = tcg_temp_new_i64(); 275 276 tcg_gen_shri_i64(t, a, sh - 1); 277 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 278 tcg_gen_vec_sar8i_i64(d, a, sh); 279 tcg_gen_vec_add8_i64(d, d, t); 280 } 281 282 static void gen_srshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 283 { 284 TCGv_i64 t = tcg_temp_new_i64(); 285 286 tcg_gen_shri_i64(t, a, sh - 1); 287 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 288 tcg_gen_vec_sar16i_i64(d, a, sh); 289 tcg_gen_vec_add16_i64(d, d, t); 290 } 291 292 void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 293 { 294 TCGv_i32 t; 295 296 /* Handle shift by the input size for the benefit of trans_SRSHR_ri */ 297 if (sh == 32) { 298 tcg_gen_movi_i32(d, 0); 299 return; 300 } 301 t = tcg_temp_new_i32(); 302 tcg_gen_extract_i32(t, a, sh - 1, 1); 303 tcg_gen_sari_i32(d, a, sh); 304 tcg_gen_add_i32(d, d, t); 305 } 306 307 void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 308 { 309 TCGv_i64 t = tcg_temp_new_i64(); 310 311 tcg_gen_extract_i64(t, a, sh - 1, 1); 312 tcg_gen_sari_i64(d, a, sh); 313 tcg_gen_add_i64(d, d, t); 314 } 315 316 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 317 { 318 TCGv_vec t = tcg_temp_new_vec_matching(d); 319 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); 320 321 tcg_gen_shri_vec(vece, t, a, sh - 1); 322 tcg_gen_and_vec(vece, t, t, ones); 323 tcg_gen_sari_vec(vece, d, a, sh); 324 tcg_gen_add_vec(vece, d, d, t); 325 } 326 327 void gen_gvec_srshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 328 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 329 { 330 static const TCGOpcode vecop_list[] = { 331 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 332 }; 333 static const GVecGen2i ops[4] = { 334 { .fni8 = gen_srshr8_i64, 335 .fniv = gen_srshr_vec, 336 .fno = gen_helper_gvec_srshr_b, 337 .opt_opc = vecop_list, 338 .vece = MO_8 }, 339 { .fni8 = gen_srshr16_i64, 340 .fniv = gen_srshr_vec, 341 .fno = gen_helper_gvec_srshr_h, 342 .opt_opc = vecop_list, 343 .vece = MO_16 }, 344 { .fni4 = gen_srshr32_i32, 345 .fniv = gen_srshr_vec, 346 .fno = gen_helper_gvec_srshr_s, 347 .opt_opc = vecop_list, 348 .vece = MO_32 }, 349 { .fni8 = gen_srshr64_i64, 350 .fniv = gen_srshr_vec, 351 .fno = gen_helper_gvec_srshr_d, 352 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 353 .opt_opc = vecop_list, 354 .vece = MO_64 }, 355 }; 356 357 /* tszimm encoding produces immediates in the range [1..esize] */ 358 tcg_debug_assert(shift > 0); 359 tcg_debug_assert(shift <= (8 << vece)); 360 361 if (shift == (8 << vece)) { 362 /* 363 * Shifts larger than the element size are architecturally valid. 364 * Signed results in all sign bits. With rounding, this produces 365 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 366 * I.e. always zero. 367 */ 368 tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0); 369 } else { 370 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 371 } 372 } 373 374 static void gen_srsra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 375 { 376 TCGv_i64 t = tcg_temp_new_i64(); 377 378 gen_srshr8_i64(t, a, sh); 379 tcg_gen_vec_add8_i64(d, d, t); 380 } 381 382 static void gen_srsra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 383 { 384 TCGv_i64 t = tcg_temp_new_i64(); 385 386 gen_srshr16_i64(t, a, sh); 387 tcg_gen_vec_add16_i64(d, d, t); 388 } 389 390 static void gen_srsra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 391 { 392 TCGv_i32 t = tcg_temp_new_i32(); 393 394 gen_srshr32_i32(t, a, sh); 395 tcg_gen_add_i32(d, d, t); 396 } 397 398 static void gen_srsra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 399 { 400 TCGv_i64 t = tcg_temp_new_i64(); 401 402 gen_srshr64_i64(t, a, sh); 403 tcg_gen_add_i64(d, d, t); 404 } 405 406 static void gen_srsra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 407 { 408 TCGv_vec t = tcg_temp_new_vec_matching(d); 409 410 gen_srshr_vec(vece, t, a, sh); 411 tcg_gen_add_vec(vece, d, d, t); 412 } 413 414 void gen_gvec_srsra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 415 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 416 { 417 static const TCGOpcode vecop_list[] = { 418 INDEX_op_shri_vec, INDEX_op_sari_vec, INDEX_op_add_vec, 0 419 }; 420 static const GVecGen2i ops[4] = { 421 { .fni8 = gen_srsra8_i64, 422 .fniv = gen_srsra_vec, 423 .fno = gen_helper_gvec_srsra_b, 424 .opt_opc = vecop_list, 425 .load_dest = true, 426 .vece = MO_8 }, 427 { .fni8 = gen_srsra16_i64, 428 .fniv = gen_srsra_vec, 429 .fno = gen_helper_gvec_srsra_h, 430 .opt_opc = vecop_list, 431 .load_dest = true, 432 .vece = MO_16 }, 433 { .fni4 = gen_srsra32_i32, 434 .fniv = gen_srsra_vec, 435 .fno = gen_helper_gvec_srsra_s, 436 .opt_opc = vecop_list, 437 .load_dest = true, 438 .vece = MO_32 }, 439 { .fni8 = gen_srsra64_i64, 440 .fniv = gen_srsra_vec, 441 .fno = gen_helper_gvec_srsra_d, 442 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 443 .opt_opc = vecop_list, 444 .load_dest = true, 445 .vece = MO_64 }, 446 }; 447 448 /* tszimm encoding produces immediates in the range [1..esize] */ 449 tcg_debug_assert(shift > 0); 450 tcg_debug_assert(shift <= (8 << vece)); 451 452 /* 453 * Shifts larger than the element size are architecturally valid. 454 * Signed results in all sign bits. With rounding, this produces 455 * (-1 + 1) >> 1 == 0, or (0 + 1) >> 1 == 0. 456 * I.e. always zero. With accumulation, this leaves D unchanged. 457 */ 458 if (shift == (8 << vece)) { 459 /* Nop, but we do need to clear the tail. */ 460 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 461 } else { 462 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 463 } 464 } 465 466 static void gen_urshr8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 467 { 468 TCGv_i64 t = tcg_temp_new_i64(); 469 470 tcg_gen_shri_i64(t, a, sh - 1); 471 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 472 tcg_gen_vec_shr8i_i64(d, a, sh); 473 tcg_gen_vec_add8_i64(d, d, t); 474 } 475 476 static void gen_urshr16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 477 { 478 TCGv_i64 t = tcg_temp_new_i64(); 479 480 tcg_gen_shri_i64(t, a, sh - 1); 481 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 482 tcg_gen_vec_shr16i_i64(d, a, sh); 483 tcg_gen_vec_add16_i64(d, d, t); 484 } 485 486 void gen_urshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 487 { 488 TCGv_i32 t; 489 490 /* Handle shift by the input size for the benefit of trans_URSHR_ri */ 491 if (sh == 32) { 492 tcg_gen_extract_i32(d, a, sh - 1, 1); 493 return; 494 } 495 t = tcg_temp_new_i32(); 496 tcg_gen_extract_i32(t, a, sh - 1, 1); 497 tcg_gen_shri_i32(d, a, sh); 498 tcg_gen_add_i32(d, d, t); 499 } 500 501 void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 502 { 503 TCGv_i64 t = tcg_temp_new_i64(); 504 505 tcg_gen_extract_i64(t, a, sh - 1, 1); 506 tcg_gen_shri_i64(d, a, sh); 507 tcg_gen_add_i64(d, d, t); 508 } 509 510 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift) 511 { 512 TCGv_vec t = tcg_temp_new_vec_matching(d); 513 TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1); 514 515 tcg_gen_shri_vec(vece, t, a, shift - 1); 516 tcg_gen_and_vec(vece, t, t, ones); 517 tcg_gen_shri_vec(vece, d, a, shift); 518 tcg_gen_add_vec(vece, d, d, t); 519 } 520 521 void gen_gvec_urshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 522 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 523 { 524 static const TCGOpcode vecop_list[] = { 525 INDEX_op_shri_vec, INDEX_op_add_vec, 0 526 }; 527 static const GVecGen2i ops[4] = { 528 { .fni8 = gen_urshr8_i64, 529 .fniv = gen_urshr_vec, 530 .fno = gen_helper_gvec_urshr_b, 531 .opt_opc = vecop_list, 532 .vece = MO_8 }, 533 { .fni8 = gen_urshr16_i64, 534 .fniv = gen_urshr_vec, 535 .fno = gen_helper_gvec_urshr_h, 536 .opt_opc = vecop_list, 537 .vece = MO_16 }, 538 { .fni4 = gen_urshr32_i32, 539 .fniv = gen_urshr_vec, 540 .fno = gen_helper_gvec_urshr_s, 541 .opt_opc = vecop_list, 542 .vece = MO_32 }, 543 { .fni8 = gen_urshr64_i64, 544 .fniv = gen_urshr_vec, 545 .fno = gen_helper_gvec_urshr_d, 546 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 547 .opt_opc = vecop_list, 548 .vece = MO_64 }, 549 }; 550 551 /* tszimm encoding produces immediates in the range [1..esize] */ 552 tcg_debug_assert(shift > 0); 553 tcg_debug_assert(shift <= (8 << vece)); 554 555 if (shift == (8 << vece)) { 556 /* 557 * Shifts larger than the element size are architecturally valid. 558 * Unsigned results in zero. With rounding, this produces a 559 * copy of the most significant bit. 560 */ 561 tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift - 1, opr_sz, max_sz); 562 } else { 563 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 564 } 565 } 566 567 static void gen_ursra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 568 { 569 TCGv_i64 t = tcg_temp_new_i64(); 570 571 if (sh == 8) { 572 tcg_gen_vec_shr8i_i64(t, a, 7); 573 } else { 574 gen_urshr8_i64(t, a, sh); 575 } 576 tcg_gen_vec_add8_i64(d, d, t); 577 } 578 579 static void gen_ursra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 580 { 581 TCGv_i64 t = tcg_temp_new_i64(); 582 583 if (sh == 16) { 584 tcg_gen_vec_shr16i_i64(t, a, 15); 585 } else { 586 gen_urshr16_i64(t, a, sh); 587 } 588 tcg_gen_vec_add16_i64(d, d, t); 589 } 590 591 static void gen_ursra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh) 592 { 593 TCGv_i32 t = tcg_temp_new_i32(); 594 595 if (sh == 32) { 596 tcg_gen_shri_i32(t, a, 31); 597 } else { 598 gen_urshr32_i32(t, a, sh); 599 } 600 tcg_gen_add_i32(d, d, t); 601 } 602 603 static void gen_ursra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh) 604 { 605 TCGv_i64 t = tcg_temp_new_i64(); 606 607 if (sh == 64) { 608 tcg_gen_shri_i64(t, a, 63); 609 } else { 610 gen_urshr64_i64(t, a, sh); 611 } 612 tcg_gen_add_i64(d, d, t); 613 } 614 615 static void gen_ursra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 616 { 617 TCGv_vec t = tcg_temp_new_vec_matching(d); 618 619 if (sh == (8 << vece)) { 620 tcg_gen_shri_vec(vece, t, a, sh - 1); 621 } else { 622 gen_urshr_vec(vece, t, a, sh); 623 } 624 tcg_gen_add_vec(vece, d, d, t); 625 } 626 627 void gen_gvec_ursra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 628 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 629 { 630 static const TCGOpcode vecop_list[] = { 631 INDEX_op_shri_vec, INDEX_op_add_vec, 0 632 }; 633 static const GVecGen2i ops[4] = { 634 { .fni8 = gen_ursra8_i64, 635 .fniv = gen_ursra_vec, 636 .fno = gen_helper_gvec_ursra_b, 637 .opt_opc = vecop_list, 638 .load_dest = true, 639 .vece = MO_8 }, 640 { .fni8 = gen_ursra16_i64, 641 .fniv = gen_ursra_vec, 642 .fno = gen_helper_gvec_ursra_h, 643 .opt_opc = vecop_list, 644 .load_dest = true, 645 .vece = MO_16 }, 646 { .fni4 = gen_ursra32_i32, 647 .fniv = gen_ursra_vec, 648 .fno = gen_helper_gvec_ursra_s, 649 .opt_opc = vecop_list, 650 .load_dest = true, 651 .vece = MO_32 }, 652 { .fni8 = gen_ursra64_i64, 653 .fniv = gen_ursra_vec, 654 .fno = gen_helper_gvec_ursra_d, 655 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 656 .opt_opc = vecop_list, 657 .load_dest = true, 658 .vece = MO_64 }, 659 }; 660 661 /* tszimm encoding produces immediates in the range [1..esize] */ 662 tcg_debug_assert(shift > 0); 663 tcg_debug_assert(shift <= (8 << vece)); 664 665 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 666 } 667 668 static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 669 { 670 uint64_t mask = dup_const(MO_8, 0xff >> shift); 671 TCGv_i64 t = tcg_temp_new_i64(); 672 673 tcg_gen_shri_i64(t, a, shift); 674 tcg_gen_andi_i64(t, t, mask); 675 tcg_gen_andi_i64(d, d, ~mask); 676 tcg_gen_or_i64(d, d, t); 677 } 678 679 static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 680 { 681 uint64_t mask = dup_const(MO_16, 0xffff >> shift); 682 TCGv_i64 t = tcg_temp_new_i64(); 683 684 tcg_gen_shri_i64(t, a, shift); 685 tcg_gen_andi_i64(t, t, mask); 686 tcg_gen_andi_i64(d, d, ~mask); 687 tcg_gen_or_i64(d, d, t); 688 } 689 690 static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 691 { 692 tcg_gen_shri_i32(a, a, shift); 693 tcg_gen_deposit_i32(d, d, a, 0, 32 - shift); 694 } 695 696 static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 697 { 698 tcg_gen_shri_i64(a, a, shift); 699 tcg_gen_deposit_i64(d, d, a, 0, 64 - shift); 700 } 701 702 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 703 { 704 TCGv_vec t = tcg_temp_new_vec_matching(d); 705 int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh); 706 TCGv_vec m = tcg_constant_vec_matching(d, vece, mi); 707 708 tcg_gen_shri_vec(vece, t, a, sh); 709 tcg_gen_and_vec(vece, d, d, m); 710 tcg_gen_or_vec(vece, d, d, t); 711 } 712 713 void gen_gvec_sri(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 714 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 715 { 716 static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 }; 717 const GVecGen2i ops[4] = { 718 { .fni8 = gen_shr8_ins_i64, 719 .fniv = gen_shr_ins_vec, 720 .fno = gen_helper_gvec_sri_b, 721 .load_dest = true, 722 .opt_opc = vecop_list, 723 .vece = MO_8 }, 724 { .fni8 = gen_shr16_ins_i64, 725 .fniv = gen_shr_ins_vec, 726 .fno = gen_helper_gvec_sri_h, 727 .load_dest = true, 728 .opt_opc = vecop_list, 729 .vece = MO_16 }, 730 { .fni4 = gen_shr32_ins_i32, 731 .fniv = gen_shr_ins_vec, 732 .fno = gen_helper_gvec_sri_s, 733 .load_dest = true, 734 .opt_opc = vecop_list, 735 .vece = MO_32 }, 736 { .fni8 = gen_shr64_ins_i64, 737 .fniv = gen_shr_ins_vec, 738 .fno = gen_helper_gvec_sri_d, 739 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 740 .load_dest = true, 741 .opt_opc = vecop_list, 742 .vece = MO_64 }, 743 }; 744 745 /* tszimm encoding produces immediates in the range [1..esize]. */ 746 tcg_debug_assert(shift > 0); 747 tcg_debug_assert(shift <= (8 << vece)); 748 749 /* Shift of esize leaves destination unchanged. */ 750 if (shift < (8 << vece)) { 751 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 752 } else { 753 /* Nop, but we do need to clear the tail. */ 754 tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz); 755 } 756 } 757 758 static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 759 { 760 uint64_t mask = dup_const(MO_8, 0xff << shift); 761 TCGv_i64 t = tcg_temp_new_i64(); 762 763 tcg_gen_shli_i64(t, a, shift); 764 tcg_gen_andi_i64(t, t, mask); 765 tcg_gen_andi_i64(d, d, ~mask); 766 tcg_gen_or_i64(d, d, t); 767 } 768 769 static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 770 { 771 uint64_t mask = dup_const(MO_16, 0xffff << shift); 772 TCGv_i64 t = tcg_temp_new_i64(); 773 774 tcg_gen_shli_i64(t, a, shift); 775 tcg_gen_andi_i64(t, t, mask); 776 tcg_gen_andi_i64(d, d, ~mask); 777 tcg_gen_or_i64(d, d, t); 778 } 779 780 static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) 781 { 782 tcg_gen_deposit_i32(d, d, a, shift, 32 - shift); 783 } 784 785 static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) 786 { 787 tcg_gen_deposit_i64(d, d, a, shift, 64 - shift); 788 } 789 790 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) 791 { 792 TCGv_vec t = tcg_temp_new_vec_matching(d); 793 TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh)); 794 795 tcg_gen_shli_vec(vece, t, a, sh); 796 tcg_gen_and_vec(vece, d, d, m); 797 tcg_gen_or_vec(vece, d, d, t); 798 } 799 800 void gen_gvec_sli(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs, 801 int64_t shift, uint32_t opr_sz, uint32_t max_sz) 802 { 803 static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 }; 804 const GVecGen2i ops[4] = { 805 { .fni8 = gen_shl8_ins_i64, 806 .fniv = gen_shl_ins_vec, 807 .fno = gen_helper_gvec_sli_b, 808 .load_dest = true, 809 .opt_opc = vecop_list, 810 .vece = MO_8 }, 811 { .fni8 = gen_shl16_ins_i64, 812 .fniv = gen_shl_ins_vec, 813 .fno = gen_helper_gvec_sli_h, 814 .load_dest = true, 815 .opt_opc = vecop_list, 816 .vece = MO_16 }, 817 { .fni4 = gen_shl32_ins_i32, 818 .fniv = gen_shl_ins_vec, 819 .fno = gen_helper_gvec_sli_s, 820 .load_dest = true, 821 .opt_opc = vecop_list, 822 .vece = MO_32 }, 823 { .fni8 = gen_shl64_ins_i64, 824 .fniv = gen_shl_ins_vec, 825 .fno = gen_helper_gvec_sli_d, 826 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 827 .load_dest = true, 828 .opt_opc = vecop_list, 829 .vece = MO_64 }, 830 }; 831 832 /* tszimm encoding produces immediates in the range [0..esize-1]. */ 833 tcg_debug_assert(shift >= 0); 834 tcg_debug_assert(shift < (8 << vece)); 835 836 if (shift == 0) { 837 tcg_gen_gvec_mov(vece, rd_ofs, rm_ofs, opr_sz, max_sz); 838 } else { 839 tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]); 840 } 841 } 842 843 static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 844 { 845 gen_helper_neon_mul_u8(a, a, b); 846 gen_helper_neon_add_u8(d, d, a); 847 } 848 849 static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 850 { 851 gen_helper_neon_mul_u8(a, a, b); 852 gen_helper_neon_sub_u8(d, d, a); 853 } 854 855 static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 856 { 857 gen_helper_neon_mul_u16(a, a, b); 858 gen_helper_neon_add_u16(d, d, a); 859 } 860 861 static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 862 { 863 gen_helper_neon_mul_u16(a, a, b); 864 gen_helper_neon_sub_u16(d, d, a); 865 } 866 867 static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 868 { 869 tcg_gen_mul_i32(a, a, b); 870 tcg_gen_add_i32(d, d, a); 871 } 872 873 static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 874 { 875 tcg_gen_mul_i32(a, a, b); 876 tcg_gen_sub_i32(d, d, a); 877 } 878 879 static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 880 { 881 tcg_gen_mul_i64(a, a, b); 882 tcg_gen_add_i64(d, d, a); 883 } 884 885 static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 886 { 887 tcg_gen_mul_i64(a, a, b); 888 tcg_gen_sub_i64(d, d, a); 889 } 890 891 static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 892 { 893 tcg_gen_mul_vec(vece, a, a, b); 894 tcg_gen_add_vec(vece, d, d, a); 895 } 896 897 static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 898 { 899 tcg_gen_mul_vec(vece, a, a, b); 900 tcg_gen_sub_vec(vece, d, d, a); 901 } 902 903 /* Note that while NEON does not support VMLA and VMLS as 64-bit ops, 904 * these tables are shared with AArch64 which does support them. 905 */ 906 void gen_gvec_mla(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 907 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 908 { 909 static const TCGOpcode vecop_list[] = { 910 INDEX_op_mul_vec, INDEX_op_add_vec, 0 911 }; 912 static const GVecGen3 ops[4] = { 913 { .fni4 = gen_mla8_i32, 914 .fniv = gen_mla_vec, 915 .load_dest = true, 916 .opt_opc = vecop_list, 917 .vece = MO_8 }, 918 { .fni4 = gen_mla16_i32, 919 .fniv = gen_mla_vec, 920 .load_dest = true, 921 .opt_opc = vecop_list, 922 .vece = MO_16 }, 923 { .fni4 = gen_mla32_i32, 924 .fniv = gen_mla_vec, 925 .load_dest = true, 926 .opt_opc = vecop_list, 927 .vece = MO_32 }, 928 { .fni8 = gen_mla64_i64, 929 .fniv = gen_mla_vec, 930 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 931 .load_dest = true, 932 .opt_opc = vecop_list, 933 .vece = MO_64 }, 934 }; 935 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 936 } 937 938 void gen_gvec_mls(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 939 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 940 { 941 static const TCGOpcode vecop_list[] = { 942 INDEX_op_mul_vec, INDEX_op_sub_vec, 0 943 }; 944 static const GVecGen3 ops[4] = { 945 { .fni4 = gen_mls8_i32, 946 .fniv = gen_mls_vec, 947 .load_dest = true, 948 .opt_opc = vecop_list, 949 .vece = MO_8 }, 950 { .fni4 = gen_mls16_i32, 951 .fniv = gen_mls_vec, 952 .load_dest = true, 953 .opt_opc = vecop_list, 954 .vece = MO_16 }, 955 { .fni4 = gen_mls32_i32, 956 .fniv = gen_mls_vec, 957 .load_dest = true, 958 .opt_opc = vecop_list, 959 .vece = MO_32 }, 960 { .fni8 = gen_mls64_i64, 961 .fniv = gen_mls_vec, 962 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 963 .load_dest = true, 964 .opt_opc = vecop_list, 965 .vece = MO_64 }, 966 }; 967 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 968 } 969 970 /* CMTST : test is "if (X & Y != 0)". */ 971 static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 972 { 973 tcg_gen_negsetcond_i32(TCG_COND_TSTNE, d, a, b); 974 } 975 976 void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 977 { 978 tcg_gen_negsetcond_i64(TCG_COND_TSTNE, d, a, b); 979 } 980 981 static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 982 { 983 tcg_gen_cmp_vec(TCG_COND_TSTNE, vece, d, a, b); 984 } 985 986 void gen_gvec_cmtst(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 987 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 988 { 989 static const TCGOpcode vecop_list[] = { INDEX_op_cmp_vec, 0 }; 990 static const GVecGen3 ops[4] = { 991 { .fni4 = gen_helper_neon_tst_u8, 992 .fniv = gen_cmtst_vec, 993 .opt_opc = vecop_list, 994 .vece = MO_8 }, 995 { .fni4 = gen_helper_neon_tst_u16, 996 .fniv = gen_cmtst_vec, 997 .opt_opc = vecop_list, 998 .vece = MO_16 }, 999 { .fni4 = gen_cmtst_i32, 1000 .fniv = gen_cmtst_vec, 1001 .opt_opc = vecop_list, 1002 .vece = MO_32 }, 1003 { .fni8 = gen_cmtst_i64, 1004 .fniv = gen_cmtst_vec, 1005 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1006 .opt_opc = vecop_list, 1007 .vece = MO_64 }, 1008 }; 1009 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1010 } 1011 1012 void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 1013 { 1014 TCGv_i32 lval = tcg_temp_new_i32(); 1015 TCGv_i32 rval = tcg_temp_new_i32(); 1016 TCGv_i32 lsh = tcg_temp_new_i32(); 1017 TCGv_i32 rsh = tcg_temp_new_i32(); 1018 TCGv_i32 zero = tcg_constant_i32(0); 1019 TCGv_i32 max = tcg_constant_i32(32); 1020 1021 /* 1022 * Rely on the TCG guarantee that out of range shifts produce 1023 * unspecified results, not undefined behaviour (i.e. no trap). 1024 * Discard out-of-range results after the fact. 1025 */ 1026 tcg_gen_ext8s_i32(lsh, shift); 1027 tcg_gen_neg_i32(rsh, lsh); 1028 tcg_gen_shl_i32(lval, src, lsh); 1029 tcg_gen_shr_i32(rval, src, rsh); 1030 tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero); 1031 tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst); 1032 } 1033 1034 void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1035 { 1036 TCGv_i64 lval = tcg_temp_new_i64(); 1037 TCGv_i64 rval = tcg_temp_new_i64(); 1038 TCGv_i64 lsh = tcg_temp_new_i64(); 1039 TCGv_i64 rsh = tcg_temp_new_i64(); 1040 TCGv_i64 zero = tcg_constant_i64(0); 1041 TCGv_i64 max = tcg_constant_i64(64); 1042 1043 /* 1044 * Rely on the TCG guarantee that out of range shifts produce 1045 * unspecified results, not undefined behaviour (i.e. no trap). 1046 * Discard out-of-range results after the fact. 1047 */ 1048 tcg_gen_ext8s_i64(lsh, shift); 1049 tcg_gen_neg_i64(rsh, lsh); 1050 tcg_gen_shl_i64(lval, src, lsh); 1051 tcg_gen_shr_i64(rval, src, rsh); 1052 tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero); 1053 tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst); 1054 } 1055 1056 static void gen_ushl_vec(unsigned vece, TCGv_vec dst, 1057 TCGv_vec src, TCGv_vec shift) 1058 { 1059 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1060 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1061 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1062 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1063 TCGv_vec max, zero; 1064 1065 tcg_gen_neg_vec(vece, rsh, shift); 1066 if (vece == MO_8) { 1067 tcg_gen_mov_vec(lsh, shift); 1068 } else { 1069 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); 1070 tcg_gen_and_vec(vece, lsh, shift, msk); 1071 tcg_gen_and_vec(vece, rsh, rsh, msk); 1072 } 1073 1074 /* 1075 * Rely on the TCG guarantee that out of range shifts produce 1076 * unspecified results, not undefined behaviour (i.e. no trap). 1077 * Discard out-of-range results after the fact. 1078 */ 1079 tcg_gen_shlv_vec(vece, lval, src, lsh); 1080 tcg_gen_shrv_vec(vece, rval, src, rsh); 1081 1082 /* 1083 * The choice of GE (signed) and GEU (unsigned) are biased toward 1084 * the instructions of the x86_64 host. For MO_8, the whole byte 1085 * is significant so we must use an unsigned compare; otherwise we 1086 * have already masked to a byte and so a signed compare works. 1087 * Other tcg hosts have a full set of comparisons and do not care. 1088 */ 1089 zero = tcg_constant_vec_matching(dst, vece, 0); 1090 max = tcg_constant_vec_matching(dst, vece, 8 << vece); 1091 if (vece == MO_8) { 1092 tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, lval, lsh, max, zero, lval); 1093 tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, rval, rsh, max, zero, rval); 1094 } else { 1095 tcg_gen_cmpsel_vec(TCG_COND_GE, vece, lval, lsh, max, zero, lval); 1096 tcg_gen_cmpsel_vec(TCG_COND_GE, vece, rval, rsh, max, zero, rval); 1097 } 1098 tcg_gen_or_vec(vece, dst, lval, rval); 1099 } 1100 1101 void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1102 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1103 { 1104 static const TCGOpcode vecop_list[] = { 1105 INDEX_op_neg_vec, INDEX_op_shlv_vec, 1106 INDEX_op_shrv_vec, INDEX_op_cmpsel_vec, 0 1107 }; 1108 static const GVecGen3 ops[4] = { 1109 { .fniv = gen_ushl_vec, 1110 .fno = gen_helper_gvec_ushl_b, 1111 .opt_opc = vecop_list, 1112 .vece = MO_8 }, 1113 { .fniv = gen_ushl_vec, 1114 .fno = gen_helper_gvec_ushl_h, 1115 .opt_opc = vecop_list, 1116 .vece = MO_16 }, 1117 { .fni4 = gen_ushl_i32, 1118 .fniv = gen_ushl_vec, 1119 .opt_opc = vecop_list, 1120 .vece = MO_32 }, 1121 { .fni8 = gen_ushl_i64, 1122 .fniv = gen_ushl_vec, 1123 .opt_opc = vecop_list, 1124 .vece = MO_64 }, 1125 }; 1126 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1127 } 1128 1129 void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift) 1130 { 1131 TCGv_i32 lval = tcg_temp_new_i32(); 1132 TCGv_i32 rval = tcg_temp_new_i32(); 1133 TCGv_i32 lsh = tcg_temp_new_i32(); 1134 TCGv_i32 rsh = tcg_temp_new_i32(); 1135 TCGv_i32 zero = tcg_constant_i32(0); 1136 TCGv_i32 max = tcg_constant_i32(31); 1137 1138 /* 1139 * Rely on the TCG guarantee that out of range shifts produce 1140 * unspecified results, not undefined behaviour (i.e. no trap). 1141 * Discard out-of-range results after the fact. 1142 */ 1143 tcg_gen_ext8s_i32(lsh, shift); 1144 tcg_gen_neg_i32(rsh, lsh); 1145 tcg_gen_shl_i32(lval, src, lsh); 1146 tcg_gen_umin_i32(rsh, rsh, max); 1147 tcg_gen_sar_i32(rval, src, rsh); 1148 tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero); 1149 tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval); 1150 } 1151 1152 void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift) 1153 { 1154 TCGv_i64 lval = tcg_temp_new_i64(); 1155 TCGv_i64 rval = tcg_temp_new_i64(); 1156 TCGv_i64 lsh = tcg_temp_new_i64(); 1157 TCGv_i64 rsh = tcg_temp_new_i64(); 1158 TCGv_i64 zero = tcg_constant_i64(0); 1159 TCGv_i64 max = tcg_constant_i64(63); 1160 1161 /* 1162 * Rely on the TCG guarantee that out of range shifts produce 1163 * unspecified results, not undefined behaviour (i.e. no trap). 1164 * Discard out-of-range results after the fact. 1165 */ 1166 tcg_gen_ext8s_i64(lsh, shift); 1167 tcg_gen_neg_i64(rsh, lsh); 1168 tcg_gen_shl_i64(lval, src, lsh); 1169 tcg_gen_umin_i64(rsh, rsh, max); 1170 tcg_gen_sar_i64(rval, src, rsh); 1171 tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero); 1172 tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval); 1173 } 1174 1175 static void gen_sshl_vec(unsigned vece, TCGv_vec dst, 1176 TCGv_vec src, TCGv_vec shift) 1177 { 1178 TCGv_vec lval = tcg_temp_new_vec_matching(dst); 1179 TCGv_vec rval = tcg_temp_new_vec_matching(dst); 1180 TCGv_vec lsh = tcg_temp_new_vec_matching(dst); 1181 TCGv_vec rsh = tcg_temp_new_vec_matching(dst); 1182 TCGv_vec max, zero; 1183 1184 /* 1185 * Rely on the TCG guarantee that out of range shifts produce 1186 * unspecified results, not undefined behaviour (i.e. no trap). 1187 * Discard out-of-range results after the fact. 1188 */ 1189 tcg_gen_neg_vec(vece, rsh, shift); 1190 if (vece == MO_8) { 1191 tcg_gen_mov_vec(lsh, shift); 1192 } else { 1193 TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff); 1194 tcg_gen_and_vec(vece, lsh, shift, msk); 1195 tcg_gen_and_vec(vece, rsh, rsh, msk); 1196 } 1197 1198 /* Bound rsh so out of bound right shift gets -1. */ 1199 max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1); 1200 tcg_gen_umin_vec(vece, rsh, rsh, max); 1201 1202 tcg_gen_shlv_vec(vece, lval, src, lsh); 1203 tcg_gen_sarv_vec(vece, rval, src, rsh); 1204 1205 /* Select in-bound left shift. */ 1206 zero = tcg_constant_vec_matching(dst, vece, 0); 1207 tcg_gen_cmpsel_vec(TCG_COND_GT, vece, lval, lsh, max, zero, lval); 1208 1209 /* Select between left and right shift. */ 1210 if (vece == MO_8) { 1211 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval); 1212 } else { 1213 TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80); 1214 tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval); 1215 } 1216 } 1217 1218 void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1219 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1220 { 1221 static const TCGOpcode vecop_list[] = { 1222 INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec, 1223 INDEX_op_sarv_vec, INDEX_op_cmpsel_vec, 0 1224 }; 1225 static const GVecGen3 ops[4] = { 1226 { .fniv = gen_sshl_vec, 1227 .fno = gen_helper_gvec_sshl_b, 1228 .opt_opc = vecop_list, 1229 .vece = MO_8 }, 1230 { .fniv = gen_sshl_vec, 1231 .fno = gen_helper_gvec_sshl_h, 1232 .opt_opc = vecop_list, 1233 .vece = MO_16 }, 1234 { .fni4 = gen_sshl_i32, 1235 .fniv = gen_sshl_vec, 1236 .opt_opc = vecop_list, 1237 .vece = MO_32 }, 1238 { .fni8 = gen_sshl_i64, 1239 .fniv = gen_sshl_vec, 1240 .opt_opc = vecop_list, 1241 .vece = MO_64 }, 1242 }; 1243 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1244 } 1245 1246 void gen_gvec_srshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1247 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1248 { 1249 static gen_helper_gvec_3 * const fns[] = { 1250 gen_helper_gvec_srshl_b, gen_helper_gvec_srshl_h, 1251 gen_helper_gvec_srshl_s, gen_helper_gvec_srshl_d, 1252 }; 1253 tcg_debug_assert(vece <= MO_64); 1254 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1255 } 1256 1257 void gen_gvec_urshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1258 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1259 { 1260 static gen_helper_gvec_3 * const fns[] = { 1261 gen_helper_gvec_urshl_b, gen_helper_gvec_urshl_h, 1262 gen_helper_gvec_urshl_s, gen_helper_gvec_urshl_d, 1263 }; 1264 tcg_debug_assert(vece <= MO_64); 1265 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1266 } 1267 1268 void gen_neon_sqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1269 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1270 { 1271 static gen_helper_gvec_3_ptr * const fns[] = { 1272 gen_helper_neon_sqshl_b, gen_helper_neon_sqshl_h, 1273 gen_helper_neon_sqshl_s, gen_helper_neon_sqshl_d, 1274 }; 1275 tcg_debug_assert(vece <= MO_64); 1276 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1277 opr_sz, max_sz, 0, fns[vece]); 1278 } 1279 1280 void gen_neon_uqshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1281 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1282 { 1283 static gen_helper_gvec_3_ptr * const fns[] = { 1284 gen_helper_neon_uqshl_b, gen_helper_neon_uqshl_h, 1285 gen_helper_neon_uqshl_s, gen_helper_neon_uqshl_d, 1286 }; 1287 tcg_debug_assert(vece <= MO_64); 1288 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1289 opr_sz, max_sz, 0, fns[vece]); 1290 } 1291 1292 void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1293 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1294 { 1295 static gen_helper_gvec_3_ptr * const fns[] = { 1296 gen_helper_neon_sqrshl_b, gen_helper_neon_sqrshl_h, 1297 gen_helper_neon_sqrshl_s, gen_helper_neon_sqrshl_d, 1298 }; 1299 tcg_debug_assert(vece <= MO_64); 1300 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1301 opr_sz, max_sz, 0, fns[vece]); 1302 } 1303 1304 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1305 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1306 { 1307 static gen_helper_gvec_3_ptr * const fns[] = { 1308 gen_helper_neon_uqrshl_b, gen_helper_neon_uqrshl_h, 1309 gen_helper_neon_uqrshl_s, gen_helper_neon_uqrshl_d, 1310 }; 1311 tcg_debug_assert(vece <= MO_64); 1312 tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, tcg_env, 1313 opr_sz, max_sz, 0, fns[vece]); 1314 } 1315 1316 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1317 { 1318 uint64_t max = MAKE_64BIT_MASK(0, 8 << esz); 1319 TCGv_i64 tmp = tcg_temp_new_i64(); 1320 1321 tcg_gen_add_i64(tmp, a, b); 1322 tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max)); 1323 tcg_gen_xor_i64(tmp, tmp, res); 1324 tcg_gen_or_i64(qc, qc, tmp); 1325 } 1326 1327 void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1328 { 1329 TCGv_i64 t = tcg_temp_new_i64(); 1330 1331 tcg_gen_add_i64(t, a, b); 1332 tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a, 1333 tcg_constant_i64(UINT64_MAX), t); 1334 tcg_gen_xor_i64(t, t, res); 1335 tcg_gen_or_i64(qc, qc, t); 1336 } 1337 1338 static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1339 TCGv_vec a, TCGv_vec b) 1340 { 1341 TCGv_vec x = tcg_temp_new_vec_matching(t); 1342 tcg_gen_add_vec(vece, x, a, b); 1343 tcg_gen_usadd_vec(vece, t, a, b); 1344 tcg_gen_xor_vec(vece, x, x, t); 1345 tcg_gen_or_vec(vece, qc, qc, x); 1346 } 1347 1348 void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1349 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1350 { 1351 static const TCGOpcode vecop_list[] = { 1352 INDEX_op_usadd_vec, INDEX_op_add_vec, 0 1353 }; 1354 static const GVecGen4 ops[4] = { 1355 { .fniv = gen_uqadd_vec, 1356 .fno = gen_helper_gvec_uqadd_b, 1357 .write_aofs = true, 1358 .opt_opc = vecop_list, 1359 .vece = MO_8 }, 1360 { .fniv = gen_uqadd_vec, 1361 .fno = gen_helper_gvec_uqadd_h, 1362 .write_aofs = true, 1363 .opt_opc = vecop_list, 1364 .vece = MO_16 }, 1365 { .fniv = gen_uqadd_vec, 1366 .fno = gen_helper_gvec_uqadd_s, 1367 .write_aofs = true, 1368 .opt_opc = vecop_list, 1369 .vece = MO_32 }, 1370 { .fniv = gen_uqadd_vec, 1371 .fni8 = gen_uqadd_d, 1372 .fno = gen_helper_gvec_uqadd_d, 1373 .write_aofs = true, 1374 .opt_opc = vecop_list, 1375 .vece = MO_64 }, 1376 }; 1377 1378 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1379 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1380 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1381 } 1382 1383 void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1384 { 1385 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1386 int64_t min = -1ll - max; 1387 TCGv_i64 tmp = tcg_temp_new_i64(); 1388 1389 tcg_gen_add_i64(tmp, a, b); 1390 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1391 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1392 tcg_gen_xor_i64(tmp, tmp, res); 1393 tcg_gen_or_i64(qc, qc, tmp); 1394 } 1395 1396 void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1397 { 1398 TCGv_i64 t0 = tcg_temp_new_i64(); 1399 TCGv_i64 t1 = tcg_temp_new_i64(); 1400 TCGv_i64 t2 = tcg_temp_new_i64(); 1401 1402 tcg_gen_add_i64(t0, a, b); 1403 1404 /* Compute signed overflow indication into T1 */ 1405 tcg_gen_xor_i64(t1, a, b); 1406 tcg_gen_xor_i64(t2, t0, a); 1407 tcg_gen_andc_i64(t1, t2, t1); 1408 1409 /* Compute saturated value into T2 */ 1410 tcg_gen_sari_i64(t2, a, 63); 1411 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1412 1413 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1414 tcg_gen_xor_i64(t0, t0, res); 1415 tcg_gen_or_i64(qc, qc, t0); 1416 } 1417 1418 static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1419 TCGv_vec a, TCGv_vec b) 1420 { 1421 TCGv_vec x = tcg_temp_new_vec_matching(t); 1422 tcg_gen_add_vec(vece, x, a, b); 1423 tcg_gen_ssadd_vec(vece, t, a, b); 1424 tcg_gen_xor_vec(vece, x, x, t); 1425 tcg_gen_or_vec(vece, qc, qc, x); 1426 } 1427 1428 void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1429 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1430 { 1431 static const TCGOpcode vecop_list[] = { 1432 INDEX_op_ssadd_vec, INDEX_op_add_vec, 0 1433 }; 1434 static const GVecGen4 ops[4] = { 1435 { .fniv = gen_sqadd_vec, 1436 .fno = gen_helper_gvec_sqadd_b, 1437 .opt_opc = vecop_list, 1438 .write_aofs = true, 1439 .vece = MO_8 }, 1440 { .fniv = gen_sqadd_vec, 1441 .fno = gen_helper_gvec_sqadd_h, 1442 .opt_opc = vecop_list, 1443 .write_aofs = true, 1444 .vece = MO_16 }, 1445 { .fniv = gen_sqadd_vec, 1446 .fno = gen_helper_gvec_sqadd_s, 1447 .opt_opc = vecop_list, 1448 .write_aofs = true, 1449 .vece = MO_32 }, 1450 { .fniv = gen_sqadd_vec, 1451 .fni8 = gen_sqadd_d, 1452 .fno = gen_helper_gvec_sqadd_d, 1453 .opt_opc = vecop_list, 1454 .write_aofs = true, 1455 .vece = MO_64 }, 1456 }; 1457 1458 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1459 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1460 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1461 } 1462 1463 void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1464 { 1465 TCGv_i64 tmp = tcg_temp_new_i64(); 1466 1467 tcg_gen_sub_i64(tmp, a, b); 1468 tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0)); 1469 tcg_gen_xor_i64(tmp, tmp, res); 1470 tcg_gen_or_i64(qc, qc, tmp); 1471 } 1472 1473 void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1474 { 1475 TCGv_i64 t = tcg_temp_new_i64(); 1476 1477 tcg_gen_sub_i64(t, a, b); 1478 tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t); 1479 tcg_gen_xor_i64(t, t, res); 1480 tcg_gen_or_i64(qc, qc, t); 1481 } 1482 1483 static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1484 TCGv_vec a, TCGv_vec b) 1485 { 1486 TCGv_vec x = tcg_temp_new_vec_matching(t); 1487 tcg_gen_sub_vec(vece, x, a, b); 1488 tcg_gen_ussub_vec(vece, t, a, b); 1489 tcg_gen_xor_vec(vece, x, x, t); 1490 tcg_gen_or_vec(vece, qc, qc, x); 1491 } 1492 1493 void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1494 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1495 { 1496 static const TCGOpcode vecop_list[] = { 1497 INDEX_op_ussub_vec, INDEX_op_sub_vec, 0 1498 }; 1499 static const GVecGen4 ops[4] = { 1500 { .fniv = gen_uqsub_vec, 1501 .fno = gen_helper_gvec_uqsub_b, 1502 .opt_opc = vecop_list, 1503 .write_aofs = true, 1504 .vece = MO_8 }, 1505 { .fniv = gen_uqsub_vec, 1506 .fno = gen_helper_gvec_uqsub_h, 1507 .opt_opc = vecop_list, 1508 .write_aofs = true, 1509 .vece = MO_16 }, 1510 { .fniv = gen_uqsub_vec, 1511 .fno = gen_helper_gvec_uqsub_s, 1512 .opt_opc = vecop_list, 1513 .write_aofs = true, 1514 .vece = MO_32 }, 1515 { .fniv = gen_uqsub_vec, 1516 .fni8 = gen_uqsub_d, 1517 .fno = gen_helper_gvec_uqsub_d, 1518 .opt_opc = vecop_list, 1519 .write_aofs = true, 1520 .vece = MO_64 }, 1521 }; 1522 1523 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1524 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1525 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1526 } 1527 1528 void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz) 1529 { 1530 int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1); 1531 int64_t min = -1ll - max; 1532 TCGv_i64 tmp = tcg_temp_new_i64(); 1533 1534 tcg_gen_sub_i64(tmp, a, b); 1535 tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max)); 1536 tcg_gen_smax_i64(res, res, tcg_constant_i64(min)); 1537 tcg_gen_xor_i64(tmp, tmp, res); 1538 tcg_gen_or_i64(qc, qc, tmp); 1539 } 1540 1541 void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b) 1542 { 1543 TCGv_i64 t0 = tcg_temp_new_i64(); 1544 TCGv_i64 t1 = tcg_temp_new_i64(); 1545 TCGv_i64 t2 = tcg_temp_new_i64(); 1546 1547 tcg_gen_sub_i64(t0, a, b); 1548 1549 /* Compute signed overflow indication into T1 */ 1550 tcg_gen_xor_i64(t1, a, b); 1551 tcg_gen_xor_i64(t2, t0, a); 1552 tcg_gen_and_i64(t1, t1, t2); 1553 1554 /* Compute saturated value into T2 */ 1555 tcg_gen_sari_i64(t2, a, 63); 1556 tcg_gen_xori_i64(t2, t2, INT64_MAX); 1557 1558 tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0); 1559 tcg_gen_xor_i64(t0, t0, res); 1560 tcg_gen_or_i64(qc, qc, t0); 1561 } 1562 1563 static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc, 1564 TCGv_vec a, TCGv_vec b) 1565 { 1566 TCGv_vec x = tcg_temp_new_vec_matching(t); 1567 tcg_gen_sub_vec(vece, x, a, b); 1568 tcg_gen_sssub_vec(vece, t, a, b); 1569 tcg_gen_xor_vec(vece, x, x, t); 1570 tcg_gen_or_vec(vece, qc, qc, x); 1571 } 1572 1573 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1574 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1575 { 1576 static const TCGOpcode vecop_list[] = { 1577 INDEX_op_sssub_vec, INDEX_op_sub_vec, 0 1578 }; 1579 static const GVecGen4 ops[4] = { 1580 { .fniv = gen_sqsub_vec, 1581 .fno = gen_helper_gvec_sqsub_b, 1582 .opt_opc = vecop_list, 1583 .write_aofs = true, 1584 .vece = MO_8 }, 1585 { .fniv = gen_sqsub_vec, 1586 .fno = gen_helper_gvec_sqsub_h, 1587 .opt_opc = vecop_list, 1588 .write_aofs = true, 1589 .vece = MO_16 }, 1590 { .fniv = gen_sqsub_vec, 1591 .fno = gen_helper_gvec_sqsub_s, 1592 .opt_opc = vecop_list, 1593 .write_aofs = true, 1594 .vece = MO_32 }, 1595 { .fniv = gen_sqsub_vec, 1596 .fni8 = gen_sqsub_d, 1597 .fno = gen_helper_gvec_sqsub_d, 1598 .opt_opc = vecop_list, 1599 .write_aofs = true, 1600 .vece = MO_64 }, 1601 }; 1602 1603 tcg_debug_assert(opr_sz <= sizeof_field(CPUARMState, vfp.qc)); 1604 tcg_gen_gvec_4(rd_ofs, offsetof(CPUARMState, vfp.qc), 1605 rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1606 } 1607 1608 static void gen_sabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1609 { 1610 TCGv_i32 t = tcg_temp_new_i32(); 1611 1612 tcg_gen_sub_i32(t, a, b); 1613 tcg_gen_sub_i32(d, b, a); 1614 tcg_gen_movcond_i32(TCG_COND_LT, d, a, b, d, t); 1615 } 1616 1617 static void gen_sabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1618 { 1619 TCGv_i64 t = tcg_temp_new_i64(); 1620 1621 tcg_gen_sub_i64(t, a, b); 1622 tcg_gen_sub_i64(d, b, a); 1623 tcg_gen_movcond_i64(TCG_COND_LT, d, a, b, d, t); 1624 } 1625 1626 static void gen_sabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1627 { 1628 TCGv_vec t = tcg_temp_new_vec_matching(d); 1629 1630 tcg_gen_smin_vec(vece, t, a, b); 1631 tcg_gen_smax_vec(vece, d, a, b); 1632 tcg_gen_sub_vec(vece, d, d, t); 1633 } 1634 1635 void gen_gvec_sabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1636 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1637 { 1638 static const TCGOpcode vecop_list[] = { 1639 INDEX_op_sub_vec, INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1640 }; 1641 static const GVecGen3 ops[4] = { 1642 { .fniv = gen_sabd_vec, 1643 .fno = gen_helper_gvec_sabd_b, 1644 .opt_opc = vecop_list, 1645 .vece = MO_8 }, 1646 { .fniv = gen_sabd_vec, 1647 .fno = gen_helper_gvec_sabd_h, 1648 .opt_opc = vecop_list, 1649 .vece = MO_16 }, 1650 { .fni4 = gen_sabd_i32, 1651 .fniv = gen_sabd_vec, 1652 .fno = gen_helper_gvec_sabd_s, 1653 .opt_opc = vecop_list, 1654 .vece = MO_32 }, 1655 { .fni8 = gen_sabd_i64, 1656 .fniv = gen_sabd_vec, 1657 .fno = gen_helper_gvec_sabd_d, 1658 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1659 .opt_opc = vecop_list, 1660 .vece = MO_64 }, 1661 }; 1662 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1663 } 1664 1665 static void gen_uabd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1666 { 1667 TCGv_i32 t = tcg_temp_new_i32(); 1668 1669 tcg_gen_sub_i32(t, a, b); 1670 tcg_gen_sub_i32(d, b, a); 1671 tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, d, t); 1672 } 1673 1674 static void gen_uabd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1675 { 1676 TCGv_i64 t = tcg_temp_new_i64(); 1677 1678 tcg_gen_sub_i64(t, a, b); 1679 tcg_gen_sub_i64(d, b, a); 1680 tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, d, t); 1681 } 1682 1683 static void gen_uabd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1684 { 1685 TCGv_vec t = tcg_temp_new_vec_matching(d); 1686 1687 tcg_gen_umin_vec(vece, t, a, b); 1688 tcg_gen_umax_vec(vece, d, a, b); 1689 tcg_gen_sub_vec(vece, d, d, t); 1690 } 1691 1692 void gen_gvec_uabd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1693 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1694 { 1695 static const TCGOpcode vecop_list[] = { 1696 INDEX_op_sub_vec, INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1697 }; 1698 static const GVecGen3 ops[4] = { 1699 { .fniv = gen_uabd_vec, 1700 .fno = gen_helper_gvec_uabd_b, 1701 .opt_opc = vecop_list, 1702 .vece = MO_8 }, 1703 { .fniv = gen_uabd_vec, 1704 .fno = gen_helper_gvec_uabd_h, 1705 .opt_opc = vecop_list, 1706 .vece = MO_16 }, 1707 { .fni4 = gen_uabd_i32, 1708 .fniv = gen_uabd_vec, 1709 .fno = gen_helper_gvec_uabd_s, 1710 .opt_opc = vecop_list, 1711 .vece = MO_32 }, 1712 { .fni8 = gen_uabd_i64, 1713 .fniv = gen_uabd_vec, 1714 .fno = gen_helper_gvec_uabd_d, 1715 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1716 .opt_opc = vecop_list, 1717 .vece = MO_64 }, 1718 }; 1719 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1720 } 1721 1722 static void gen_saba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1723 { 1724 TCGv_i32 t = tcg_temp_new_i32(); 1725 gen_sabd_i32(t, a, b); 1726 tcg_gen_add_i32(d, d, t); 1727 } 1728 1729 static void gen_saba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1730 { 1731 TCGv_i64 t = tcg_temp_new_i64(); 1732 gen_sabd_i64(t, a, b); 1733 tcg_gen_add_i64(d, d, t); 1734 } 1735 1736 static void gen_saba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1737 { 1738 TCGv_vec t = tcg_temp_new_vec_matching(d); 1739 gen_sabd_vec(vece, t, a, b); 1740 tcg_gen_add_vec(vece, d, d, t); 1741 } 1742 1743 void gen_gvec_saba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1744 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1745 { 1746 static const TCGOpcode vecop_list[] = { 1747 INDEX_op_sub_vec, INDEX_op_add_vec, 1748 INDEX_op_smin_vec, INDEX_op_smax_vec, 0 1749 }; 1750 static const GVecGen3 ops[4] = { 1751 { .fniv = gen_saba_vec, 1752 .fno = gen_helper_gvec_saba_b, 1753 .opt_opc = vecop_list, 1754 .load_dest = true, 1755 .vece = MO_8 }, 1756 { .fniv = gen_saba_vec, 1757 .fno = gen_helper_gvec_saba_h, 1758 .opt_opc = vecop_list, 1759 .load_dest = true, 1760 .vece = MO_16 }, 1761 { .fni4 = gen_saba_i32, 1762 .fniv = gen_saba_vec, 1763 .fno = gen_helper_gvec_saba_s, 1764 .opt_opc = vecop_list, 1765 .load_dest = true, 1766 .vece = MO_32 }, 1767 { .fni8 = gen_saba_i64, 1768 .fniv = gen_saba_vec, 1769 .fno = gen_helper_gvec_saba_d, 1770 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1771 .opt_opc = vecop_list, 1772 .load_dest = true, 1773 .vece = MO_64 }, 1774 }; 1775 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1776 } 1777 1778 static void gen_uaba_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1779 { 1780 TCGv_i32 t = tcg_temp_new_i32(); 1781 gen_uabd_i32(t, a, b); 1782 tcg_gen_add_i32(d, d, t); 1783 } 1784 1785 static void gen_uaba_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1786 { 1787 TCGv_i64 t = tcg_temp_new_i64(); 1788 gen_uabd_i64(t, a, b); 1789 tcg_gen_add_i64(d, d, t); 1790 } 1791 1792 static void gen_uaba_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1793 { 1794 TCGv_vec t = tcg_temp_new_vec_matching(d); 1795 gen_uabd_vec(vece, t, a, b); 1796 tcg_gen_add_vec(vece, d, d, t); 1797 } 1798 1799 void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1800 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1801 { 1802 static const TCGOpcode vecop_list[] = { 1803 INDEX_op_sub_vec, INDEX_op_add_vec, 1804 INDEX_op_umin_vec, INDEX_op_umax_vec, 0 1805 }; 1806 static const GVecGen3 ops[4] = { 1807 { .fniv = gen_uaba_vec, 1808 .fno = gen_helper_gvec_uaba_b, 1809 .opt_opc = vecop_list, 1810 .load_dest = true, 1811 .vece = MO_8 }, 1812 { .fniv = gen_uaba_vec, 1813 .fno = gen_helper_gvec_uaba_h, 1814 .opt_opc = vecop_list, 1815 .load_dest = true, 1816 .vece = MO_16 }, 1817 { .fni4 = gen_uaba_i32, 1818 .fniv = gen_uaba_vec, 1819 .fno = gen_helper_gvec_uaba_s, 1820 .opt_opc = vecop_list, 1821 .load_dest = true, 1822 .vece = MO_32 }, 1823 { .fni8 = gen_uaba_i64, 1824 .fniv = gen_uaba_vec, 1825 .fno = gen_helper_gvec_uaba_d, 1826 .prefer_i64 = TCG_TARGET_REG_BITS == 64, 1827 .opt_opc = vecop_list, 1828 .load_dest = true, 1829 .vece = MO_64 }, 1830 }; 1831 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); 1832 } 1833 1834 void gen_gvec_addp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1835 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1836 { 1837 static gen_helper_gvec_3 * const fns[4] = { 1838 gen_helper_gvec_addp_b, 1839 gen_helper_gvec_addp_h, 1840 gen_helper_gvec_addp_s, 1841 gen_helper_gvec_addp_d, 1842 }; 1843 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1844 } 1845 1846 void gen_gvec_smaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1847 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1848 { 1849 static gen_helper_gvec_3 * const fns[4] = { 1850 gen_helper_gvec_smaxp_b, 1851 gen_helper_gvec_smaxp_h, 1852 gen_helper_gvec_smaxp_s, 1853 }; 1854 tcg_debug_assert(vece <= MO_32); 1855 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1856 } 1857 1858 void gen_gvec_sminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1859 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1860 { 1861 static gen_helper_gvec_3 * const fns[4] = { 1862 gen_helper_gvec_sminp_b, 1863 gen_helper_gvec_sminp_h, 1864 gen_helper_gvec_sminp_s, 1865 }; 1866 tcg_debug_assert(vece <= MO_32); 1867 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1868 } 1869 1870 void gen_gvec_umaxp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1871 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1872 { 1873 static gen_helper_gvec_3 * const fns[4] = { 1874 gen_helper_gvec_umaxp_b, 1875 gen_helper_gvec_umaxp_h, 1876 gen_helper_gvec_umaxp_s, 1877 }; 1878 tcg_debug_assert(vece <= MO_32); 1879 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1880 } 1881 1882 void gen_gvec_uminp(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1883 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1884 { 1885 static gen_helper_gvec_3 * const fns[4] = { 1886 gen_helper_gvec_uminp_b, 1887 gen_helper_gvec_uminp_h, 1888 gen_helper_gvec_uminp_s, 1889 }; 1890 tcg_debug_assert(vece <= MO_32); 1891 tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, 0, fns[vece]); 1892 } 1893 1894 static void gen_shadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1895 { 1896 TCGv_i64 t = tcg_temp_new_i64(); 1897 1898 tcg_gen_and_i64(t, a, b); 1899 tcg_gen_vec_sar8i_i64(a, a, 1); 1900 tcg_gen_vec_sar8i_i64(b, b, 1); 1901 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1902 tcg_gen_vec_add8_i64(d, a, b); 1903 tcg_gen_vec_add8_i64(d, d, t); 1904 } 1905 1906 static void gen_shadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1907 { 1908 TCGv_i64 t = tcg_temp_new_i64(); 1909 1910 tcg_gen_and_i64(t, a, b); 1911 tcg_gen_vec_sar16i_i64(a, a, 1); 1912 tcg_gen_vec_sar16i_i64(b, b, 1); 1913 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1914 tcg_gen_vec_add16_i64(d, a, b); 1915 tcg_gen_vec_add16_i64(d, d, t); 1916 } 1917 1918 static void gen_shadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1919 { 1920 TCGv_i32 t = tcg_temp_new_i32(); 1921 1922 tcg_gen_and_i32(t, a, b); 1923 tcg_gen_sari_i32(a, a, 1); 1924 tcg_gen_sari_i32(b, b, 1); 1925 tcg_gen_andi_i32(t, t, 1); 1926 tcg_gen_add_i32(d, a, b); 1927 tcg_gen_add_i32(d, d, t); 1928 } 1929 1930 static void gen_shadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 1931 { 1932 TCGv_vec t = tcg_temp_new_vec_matching(d); 1933 1934 tcg_gen_and_vec(vece, t, a, b); 1935 tcg_gen_sari_vec(vece, a, a, 1); 1936 tcg_gen_sari_vec(vece, b, b, 1); 1937 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 1938 tcg_gen_add_vec(vece, d, a, b); 1939 tcg_gen_add_vec(vece, d, d, t); 1940 } 1941 1942 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 1943 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 1944 { 1945 static const TCGOpcode vecop_list[] = { 1946 INDEX_op_sari_vec, INDEX_op_add_vec, 0 1947 }; 1948 static const GVecGen3 g[] = { 1949 { .fni8 = gen_shadd8_i64, 1950 .fniv = gen_shadd_vec, 1951 .opt_opc = vecop_list, 1952 .vece = MO_8 }, 1953 { .fni8 = gen_shadd16_i64, 1954 .fniv = gen_shadd_vec, 1955 .opt_opc = vecop_list, 1956 .vece = MO_16 }, 1957 { .fni4 = gen_shadd_i32, 1958 .fniv = gen_shadd_vec, 1959 .opt_opc = vecop_list, 1960 .vece = MO_32 }, 1961 }; 1962 tcg_debug_assert(vece <= MO_32); 1963 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 1964 } 1965 1966 static void gen_uhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1967 { 1968 TCGv_i64 t = tcg_temp_new_i64(); 1969 1970 tcg_gen_and_i64(t, a, b); 1971 tcg_gen_vec_shr8i_i64(a, a, 1); 1972 tcg_gen_vec_shr8i_i64(b, b, 1); 1973 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 1974 tcg_gen_vec_add8_i64(d, a, b); 1975 tcg_gen_vec_add8_i64(d, d, t); 1976 } 1977 1978 static void gen_uhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 1979 { 1980 TCGv_i64 t = tcg_temp_new_i64(); 1981 1982 tcg_gen_and_i64(t, a, b); 1983 tcg_gen_vec_shr16i_i64(a, a, 1); 1984 tcg_gen_vec_shr16i_i64(b, b, 1); 1985 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 1986 tcg_gen_vec_add16_i64(d, a, b); 1987 tcg_gen_vec_add16_i64(d, d, t); 1988 } 1989 1990 static void gen_uhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 1991 { 1992 TCGv_i32 t = tcg_temp_new_i32(); 1993 1994 tcg_gen_and_i32(t, a, b); 1995 tcg_gen_shri_i32(a, a, 1); 1996 tcg_gen_shri_i32(b, b, 1); 1997 tcg_gen_andi_i32(t, t, 1); 1998 tcg_gen_add_i32(d, a, b); 1999 tcg_gen_add_i32(d, d, t); 2000 } 2001 2002 static void gen_uhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2003 { 2004 TCGv_vec t = tcg_temp_new_vec_matching(d); 2005 2006 tcg_gen_and_vec(vece, t, a, b); 2007 tcg_gen_shri_vec(vece, a, a, 1); 2008 tcg_gen_shri_vec(vece, b, b, 1); 2009 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2010 tcg_gen_add_vec(vece, d, a, b); 2011 tcg_gen_add_vec(vece, d, d, t); 2012 } 2013 2014 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2015 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2016 { 2017 static const TCGOpcode vecop_list[] = { 2018 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2019 }; 2020 static const GVecGen3 g[] = { 2021 { .fni8 = gen_uhadd8_i64, 2022 .fniv = gen_uhadd_vec, 2023 .opt_opc = vecop_list, 2024 .vece = MO_8 }, 2025 { .fni8 = gen_uhadd16_i64, 2026 .fniv = gen_uhadd_vec, 2027 .opt_opc = vecop_list, 2028 .vece = MO_16 }, 2029 { .fni4 = gen_uhadd_i32, 2030 .fniv = gen_uhadd_vec, 2031 .opt_opc = vecop_list, 2032 .vece = MO_32 }, 2033 }; 2034 tcg_debug_assert(vece <= MO_32); 2035 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2036 } 2037 2038 static void gen_shsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2039 { 2040 TCGv_i64 t = tcg_temp_new_i64(); 2041 2042 tcg_gen_andc_i64(t, b, a); 2043 tcg_gen_vec_sar8i_i64(a, a, 1); 2044 tcg_gen_vec_sar8i_i64(b, b, 1); 2045 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2046 tcg_gen_vec_sub8_i64(d, a, b); 2047 tcg_gen_vec_sub8_i64(d, d, t); 2048 } 2049 2050 static void gen_shsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2051 { 2052 TCGv_i64 t = tcg_temp_new_i64(); 2053 2054 tcg_gen_andc_i64(t, b, a); 2055 tcg_gen_vec_sar16i_i64(a, a, 1); 2056 tcg_gen_vec_sar16i_i64(b, b, 1); 2057 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2058 tcg_gen_vec_sub16_i64(d, a, b); 2059 tcg_gen_vec_sub16_i64(d, d, t); 2060 } 2061 2062 static void gen_shsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2063 { 2064 TCGv_i32 t = tcg_temp_new_i32(); 2065 2066 tcg_gen_andc_i32(t, b, a); 2067 tcg_gen_sari_i32(a, a, 1); 2068 tcg_gen_sari_i32(b, b, 1); 2069 tcg_gen_andi_i32(t, t, 1); 2070 tcg_gen_sub_i32(d, a, b); 2071 tcg_gen_sub_i32(d, d, t); 2072 } 2073 2074 static void gen_shsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2075 { 2076 TCGv_vec t = tcg_temp_new_vec_matching(d); 2077 2078 tcg_gen_andc_vec(vece, t, b, a); 2079 tcg_gen_sari_vec(vece, a, a, 1); 2080 tcg_gen_sari_vec(vece, b, b, 1); 2081 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2082 tcg_gen_sub_vec(vece, d, a, b); 2083 tcg_gen_sub_vec(vece, d, d, t); 2084 } 2085 2086 void gen_gvec_shsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2087 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2088 { 2089 static const TCGOpcode vecop_list[] = { 2090 INDEX_op_sari_vec, INDEX_op_sub_vec, 0 2091 }; 2092 static const GVecGen3 g[4] = { 2093 { .fni8 = gen_shsub8_i64, 2094 .fniv = gen_shsub_vec, 2095 .opt_opc = vecop_list, 2096 .vece = MO_8 }, 2097 { .fni8 = gen_shsub16_i64, 2098 .fniv = gen_shsub_vec, 2099 .opt_opc = vecop_list, 2100 .vece = MO_16 }, 2101 { .fni4 = gen_shsub_i32, 2102 .fniv = gen_shsub_vec, 2103 .opt_opc = vecop_list, 2104 .vece = MO_32 }, 2105 }; 2106 assert(vece <= MO_32); 2107 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2108 } 2109 2110 static void gen_uhsub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2111 { 2112 TCGv_i64 t = tcg_temp_new_i64(); 2113 2114 tcg_gen_andc_i64(t, b, a); 2115 tcg_gen_vec_shr8i_i64(a, a, 1); 2116 tcg_gen_vec_shr8i_i64(b, b, 1); 2117 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2118 tcg_gen_vec_sub8_i64(d, a, b); 2119 tcg_gen_vec_sub8_i64(d, d, t); 2120 } 2121 2122 static void gen_uhsub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2123 { 2124 TCGv_i64 t = tcg_temp_new_i64(); 2125 2126 tcg_gen_andc_i64(t, b, a); 2127 tcg_gen_vec_shr16i_i64(a, a, 1); 2128 tcg_gen_vec_shr16i_i64(b, b, 1); 2129 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2130 tcg_gen_vec_sub16_i64(d, a, b); 2131 tcg_gen_vec_sub16_i64(d, d, t); 2132 } 2133 2134 static void gen_uhsub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2135 { 2136 TCGv_i32 t = tcg_temp_new_i32(); 2137 2138 tcg_gen_andc_i32(t, b, a); 2139 tcg_gen_shri_i32(a, a, 1); 2140 tcg_gen_shri_i32(b, b, 1); 2141 tcg_gen_andi_i32(t, t, 1); 2142 tcg_gen_sub_i32(d, a, b); 2143 tcg_gen_sub_i32(d, d, t); 2144 } 2145 2146 static void gen_uhsub_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2147 { 2148 TCGv_vec t = tcg_temp_new_vec_matching(d); 2149 2150 tcg_gen_andc_vec(vece, t, b, a); 2151 tcg_gen_shri_vec(vece, a, a, 1); 2152 tcg_gen_shri_vec(vece, b, b, 1); 2153 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2154 tcg_gen_sub_vec(vece, d, a, b); 2155 tcg_gen_sub_vec(vece, d, d, t); 2156 } 2157 2158 void gen_gvec_uhsub(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2159 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2160 { 2161 static const TCGOpcode vecop_list[] = { 2162 INDEX_op_shri_vec, INDEX_op_sub_vec, 0 2163 }; 2164 static const GVecGen3 g[4] = { 2165 { .fni8 = gen_uhsub8_i64, 2166 .fniv = gen_uhsub_vec, 2167 .opt_opc = vecop_list, 2168 .vece = MO_8 }, 2169 { .fni8 = gen_uhsub16_i64, 2170 .fniv = gen_uhsub_vec, 2171 .opt_opc = vecop_list, 2172 .vece = MO_16 }, 2173 { .fni4 = gen_uhsub_i32, 2174 .fniv = gen_uhsub_vec, 2175 .opt_opc = vecop_list, 2176 .vece = MO_32 }, 2177 }; 2178 assert(vece <= MO_32); 2179 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2180 } 2181 2182 static void gen_srhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2183 { 2184 TCGv_i64 t = tcg_temp_new_i64(); 2185 2186 tcg_gen_or_i64(t, a, b); 2187 tcg_gen_vec_sar8i_i64(a, a, 1); 2188 tcg_gen_vec_sar8i_i64(b, b, 1); 2189 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2190 tcg_gen_vec_add8_i64(d, a, b); 2191 tcg_gen_vec_add8_i64(d, d, t); 2192 } 2193 2194 static void gen_srhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2195 { 2196 TCGv_i64 t = tcg_temp_new_i64(); 2197 2198 tcg_gen_or_i64(t, a, b); 2199 tcg_gen_vec_sar16i_i64(a, a, 1); 2200 tcg_gen_vec_sar16i_i64(b, b, 1); 2201 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2202 tcg_gen_vec_add16_i64(d, a, b); 2203 tcg_gen_vec_add16_i64(d, d, t); 2204 } 2205 2206 static void gen_srhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2207 { 2208 TCGv_i32 t = tcg_temp_new_i32(); 2209 2210 tcg_gen_or_i32(t, a, b); 2211 tcg_gen_sari_i32(a, a, 1); 2212 tcg_gen_sari_i32(b, b, 1); 2213 tcg_gen_andi_i32(t, t, 1); 2214 tcg_gen_add_i32(d, a, b); 2215 tcg_gen_add_i32(d, d, t); 2216 } 2217 2218 static void gen_srhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2219 { 2220 TCGv_vec t = tcg_temp_new_vec_matching(d); 2221 2222 tcg_gen_or_vec(vece, t, a, b); 2223 tcg_gen_sari_vec(vece, a, a, 1); 2224 tcg_gen_sari_vec(vece, b, b, 1); 2225 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2226 tcg_gen_add_vec(vece, d, a, b); 2227 tcg_gen_add_vec(vece, d, d, t); 2228 } 2229 2230 void gen_gvec_srhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2231 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2232 { 2233 static const TCGOpcode vecop_list[] = { 2234 INDEX_op_sari_vec, INDEX_op_add_vec, 0 2235 }; 2236 static const GVecGen3 g[] = { 2237 { .fni8 = gen_srhadd8_i64, 2238 .fniv = gen_srhadd_vec, 2239 .opt_opc = vecop_list, 2240 .vece = MO_8 }, 2241 { .fni8 = gen_srhadd16_i64, 2242 .fniv = gen_srhadd_vec, 2243 .opt_opc = vecop_list, 2244 .vece = MO_16 }, 2245 { .fni4 = gen_srhadd_i32, 2246 .fniv = gen_srhadd_vec, 2247 .opt_opc = vecop_list, 2248 .vece = MO_32 }, 2249 }; 2250 assert(vece <= MO_32); 2251 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2252 } 2253 2254 static void gen_urhadd8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2255 { 2256 TCGv_i64 t = tcg_temp_new_i64(); 2257 2258 tcg_gen_or_i64(t, a, b); 2259 tcg_gen_vec_shr8i_i64(a, a, 1); 2260 tcg_gen_vec_shr8i_i64(b, b, 1); 2261 tcg_gen_andi_i64(t, t, dup_const(MO_8, 1)); 2262 tcg_gen_vec_add8_i64(d, a, b); 2263 tcg_gen_vec_add8_i64(d, d, t); 2264 } 2265 2266 static void gen_urhadd16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) 2267 { 2268 TCGv_i64 t = tcg_temp_new_i64(); 2269 2270 tcg_gen_or_i64(t, a, b); 2271 tcg_gen_vec_shr16i_i64(a, a, 1); 2272 tcg_gen_vec_shr16i_i64(b, b, 1); 2273 tcg_gen_andi_i64(t, t, dup_const(MO_16, 1)); 2274 tcg_gen_vec_add16_i64(d, a, b); 2275 tcg_gen_vec_add16_i64(d, d, t); 2276 } 2277 2278 static void gen_urhadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) 2279 { 2280 TCGv_i32 t = tcg_temp_new_i32(); 2281 2282 tcg_gen_or_i32(t, a, b); 2283 tcg_gen_shri_i32(a, a, 1); 2284 tcg_gen_shri_i32(b, b, 1); 2285 tcg_gen_andi_i32(t, t, 1); 2286 tcg_gen_add_i32(d, a, b); 2287 tcg_gen_add_i32(d, d, t); 2288 } 2289 2290 static void gen_urhadd_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) 2291 { 2292 TCGv_vec t = tcg_temp_new_vec_matching(d); 2293 2294 tcg_gen_or_vec(vece, t, a, b); 2295 tcg_gen_shri_vec(vece, a, a, 1); 2296 tcg_gen_shri_vec(vece, b, b, 1); 2297 tcg_gen_and_vec(vece, t, t, tcg_constant_vec_matching(d, vece, 1)); 2298 tcg_gen_add_vec(vece, d, a, b); 2299 tcg_gen_add_vec(vece, d, d, t); 2300 } 2301 2302 void gen_gvec_urhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, 2303 uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz) 2304 { 2305 static const TCGOpcode vecop_list[] = { 2306 INDEX_op_shri_vec, INDEX_op_add_vec, 0 2307 }; 2308 static const GVecGen3 g[] = { 2309 { .fni8 = gen_urhadd8_i64, 2310 .fniv = gen_urhadd_vec, 2311 .opt_opc = vecop_list, 2312 .vece = MO_8 }, 2313 { .fni8 = gen_urhadd16_i64, 2314 .fniv = gen_urhadd_vec, 2315 .opt_opc = vecop_list, 2316 .vece = MO_16 }, 2317 { .fni4 = gen_urhadd_i32, 2318 .fniv = gen_urhadd_vec, 2319 .opt_opc = vecop_list, 2320 .vece = MO_32 }, 2321 }; 2322 assert(vece <= MO_32); 2323 tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &g[vece]); 2324 } 2325