1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 315 void *va, uint32_t desc) 316 { 317 intptr_t i, opr_sz = simd_oprsz(desc); 318 int16_t *d = vd, *n = vn, *m = vm, *a = va; 319 uint32_t discard; 320 321 for (i = 0; i < opr_sz / 2; ++i) { 322 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 323 } 324 } 325 326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 327 void *va, uint32_t desc) 328 { 329 intptr_t i, opr_sz = simd_oprsz(desc); 330 int16_t *d = vd, *n = vn, *m = vm, *a = va; 331 uint32_t discard; 332 333 for (i = 0; i < opr_sz / 2; ++i) { 334 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 335 } 336 } 337 338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 339 { 340 intptr_t i, opr_sz = simd_oprsz(desc); 341 int16_t *d = vd, *n = vn, *m = vm; 342 uint32_t discard; 343 344 for (i = 0; i < opr_sz / 2; ++i) { 345 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 346 } 347 } 348 349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 350 { 351 intptr_t i, opr_sz = simd_oprsz(desc); 352 int16_t *d = vd, *n = vn, *m = vm; 353 uint32_t discard; 354 355 for (i = 0; i < opr_sz / 2; ++i) { 356 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 357 } 358 } 359 360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 361 { 362 intptr_t i, j, opr_sz = simd_oprsz(desc); 363 int idx = simd_data(desc); 364 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 365 uint32_t discard; 366 367 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 368 int16_t mm = m[i]; 369 for (j = 0; j < 16 / 2; ++j) { 370 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 371 } 372 } 373 } 374 375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 376 { 377 intptr_t i, j, opr_sz = simd_oprsz(desc); 378 int idx = simd_data(desc); 379 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 380 uint32_t discard; 381 382 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 383 int16_t mm = m[i]; 384 for (j = 0; j < 16 / 2; ++j) { 385 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 386 } 387 } 388 } 389 390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 392 bool neg, bool round, uint32_t *sat) 393 { 394 /* Simplify similarly to do_sqrdmlah_b above. */ 395 int64_t ret = (int64_t)src1 * src2; 396 if (neg) { 397 ret = -ret; 398 } 399 ret += ((int64_t)src3 << 31) + (round << 30); 400 ret >>= 31; 401 402 if (ret != (int32_t)ret) { 403 *sat = 1; 404 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 405 } 406 return ret; 407 } 408 409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 410 int32_t src2, int32_t src3) 411 { 412 uint32_t *sat = &env->vfp.qc[0]; 413 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 414 } 415 416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 417 void *vq, uint32_t desc) 418 { 419 uintptr_t opr_sz = simd_oprsz(desc); 420 int32_t *d = vd; 421 int32_t *n = vn; 422 int32_t *m = vm; 423 uintptr_t i; 424 425 for (i = 0; i < opr_sz / 4; ++i) { 426 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 427 } 428 clear_tail(d, opr_sz, simd_maxsz(desc)); 429 } 430 431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 432 int32_t src2, int32_t src3) 433 { 434 uint32_t *sat = &env->vfp.qc[0]; 435 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 436 } 437 438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 439 void *vq, uint32_t desc) 440 { 441 uintptr_t opr_sz = simd_oprsz(desc); 442 int32_t *d = vd; 443 int32_t *n = vn; 444 int32_t *m = vm; 445 uintptr_t i; 446 447 for (i = 0; i < opr_sz / 4; ++i) { 448 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 449 } 450 clear_tail(d, opr_sz, simd_maxsz(desc)); 451 } 452 453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 454 void *vq, uint32_t desc) 455 { 456 intptr_t i, opr_sz = simd_oprsz(desc); 457 int32_t *d = vd, *n = vn, *m = vm; 458 459 for (i = 0; i < opr_sz / 4; ++i) { 460 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 461 } 462 clear_tail(d, opr_sz, simd_maxsz(desc)); 463 } 464 465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 466 void *vq, uint32_t desc) 467 { 468 intptr_t i, opr_sz = simd_oprsz(desc); 469 int32_t *d = vd, *n = vn, *m = vm; 470 471 for (i = 0; i < opr_sz / 4; ++i) { 472 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 473 } 474 clear_tail(d, opr_sz, simd_maxsz(desc)); 475 } 476 477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 478 void *va, uint32_t desc) 479 { 480 intptr_t i, opr_sz = simd_oprsz(desc); 481 int32_t *d = vd, *n = vn, *m = vm, *a = va; 482 uint32_t discard; 483 484 for (i = 0; i < opr_sz / 4; ++i) { 485 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 486 } 487 } 488 489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 490 void *va, uint32_t desc) 491 { 492 intptr_t i, opr_sz = simd_oprsz(desc); 493 int32_t *d = vd, *n = vn, *m = vm, *a = va; 494 uint32_t discard; 495 496 for (i = 0; i < opr_sz / 4; ++i) { 497 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 498 } 499 } 500 501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 502 { 503 intptr_t i, opr_sz = simd_oprsz(desc); 504 int32_t *d = vd, *n = vn, *m = vm; 505 uint32_t discard; 506 507 for (i = 0; i < opr_sz / 4; ++i) { 508 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 509 } 510 } 511 512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 513 { 514 intptr_t i, opr_sz = simd_oprsz(desc); 515 int32_t *d = vd, *n = vn, *m = vm; 516 uint32_t discard; 517 518 for (i = 0; i < opr_sz / 4; ++i) { 519 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 520 } 521 } 522 523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 524 { 525 intptr_t i, j, opr_sz = simd_oprsz(desc); 526 int idx = simd_data(desc); 527 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 528 uint32_t discard; 529 530 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 531 int32_t mm = m[i]; 532 for (j = 0; j < 16 / 4; ++j) { 533 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 534 } 535 } 536 } 537 538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 539 { 540 intptr_t i, j, opr_sz = simd_oprsz(desc); 541 int idx = simd_data(desc); 542 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 543 uint32_t discard; 544 545 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 546 int32_t mm = m[i]; 547 for (j = 0; j < 16 / 4; ++j) { 548 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 549 } 550 } 551 } 552 553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 554 static int64_t do_sat128_d(Int128 r) 555 { 556 int64_t ls = int128_getlo(r); 557 int64_t hs = int128_gethi(r); 558 559 if (unlikely(hs != (ls >> 63))) { 560 return hs < 0 ? INT64_MIN : INT64_MAX; 561 } 562 return ls; 563 } 564 565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 566 { 567 uint64_t l, h; 568 Int128 r, t; 569 570 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 571 muls64(&l, &h, m, n); 572 r = int128_make128(l, h); 573 if (neg) { 574 r = int128_neg(r); 575 } 576 if (a) { 577 t = int128_exts64(a); 578 t = int128_lshift(t, 63); 579 r = int128_add(r, t); 580 } 581 if (round) { 582 t = int128_exts64(1ll << 62); 583 r = int128_add(r, t); 584 } 585 r = int128_rshift(r, 63); 586 587 return do_sat128_d(r); 588 } 589 590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 591 void *va, uint32_t desc) 592 { 593 intptr_t i, opr_sz = simd_oprsz(desc); 594 int64_t *d = vd, *n = vn, *m = vm, *a = va; 595 596 for (i = 0; i < opr_sz / 8; ++i) { 597 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 598 } 599 } 600 601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 602 void *va, uint32_t desc) 603 { 604 intptr_t i, opr_sz = simd_oprsz(desc); 605 int64_t *d = vd, *n = vn, *m = vm, *a = va; 606 607 for (i = 0; i < opr_sz / 8; ++i) { 608 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 609 } 610 } 611 612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 613 { 614 intptr_t i, opr_sz = simd_oprsz(desc); 615 int64_t *d = vd, *n = vn, *m = vm; 616 617 for (i = 0; i < opr_sz / 8; ++i) { 618 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 619 } 620 } 621 622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int64_t *d = vd, *n = vn, *m = vm; 626 627 for (i = 0; i < opr_sz / 8; ++i) { 628 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 629 } 630 } 631 632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 633 { 634 intptr_t i, j, opr_sz = simd_oprsz(desc); 635 int idx = simd_data(desc); 636 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 637 638 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 639 int64_t mm = m[i]; 640 for (j = 0; j < 16 / 8; ++j) { 641 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 642 } 643 } 644 } 645 646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 647 { 648 intptr_t i, j, opr_sz = simd_oprsz(desc); 649 int idx = simd_data(desc); 650 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 651 652 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 653 int64_t mm = m[i]; 654 for (j = 0; j < 16 / 8; ++j) { 655 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 656 } 657 } 658 } 659 660 /* Integer 8 and 16-bit dot-product. 661 * 662 * Note that for the loops herein, host endianness does not matter 663 * with respect to the ordering of data within the quad-width lanes. 664 * All elements are treated equally, no matter where they are. 665 */ 666 667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 669 { \ 670 intptr_t i, opr_sz = simd_oprsz(desc); \ 671 TYPED *d = vd, *a = va; \ 672 TYPEN *n = vn; \ 673 TYPEM *m = vm; \ 674 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 675 d[i] = (a[i] + \ 676 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 677 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 678 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 679 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 680 } \ 681 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 682 } 683 684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 689 690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 692 { \ 693 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 694 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 695 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 696 intptr_t index = simd_data(desc); \ 697 TYPED *d = vd, *a = va; \ 698 TYPEN *n = vn; \ 699 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 700 do { \ 701 TYPED m0 = m_indexed[i * 4 + 0]; \ 702 TYPED m1 = m_indexed[i * 4 + 1]; \ 703 TYPED m2 = m_indexed[i * 4 + 2]; \ 704 TYPED m3 = m_indexed[i * 4 + 3]; \ 705 do { \ 706 d[i] = (a[i] + \ 707 n[i * 4 + 0] * m0 + \ 708 n[i * 4 + 1] * m1 + \ 709 n[i * 4 + 2] * m2 + \ 710 n[i * 4 + 3] * m3); \ 711 } while (++i < segend); \ 712 segend = i + 4; \ 713 } while (i < opr_sz_n); \ 714 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 715 } 716 717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 723 724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 725 void *vfpst, uint32_t desc) 726 { 727 uintptr_t opr_sz = simd_oprsz(desc); 728 float16 *d = vd; 729 float16 *n = vn; 730 float16 *m = vm; 731 float_status *fpst = vfpst; 732 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 733 uint32_t neg_imag = neg_real ^ 1; 734 uintptr_t i; 735 736 /* Shift boolean to the sign bit so we can xor to negate. */ 737 neg_real <<= 15; 738 neg_imag <<= 15; 739 740 for (i = 0; i < opr_sz / 2; i += 2) { 741 float16 e0 = n[H2(i)]; 742 float16 e1 = m[H2(i + 1)] ^ neg_imag; 743 float16 e2 = n[H2(i + 1)]; 744 float16 e3 = m[H2(i)] ^ neg_real; 745 746 d[H2(i)] = float16_add(e0, e1, fpst); 747 d[H2(i + 1)] = float16_add(e2, e3, fpst); 748 } 749 clear_tail(d, opr_sz, simd_maxsz(desc)); 750 } 751 752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 753 void *vfpst, uint32_t desc) 754 { 755 uintptr_t opr_sz = simd_oprsz(desc); 756 float32 *d = vd; 757 float32 *n = vn; 758 float32 *m = vm; 759 float_status *fpst = vfpst; 760 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 761 uint32_t neg_imag = neg_real ^ 1; 762 uintptr_t i; 763 764 /* Shift boolean to the sign bit so we can xor to negate. */ 765 neg_real <<= 31; 766 neg_imag <<= 31; 767 768 for (i = 0; i < opr_sz / 4; i += 2) { 769 float32 e0 = n[H4(i)]; 770 float32 e1 = m[H4(i + 1)] ^ neg_imag; 771 float32 e2 = n[H4(i + 1)]; 772 float32 e3 = m[H4(i)] ^ neg_real; 773 774 d[H4(i)] = float32_add(e0, e1, fpst); 775 d[H4(i + 1)] = float32_add(e2, e3, fpst); 776 } 777 clear_tail(d, opr_sz, simd_maxsz(desc)); 778 } 779 780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 781 void *vfpst, uint32_t desc) 782 { 783 uintptr_t opr_sz = simd_oprsz(desc); 784 float64 *d = vd; 785 float64 *n = vn; 786 float64 *m = vm; 787 float_status *fpst = vfpst; 788 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 789 uint64_t neg_imag = neg_real ^ 1; 790 uintptr_t i; 791 792 /* Shift boolean to the sign bit so we can xor to negate. */ 793 neg_real <<= 63; 794 neg_imag <<= 63; 795 796 for (i = 0; i < opr_sz / 8; i += 2) { 797 float64 e0 = n[i]; 798 float64 e1 = m[i + 1] ^ neg_imag; 799 float64 e2 = n[i + 1]; 800 float64 e3 = m[i] ^ neg_real; 801 802 d[i] = float64_add(e0, e1, fpst); 803 d[i + 1] = float64_add(e2, e3, fpst); 804 } 805 clear_tail(d, opr_sz, simd_maxsz(desc)); 806 } 807 808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 809 void *vfpst, uint32_t desc) 810 { 811 uintptr_t opr_sz = simd_oprsz(desc); 812 float16 *d = vd, *n = vn, *m = vm, *a = va; 813 float_status *fpst = vfpst; 814 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 815 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 816 uint32_t neg_real = flip ^ neg_imag; 817 uintptr_t i; 818 819 /* Shift boolean to the sign bit so we can xor to negate. */ 820 neg_real <<= 15; 821 neg_imag <<= 15; 822 823 for (i = 0; i < opr_sz / 2; i += 2) { 824 float16 e2 = n[H2(i + flip)]; 825 float16 e1 = m[H2(i + flip)] ^ neg_real; 826 float16 e4 = e2; 827 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 828 829 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 830 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 831 } 832 clear_tail(d, opr_sz, simd_maxsz(desc)); 833 } 834 835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 836 void *vfpst, uint32_t desc) 837 { 838 uintptr_t opr_sz = simd_oprsz(desc); 839 float16 *d = vd, *n = vn, *m = vm, *a = va; 840 float_status *fpst = vfpst; 841 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 842 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 843 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 844 uint32_t neg_real = flip ^ neg_imag; 845 intptr_t elements = opr_sz / sizeof(float16); 846 intptr_t eltspersegment = 16 / sizeof(float16); 847 intptr_t i, j; 848 849 /* Shift boolean to the sign bit so we can xor to negate. */ 850 neg_real <<= 15; 851 neg_imag <<= 15; 852 853 for (i = 0; i < elements; i += eltspersegment) { 854 float16 mr = m[H2(i + 2 * index + 0)]; 855 float16 mi = m[H2(i + 2 * index + 1)]; 856 float16 e1 = neg_real ^ (flip ? mi : mr); 857 float16 e3 = neg_imag ^ (flip ? mr : mi); 858 859 for (j = i; j < i + eltspersegment; j += 2) { 860 float16 e2 = n[H2(j + flip)]; 861 float16 e4 = e2; 862 863 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 864 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 865 } 866 } 867 clear_tail(d, opr_sz, simd_maxsz(desc)); 868 } 869 870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 871 void *vfpst, uint32_t desc) 872 { 873 uintptr_t opr_sz = simd_oprsz(desc); 874 float32 *d = vd, *n = vn, *m = vm, *a = va; 875 float_status *fpst = vfpst; 876 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 877 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 878 uint32_t neg_real = flip ^ neg_imag; 879 uintptr_t i; 880 881 /* Shift boolean to the sign bit so we can xor to negate. */ 882 neg_real <<= 31; 883 neg_imag <<= 31; 884 885 for (i = 0; i < opr_sz / 4; i += 2) { 886 float32 e2 = n[H4(i + flip)]; 887 float32 e1 = m[H4(i + flip)] ^ neg_real; 888 float32 e4 = e2; 889 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 890 891 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 892 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 893 } 894 clear_tail(d, opr_sz, simd_maxsz(desc)); 895 } 896 897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 898 void *vfpst, uint32_t desc) 899 { 900 uintptr_t opr_sz = simd_oprsz(desc); 901 float32 *d = vd, *n = vn, *m = vm, *a = va; 902 float_status *fpst = vfpst; 903 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 904 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 905 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 906 uint32_t neg_real = flip ^ neg_imag; 907 intptr_t elements = opr_sz / sizeof(float32); 908 intptr_t eltspersegment = 16 / sizeof(float32); 909 intptr_t i, j; 910 911 /* Shift boolean to the sign bit so we can xor to negate. */ 912 neg_real <<= 31; 913 neg_imag <<= 31; 914 915 for (i = 0; i < elements; i += eltspersegment) { 916 float32 mr = m[H4(i + 2 * index + 0)]; 917 float32 mi = m[H4(i + 2 * index + 1)]; 918 float32 e1 = neg_real ^ (flip ? mi : mr); 919 float32 e3 = neg_imag ^ (flip ? mr : mi); 920 921 for (j = i; j < i + eltspersegment; j += 2) { 922 float32 e2 = n[H4(j + flip)]; 923 float32 e4 = e2; 924 925 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 926 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 927 } 928 } 929 clear_tail(d, opr_sz, simd_maxsz(desc)); 930 } 931 932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 933 void *vfpst, uint32_t desc) 934 { 935 uintptr_t opr_sz = simd_oprsz(desc); 936 float64 *d = vd, *n = vn, *m = vm, *a = va; 937 float_status *fpst = vfpst; 938 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 939 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 940 uint64_t neg_real = flip ^ neg_imag; 941 uintptr_t i; 942 943 /* Shift boolean to the sign bit so we can xor to negate. */ 944 neg_real <<= 63; 945 neg_imag <<= 63; 946 947 for (i = 0; i < opr_sz / 8; i += 2) { 948 float64 e2 = n[i + flip]; 949 float64 e1 = m[i + flip] ^ neg_real; 950 float64 e4 = e2; 951 float64 e3 = m[i + 1 - flip] ^ neg_imag; 952 953 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 954 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 955 } 956 clear_tail(d, opr_sz, simd_maxsz(desc)); 957 } 958 959 /* 960 * Floating point comparisons producing an integer result (all 1s or all 0s). 961 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 962 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 963 */ 964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 965 { 966 return -float16_eq_quiet(op1, op2, stat); 967 } 968 969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 970 { 971 return -float32_eq_quiet(op1, op2, stat); 972 } 973 974 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 975 { 976 return -float16_le(op2, op1, stat); 977 } 978 979 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 980 { 981 return -float32_le(op2, op1, stat); 982 } 983 984 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 985 { 986 return -float16_lt(op2, op1, stat); 987 } 988 989 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 990 { 991 return -float32_lt(op2, op1, stat); 992 } 993 994 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 995 { 996 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 997 } 998 999 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1000 { 1001 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1002 } 1003 1004 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1005 { 1006 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1007 } 1008 1009 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1010 { 1011 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1012 } 1013 1014 static int16_t vfp_tosszh(float16 x, void *fpstp) 1015 { 1016 float_status *fpst = fpstp; 1017 if (float16_is_any_nan(x)) { 1018 float_raise(float_flag_invalid, fpst); 1019 return 0; 1020 } 1021 return float16_to_int16_round_to_zero(x, fpst); 1022 } 1023 1024 static uint16_t vfp_touszh(float16 x, void *fpstp) 1025 { 1026 float_status *fpst = fpstp; 1027 if (float16_is_any_nan(x)) { 1028 float_raise(float_flag_invalid, fpst); 1029 return 0; 1030 } 1031 return float16_to_uint16_round_to_zero(x, fpst); 1032 } 1033 1034 #define DO_2OP(NAME, FUNC, TYPE) \ 1035 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1036 { \ 1037 intptr_t i, oprsz = simd_oprsz(desc); \ 1038 TYPE *d = vd, *n = vn; \ 1039 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1040 d[i] = FUNC(n[i], stat); \ 1041 } \ 1042 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1043 } 1044 1045 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1046 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1047 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1048 1049 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1050 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1051 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1052 1053 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1054 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1055 1056 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1057 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1058 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1059 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1060 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1061 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1062 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1063 DO_2OP(gvec_touszh, vfp_touszh, float16) 1064 1065 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1066 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1067 { \ 1068 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1069 } 1070 1071 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1072 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1073 { \ 1074 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1075 } 1076 1077 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1078 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1079 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1080 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1081 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1082 1083 DO_2OP_CMP0(cgt, cgt, FWD) 1084 DO_2OP_CMP0(cge, cge, FWD) 1085 DO_2OP_CMP0(ceq, ceq, FWD) 1086 DO_2OP_CMP0(clt, cgt, REV) 1087 DO_2OP_CMP0(cle, cge, REV) 1088 1089 #undef DO_2OP 1090 #undef DO_2OP_CMP0 1091 1092 /* Floating-point trigonometric starting value. 1093 * See the ARM ARM pseudocode function FPTrigSMul. 1094 */ 1095 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1096 { 1097 float16 result = float16_mul(op1, op1, stat); 1098 if (!float16_is_any_nan(result)) { 1099 result = float16_set_sign(result, op2 & 1); 1100 } 1101 return result; 1102 } 1103 1104 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1105 { 1106 float32 result = float32_mul(op1, op1, stat); 1107 if (!float32_is_any_nan(result)) { 1108 result = float32_set_sign(result, op2 & 1); 1109 } 1110 return result; 1111 } 1112 1113 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1114 { 1115 float64 result = float64_mul(op1, op1, stat); 1116 if (!float64_is_any_nan(result)) { 1117 result = float64_set_sign(result, op2 & 1); 1118 } 1119 return result; 1120 } 1121 1122 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1123 { 1124 return float16_abs(float16_sub(op1, op2, stat)); 1125 } 1126 1127 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1128 { 1129 return float32_abs(float32_sub(op1, op2, stat)); 1130 } 1131 1132 /* 1133 * Reciprocal step. These are the AArch32 version which uses a 1134 * non-fused multiply-and-subtract. 1135 */ 1136 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1137 { 1138 op1 = float16_squash_input_denormal(op1, stat); 1139 op2 = float16_squash_input_denormal(op2, stat); 1140 1141 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1142 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1143 return float16_two; 1144 } 1145 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1146 } 1147 1148 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1149 { 1150 op1 = float32_squash_input_denormal(op1, stat); 1151 op2 = float32_squash_input_denormal(op2, stat); 1152 1153 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1154 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1155 return float32_two; 1156 } 1157 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1158 } 1159 1160 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1161 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1162 { 1163 op1 = float16_squash_input_denormal(op1, stat); 1164 op2 = float16_squash_input_denormal(op2, stat); 1165 1166 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1167 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1168 return float16_one_point_five; 1169 } 1170 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1171 return float16_div(op1, float16_two, stat); 1172 } 1173 1174 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1175 { 1176 op1 = float32_squash_input_denormal(op1, stat); 1177 op2 = float32_squash_input_denormal(op2, stat); 1178 1179 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1180 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1181 return float32_one_point_five; 1182 } 1183 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1184 return float32_div(op1, float32_two, stat); 1185 } 1186 1187 #define DO_3OP(NAME, FUNC, TYPE) \ 1188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1189 { \ 1190 intptr_t i, oprsz = simd_oprsz(desc); \ 1191 TYPE *d = vd, *n = vn, *m = vm; \ 1192 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1193 d[i] = FUNC(n[i], m[i], stat); \ 1194 } \ 1195 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1196 } 1197 1198 DO_3OP(gvec_fadd_h, float16_add, float16) 1199 DO_3OP(gvec_fadd_s, float32_add, float32) 1200 DO_3OP(gvec_fadd_d, float64_add, float64) 1201 1202 DO_3OP(gvec_fsub_h, float16_sub, float16) 1203 DO_3OP(gvec_fsub_s, float32_sub, float32) 1204 DO_3OP(gvec_fsub_d, float64_sub, float64) 1205 1206 DO_3OP(gvec_fmul_h, float16_mul, float16) 1207 DO_3OP(gvec_fmul_s, float32_mul, float32) 1208 DO_3OP(gvec_fmul_d, float64_mul, float64) 1209 1210 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1211 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1212 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1213 1214 DO_3OP(gvec_fabd_h, float16_abd, float16) 1215 DO_3OP(gvec_fabd_s, float32_abd, float32) 1216 1217 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1218 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1219 1220 DO_3OP(gvec_fcge_h, float16_cge, float16) 1221 DO_3OP(gvec_fcge_s, float32_cge, float32) 1222 1223 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1224 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1225 1226 DO_3OP(gvec_facge_h, float16_acge, float16) 1227 DO_3OP(gvec_facge_s, float32_acge, float32) 1228 1229 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1230 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1231 1232 DO_3OP(gvec_fmax_h, float16_max, float16) 1233 DO_3OP(gvec_fmax_s, float32_max, float32) 1234 DO_3OP(gvec_fmax_d, float64_max, float64) 1235 1236 DO_3OP(gvec_fmin_h, float16_min, float16) 1237 DO_3OP(gvec_fmin_s, float32_min, float32) 1238 DO_3OP(gvec_fmin_d, float64_min, float64) 1239 1240 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1241 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1242 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1243 1244 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1245 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1246 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1247 1248 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1249 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1250 1251 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1252 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1253 1254 #ifdef TARGET_AARCH64 1255 DO_3OP(gvec_fdiv_h, float16_div, float16) 1256 DO_3OP(gvec_fdiv_s, float32_div, float32) 1257 DO_3OP(gvec_fdiv_d, float64_div, float64) 1258 1259 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1260 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1261 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1262 1263 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1264 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1265 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1266 1267 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1268 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1269 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1270 1271 #endif 1272 #undef DO_3OP 1273 1274 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1275 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1276 float_status *stat) 1277 { 1278 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1279 } 1280 1281 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1282 float_status *stat) 1283 { 1284 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1285 } 1286 1287 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1288 float_status *stat) 1289 { 1290 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1291 } 1292 1293 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1294 float_status *stat) 1295 { 1296 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1297 } 1298 1299 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1300 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1301 float_status *stat) 1302 { 1303 return float16_muladd(op1, op2, dest, 0, stat); 1304 } 1305 1306 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1307 float_status *stat) 1308 { 1309 return float32_muladd(op1, op2, dest, 0, stat); 1310 } 1311 1312 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1313 float_status *stat) 1314 { 1315 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1316 } 1317 1318 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1319 float_status *stat) 1320 { 1321 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1322 } 1323 1324 #define DO_MULADD(NAME, FUNC, TYPE) \ 1325 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1326 { \ 1327 intptr_t i, oprsz = simd_oprsz(desc); \ 1328 TYPE *d = vd, *n = vn, *m = vm; \ 1329 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1330 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1331 } \ 1332 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1333 } 1334 1335 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1336 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1337 1338 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1339 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1340 1341 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1342 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1343 1344 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1345 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1346 1347 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1348 * For AdvSIMD, there is of course only one such vector segment. 1349 */ 1350 1351 #define DO_MUL_IDX(NAME, TYPE, H) \ 1352 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1353 { \ 1354 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1355 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1356 intptr_t idx = simd_data(desc); \ 1357 TYPE *d = vd, *n = vn, *m = vm; \ 1358 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1359 TYPE mm = m[H(i + idx)]; \ 1360 for (j = 0; j < segment; j++) { \ 1361 d[i + j] = n[i + j] * mm; \ 1362 } \ 1363 } \ 1364 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1365 } 1366 1367 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1368 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1369 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1370 1371 #undef DO_MUL_IDX 1372 1373 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1374 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1375 { \ 1376 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1377 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1378 intptr_t idx = simd_data(desc); \ 1379 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1380 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1381 TYPE mm = m[H(i + idx)]; \ 1382 for (j = 0; j < segment; j++) { \ 1383 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1384 } \ 1385 } \ 1386 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1387 } 1388 1389 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1390 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1391 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1392 1393 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1394 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1395 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1396 1397 #undef DO_MLA_IDX 1398 1399 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1400 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1401 { \ 1402 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1403 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1404 intptr_t idx = simd_data(desc); \ 1405 TYPE *d = vd, *n = vn, *m = vm; \ 1406 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1407 TYPE mm = m[H(i + idx)]; \ 1408 for (j = 0; j < segment; j++) { \ 1409 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1410 } \ 1411 } \ 1412 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1413 } 1414 1415 #define nop(N, M, S) (M) 1416 1417 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1418 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1419 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1420 1421 #ifdef TARGET_AARCH64 1422 1423 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1424 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1425 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1426 1427 #endif 1428 1429 #undef nop 1430 1431 /* 1432 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1433 * the fused ops below they assume accumulate both from and into Vd. 1434 */ 1435 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1436 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1437 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1438 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1439 1440 #undef DO_FMUL_IDX 1441 1442 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1443 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1444 void *stat, uint32_t desc) \ 1445 { \ 1446 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1447 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1448 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1449 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1450 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1451 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1452 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1453 TYPE mm = m[H(i + idx)]; \ 1454 for (j = 0; j < segment; j++) { \ 1455 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1456 mm, a[i + j], 0, stat); \ 1457 } \ 1458 } \ 1459 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1460 } 1461 1462 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1463 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1464 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1465 1466 #undef DO_FMLA_IDX 1467 1468 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1469 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1470 { \ 1471 intptr_t i, oprsz = simd_oprsz(desc); \ 1472 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1473 bool q = false; \ 1474 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1475 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1476 if (dd < MIN) { \ 1477 dd = MIN; \ 1478 q = true; \ 1479 } else if (dd > MAX) { \ 1480 dd = MAX; \ 1481 q = true; \ 1482 } \ 1483 d[i] = dd; \ 1484 } \ 1485 if (q) { \ 1486 uint32_t *qc = vq; \ 1487 qc[0] = 1; \ 1488 } \ 1489 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1490 } 1491 1492 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1493 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1494 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1495 1496 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1497 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1498 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1499 1500 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1501 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1502 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1503 1504 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1505 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1506 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1507 1508 #undef DO_SAT 1509 1510 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1511 void *vm, uint32_t desc) 1512 { 1513 intptr_t i, oprsz = simd_oprsz(desc); 1514 uint64_t *d = vd, *n = vn, *m = vm; 1515 bool q = false; 1516 1517 for (i = 0; i < oprsz / 8; i++) { 1518 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1519 if (dd < nn) { 1520 dd = UINT64_MAX; 1521 q = true; 1522 } 1523 d[i] = dd; 1524 } 1525 if (q) { 1526 uint32_t *qc = vq; 1527 qc[0] = 1; 1528 } 1529 clear_tail(d, oprsz, simd_maxsz(desc)); 1530 } 1531 1532 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1533 void *vm, uint32_t desc) 1534 { 1535 intptr_t i, oprsz = simd_oprsz(desc); 1536 uint64_t *d = vd, *n = vn, *m = vm; 1537 bool q = false; 1538 1539 for (i = 0; i < oprsz / 8; i++) { 1540 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1541 if (nn < mm) { 1542 dd = 0; 1543 q = true; 1544 } 1545 d[i] = dd; 1546 } 1547 if (q) { 1548 uint32_t *qc = vq; 1549 qc[0] = 1; 1550 } 1551 clear_tail(d, oprsz, simd_maxsz(desc)); 1552 } 1553 1554 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1555 void *vm, uint32_t desc) 1556 { 1557 intptr_t i, oprsz = simd_oprsz(desc); 1558 int64_t *d = vd, *n = vn, *m = vm; 1559 bool q = false; 1560 1561 for (i = 0; i < oprsz / 8; i++) { 1562 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1563 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1564 dd = (nn >> 63) ^ ~INT64_MIN; 1565 q = true; 1566 } 1567 d[i] = dd; 1568 } 1569 if (q) { 1570 uint32_t *qc = vq; 1571 qc[0] = 1; 1572 } 1573 clear_tail(d, oprsz, simd_maxsz(desc)); 1574 } 1575 1576 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1577 void *vm, uint32_t desc) 1578 { 1579 intptr_t i, oprsz = simd_oprsz(desc); 1580 int64_t *d = vd, *n = vn, *m = vm; 1581 bool q = false; 1582 1583 for (i = 0; i < oprsz / 8; i++) { 1584 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1585 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1586 dd = (nn >> 63) ^ ~INT64_MIN; 1587 q = true; 1588 } 1589 d[i] = dd; 1590 } 1591 if (q) { 1592 uint32_t *qc = vq; 1593 qc[0] = 1; 1594 } 1595 clear_tail(d, oprsz, simd_maxsz(desc)); 1596 } 1597 1598 1599 #define DO_SRA(NAME, TYPE) \ 1600 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1601 { \ 1602 intptr_t i, oprsz = simd_oprsz(desc); \ 1603 int shift = simd_data(desc); \ 1604 TYPE *d = vd, *n = vn; \ 1605 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1606 d[i] += n[i] >> shift; \ 1607 } \ 1608 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1609 } 1610 1611 DO_SRA(gvec_ssra_b, int8_t) 1612 DO_SRA(gvec_ssra_h, int16_t) 1613 DO_SRA(gvec_ssra_s, int32_t) 1614 DO_SRA(gvec_ssra_d, int64_t) 1615 1616 DO_SRA(gvec_usra_b, uint8_t) 1617 DO_SRA(gvec_usra_h, uint16_t) 1618 DO_SRA(gvec_usra_s, uint32_t) 1619 DO_SRA(gvec_usra_d, uint64_t) 1620 1621 #undef DO_SRA 1622 1623 #define DO_RSHR(NAME, TYPE) \ 1624 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1625 { \ 1626 intptr_t i, oprsz = simd_oprsz(desc); \ 1627 int shift = simd_data(desc); \ 1628 TYPE *d = vd, *n = vn; \ 1629 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1630 TYPE tmp = n[i] >> (shift - 1); \ 1631 d[i] = (tmp >> 1) + (tmp & 1); \ 1632 } \ 1633 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1634 } 1635 1636 DO_RSHR(gvec_srshr_b, int8_t) 1637 DO_RSHR(gvec_srshr_h, int16_t) 1638 DO_RSHR(gvec_srshr_s, int32_t) 1639 DO_RSHR(gvec_srshr_d, int64_t) 1640 1641 DO_RSHR(gvec_urshr_b, uint8_t) 1642 DO_RSHR(gvec_urshr_h, uint16_t) 1643 DO_RSHR(gvec_urshr_s, uint32_t) 1644 DO_RSHR(gvec_urshr_d, uint64_t) 1645 1646 #undef DO_RSHR 1647 1648 #define DO_RSRA(NAME, TYPE) \ 1649 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1650 { \ 1651 intptr_t i, oprsz = simd_oprsz(desc); \ 1652 int shift = simd_data(desc); \ 1653 TYPE *d = vd, *n = vn; \ 1654 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1655 TYPE tmp = n[i] >> (shift - 1); \ 1656 d[i] += (tmp >> 1) + (tmp & 1); \ 1657 } \ 1658 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1659 } 1660 1661 DO_RSRA(gvec_srsra_b, int8_t) 1662 DO_RSRA(gvec_srsra_h, int16_t) 1663 DO_RSRA(gvec_srsra_s, int32_t) 1664 DO_RSRA(gvec_srsra_d, int64_t) 1665 1666 DO_RSRA(gvec_ursra_b, uint8_t) 1667 DO_RSRA(gvec_ursra_h, uint16_t) 1668 DO_RSRA(gvec_ursra_s, uint32_t) 1669 DO_RSRA(gvec_ursra_d, uint64_t) 1670 1671 #undef DO_RSRA 1672 1673 #define DO_SRI(NAME, TYPE) \ 1674 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1675 { \ 1676 intptr_t i, oprsz = simd_oprsz(desc); \ 1677 int shift = simd_data(desc); \ 1678 TYPE *d = vd, *n = vn; \ 1679 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1680 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1681 } \ 1682 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1683 } 1684 1685 DO_SRI(gvec_sri_b, uint8_t) 1686 DO_SRI(gvec_sri_h, uint16_t) 1687 DO_SRI(gvec_sri_s, uint32_t) 1688 DO_SRI(gvec_sri_d, uint64_t) 1689 1690 #undef DO_SRI 1691 1692 #define DO_SLI(NAME, TYPE) \ 1693 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1694 { \ 1695 intptr_t i, oprsz = simd_oprsz(desc); \ 1696 int shift = simd_data(desc); \ 1697 TYPE *d = vd, *n = vn; \ 1698 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1699 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1700 } \ 1701 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1702 } 1703 1704 DO_SLI(gvec_sli_b, uint8_t) 1705 DO_SLI(gvec_sli_h, uint16_t) 1706 DO_SLI(gvec_sli_s, uint32_t) 1707 DO_SLI(gvec_sli_d, uint64_t) 1708 1709 #undef DO_SLI 1710 1711 /* 1712 * Convert float16 to float32, raising no exceptions and 1713 * preserving exceptional values, including SNaN. 1714 * This is effectively an unpack+repack operation. 1715 */ 1716 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1717 { 1718 const int f16_bias = 15; 1719 const int f32_bias = 127; 1720 uint32_t sign = extract32(f16, 15, 1); 1721 uint32_t exp = extract32(f16, 10, 5); 1722 uint32_t frac = extract32(f16, 0, 10); 1723 1724 if (exp == 0x1f) { 1725 /* Inf or NaN */ 1726 exp = 0xff; 1727 } else if (exp == 0) { 1728 /* Zero or denormal. */ 1729 if (frac != 0) { 1730 if (fz16) { 1731 frac = 0; 1732 } else { 1733 /* 1734 * Denormal; these are all normal float32. 1735 * Shift the fraction so that the msb is at bit 11, 1736 * then remove bit 11 as the implicit bit of the 1737 * normalized float32. Note that we still go through 1738 * the shift for normal numbers below, to put the 1739 * float32 fraction at the right place. 1740 */ 1741 int shift = clz32(frac) - 21; 1742 frac = (frac << shift) & 0x3ff; 1743 exp = f32_bias - f16_bias - shift + 1; 1744 } 1745 } 1746 } else { 1747 /* Normal number; adjust the bias. */ 1748 exp += f32_bias - f16_bias; 1749 } 1750 sign <<= 31; 1751 exp <<= 23; 1752 frac <<= 23 - 10; 1753 1754 return sign | exp | frac; 1755 } 1756 1757 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1758 { 1759 /* 1760 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1761 * Load the 2nd qword iff is_q & is_2. 1762 * Shift to the 2nd dword iff !is_q & is_2. 1763 * For !is_q & !is_2, the upper bits of the result are garbage. 1764 */ 1765 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1766 } 1767 1768 /* 1769 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1770 * as there is not yet SVE versions that might use blocking. 1771 */ 1772 1773 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1774 uint32_t desc, bool fz16) 1775 { 1776 intptr_t i, oprsz = simd_oprsz(desc); 1777 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1778 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1779 int is_q = oprsz == 16; 1780 uint64_t n_4, m_4; 1781 1782 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1783 n_4 = load4_f16(vn, is_q, is_2); 1784 m_4 = load4_f16(vm, is_q, is_2); 1785 1786 /* Negate all inputs for FMLSL at once. */ 1787 if (is_s) { 1788 n_4 ^= 0x8000800080008000ull; 1789 } 1790 1791 for (i = 0; i < oprsz / 4; i++) { 1792 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1793 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1794 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1795 } 1796 clear_tail(d, oprsz, simd_maxsz(desc)); 1797 } 1798 1799 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1800 void *venv, uint32_t desc) 1801 { 1802 CPUARMState *env = venv; 1803 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1804 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1805 } 1806 1807 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1808 void *venv, uint32_t desc) 1809 { 1810 CPUARMState *env = venv; 1811 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1812 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1813 } 1814 1815 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 1816 void *venv, uint32_t desc) 1817 { 1818 intptr_t i, oprsz = simd_oprsz(desc); 1819 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1820 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1821 CPUARMState *env = venv; 1822 float_status *status = &env->vfp.fp_status; 1823 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1824 1825 for (i = 0; i < oprsz; i += sizeof(float32)) { 1826 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 1827 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 1828 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1829 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1830 float32 aa = *(float32 *)(va + H1_4(i)); 1831 1832 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 1833 } 1834 } 1835 1836 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 1837 uint32_t desc, bool fz16) 1838 { 1839 intptr_t i, oprsz = simd_oprsz(desc); 1840 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1841 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1842 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 1843 int is_q = oprsz == 16; 1844 uint64_t n_4; 1845 float32 m_1; 1846 1847 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1848 n_4 = load4_f16(vn, is_q, is_2); 1849 1850 /* Negate all inputs for FMLSL at once. */ 1851 if (is_s) { 1852 n_4 ^= 0x8000800080008000ull; 1853 } 1854 1855 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 1856 1857 for (i = 0; i < oprsz / 4; i++) { 1858 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1859 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1860 } 1861 clear_tail(d, oprsz, simd_maxsz(desc)); 1862 } 1863 1864 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 1865 void *venv, uint32_t desc) 1866 { 1867 CPUARMState *env = venv; 1868 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1869 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1870 } 1871 1872 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 1873 void *venv, uint32_t desc) 1874 { 1875 CPUARMState *env = venv; 1876 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 1877 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1878 } 1879 1880 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 1881 void *venv, uint32_t desc) 1882 { 1883 intptr_t i, j, oprsz = simd_oprsz(desc); 1884 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1885 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1886 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 1887 CPUARMState *env = venv; 1888 float_status *status = &env->vfp.fp_status; 1889 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1890 1891 for (i = 0; i < oprsz; i += 16) { 1892 float16 mm_16 = *(float16 *)(vm + i + idx); 1893 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1894 1895 for (j = 0; j < 16; j += sizeof(float32)) { 1896 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 1897 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1898 float32 aa = *(float32 *)(va + H1_4(i + j)); 1899 1900 *(float32 *)(vd + H1_4(i + j)) = 1901 float32_muladd(nn, mm, aa, 0, status); 1902 } 1903 } 1904 } 1905 1906 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1907 { 1908 intptr_t i, opr_sz = simd_oprsz(desc); 1909 int8_t *d = vd, *n = vn, *m = vm; 1910 1911 for (i = 0; i < opr_sz; ++i) { 1912 int8_t mm = m[i]; 1913 int8_t nn = n[i]; 1914 int8_t res = 0; 1915 if (mm >= 0) { 1916 if (mm < 8) { 1917 res = nn << mm; 1918 } 1919 } else { 1920 res = nn >> (mm > -8 ? -mm : 7); 1921 } 1922 d[i] = res; 1923 } 1924 clear_tail(d, opr_sz, simd_maxsz(desc)); 1925 } 1926 1927 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1928 { 1929 intptr_t i, opr_sz = simd_oprsz(desc); 1930 int16_t *d = vd, *n = vn, *m = vm; 1931 1932 for (i = 0; i < opr_sz / 2; ++i) { 1933 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1934 int16_t nn = n[i]; 1935 int16_t res = 0; 1936 if (mm >= 0) { 1937 if (mm < 16) { 1938 res = nn << mm; 1939 } 1940 } else { 1941 res = nn >> (mm > -16 ? -mm : 15); 1942 } 1943 d[i] = res; 1944 } 1945 clear_tail(d, opr_sz, simd_maxsz(desc)); 1946 } 1947 1948 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1949 { 1950 intptr_t i, opr_sz = simd_oprsz(desc); 1951 uint8_t *d = vd, *n = vn, *m = vm; 1952 1953 for (i = 0; i < opr_sz; ++i) { 1954 int8_t mm = m[i]; 1955 uint8_t nn = n[i]; 1956 uint8_t res = 0; 1957 if (mm >= 0) { 1958 if (mm < 8) { 1959 res = nn << mm; 1960 } 1961 } else { 1962 if (mm > -8) { 1963 res = nn >> -mm; 1964 } 1965 } 1966 d[i] = res; 1967 } 1968 clear_tail(d, opr_sz, simd_maxsz(desc)); 1969 } 1970 1971 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1972 { 1973 intptr_t i, opr_sz = simd_oprsz(desc); 1974 uint16_t *d = vd, *n = vn, *m = vm; 1975 1976 for (i = 0; i < opr_sz / 2; ++i) { 1977 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1978 uint16_t nn = n[i]; 1979 uint16_t res = 0; 1980 if (mm >= 0) { 1981 if (mm < 16) { 1982 res = nn << mm; 1983 } 1984 } else { 1985 if (mm > -16) { 1986 res = nn >> -mm; 1987 } 1988 } 1989 d[i] = res; 1990 } 1991 clear_tail(d, opr_sz, simd_maxsz(desc)); 1992 } 1993 1994 /* 1995 * 8x8->8 polynomial multiply. 1996 * 1997 * Polynomial multiplication is like integer multiplication except the 1998 * partial products are XORed, not added. 1999 * 2000 * TODO: expose this as a generic vector operation, as it is a common 2001 * crypto building block. 2002 */ 2003 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2004 { 2005 intptr_t i, opr_sz = simd_oprsz(desc); 2006 uint64_t *d = vd, *n = vn, *m = vm; 2007 2008 for (i = 0; i < opr_sz / 8; ++i) { 2009 d[i] = clmul_8x8_low(n[i], m[i]); 2010 } 2011 clear_tail(d, opr_sz, simd_maxsz(desc)); 2012 } 2013 2014 /* 2015 * 64x64->128 polynomial multiply. 2016 * Because of the lanes are not accessed in strict columns, 2017 * this probably cannot be turned into a generic helper. 2018 */ 2019 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2020 { 2021 intptr_t i, opr_sz = simd_oprsz(desc); 2022 intptr_t hi = simd_data(desc); 2023 uint64_t *d = vd, *n = vn, *m = vm; 2024 2025 for (i = 0; i < opr_sz / 8; i += 2) { 2026 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2027 d[i] = int128_getlo(r); 2028 d[i + 1] = int128_gethi(r); 2029 } 2030 clear_tail(d, opr_sz, simd_maxsz(desc)); 2031 } 2032 2033 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2034 { 2035 int hi = simd_data(desc); 2036 uint64_t *d = vd, *n = vn, *m = vm; 2037 uint64_t nn = n[hi], mm = m[hi]; 2038 2039 d[0] = clmul_8x4_packed(nn, mm); 2040 nn >>= 32; 2041 mm >>= 32; 2042 d[1] = clmul_8x4_packed(nn, mm); 2043 2044 clear_tail(d, 16, simd_maxsz(desc)); 2045 } 2046 2047 #ifdef TARGET_AARCH64 2048 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2049 { 2050 int shift = simd_data(desc) * 8; 2051 intptr_t i, opr_sz = simd_oprsz(desc); 2052 uint64_t *d = vd, *n = vn, *m = vm; 2053 2054 for (i = 0; i < opr_sz / 8; ++i) { 2055 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2056 } 2057 } 2058 2059 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2060 { 2061 intptr_t sel = H4(simd_data(desc)); 2062 intptr_t i, opr_sz = simd_oprsz(desc); 2063 uint32_t *n = vn, *m = vm; 2064 uint64_t *d = vd; 2065 2066 for (i = 0; i < opr_sz / 8; ++i) { 2067 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2068 } 2069 } 2070 #endif 2071 2072 #define DO_CMP0(NAME, TYPE, OP) \ 2073 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2074 { \ 2075 intptr_t i, opr_sz = simd_oprsz(desc); \ 2076 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2077 TYPE nn = *(TYPE *)(vn + i); \ 2078 *(TYPE *)(vd + i) = -(nn OP 0); \ 2079 } \ 2080 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2081 } 2082 2083 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2084 DO_CMP0(gvec_clt0_b, int8_t, <) 2085 DO_CMP0(gvec_cle0_b, int8_t, <=) 2086 DO_CMP0(gvec_cgt0_b, int8_t, >) 2087 DO_CMP0(gvec_cge0_b, int8_t, >=) 2088 2089 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2090 DO_CMP0(gvec_clt0_h, int16_t, <) 2091 DO_CMP0(gvec_cle0_h, int16_t, <=) 2092 DO_CMP0(gvec_cgt0_h, int16_t, >) 2093 DO_CMP0(gvec_cge0_h, int16_t, >=) 2094 2095 #undef DO_CMP0 2096 2097 #define DO_ABD(NAME, TYPE) \ 2098 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2099 { \ 2100 intptr_t i, opr_sz = simd_oprsz(desc); \ 2101 TYPE *d = vd, *n = vn, *m = vm; \ 2102 \ 2103 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2104 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2105 } \ 2106 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2107 } 2108 2109 DO_ABD(gvec_sabd_b, int8_t) 2110 DO_ABD(gvec_sabd_h, int16_t) 2111 DO_ABD(gvec_sabd_s, int32_t) 2112 DO_ABD(gvec_sabd_d, int64_t) 2113 2114 DO_ABD(gvec_uabd_b, uint8_t) 2115 DO_ABD(gvec_uabd_h, uint16_t) 2116 DO_ABD(gvec_uabd_s, uint32_t) 2117 DO_ABD(gvec_uabd_d, uint64_t) 2118 2119 #undef DO_ABD 2120 2121 #define DO_ABA(NAME, TYPE) \ 2122 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2123 { \ 2124 intptr_t i, opr_sz = simd_oprsz(desc); \ 2125 TYPE *d = vd, *n = vn, *m = vm; \ 2126 \ 2127 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2128 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2129 } \ 2130 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2131 } 2132 2133 DO_ABA(gvec_saba_b, int8_t) 2134 DO_ABA(gvec_saba_h, int16_t) 2135 DO_ABA(gvec_saba_s, int32_t) 2136 DO_ABA(gvec_saba_d, int64_t) 2137 2138 DO_ABA(gvec_uaba_b, uint8_t) 2139 DO_ABA(gvec_uaba_h, uint16_t) 2140 DO_ABA(gvec_uaba_s, uint32_t) 2141 DO_ABA(gvec_uaba_d, uint64_t) 2142 2143 #undef DO_ABA 2144 2145 #define DO_NEON_PAIRWISE(NAME, OP) \ 2146 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \ 2147 void *stat, uint32_t oprsz) \ 2148 { \ 2149 float_status *fpst = stat; \ 2150 float32 *d = vd; \ 2151 float32 *n = vn; \ 2152 float32 *m = vm; \ 2153 float32 r0, r1; \ 2154 \ 2155 /* Read all inputs before writing outputs in case vm == vd */ \ 2156 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \ 2157 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \ 2158 \ 2159 d[H4(0)] = r0; \ 2160 d[H4(1)] = r1; \ 2161 } \ 2162 \ 2163 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \ 2164 void *stat, uint32_t oprsz) \ 2165 { \ 2166 float_status *fpst = stat; \ 2167 float16 *d = vd; \ 2168 float16 *n = vn; \ 2169 float16 *m = vm; \ 2170 float16 r0, r1, r2, r3; \ 2171 \ 2172 /* Read all inputs before writing outputs in case vm == vd */ \ 2173 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \ 2174 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \ 2175 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \ 2176 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \ 2177 \ 2178 d[H2(0)] = r0; \ 2179 d[H2(1)] = r1; \ 2180 d[H2(2)] = r2; \ 2181 d[H2(3)] = r3; \ 2182 } 2183 2184 DO_NEON_PAIRWISE(neon_padd, add) 2185 DO_NEON_PAIRWISE(neon_pmax, max) 2186 DO_NEON_PAIRWISE(neon_pmin, min) 2187 2188 #undef DO_NEON_PAIRWISE 2189 2190 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2191 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2192 { \ 2193 intptr_t i, oprsz = simd_oprsz(desc); \ 2194 int shift = simd_data(desc); \ 2195 TYPE *d = vd, *n = vn; \ 2196 float_status *fpst = stat; \ 2197 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2198 d[i] = FUNC(n[i], shift, fpst); \ 2199 } \ 2200 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2201 } 2202 2203 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2204 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2205 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2206 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) 2207 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2208 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2209 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2210 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2211 2212 #undef DO_VCVT_FIXED 2213 2214 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2215 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2216 { \ 2217 float_status *fpst = stat; \ 2218 intptr_t i, oprsz = simd_oprsz(desc); \ 2219 uint32_t rmode = simd_data(desc); \ 2220 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2221 TYPE *d = vd, *n = vn; \ 2222 set_float_rounding_mode(rmode, fpst); \ 2223 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2224 d[i] = FUNC(n[i], 0, fpst); \ 2225 } \ 2226 set_float_rounding_mode(prev_rmode, fpst); \ 2227 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2228 } 2229 2230 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2231 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2232 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2233 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2234 2235 #undef DO_VCVT_RMODE 2236 2237 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2238 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2239 { \ 2240 float_status *fpst = stat; \ 2241 intptr_t i, oprsz = simd_oprsz(desc); \ 2242 uint32_t rmode = simd_data(desc); \ 2243 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2244 TYPE *d = vd, *n = vn; \ 2245 set_float_rounding_mode(rmode, fpst); \ 2246 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2247 d[i] = FUNC(n[i], fpst); \ 2248 } \ 2249 set_float_rounding_mode(prev_rmode, fpst); \ 2250 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2251 } 2252 2253 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2254 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2255 2256 #undef DO_VRINT_RMODE 2257 2258 #ifdef TARGET_AARCH64 2259 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2260 { 2261 const uint8_t *indices = vm; 2262 CPUARMState *env = venv; 2263 size_t oprsz = simd_oprsz(desc); 2264 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2265 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2266 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2267 union { 2268 uint8_t b[16]; 2269 uint64_t d[2]; 2270 } result; 2271 2272 /* 2273 * We must construct the final result in a temp, lest the output 2274 * overlaps the input table. For TBL, begin with zero; for TBX, 2275 * begin with the original register contents. Note that we always 2276 * copy 16 bytes here to avoid an extra branch; clearing the high 2277 * bits of the register for oprsz == 8 is handled below. 2278 */ 2279 if (is_tbx) { 2280 memcpy(&result, vd, 16); 2281 } else { 2282 memset(&result, 0, 16); 2283 } 2284 2285 for (size_t i = 0; i < oprsz; ++i) { 2286 uint32_t index = indices[H1(i)]; 2287 2288 if (index < table_len) { 2289 /* 2290 * Convert index (a byte offset into the virtual table 2291 * which is a series of 128-bit vectors concatenated) 2292 * into the correct register element, bearing in mind 2293 * that the table can wrap around from V31 to V0. 2294 */ 2295 const uint8_t *table = (const uint8_t *) 2296 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2297 result.b[H1(i)] = table[H1(index % 16)]; 2298 } 2299 } 2300 2301 memcpy(vd, &result, 16); 2302 clear_tail(vd, oprsz, simd_maxsz(desc)); 2303 } 2304 #endif 2305 2306 /* 2307 * NxN -> N highpart multiply 2308 * 2309 * TODO: expose this as a generic vector operation. 2310 */ 2311 2312 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2313 { 2314 intptr_t i, opr_sz = simd_oprsz(desc); 2315 int8_t *d = vd, *n = vn, *m = vm; 2316 2317 for (i = 0; i < opr_sz; ++i) { 2318 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2319 } 2320 clear_tail(d, opr_sz, simd_maxsz(desc)); 2321 } 2322 2323 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2324 { 2325 intptr_t i, opr_sz = simd_oprsz(desc); 2326 int16_t *d = vd, *n = vn, *m = vm; 2327 2328 for (i = 0; i < opr_sz / 2; ++i) { 2329 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2330 } 2331 clear_tail(d, opr_sz, simd_maxsz(desc)); 2332 } 2333 2334 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2335 { 2336 intptr_t i, opr_sz = simd_oprsz(desc); 2337 int32_t *d = vd, *n = vn, *m = vm; 2338 2339 for (i = 0; i < opr_sz / 4; ++i) { 2340 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2341 } 2342 clear_tail(d, opr_sz, simd_maxsz(desc)); 2343 } 2344 2345 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2346 { 2347 intptr_t i, opr_sz = simd_oprsz(desc); 2348 uint64_t *d = vd, *n = vn, *m = vm; 2349 uint64_t discard; 2350 2351 for (i = 0; i < opr_sz / 8; ++i) { 2352 muls64(&discard, &d[i], n[i], m[i]); 2353 } 2354 clear_tail(d, opr_sz, simd_maxsz(desc)); 2355 } 2356 2357 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2358 { 2359 intptr_t i, opr_sz = simd_oprsz(desc); 2360 uint8_t *d = vd, *n = vn, *m = vm; 2361 2362 for (i = 0; i < opr_sz; ++i) { 2363 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2364 } 2365 clear_tail(d, opr_sz, simd_maxsz(desc)); 2366 } 2367 2368 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2369 { 2370 intptr_t i, opr_sz = simd_oprsz(desc); 2371 uint16_t *d = vd, *n = vn, *m = vm; 2372 2373 for (i = 0; i < opr_sz / 2; ++i) { 2374 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2375 } 2376 clear_tail(d, opr_sz, simd_maxsz(desc)); 2377 } 2378 2379 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2380 { 2381 intptr_t i, opr_sz = simd_oprsz(desc); 2382 uint32_t *d = vd, *n = vn, *m = vm; 2383 2384 for (i = 0; i < opr_sz / 4; ++i) { 2385 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2386 } 2387 clear_tail(d, opr_sz, simd_maxsz(desc)); 2388 } 2389 2390 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2391 { 2392 intptr_t i, opr_sz = simd_oprsz(desc); 2393 uint64_t *d = vd, *n = vn, *m = vm; 2394 uint64_t discard; 2395 2396 for (i = 0; i < opr_sz / 8; ++i) { 2397 mulu64(&discard, &d[i], n[i], m[i]); 2398 } 2399 clear_tail(d, opr_sz, simd_maxsz(desc)); 2400 } 2401 2402 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2403 { 2404 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2405 int shr = simd_data(desc); 2406 uint64_t *d = vd, *n = vn, *m = vm; 2407 2408 for (i = 0; i < opr_sz; ++i) { 2409 d[i] = ror64(n[i] ^ m[i], shr); 2410 } 2411 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2412 } 2413 2414 /* 2415 * Integer matrix-multiply accumulate 2416 */ 2417 2418 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2419 { 2420 int8_t *n = vn, *m = vm; 2421 2422 for (intptr_t k = 0; k < 8; ++k) { 2423 sum += n[H1(k)] * m[H1(k)]; 2424 } 2425 return sum; 2426 } 2427 2428 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2429 { 2430 uint8_t *n = vn, *m = vm; 2431 2432 for (intptr_t k = 0; k < 8; ++k) { 2433 sum += n[H1(k)] * m[H1(k)]; 2434 } 2435 return sum; 2436 } 2437 2438 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2439 { 2440 uint8_t *n = vn; 2441 int8_t *m = vm; 2442 2443 for (intptr_t k = 0; k < 8; ++k) { 2444 sum += n[H1(k)] * m[H1(k)]; 2445 } 2446 return sum; 2447 } 2448 2449 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2450 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2451 { 2452 intptr_t seg, opr_sz = simd_oprsz(desc); 2453 2454 for (seg = 0; seg < opr_sz; seg += 16) { 2455 uint32_t *d = vd + seg; 2456 uint32_t *a = va + seg; 2457 uint32_t sum0, sum1, sum2, sum3; 2458 2459 /* 2460 * Process the entire segment at once, writing back the 2461 * results only after we've consumed all of the inputs. 2462 * 2463 * Key to indices by column: 2464 * i j i j 2465 */ 2466 sum0 = a[H4(0 + 0)]; 2467 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2468 sum1 = a[H4(0 + 1)]; 2469 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2470 sum2 = a[H4(2 + 0)]; 2471 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2472 sum3 = a[H4(2 + 1)]; 2473 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2474 2475 d[H4(0)] = sum0; 2476 d[H4(1)] = sum1; 2477 d[H4(2)] = sum2; 2478 d[H4(3)] = sum3; 2479 } 2480 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2481 } 2482 2483 #define DO_MMLA_B(NAME, INNER) \ 2484 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2485 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2486 2487 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2488 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2489 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2490 2491 /* 2492 * BFloat16 Dot Product 2493 */ 2494 2495 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) 2496 { 2497 /* FPCR is ignored for BFDOT and BFMMLA. */ 2498 float_status bf_status = { 2499 .tininess_before_rounding = float_tininess_before_rounding, 2500 .float_rounding_mode = float_round_to_odd_inf, 2501 .flush_to_zero = true, 2502 .flush_inputs_to_zero = true, 2503 .default_nan_mode = true, 2504 }; 2505 float32 t1, t2; 2506 2507 /* 2508 * Extract each BFloat16 from the element pair, and shift 2509 * them such that they become float32. 2510 */ 2511 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); 2512 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); 2513 t1 = float32_add(t1, t2, &bf_status); 2514 t1 = float32_add(sum, t1, &bf_status); 2515 2516 return t1; 2517 } 2518 2519 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2520 { 2521 intptr_t i, opr_sz = simd_oprsz(desc); 2522 float32 *d = vd, *a = va; 2523 uint32_t *n = vn, *m = vm; 2524 2525 for (i = 0; i < opr_sz / 4; ++i) { 2526 d[i] = bfdotadd(a[i], n[i], m[i]); 2527 } 2528 clear_tail(d, opr_sz, simd_maxsz(desc)); 2529 } 2530 2531 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2532 void *va, uint32_t desc) 2533 { 2534 intptr_t i, j, opr_sz = simd_oprsz(desc); 2535 intptr_t index = simd_data(desc); 2536 intptr_t elements = opr_sz / 4; 2537 intptr_t eltspersegment = MIN(16 / 4, elements); 2538 float32 *d = vd, *a = va; 2539 uint32_t *n = vn, *m = vm; 2540 2541 for (i = 0; i < elements; i += eltspersegment) { 2542 uint32_t m_idx = m[i + H4(index)]; 2543 2544 for (j = i; j < i + eltspersegment; j++) { 2545 d[j] = bfdotadd(a[j], n[j], m_idx); 2546 } 2547 } 2548 clear_tail(d, opr_sz, simd_maxsz(desc)); 2549 } 2550 2551 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2552 { 2553 intptr_t s, opr_sz = simd_oprsz(desc); 2554 float32 *d = vd, *a = va; 2555 uint32_t *n = vn, *m = vm; 2556 2557 for (s = 0; s < opr_sz / 4; s += 4) { 2558 float32 sum00, sum01, sum10, sum11; 2559 2560 /* 2561 * Process the entire segment at once, writing back the 2562 * results only after we've consumed all of the inputs. 2563 * 2564 * Key to indices by column: 2565 * i j i k j k 2566 */ 2567 sum00 = a[s + H4(0 + 0)]; 2568 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); 2569 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); 2570 2571 sum01 = a[s + H4(0 + 1)]; 2572 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); 2573 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); 2574 2575 sum10 = a[s + H4(2 + 0)]; 2576 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); 2577 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); 2578 2579 sum11 = a[s + H4(2 + 1)]; 2580 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); 2581 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); 2582 2583 d[s + H4(0 + 0)] = sum00; 2584 d[s + H4(0 + 1)] = sum01; 2585 d[s + H4(2 + 0)] = sum10; 2586 d[s + H4(2 + 1)] = sum11; 2587 } 2588 clear_tail(d, opr_sz, simd_maxsz(desc)); 2589 } 2590 2591 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 2592 void *stat, uint32_t desc) 2593 { 2594 intptr_t i, opr_sz = simd_oprsz(desc); 2595 intptr_t sel = simd_data(desc); 2596 float32 *d = vd, *a = va; 2597 bfloat16 *n = vn, *m = vm; 2598 2599 for (i = 0; i < opr_sz / 4; ++i) { 2600 float32 nn = n[H2(i * 2 + sel)] << 16; 2601 float32 mm = m[H2(i * 2 + sel)] << 16; 2602 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 2603 } 2604 clear_tail(d, opr_sz, simd_maxsz(desc)); 2605 } 2606 2607 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 2608 void *va, void *stat, uint32_t desc) 2609 { 2610 intptr_t i, j, opr_sz = simd_oprsz(desc); 2611 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 2612 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 2613 intptr_t elements = opr_sz / 4; 2614 intptr_t eltspersegment = MIN(16 / 4, elements); 2615 float32 *d = vd, *a = va; 2616 bfloat16 *n = vn, *m = vm; 2617 2618 for (i = 0; i < elements; i += eltspersegment) { 2619 float32 m_idx = m[H2(2 * i + index)] << 16; 2620 2621 for (j = i; j < i + eltspersegment; j++) { 2622 float32 n_j = n[H2(2 * j + sel)] << 16; 2623 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 2624 } 2625 } 2626 clear_tail(d, opr_sz, simd_maxsz(desc)); 2627 } 2628 2629 #define DO_CLAMP(NAME, TYPE) \ 2630 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 2631 { \ 2632 intptr_t i, opr_sz = simd_oprsz(desc); \ 2633 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2634 TYPE aa = *(TYPE *)(a + i); \ 2635 TYPE nn = *(TYPE *)(n + i); \ 2636 TYPE mm = *(TYPE *)(m + i); \ 2637 TYPE dd = MIN(MAX(aa, nn), mm); \ 2638 *(TYPE *)(d + i) = dd; \ 2639 } \ 2640 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2641 } 2642 2643 DO_CLAMP(gvec_sclamp_b, int8_t) 2644 DO_CLAMP(gvec_sclamp_h, int16_t) 2645 DO_CLAMP(gvec_sclamp_s, int32_t) 2646 DO_CLAMP(gvec_sclamp_d, int64_t) 2647 2648 DO_CLAMP(gvec_uclamp_b, uint8_t) 2649 DO_CLAMP(gvec_uclamp_h, uint16_t) 2650 DO_CLAMP(gvec_uclamp_s, uint32_t) 2651 DO_CLAMP(gvec_uclamp_d, uint64_t) 2652