1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "vec_internal.h" 27 28 /* 29 * Data for expanding active predicate bits to bytes, for byte elements. 30 * 31 * for (i = 0; i < 256; ++i) { 32 * unsigned long m = 0; 33 * for (j = 0; j < 8; j++) { 34 * if ((i >> j) & 1) { 35 * m |= 0xfful << (j << 3); 36 * } 37 * } 38 * printf("0x%016lx,\n", m); 39 * } 40 */ 41 const uint64_t expand_pred_b_data[256] = { 42 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 43 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 44 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 45 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 46 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 47 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 48 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 49 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 50 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 51 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 52 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 53 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 54 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 55 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 56 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 57 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 58 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 59 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 60 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 61 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 62 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 63 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 64 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 65 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 66 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 67 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 68 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 69 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 70 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 71 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 72 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 73 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 74 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 75 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 76 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 77 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 78 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 79 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 80 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 81 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 82 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 83 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 84 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 85 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 86 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 87 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 88 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 89 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 90 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 91 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 92 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 93 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 94 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 95 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 96 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 97 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 98 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 99 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 100 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 101 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 102 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 103 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 104 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 105 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 106 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 107 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 108 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 109 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 110 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 111 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 112 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 113 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 114 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 115 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 116 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 117 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 118 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 119 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 120 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 121 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 122 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 123 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 124 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 125 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 126 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 127 0xffffffffffffffff, 128 }; 129 130 /* 131 * Similarly for half-word elements. 132 * for (i = 0; i < 256; ++i) { 133 * unsigned long m = 0; 134 * if (i & 0xaa) { 135 * continue; 136 * } 137 * for (j = 0; j < 8; j += 2) { 138 * if ((i >> j) & 1) { 139 * m |= 0xfffful << (j << 3); 140 * } 141 * } 142 * printf("[0x%x] = 0x%016lx,\n", i, m); 143 * } 144 */ 145 const uint64_t expand_pred_h_data[0x55 + 1] = { 146 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 147 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 148 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 149 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 150 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 151 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 152 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 153 [0x55] = 0xffffffffffffffff, 154 }; 155 156 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 157 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 158 bool neg, bool round) 159 { 160 /* 161 * Simplify: 162 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 163 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 164 */ 165 int32_t ret = (int32_t)src1 * src2; 166 if (neg) { 167 ret = -ret; 168 } 169 ret += ((int32_t)src3 << 7) + (round << 6); 170 ret >>= 7; 171 172 if (ret != (int8_t)ret) { 173 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 174 } 175 return ret; 176 } 177 178 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 179 void *va, uint32_t desc) 180 { 181 intptr_t i, opr_sz = simd_oprsz(desc); 182 int8_t *d = vd, *n = vn, *m = vm, *a = va; 183 184 for (i = 0; i < opr_sz; ++i) { 185 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 186 } 187 } 188 189 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 190 void *va, uint32_t desc) 191 { 192 intptr_t i, opr_sz = simd_oprsz(desc); 193 int8_t *d = vd, *n = vn, *m = vm, *a = va; 194 195 for (i = 0; i < opr_sz; ++i) { 196 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 197 } 198 } 199 200 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 201 { 202 intptr_t i, opr_sz = simd_oprsz(desc); 203 int8_t *d = vd, *n = vn, *m = vm; 204 205 for (i = 0; i < opr_sz; ++i) { 206 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 207 } 208 } 209 210 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 211 { 212 intptr_t i, opr_sz = simd_oprsz(desc); 213 int8_t *d = vd, *n = vn, *m = vm; 214 215 for (i = 0; i < opr_sz; ++i) { 216 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 217 } 218 } 219 220 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 221 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 222 bool neg, bool round, uint32_t *sat) 223 { 224 /* Simplify similarly to do_sqrdmlah_b above. */ 225 int32_t ret = (int32_t)src1 * src2; 226 if (neg) { 227 ret = -ret; 228 } 229 ret += ((int32_t)src3 << 15) + (round << 14); 230 ret >>= 15; 231 232 if (ret != (int16_t)ret) { 233 *sat = 1; 234 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 235 } 236 return ret; 237 } 238 239 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 240 uint32_t src2, uint32_t src3) 241 { 242 uint32_t *sat = &env->vfp.qc[0]; 243 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 244 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 245 false, true, sat); 246 return deposit32(e1, 16, 16, e2); 247 } 248 249 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 250 void *vq, uint32_t desc) 251 { 252 uintptr_t opr_sz = simd_oprsz(desc); 253 int16_t *d = vd; 254 int16_t *n = vn; 255 int16_t *m = vm; 256 uintptr_t i; 257 258 for (i = 0; i < opr_sz / 2; ++i) { 259 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 260 } 261 clear_tail(d, opr_sz, simd_maxsz(desc)); 262 } 263 264 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 265 uint32_t src2, uint32_t src3) 266 { 267 uint32_t *sat = &env->vfp.qc[0]; 268 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 269 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 270 true, true, sat); 271 return deposit32(e1, 16, 16, e2); 272 } 273 274 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 275 void *vq, uint32_t desc) 276 { 277 uintptr_t opr_sz = simd_oprsz(desc); 278 int16_t *d = vd; 279 int16_t *n = vn; 280 int16_t *m = vm; 281 uintptr_t i; 282 283 for (i = 0; i < opr_sz / 2; ++i) { 284 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 285 } 286 clear_tail(d, opr_sz, simd_maxsz(desc)); 287 } 288 289 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 290 void *vq, uint32_t desc) 291 { 292 intptr_t i, opr_sz = simd_oprsz(desc); 293 int16_t *d = vd, *n = vn, *m = vm; 294 295 for (i = 0; i < opr_sz / 2; ++i) { 296 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 297 } 298 clear_tail(d, opr_sz, simd_maxsz(desc)); 299 } 300 301 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 302 void *vq, uint32_t desc) 303 { 304 intptr_t i, opr_sz = simd_oprsz(desc); 305 int16_t *d = vd, *n = vn, *m = vm; 306 307 for (i = 0; i < opr_sz / 2; ++i) { 308 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 309 } 310 clear_tail(d, opr_sz, simd_maxsz(desc)); 311 } 312 313 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 314 void *va, uint32_t desc) 315 { 316 intptr_t i, opr_sz = simd_oprsz(desc); 317 int16_t *d = vd, *n = vn, *m = vm, *a = va; 318 uint32_t discard; 319 320 for (i = 0; i < opr_sz / 2; ++i) { 321 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 322 } 323 } 324 325 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 326 void *va, uint32_t desc) 327 { 328 intptr_t i, opr_sz = simd_oprsz(desc); 329 int16_t *d = vd, *n = vn, *m = vm, *a = va; 330 uint32_t discard; 331 332 for (i = 0; i < opr_sz / 2; ++i) { 333 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 334 } 335 } 336 337 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 338 { 339 intptr_t i, opr_sz = simd_oprsz(desc); 340 int16_t *d = vd, *n = vn, *m = vm; 341 uint32_t discard; 342 343 for (i = 0; i < opr_sz / 2; ++i) { 344 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 345 } 346 } 347 348 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 349 { 350 intptr_t i, opr_sz = simd_oprsz(desc); 351 int16_t *d = vd, *n = vn, *m = vm; 352 uint32_t discard; 353 354 for (i = 0; i < opr_sz / 2; ++i) { 355 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 356 } 357 } 358 359 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 360 { 361 intptr_t i, j, opr_sz = simd_oprsz(desc); 362 int idx = simd_data(desc); 363 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 364 uint32_t discard; 365 366 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 367 int16_t mm = m[i]; 368 for (j = 0; j < 16 / 2; ++j) { 369 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 370 } 371 } 372 } 373 374 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 375 { 376 intptr_t i, j, opr_sz = simd_oprsz(desc); 377 int idx = simd_data(desc); 378 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 379 uint32_t discard; 380 381 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 382 int16_t mm = m[i]; 383 for (j = 0; j < 16 / 2; ++j) { 384 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 385 } 386 } 387 } 388 389 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 390 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 391 bool neg, bool round, uint32_t *sat) 392 { 393 /* Simplify similarly to do_sqrdmlah_b above. */ 394 int64_t ret = (int64_t)src1 * src2; 395 if (neg) { 396 ret = -ret; 397 } 398 ret += ((int64_t)src3 << 31) + (round << 30); 399 ret >>= 31; 400 401 if (ret != (int32_t)ret) { 402 *sat = 1; 403 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 404 } 405 return ret; 406 } 407 408 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 409 int32_t src2, int32_t src3) 410 { 411 uint32_t *sat = &env->vfp.qc[0]; 412 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 413 } 414 415 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 416 void *vq, uint32_t desc) 417 { 418 uintptr_t opr_sz = simd_oprsz(desc); 419 int32_t *d = vd; 420 int32_t *n = vn; 421 int32_t *m = vm; 422 uintptr_t i; 423 424 for (i = 0; i < opr_sz / 4; ++i) { 425 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 426 } 427 clear_tail(d, opr_sz, simd_maxsz(desc)); 428 } 429 430 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 431 int32_t src2, int32_t src3) 432 { 433 uint32_t *sat = &env->vfp.qc[0]; 434 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 435 } 436 437 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 438 void *vq, uint32_t desc) 439 { 440 uintptr_t opr_sz = simd_oprsz(desc); 441 int32_t *d = vd; 442 int32_t *n = vn; 443 int32_t *m = vm; 444 uintptr_t i; 445 446 for (i = 0; i < opr_sz / 4; ++i) { 447 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 448 } 449 clear_tail(d, opr_sz, simd_maxsz(desc)); 450 } 451 452 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 453 void *vq, uint32_t desc) 454 { 455 intptr_t i, opr_sz = simd_oprsz(desc); 456 int32_t *d = vd, *n = vn, *m = vm; 457 458 for (i = 0; i < opr_sz / 4; ++i) { 459 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 460 } 461 clear_tail(d, opr_sz, simd_maxsz(desc)); 462 } 463 464 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 465 void *vq, uint32_t desc) 466 { 467 intptr_t i, opr_sz = simd_oprsz(desc); 468 int32_t *d = vd, *n = vn, *m = vm; 469 470 for (i = 0; i < opr_sz / 4; ++i) { 471 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 472 } 473 clear_tail(d, opr_sz, simd_maxsz(desc)); 474 } 475 476 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 477 void *va, uint32_t desc) 478 { 479 intptr_t i, opr_sz = simd_oprsz(desc); 480 int32_t *d = vd, *n = vn, *m = vm, *a = va; 481 uint32_t discard; 482 483 for (i = 0; i < opr_sz / 4; ++i) { 484 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 485 } 486 } 487 488 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 489 void *va, uint32_t desc) 490 { 491 intptr_t i, opr_sz = simd_oprsz(desc); 492 int32_t *d = vd, *n = vn, *m = vm, *a = va; 493 uint32_t discard; 494 495 for (i = 0; i < opr_sz / 4; ++i) { 496 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 497 } 498 } 499 500 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 501 { 502 intptr_t i, opr_sz = simd_oprsz(desc); 503 int32_t *d = vd, *n = vn, *m = vm; 504 uint32_t discard; 505 506 for (i = 0; i < opr_sz / 4; ++i) { 507 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 508 } 509 } 510 511 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 512 { 513 intptr_t i, opr_sz = simd_oprsz(desc); 514 int32_t *d = vd, *n = vn, *m = vm; 515 uint32_t discard; 516 517 for (i = 0; i < opr_sz / 4; ++i) { 518 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 519 } 520 } 521 522 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 523 { 524 intptr_t i, j, opr_sz = simd_oprsz(desc); 525 int idx = simd_data(desc); 526 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 527 uint32_t discard; 528 529 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 530 int32_t mm = m[i]; 531 for (j = 0; j < 16 / 4; ++j) { 532 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 533 } 534 } 535 } 536 537 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 538 { 539 intptr_t i, j, opr_sz = simd_oprsz(desc); 540 int idx = simd_data(desc); 541 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 542 uint32_t discard; 543 544 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 545 int32_t mm = m[i]; 546 for (j = 0; j < 16 / 4; ++j) { 547 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 548 } 549 } 550 } 551 552 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 553 static int64_t do_sat128_d(Int128 r) 554 { 555 int64_t ls = int128_getlo(r); 556 int64_t hs = int128_gethi(r); 557 558 if (unlikely(hs != (ls >> 63))) { 559 return hs < 0 ? INT64_MIN : INT64_MAX; 560 } 561 return ls; 562 } 563 564 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 565 { 566 uint64_t l, h; 567 Int128 r, t; 568 569 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 570 muls64(&l, &h, m, n); 571 r = int128_make128(l, h); 572 if (neg) { 573 r = int128_neg(r); 574 } 575 if (a) { 576 t = int128_exts64(a); 577 t = int128_lshift(t, 63); 578 r = int128_add(r, t); 579 } 580 if (round) { 581 t = int128_exts64(1ll << 62); 582 r = int128_add(r, t); 583 } 584 r = int128_rshift(r, 63); 585 586 return do_sat128_d(r); 587 } 588 589 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 590 void *va, uint32_t desc) 591 { 592 intptr_t i, opr_sz = simd_oprsz(desc); 593 int64_t *d = vd, *n = vn, *m = vm, *a = va; 594 595 for (i = 0; i < opr_sz / 8; ++i) { 596 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 597 } 598 } 599 600 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 601 void *va, uint32_t desc) 602 { 603 intptr_t i, opr_sz = simd_oprsz(desc); 604 int64_t *d = vd, *n = vn, *m = vm, *a = va; 605 606 for (i = 0; i < opr_sz / 8; ++i) { 607 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 608 } 609 } 610 611 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 612 { 613 intptr_t i, opr_sz = simd_oprsz(desc); 614 int64_t *d = vd, *n = vn, *m = vm; 615 616 for (i = 0; i < opr_sz / 8; ++i) { 617 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 618 } 619 } 620 621 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 622 { 623 intptr_t i, opr_sz = simd_oprsz(desc); 624 int64_t *d = vd, *n = vn, *m = vm; 625 626 for (i = 0; i < opr_sz / 8; ++i) { 627 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 628 } 629 } 630 631 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 632 { 633 intptr_t i, j, opr_sz = simd_oprsz(desc); 634 int idx = simd_data(desc); 635 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 636 637 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 638 int64_t mm = m[i]; 639 for (j = 0; j < 16 / 8; ++j) { 640 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 641 } 642 } 643 } 644 645 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, j, opr_sz = simd_oprsz(desc); 648 int idx = simd_data(desc); 649 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 650 651 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 652 int64_t mm = m[i]; 653 for (j = 0; j < 16 / 8; ++j) { 654 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 655 } 656 } 657 } 658 659 /* Integer 8 and 16-bit dot-product. 660 * 661 * Note that for the loops herein, host endianness does not matter 662 * with respect to the ordering of data within the quad-width lanes. 663 * All elements are treated equally, no matter where they are. 664 */ 665 666 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 667 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 668 { \ 669 intptr_t i, opr_sz = simd_oprsz(desc); \ 670 TYPED *d = vd, *a = va; \ 671 TYPEN *n = vn; \ 672 TYPEM *m = vm; \ 673 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 674 d[i] = (a[i] + \ 675 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 676 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 677 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 678 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 679 } \ 680 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 681 } 682 683 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 684 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 685 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 686 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 687 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 688 689 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 690 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 691 { \ 692 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 693 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 694 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 695 intptr_t index = simd_data(desc); \ 696 TYPED *d = vd, *a = va; \ 697 TYPEN *n = vn; \ 698 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 699 do { \ 700 TYPED m0 = m_indexed[i * 4 + 0]; \ 701 TYPED m1 = m_indexed[i * 4 + 1]; \ 702 TYPED m2 = m_indexed[i * 4 + 2]; \ 703 TYPED m3 = m_indexed[i * 4 + 3]; \ 704 do { \ 705 d[i] = (a[i] + \ 706 n[i * 4 + 0] * m0 + \ 707 n[i * 4 + 1] * m1 + \ 708 n[i * 4 + 2] * m2 + \ 709 n[i * 4 + 3] * m3); \ 710 } while (++i < segend); \ 711 segend = i + 4; \ 712 } while (i < opr_sz_n); \ 713 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 714 } 715 716 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 717 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 718 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 719 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 720 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 721 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 722 723 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 724 void *vfpst, uint32_t desc) 725 { 726 uintptr_t opr_sz = simd_oprsz(desc); 727 float16 *d = vd; 728 float16 *n = vn; 729 float16 *m = vm; 730 float_status *fpst = vfpst; 731 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 732 uint32_t neg_imag = neg_real ^ 1; 733 uintptr_t i; 734 735 /* Shift boolean to the sign bit so we can xor to negate. */ 736 neg_real <<= 15; 737 neg_imag <<= 15; 738 739 for (i = 0; i < opr_sz / 2; i += 2) { 740 float16 e0 = n[H2(i)]; 741 float16 e1 = m[H2(i + 1)] ^ neg_imag; 742 float16 e2 = n[H2(i + 1)]; 743 float16 e3 = m[H2(i)] ^ neg_real; 744 745 d[H2(i)] = float16_add(e0, e1, fpst); 746 d[H2(i + 1)] = float16_add(e2, e3, fpst); 747 } 748 clear_tail(d, opr_sz, simd_maxsz(desc)); 749 } 750 751 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 752 void *vfpst, uint32_t desc) 753 { 754 uintptr_t opr_sz = simd_oprsz(desc); 755 float32 *d = vd; 756 float32 *n = vn; 757 float32 *m = vm; 758 float_status *fpst = vfpst; 759 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 760 uint32_t neg_imag = neg_real ^ 1; 761 uintptr_t i; 762 763 /* Shift boolean to the sign bit so we can xor to negate. */ 764 neg_real <<= 31; 765 neg_imag <<= 31; 766 767 for (i = 0; i < opr_sz / 4; i += 2) { 768 float32 e0 = n[H4(i)]; 769 float32 e1 = m[H4(i + 1)] ^ neg_imag; 770 float32 e2 = n[H4(i + 1)]; 771 float32 e3 = m[H4(i)] ^ neg_real; 772 773 d[H4(i)] = float32_add(e0, e1, fpst); 774 d[H4(i + 1)] = float32_add(e2, e3, fpst); 775 } 776 clear_tail(d, opr_sz, simd_maxsz(desc)); 777 } 778 779 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 780 void *vfpst, uint32_t desc) 781 { 782 uintptr_t opr_sz = simd_oprsz(desc); 783 float64 *d = vd; 784 float64 *n = vn; 785 float64 *m = vm; 786 float_status *fpst = vfpst; 787 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 788 uint64_t neg_imag = neg_real ^ 1; 789 uintptr_t i; 790 791 /* Shift boolean to the sign bit so we can xor to negate. */ 792 neg_real <<= 63; 793 neg_imag <<= 63; 794 795 for (i = 0; i < opr_sz / 8; i += 2) { 796 float64 e0 = n[i]; 797 float64 e1 = m[i + 1] ^ neg_imag; 798 float64 e2 = n[i + 1]; 799 float64 e3 = m[i] ^ neg_real; 800 801 d[i] = float64_add(e0, e1, fpst); 802 d[i + 1] = float64_add(e2, e3, fpst); 803 } 804 clear_tail(d, opr_sz, simd_maxsz(desc)); 805 } 806 807 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 808 void *vfpst, uint32_t desc) 809 { 810 uintptr_t opr_sz = simd_oprsz(desc); 811 float16 *d = vd, *n = vn, *m = vm, *a = va; 812 float_status *fpst = vfpst; 813 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 814 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 815 uint32_t neg_real = flip ^ neg_imag; 816 uintptr_t i; 817 818 /* Shift boolean to the sign bit so we can xor to negate. */ 819 neg_real <<= 15; 820 neg_imag <<= 15; 821 822 for (i = 0; i < opr_sz / 2; i += 2) { 823 float16 e2 = n[H2(i + flip)]; 824 float16 e1 = m[H2(i + flip)] ^ neg_real; 825 float16 e4 = e2; 826 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 827 828 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 829 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 830 } 831 clear_tail(d, opr_sz, simd_maxsz(desc)); 832 } 833 834 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 835 void *vfpst, uint32_t desc) 836 { 837 uintptr_t opr_sz = simd_oprsz(desc); 838 float16 *d = vd, *n = vn, *m = vm, *a = va; 839 float_status *fpst = vfpst; 840 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 841 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 842 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 843 uint32_t neg_real = flip ^ neg_imag; 844 intptr_t elements = opr_sz / sizeof(float16); 845 intptr_t eltspersegment = 16 / sizeof(float16); 846 intptr_t i, j; 847 848 /* Shift boolean to the sign bit so we can xor to negate. */ 849 neg_real <<= 15; 850 neg_imag <<= 15; 851 852 for (i = 0; i < elements; i += eltspersegment) { 853 float16 mr = m[H2(i + 2 * index + 0)]; 854 float16 mi = m[H2(i + 2 * index + 1)]; 855 float16 e1 = neg_real ^ (flip ? mi : mr); 856 float16 e3 = neg_imag ^ (flip ? mr : mi); 857 858 for (j = i; j < i + eltspersegment; j += 2) { 859 float16 e2 = n[H2(j + flip)]; 860 float16 e4 = e2; 861 862 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 863 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 864 } 865 } 866 clear_tail(d, opr_sz, simd_maxsz(desc)); 867 } 868 869 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 870 void *vfpst, uint32_t desc) 871 { 872 uintptr_t opr_sz = simd_oprsz(desc); 873 float32 *d = vd, *n = vn, *m = vm, *a = va; 874 float_status *fpst = vfpst; 875 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 876 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 877 uint32_t neg_real = flip ^ neg_imag; 878 uintptr_t i; 879 880 /* Shift boolean to the sign bit so we can xor to negate. */ 881 neg_real <<= 31; 882 neg_imag <<= 31; 883 884 for (i = 0; i < opr_sz / 4; i += 2) { 885 float32 e2 = n[H4(i + flip)]; 886 float32 e1 = m[H4(i + flip)] ^ neg_real; 887 float32 e4 = e2; 888 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 889 890 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 891 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 892 } 893 clear_tail(d, opr_sz, simd_maxsz(desc)); 894 } 895 896 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 897 void *vfpst, uint32_t desc) 898 { 899 uintptr_t opr_sz = simd_oprsz(desc); 900 float32 *d = vd, *n = vn, *m = vm, *a = va; 901 float_status *fpst = vfpst; 902 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 903 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 904 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 905 uint32_t neg_real = flip ^ neg_imag; 906 intptr_t elements = opr_sz / sizeof(float32); 907 intptr_t eltspersegment = 16 / sizeof(float32); 908 intptr_t i, j; 909 910 /* Shift boolean to the sign bit so we can xor to negate. */ 911 neg_real <<= 31; 912 neg_imag <<= 31; 913 914 for (i = 0; i < elements; i += eltspersegment) { 915 float32 mr = m[H4(i + 2 * index + 0)]; 916 float32 mi = m[H4(i + 2 * index + 1)]; 917 float32 e1 = neg_real ^ (flip ? mi : mr); 918 float32 e3 = neg_imag ^ (flip ? mr : mi); 919 920 for (j = i; j < i + eltspersegment; j += 2) { 921 float32 e2 = n[H4(j + flip)]; 922 float32 e4 = e2; 923 924 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 925 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 926 } 927 } 928 clear_tail(d, opr_sz, simd_maxsz(desc)); 929 } 930 931 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 932 void *vfpst, uint32_t desc) 933 { 934 uintptr_t opr_sz = simd_oprsz(desc); 935 float64 *d = vd, *n = vn, *m = vm, *a = va; 936 float_status *fpst = vfpst; 937 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 938 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 939 uint64_t neg_real = flip ^ neg_imag; 940 uintptr_t i; 941 942 /* Shift boolean to the sign bit so we can xor to negate. */ 943 neg_real <<= 63; 944 neg_imag <<= 63; 945 946 for (i = 0; i < opr_sz / 8; i += 2) { 947 float64 e2 = n[i + flip]; 948 float64 e1 = m[i + flip] ^ neg_real; 949 float64 e4 = e2; 950 float64 e3 = m[i + 1 - flip] ^ neg_imag; 951 952 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 953 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 954 } 955 clear_tail(d, opr_sz, simd_maxsz(desc)); 956 } 957 958 /* 959 * Floating point comparisons producing an integer result (all 1s or all 0s). 960 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 961 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 962 */ 963 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 964 { 965 return -float16_eq_quiet(op1, op2, stat); 966 } 967 968 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 969 { 970 return -float32_eq_quiet(op1, op2, stat); 971 } 972 973 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 974 { 975 return -float16_le(op2, op1, stat); 976 } 977 978 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 979 { 980 return -float32_le(op2, op1, stat); 981 } 982 983 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 984 { 985 return -float16_lt(op2, op1, stat); 986 } 987 988 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 989 { 990 return -float32_lt(op2, op1, stat); 991 } 992 993 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 994 { 995 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 996 } 997 998 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 999 { 1000 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1001 } 1002 1003 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1004 { 1005 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1006 } 1007 1008 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1009 { 1010 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1011 } 1012 1013 static int16_t vfp_tosszh(float16 x, void *fpstp) 1014 { 1015 float_status *fpst = fpstp; 1016 if (float16_is_any_nan(x)) { 1017 float_raise(float_flag_invalid, fpst); 1018 return 0; 1019 } 1020 return float16_to_int16_round_to_zero(x, fpst); 1021 } 1022 1023 static uint16_t vfp_touszh(float16 x, void *fpstp) 1024 { 1025 float_status *fpst = fpstp; 1026 if (float16_is_any_nan(x)) { 1027 float_raise(float_flag_invalid, fpst); 1028 return 0; 1029 } 1030 return float16_to_uint16_round_to_zero(x, fpst); 1031 } 1032 1033 #define DO_2OP(NAME, FUNC, TYPE) \ 1034 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1035 { \ 1036 intptr_t i, oprsz = simd_oprsz(desc); \ 1037 TYPE *d = vd, *n = vn; \ 1038 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1039 d[i] = FUNC(n[i], stat); \ 1040 } \ 1041 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1042 } 1043 1044 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1045 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1046 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1047 1048 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1049 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1050 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1051 1052 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1053 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1054 1055 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1056 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1057 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1058 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1059 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1060 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1061 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1062 DO_2OP(gvec_touszh, vfp_touszh, float16) 1063 1064 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1065 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1066 { \ 1067 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1068 } 1069 1070 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1071 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1072 { \ 1073 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1074 } 1075 1076 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1077 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1078 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1079 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1080 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1081 1082 DO_2OP_CMP0(cgt, cgt, FWD) 1083 DO_2OP_CMP0(cge, cge, FWD) 1084 DO_2OP_CMP0(ceq, ceq, FWD) 1085 DO_2OP_CMP0(clt, cgt, REV) 1086 DO_2OP_CMP0(cle, cge, REV) 1087 1088 #undef DO_2OP 1089 #undef DO_2OP_CMP0 1090 1091 /* Floating-point trigonometric starting value. 1092 * See the ARM ARM pseudocode function FPTrigSMul. 1093 */ 1094 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1095 { 1096 float16 result = float16_mul(op1, op1, stat); 1097 if (!float16_is_any_nan(result)) { 1098 result = float16_set_sign(result, op2 & 1); 1099 } 1100 return result; 1101 } 1102 1103 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1104 { 1105 float32 result = float32_mul(op1, op1, stat); 1106 if (!float32_is_any_nan(result)) { 1107 result = float32_set_sign(result, op2 & 1); 1108 } 1109 return result; 1110 } 1111 1112 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1113 { 1114 float64 result = float64_mul(op1, op1, stat); 1115 if (!float64_is_any_nan(result)) { 1116 result = float64_set_sign(result, op2 & 1); 1117 } 1118 return result; 1119 } 1120 1121 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1122 { 1123 return float16_abs(float16_sub(op1, op2, stat)); 1124 } 1125 1126 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1127 { 1128 return float32_abs(float32_sub(op1, op2, stat)); 1129 } 1130 1131 /* 1132 * Reciprocal step. These are the AArch32 version which uses a 1133 * non-fused multiply-and-subtract. 1134 */ 1135 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1136 { 1137 op1 = float16_squash_input_denormal(op1, stat); 1138 op2 = float16_squash_input_denormal(op2, stat); 1139 1140 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1141 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1142 return float16_two; 1143 } 1144 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1145 } 1146 1147 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1148 { 1149 op1 = float32_squash_input_denormal(op1, stat); 1150 op2 = float32_squash_input_denormal(op2, stat); 1151 1152 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1153 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1154 return float32_two; 1155 } 1156 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1157 } 1158 1159 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1160 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1161 { 1162 op1 = float16_squash_input_denormal(op1, stat); 1163 op2 = float16_squash_input_denormal(op2, stat); 1164 1165 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1166 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1167 return float16_one_point_five; 1168 } 1169 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1170 return float16_div(op1, float16_two, stat); 1171 } 1172 1173 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1174 { 1175 op1 = float32_squash_input_denormal(op1, stat); 1176 op2 = float32_squash_input_denormal(op2, stat); 1177 1178 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1179 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1180 return float32_one_point_five; 1181 } 1182 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1183 return float32_div(op1, float32_two, stat); 1184 } 1185 1186 #define DO_3OP(NAME, FUNC, TYPE) \ 1187 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1188 { \ 1189 intptr_t i, oprsz = simd_oprsz(desc); \ 1190 TYPE *d = vd, *n = vn, *m = vm; \ 1191 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1192 d[i] = FUNC(n[i], m[i], stat); \ 1193 } \ 1194 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1195 } 1196 1197 DO_3OP(gvec_fadd_h, float16_add, float16) 1198 DO_3OP(gvec_fadd_s, float32_add, float32) 1199 DO_3OP(gvec_fadd_d, float64_add, float64) 1200 1201 DO_3OP(gvec_fsub_h, float16_sub, float16) 1202 DO_3OP(gvec_fsub_s, float32_sub, float32) 1203 DO_3OP(gvec_fsub_d, float64_sub, float64) 1204 1205 DO_3OP(gvec_fmul_h, float16_mul, float16) 1206 DO_3OP(gvec_fmul_s, float32_mul, float32) 1207 DO_3OP(gvec_fmul_d, float64_mul, float64) 1208 1209 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1210 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1211 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1212 1213 DO_3OP(gvec_fabd_h, float16_abd, float16) 1214 DO_3OP(gvec_fabd_s, float32_abd, float32) 1215 1216 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1217 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1218 1219 DO_3OP(gvec_fcge_h, float16_cge, float16) 1220 DO_3OP(gvec_fcge_s, float32_cge, float32) 1221 1222 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1223 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1224 1225 DO_3OP(gvec_facge_h, float16_acge, float16) 1226 DO_3OP(gvec_facge_s, float32_acge, float32) 1227 1228 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1229 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1230 1231 DO_3OP(gvec_fmax_h, float16_max, float16) 1232 DO_3OP(gvec_fmax_s, float32_max, float32) 1233 1234 DO_3OP(gvec_fmin_h, float16_min, float16) 1235 DO_3OP(gvec_fmin_s, float32_min, float32) 1236 1237 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1238 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1239 1240 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1241 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1242 1243 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1244 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1245 1246 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1247 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1248 1249 #ifdef TARGET_AARCH64 1250 1251 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1252 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1253 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1254 1255 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1256 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1257 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1258 1259 #endif 1260 #undef DO_3OP 1261 1262 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1263 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1264 float_status *stat) 1265 { 1266 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1267 } 1268 1269 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1270 float_status *stat) 1271 { 1272 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1273 } 1274 1275 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1276 float_status *stat) 1277 { 1278 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1279 } 1280 1281 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1282 float_status *stat) 1283 { 1284 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1285 } 1286 1287 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1288 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1289 float_status *stat) 1290 { 1291 return float16_muladd(op1, op2, dest, 0, stat); 1292 } 1293 1294 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1295 float_status *stat) 1296 { 1297 return float32_muladd(op1, op2, dest, 0, stat); 1298 } 1299 1300 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1301 float_status *stat) 1302 { 1303 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1304 } 1305 1306 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1307 float_status *stat) 1308 { 1309 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1310 } 1311 1312 #define DO_MULADD(NAME, FUNC, TYPE) \ 1313 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1314 { \ 1315 intptr_t i, oprsz = simd_oprsz(desc); \ 1316 TYPE *d = vd, *n = vn, *m = vm; \ 1317 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1318 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1319 } \ 1320 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1321 } 1322 1323 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1324 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1325 1326 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1327 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1328 1329 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1330 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1331 1332 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1333 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1334 1335 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1336 * For AdvSIMD, there is of course only one such vector segment. 1337 */ 1338 1339 #define DO_MUL_IDX(NAME, TYPE, H) \ 1340 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1341 { \ 1342 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1343 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1344 intptr_t idx = simd_data(desc); \ 1345 TYPE *d = vd, *n = vn, *m = vm; \ 1346 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1347 TYPE mm = m[H(i + idx)]; \ 1348 for (j = 0; j < segment; j++) { \ 1349 d[i + j] = n[i + j] * mm; \ 1350 } \ 1351 } \ 1352 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1353 } 1354 1355 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1356 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1357 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1358 1359 #undef DO_MUL_IDX 1360 1361 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1362 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1363 { \ 1364 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1365 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1366 intptr_t idx = simd_data(desc); \ 1367 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1368 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1369 TYPE mm = m[H(i + idx)]; \ 1370 for (j = 0; j < segment; j++) { \ 1371 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1372 } \ 1373 } \ 1374 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1375 } 1376 1377 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1378 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1379 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1380 1381 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1382 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1383 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1384 1385 #undef DO_MLA_IDX 1386 1387 #define DO_FMUL_IDX(NAME, ADD, TYPE, H) \ 1388 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1389 { \ 1390 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1391 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1392 intptr_t idx = simd_data(desc); \ 1393 TYPE *d = vd, *n = vn, *m = vm; \ 1394 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1395 TYPE mm = m[H(i + idx)]; \ 1396 for (j = 0; j < segment; j++) { \ 1397 d[i + j] = TYPE##_##ADD(d[i + j], \ 1398 TYPE##_mul(n[i + j], mm, stat), stat); \ 1399 } \ 1400 } \ 1401 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1402 } 1403 1404 #define float16_nop(N, M, S) (M) 1405 #define float32_nop(N, M, S) (M) 1406 #define float64_nop(N, M, S) (M) 1407 1408 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2) 1409 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4) 1410 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8) 1411 1412 /* 1413 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1414 * the fused ops below they assume accumulate both from and into Vd. 1415 */ 1416 DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2) 1417 DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4) 1418 DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2) 1419 DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4) 1420 1421 #undef float16_nop 1422 #undef float32_nop 1423 #undef float64_nop 1424 #undef DO_FMUL_IDX 1425 1426 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1427 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1428 void *stat, uint32_t desc) \ 1429 { \ 1430 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1431 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1432 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1433 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1434 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1435 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1436 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1437 TYPE mm = m[H(i + idx)]; \ 1438 for (j = 0; j < segment; j++) { \ 1439 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1440 mm, a[i + j], 0, stat); \ 1441 } \ 1442 } \ 1443 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1444 } 1445 1446 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1447 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1448 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1449 1450 #undef DO_FMLA_IDX 1451 1452 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1453 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1454 { \ 1455 intptr_t i, oprsz = simd_oprsz(desc); \ 1456 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1457 bool q = false; \ 1458 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1459 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1460 if (dd < MIN) { \ 1461 dd = MIN; \ 1462 q = true; \ 1463 } else if (dd > MAX) { \ 1464 dd = MAX; \ 1465 q = true; \ 1466 } \ 1467 d[i] = dd; \ 1468 } \ 1469 if (q) { \ 1470 uint32_t *qc = vq; \ 1471 qc[0] = 1; \ 1472 } \ 1473 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1474 } 1475 1476 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1477 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1478 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1479 1480 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1481 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1482 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1483 1484 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1485 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1486 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1487 1488 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1489 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1490 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1491 1492 #undef DO_SAT 1493 1494 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1495 void *vm, uint32_t desc) 1496 { 1497 intptr_t i, oprsz = simd_oprsz(desc); 1498 uint64_t *d = vd, *n = vn, *m = vm; 1499 bool q = false; 1500 1501 for (i = 0; i < oprsz / 8; i++) { 1502 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1503 if (dd < nn) { 1504 dd = UINT64_MAX; 1505 q = true; 1506 } 1507 d[i] = dd; 1508 } 1509 if (q) { 1510 uint32_t *qc = vq; 1511 qc[0] = 1; 1512 } 1513 clear_tail(d, oprsz, simd_maxsz(desc)); 1514 } 1515 1516 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1517 void *vm, uint32_t desc) 1518 { 1519 intptr_t i, oprsz = simd_oprsz(desc); 1520 uint64_t *d = vd, *n = vn, *m = vm; 1521 bool q = false; 1522 1523 for (i = 0; i < oprsz / 8; i++) { 1524 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1525 if (nn < mm) { 1526 dd = 0; 1527 q = true; 1528 } 1529 d[i] = dd; 1530 } 1531 if (q) { 1532 uint32_t *qc = vq; 1533 qc[0] = 1; 1534 } 1535 clear_tail(d, oprsz, simd_maxsz(desc)); 1536 } 1537 1538 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1539 void *vm, uint32_t desc) 1540 { 1541 intptr_t i, oprsz = simd_oprsz(desc); 1542 int64_t *d = vd, *n = vn, *m = vm; 1543 bool q = false; 1544 1545 for (i = 0; i < oprsz / 8; i++) { 1546 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1547 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1548 dd = (nn >> 63) ^ ~INT64_MIN; 1549 q = true; 1550 } 1551 d[i] = dd; 1552 } 1553 if (q) { 1554 uint32_t *qc = vq; 1555 qc[0] = 1; 1556 } 1557 clear_tail(d, oprsz, simd_maxsz(desc)); 1558 } 1559 1560 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1561 void *vm, uint32_t desc) 1562 { 1563 intptr_t i, oprsz = simd_oprsz(desc); 1564 int64_t *d = vd, *n = vn, *m = vm; 1565 bool q = false; 1566 1567 for (i = 0; i < oprsz / 8; i++) { 1568 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1569 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1570 dd = (nn >> 63) ^ ~INT64_MIN; 1571 q = true; 1572 } 1573 d[i] = dd; 1574 } 1575 if (q) { 1576 uint32_t *qc = vq; 1577 qc[0] = 1; 1578 } 1579 clear_tail(d, oprsz, simd_maxsz(desc)); 1580 } 1581 1582 1583 #define DO_SRA(NAME, TYPE) \ 1584 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1585 { \ 1586 intptr_t i, oprsz = simd_oprsz(desc); \ 1587 int shift = simd_data(desc); \ 1588 TYPE *d = vd, *n = vn; \ 1589 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1590 d[i] += n[i] >> shift; \ 1591 } \ 1592 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1593 } 1594 1595 DO_SRA(gvec_ssra_b, int8_t) 1596 DO_SRA(gvec_ssra_h, int16_t) 1597 DO_SRA(gvec_ssra_s, int32_t) 1598 DO_SRA(gvec_ssra_d, int64_t) 1599 1600 DO_SRA(gvec_usra_b, uint8_t) 1601 DO_SRA(gvec_usra_h, uint16_t) 1602 DO_SRA(gvec_usra_s, uint32_t) 1603 DO_SRA(gvec_usra_d, uint64_t) 1604 1605 #undef DO_SRA 1606 1607 #define DO_RSHR(NAME, TYPE) \ 1608 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1609 { \ 1610 intptr_t i, oprsz = simd_oprsz(desc); \ 1611 int shift = simd_data(desc); \ 1612 TYPE *d = vd, *n = vn; \ 1613 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1614 TYPE tmp = n[i] >> (shift - 1); \ 1615 d[i] = (tmp >> 1) + (tmp & 1); \ 1616 } \ 1617 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1618 } 1619 1620 DO_RSHR(gvec_srshr_b, int8_t) 1621 DO_RSHR(gvec_srshr_h, int16_t) 1622 DO_RSHR(gvec_srshr_s, int32_t) 1623 DO_RSHR(gvec_srshr_d, int64_t) 1624 1625 DO_RSHR(gvec_urshr_b, uint8_t) 1626 DO_RSHR(gvec_urshr_h, uint16_t) 1627 DO_RSHR(gvec_urshr_s, uint32_t) 1628 DO_RSHR(gvec_urshr_d, uint64_t) 1629 1630 #undef DO_RSHR 1631 1632 #define DO_RSRA(NAME, TYPE) \ 1633 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1634 { \ 1635 intptr_t i, oprsz = simd_oprsz(desc); \ 1636 int shift = simd_data(desc); \ 1637 TYPE *d = vd, *n = vn; \ 1638 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1639 TYPE tmp = n[i] >> (shift - 1); \ 1640 d[i] += (tmp >> 1) + (tmp & 1); \ 1641 } \ 1642 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1643 } 1644 1645 DO_RSRA(gvec_srsra_b, int8_t) 1646 DO_RSRA(gvec_srsra_h, int16_t) 1647 DO_RSRA(gvec_srsra_s, int32_t) 1648 DO_RSRA(gvec_srsra_d, int64_t) 1649 1650 DO_RSRA(gvec_ursra_b, uint8_t) 1651 DO_RSRA(gvec_ursra_h, uint16_t) 1652 DO_RSRA(gvec_ursra_s, uint32_t) 1653 DO_RSRA(gvec_ursra_d, uint64_t) 1654 1655 #undef DO_RSRA 1656 1657 #define DO_SRI(NAME, TYPE) \ 1658 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1659 { \ 1660 intptr_t i, oprsz = simd_oprsz(desc); \ 1661 int shift = simd_data(desc); \ 1662 TYPE *d = vd, *n = vn; \ 1663 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1664 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1665 } \ 1666 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1667 } 1668 1669 DO_SRI(gvec_sri_b, uint8_t) 1670 DO_SRI(gvec_sri_h, uint16_t) 1671 DO_SRI(gvec_sri_s, uint32_t) 1672 DO_SRI(gvec_sri_d, uint64_t) 1673 1674 #undef DO_SRI 1675 1676 #define DO_SLI(NAME, TYPE) \ 1677 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1678 { \ 1679 intptr_t i, oprsz = simd_oprsz(desc); \ 1680 int shift = simd_data(desc); \ 1681 TYPE *d = vd, *n = vn; \ 1682 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1683 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1684 } \ 1685 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1686 } 1687 1688 DO_SLI(gvec_sli_b, uint8_t) 1689 DO_SLI(gvec_sli_h, uint16_t) 1690 DO_SLI(gvec_sli_s, uint32_t) 1691 DO_SLI(gvec_sli_d, uint64_t) 1692 1693 #undef DO_SLI 1694 1695 /* 1696 * Convert float16 to float32, raising no exceptions and 1697 * preserving exceptional values, including SNaN. 1698 * This is effectively an unpack+repack operation. 1699 */ 1700 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1701 { 1702 const int f16_bias = 15; 1703 const int f32_bias = 127; 1704 uint32_t sign = extract32(f16, 15, 1); 1705 uint32_t exp = extract32(f16, 10, 5); 1706 uint32_t frac = extract32(f16, 0, 10); 1707 1708 if (exp == 0x1f) { 1709 /* Inf or NaN */ 1710 exp = 0xff; 1711 } else if (exp == 0) { 1712 /* Zero or denormal. */ 1713 if (frac != 0) { 1714 if (fz16) { 1715 frac = 0; 1716 } else { 1717 /* 1718 * Denormal; these are all normal float32. 1719 * Shift the fraction so that the msb is at bit 11, 1720 * then remove bit 11 as the implicit bit of the 1721 * normalized float32. Note that we still go through 1722 * the shift for normal numbers below, to put the 1723 * float32 fraction at the right place. 1724 */ 1725 int shift = clz32(frac) - 21; 1726 frac = (frac << shift) & 0x3ff; 1727 exp = f32_bias - f16_bias - shift + 1; 1728 } 1729 } 1730 } else { 1731 /* Normal number; adjust the bias. */ 1732 exp += f32_bias - f16_bias; 1733 } 1734 sign <<= 31; 1735 exp <<= 23; 1736 frac <<= 23 - 10; 1737 1738 return sign | exp | frac; 1739 } 1740 1741 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1742 { 1743 /* 1744 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1745 * Load the 2nd qword iff is_q & is_2. 1746 * Shift to the 2nd dword iff !is_q & is_2. 1747 * For !is_q & !is_2, the upper bits of the result are garbage. 1748 */ 1749 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1750 } 1751 1752 /* 1753 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1754 * as there is not yet SVE versions that might use blocking. 1755 */ 1756 1757 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1758 uint32_t desc, bool fz16) 1759 { 1760 intptr_t i, oprsz = simd_oprsz(desc); 1761 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1762 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1763 int is_q = oprsz == 16; 1764 uint64_t n_4, m_4; 1765 1766 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1767 n_4 = load4_f16(vn, is_q, is_2); 1768 m_4 = load4_f16(vm, is_q, is_2); 1769 1770 /* Negate all inputs for FMLSL at once. */ 1771 if (is_s) { 1772 n_4 ^= 0x8000800080008000ull; 1773 } 1774 1775 for (i = 0; i < oprsz / 4; i++) { 1776 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1777 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1778 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1779 } 1780 clear_tail(d, oprsz, simd_maxsz(desc)); 1781 } 1782 1783 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1784 void *venv, uint32_t desc) 1785 { 1786 CPUARMState *env = venv; 1787 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1788 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1789 } 1790 1791 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1792 void *venv, uint32_t desc) 1793 { 1794 CPUARMState *env = venv; 1795 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1796 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1797 } 1798 1799 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 1800 void *venv, uint32_t desc) 1801 { 1802 intptr_t i, oprsz = simd_oprsz(desc); 1803 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1804 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1805 CPUARMState *env = venv; 1806 float_status *status = &env->vfp.fp_status; 1807 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1808 1809 for (i = 0; i < oprsz; i += sizeof(float32)) { 1810 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 1811 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 1812 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1813 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1814 float32 aa = *(float32 *)(va + H1_4(i)); 1815 1816 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 1817 } 1818 } 1819 1820 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 1821 uint32_t desc, bool fz16) 1822 { 1823 intptr_t i, oprsz = simd_oprsz(desc); 1824 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1825 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1826 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 1827 int is_q = oprsz == 16; 1828 uint64_t n_4; 1829 float32 m_1; 1830 1831 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1832 n_4 = load4_f16(vn, is_q, is_2); 1833 1834 /* Negate all inputs for FMLSL at once. */ 1835 if (is_s) { 1836 n_4 ^= 0x8000800080008000ull; 1837 } 1838 1839 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 1840 1841 for (i = 0; i < oprsz / 4; i++) { 1842 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1843 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1844 } 1845 clear_tail(d, oprsz, simd_maxsz(desc)); 1846 } 1847 1848 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 1849 void *venv, uint32_t desc) 1850 { 1851 CPUARMState *env = venv; 1852 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1853 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1854 } 1855 1856 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 1857 void *venv, uint32_t desc) 1858 { 1859 CPUARMState *env = venv; 1860 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 1861 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1862 } 1863 1864 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 1865 void *venv, uint32_t desc) 1866 { 1867 intptr_t i, j, oprsz = simd_oprsz(desc); 1868 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1869 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1870 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 1871 CPUARMState *env = venv; 1872 float_status *status = &env->vfp.fp_status; 1873 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1874 1875 for (i = 0; i < oprsz; i += 16) { 1876 float16 mm_16 = *(float16 *)(vm + i + idx); 1877 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1878 1879 for (j = 0; j < 16; j += sizeof(float32)) { 1880 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 1881 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1882 float32 aa = *(float32 *)(va + H1_4(i + j)); 1883 1884 *(float32 *)(vd + H1_4(i + j)) = 1885 float32_muladd(nn, mm, aa, 0, status); 1886 } 1887 } 1888 } 1889 1890 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1891 { 1892 intptr_t i, opr_sz = simd_oprsz(desc); 1893 int8_t *d = vd, *n = vn, *m = vm; 1894 1895 for (i = 0; i < opr_sz; ++i) { 1896 int8_t mm = m[i]; 1897 int8_t nn = n[i]; 1898 int8_t res = 0; 1899 if (mm >= 0) { 1900 if (mm < 8) { 1901 res = nn << mm; 1902 } 1903 } else { 1904 res = nn >> (mm > -8 ? -mm : 7); 1905 } 1906 d[i] = res; 1907 } 1908 clear_tail(d, opr_sz, simd_maxsz(desc)); 1909 } 1910 1911 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1912 { 1913 intptr_t i, opr_sz = simd_oprsz(desc); 1914 int16_t *d = vd, *n = vn, *m = vm; 1915 1916 for (i = 0; i < opr_sz / 2; ++i) { 1917 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1918 int16_t nn = n[i]; 1919 int16_t res = 0; 1920 if (mm >= 0) { 1921 if (mm < 16) { 1922 res = nn << mm; 1923 } 1924 } else { 1925 res = nn >> (mm > -16 ? -mm : 15); 1926 } 1927 d[i] = res; 1928 } 1929 clear_tail(d, opr_sz, simd_maxsz(desc)); 1930 } 1931 1932 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1933 { 1934 intptr_t i, opr_sz = simd_oprsz(desc); 1935 uint8_t *d = vd, *n = vn, *m = vm; 1936 1937 for (i = 0; i < opr_sz; ++i) { 1938 int8_t mm = m[i]; 1939 uint8_t nn = n[i]; 1940 uint8_t res = 0; 1941 if (mm >= 0) { 1942 if (mm < 8) { 1943 res = nn << mm; 1944 } 1945 } else { 1946 if (mm > -8) { 1947 res = nn >> -mm; 1948 } 1949 } 1950 d[i] = res; 1951 } 1952 clear_tail(d, opr_sz, simd_maxsz(desc)); 1953 } 1954 1955 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1956 { 1957 intptr_t i, opr_sz = simd_oprsz(desc); 1958 uint16_t *d = vd, *n = vn, *m = vm; 1959 1960 for (i = 0; i < opr_sz / 2; ++i) { 1961 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1962 uint16_t nn = n[i]; 1963 uint16_t res = 0; 1964 if (mm >= 0) { 1965 if (mm < 16) { 1966 res = nn << mm; 1967 } 1968 } else { 1969 if (mm > -16) { 1970 res = nn >> -mm; 1971 } 1972 } 1973 d[i] = res; 1974 } 1975 clear_tail(d, opr_sz, simd_maxsz(desc)); 1976 } 1977 1978 /* 1979 * 8x8->8 polynomial multiply. 1980 * 1981 * Polynomial multiplication is like integer multiplication except the 1982 * partial products are XORed, not added. 1983 * 1984 * TODO: expose this as a generic vector operation, as it is a common 1985 * crypto building block. 1986 */ 1987 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 1988 { 1989 intptr_t i, j, opr_sz = simd_oprsz(desc); 1990 uint64_t *d = vd, *n = vn, *m = vm; 1991 1992 for (i = 0; i < opr_sz / 8; ++i) { 1993 uint64_t nn = n[i]; 1994 uint64_t mm = m[i]; 1995 uint64_t rr = 0; 1996 1997 for (j = 0; j < 8; ++j) { 1998 uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; 1999 rr ^= mm & mask; 2000 mm = (mm << 1) & 0xfefefefefefefefeull; 2001 nn >>= 1; 2002 } 2003 d[i] = rr; 2004 } 2005 clear_tail(d, opr_sz, simd_maxsz(desc)); 2006 } 2007 2008 /* 2009 * 64x64->128 polynomial multiply. 2010 * Because of the lanes are not accessed in strict columns, 2011 * this probably cannot be turned into a generic helper. 2012 */ 2013 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2014 { 2015 intptr_t i, j, opr_sz = simd_oprsz(desc); 2016 intptr_t hi = simd_data(desc); 2017 uint64_t *d = vd, *n = vn, *m = vm; 2018 2019 for (i = 0; i < opr_sz / 8; i += 2) { 2020 uint64_t nn = n[i + hi]; 2021 uint64_t mm = m[i + hi]; 2022 uint64_t rhi = 0; 2023 uint64_t rlo = 0; 2024 2025 /* Bit 0 can only influence the low 64-bit result. */ 2026 if (nn & 1) { 2027 rlo = mm; 2028 } 2029 2030 for (j = 1; j < 64; ++j) { 2031 uint64_t mask = -((nn >> j) & 1); 2032 rlo ^= (mm << j) & mask; 2033 rhi ^= (mm >> (64 - j)) & mask; 2034 } 2035 d[i] = rlo; 2036 d[i + 1] = rhi; 2037 } 2038 clear_tail(d, opr_sz, simd_maxsz(desc)); 2039 } 2040 2041 /* 2042 * 8x8->16 polynomial multiply. 2043 * 2044 * The byte inputs are expanded to (or extracted from) half-words. 2045 * Note that neon and sve2 get the inputs from different positions. 2046 * This allows 4 bytes to be processed in parallel with uint64_t. 2047 */ 2048 2049 static uint64_t expand_byte_to_half(uint64_t x) 2050 { 2051 return (x & 0x000000ff) 2052 | ((x & 0x0000ff00) << 8) 2053 | ((x & 0x00ff0000) << 16) 2054 | ((x & 0xff000000) << 24); 2055 } 2056 2057 uint64_t pmull_w(uint64_t op1, uint64_t op2) 2058 { 2059 uint64_t result = 0; 2060 int i; 2061 for (i = 0; i < 16; ++i) { 2062 uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff; 2063 result ^= op2 & mask; 2064 op1 >>= 1; 2065 op2 <<= 1; 2066 } 2067 return result; 2068 } 2069 2070 uint64_t pmull_h(uint64_t op1, uint64_t op2) 2071 { 2072 uint64_t result = 0; 2073 int i; 2074 for (i = 0; i < 8; ++i) { 2075 uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; 2076 result ^= op2 & mask; 2077 op1 >>= 1; 2078 op2 <<= 1; 2079 } 2080 return result; 2081 } 2082 2083 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2084 { 2085 int hi = simd_data(desc); 2086 uint64_t *d = vd, *n = vn, *m = vm; 2087 uint64_t nn = n[hi], mm = m[hi]; 2088 2089 d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); 2090 nn >>= 32; 2091 mm >>= 32; 2092 d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); 2093 2094 clear_tail(d, 16, simd_maxsz(desc)); 2095 } 2096 2097 #ifdef TARGET_AARCH64 2098 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2099 { 2100 int shift = simd_data(desc) * 8; 2101 intptr_t i, opr_sz = simd_oprsz(desc); 2102 uint64_t *d = vd, *n = vn, *m = vm; 2103 2104 for (i = 0; i < opr_sz / 8; ++i) { 2105 uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; 2106 uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; 2107 2108 d[i] = pmull_h(nn, mm); 2109 } 2110 } 2111 2112 static uint64_t pmull_d(uint64_t op1, uint64_t op2) 2113 { 2114 uint64_t result = 0; 2115 int i; 2116 2117 for (i = 0; i < 32; ++i) { 2118 uint64_t mask = -((op1 >> i) & 1); 2119 result ^= (op2 << i) & mask; 2120 } 2121 return result; 2122 } 2123 2124 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2125 { 2126 intptr_t sel = H4(simd_data(desc)); 2127 intptr_t i, opr_sz = simd_oprsz(desc); 2128 uint32_t *n = vn, *m = vm; 2129 uint64_t *d = vd; 2130 2131 for (i = 0; i < opr_sz / 8; ++i) { 2132 d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]); 2133 } 2134 } 2135 #endif 2136 2137 #define DO_CMP0(NAME, TYPE, OP) \ 2138 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2139 { \ 2140 intptr_t i, opr_sz = simd_oprsz(desc); \ 2141 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2142 TYPE nn = *(TYPE *)(vn + i); \ 2143 *(TYPE *)(vd + i) = -(nn OP 0); \ 2144 } \ 2145 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2146 } 2147 2148 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2149 DO_CMP0(gvec_clt0_b, int8_t, <) 2150 DO_CMP0(gvec_cle0_b, int8_t, <=) 2151 DO_CMP0(gvec_cgt0_b, int8_t, >) 2152 DO_CMP0(gvec_cge0_b, int8_t, >=) 2153 2154 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2155 DO_CMP0(gvec_clt0_h, int16_t, <) 2156 DO_CMP0(gvec_cle0_h, int16_t, <=) 2157 DO_CMP0(gvec_cgt0_h, int16_t, >) 2158 DO_CMP0(gvec_cge0_h, int16_t, >=) 2159 2160 #undef DO_CMP0 2161 2162 #define DO_ABD(NAME, TYPE) \ 2163 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2164 { \ 2165 intptr_t i, opr_sz = simd_oprsz(desc); \ 2166 TYPE *d = vd, *n = vn, *m = vm; \ 2167 \ 2168 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2169 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2170 } \ 2171 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2172 } 2173 2174 DO_ABD(gvec_sabd_b, int8_t) 2175 DO_ABD(gvec_sabd_h, int16_t) 2176 DO_ABD(gvec_sabd_s, int32_t) 2177 DO_ABD(gvec_sabd_d, int64_t) 2178 2179 DO_ABD(gvec_uabd_b, uint8_t) 2180 DO_ABD(gvec_uabd_h, uint16_t) 2181 DO_ABD(gvec_uabd_s, uint32_t) 2182 DO_ABD(gvec_uabd_d, uint64_t) 2183 2184 #undef DO_ABD 2185 2186 #define DO_ABA(NAME, TYPE) \ 2187 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2188 { \ 2189 intptr_t i, opr_sz = simd_oprsz(desc); \ 2190 TYPE *d = vd, *n = vn, *m = vm; \ 2191 \ 2192 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2193 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2194 } \ 2195 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2196 } 2197 2198 DO_ABA(gvec_saba_b, int8_t) 2199 DO_ABA(gvec_saba_h, int16_t) 2200 DO_ABA(gvec_saba_s, int32_t) 2201 DO_ABA(gvec_saba_d, int64_t) 2202 2203 DO_ABA(gvec_uaba_b, uint8_t) 2204 DO_ABA(gvec_uaba_h, uint16_t) 2205 DO_ABA(gvec_uaba_s, uint32_t) 2206 DO_ABA(gvec_uaba_d, uint64_t) 2207 2208 #undef DO_ABA 2209 2210 #define DO_NEON_PAIRWISE(NAME, OP) \ 2211 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \ 2212 void *stat, uint32_t oprsz) \ 2213 { \ 2214 float_status *fpst = stat; \ 2215 float32 *d = vd; \ 2216 float32 *n = vn; \ 2217 float32 *m = vm; \ 2218 float32 r0, r1; \ 2219 \ 2220 /* Read all inputs before writing outputs in case vm == vd */ \ 2221 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \ 2222 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \ 2223 \ 2224 d[H4(0)] = r0; \ 2225 d[H4(1)] = r1; \ 2226 } \ 2227 \ 2228 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \ 2229 void *stat, uint32_t oprsz) \ 2230 { \ 2231 float_status *fpst = stat; \ 2232 float16 *d = vd; \ 2233 float16 *n = vn; \ 2234 float16 *m = vm; \ 2235 float16 r0, r1, r2, r3; \ 2236 \ 2237 /* Read all inputs before writing outputs in case vm == vd */ \ 2238 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \ 2239 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \ 2240 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \ 2241 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \ 2242 \ 2243 d[H2(0)] = r0; \ 2244 d[H2(1)] = r1; \ 2245 d[H2(2)] = r2; \ 2246 d[H2(3)] = r3; \ 2247 } 2248 2249 DO_NEON_PAIRWISE(neon_padd, add) 2250 DO_NEON_PAIRWISE(neon_pmax, max) 2251 DO_NEON_PAIRWISE(neon_pmin, min) 2252 2253 #undef DO_NEON_PAIRWISE 2254 2255 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2256 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2257 { \ 2258 intptr_t i, oprsz = simd_oprsz(desc); \ 2259 int shift = simd_data(desc); \ 2260 TYPE *d = vd, *n = vn; \ 2261 float_status *fpst = stat; \ 2262 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2263 d[i] = FUNC(n[i], shift, fpst); \ 2264 } \ 2265 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2266 } 2267 2268 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2269 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2270 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2271 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) 2272 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2273 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2274 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2275 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2276 2277 #undef DO_VCVT_FIXED 2278 2279 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2280 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2281 { \ 2282 float_status *fpst = stat; \ 2283 intptr_t i, oprsz = simd_oprsz(desc); \ 2284 uint32_t rmode = simd_data(desc); \ 2285 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2286 TYPE *d = vd, *n = vn; \ 2287 set_float_rounding_mode(rmode, fpst); \ 2288 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2289 d[i] = FUNC(n[i], 0, fpst); \ 2290 } \ 2291 set_float_rounding_mode(prev_rmode, fpst); \ 2292 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2293 } 2294 2295 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2296 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2297 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2298 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2299 2300 #undef DO_VCVT_RMODE 2301 2302 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2303 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2304 { \ 2305 float_status *fpst = stat; \ 2306 intptr_t i, oprsz = simd_oprsz(desc); \ 2307 uint32_t rmode = simd_data(desc); \ 2308 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2309 TYPE *d = vd, *n = vn; \ 2310 set_float_rounding_mode(rmode, fpst); \ 2311 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2312 d[i] = FUNC(n[i], fpst); \ 2313 } \ 2314 set_float_rounding_mode(prev_rmode, fpst); \ 2315 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2316 } 2317 2318 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2319 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2320 2321 #undef DO_VRINT_RMODE 2322 2323 #ifdef TARGET_AARCH64 2324 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2325 { 2326 const uint8_t *indices = vm; 2327 CPUARMState *env = venv; 2328 size_t oprsz = simd_oprsz(desc); 2329 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2330 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2331 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2332 union { 2333 uint8_t b[16]; 2334 uint64_t d[2]; 2335 } result; 2336 2337 /* 2338 * We must construct the final result in a temp, lest the output 2339 * overlaps the input table. For TBL, begin with zero; for TBX, 2340 * begin with the original register contents. Note that we always 2341 * copy 16 bytes here to avoid an extra branch; clearing the high 2342 * bits of the register for oprsz == 8 is handled below. 2343 */ 2344 if (is_tbx) { 2345 memcpy(&result, vd, 16); 2346 } else { 2347 memset(&result, 0, 16); 2348 } 2349 2350 for (size_t i = 0; i < oprsz; ++i) { 2351 uint32_t index = indices[H1(i)]; 2352 2353 if (index < table_len) { 2354 /* 2355 * Convert index (a byte offset into the virtual table 2356 * which is a series of 128-bit vectors concatenated) 2357 * into the correct register element, bearing in mind 2358 * that the table can wrap around from V31 to V0. 2359 */ 2360 const uint8_t *table = (const uint8_t *) 2361 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2362 result.b[H1(i)] = table[H1(index % 16)]; 2363 } 2364 } 2365 2366 memcpy(vd, &result, 16); 2367 clear_tail(vd, oprsz, simd_maxsz(desc)); 2368 } 2369 #endif 2370 2371 /* 2372 * NxN -> N highpart multiply 2373 * 2374 * TODO: expose this as a generic vector operation. 2375 */ 2376 2377 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2378 { 2379 intptr_t i, opr_sz = simd_oprsz(desc); 2380 int8_t *d = vd, *n = vn, *m = vm; 2381 2382 for (i = 0; i < opr_sz; ++i) { 2383 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2384 } 2385 clear_tail(d, opr_sz, simd_maxsz(desc)); 2386 } 2387 2388 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2389 { 2390 intptr_t i, opr_sz = simd_oprsz(desc); 2391 int16_t *d = vd, *n = vn, *m = vm; 2392 2393 for (i = 0; i < opr_sz / 2; ++i) { 2394 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2395 } 2396 clear_tail(d, opr_sz, simd_maxsz(desc)); 2397 } 2398 2399 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2400 { 2401 intptr_t i, opr_sz = simd_oprsz(desc); 2402 int32_t *d = vd, *n = vn, *m = vm; 2403 2404 for (i = 0; i < opr_sz / 4; ++i) { 2405 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2406 } 2407 clear_tail(d, opr_sz, simd_maxsz(desc)); 2408 } 2409 2410 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2411 { 2412 intptr_t i, opr_sz = simd_oprsz(desc); 2413 uint64_t *d = vd, *n = vn, *m = vm; 2414 uint64_t discard; 2415 2416 for (i = 0; i < opr_sz / 8; ++i) { 2417 muls64(&discard, &d[i], n[i], m[i]); 2418 } 2419 clear_tail(d, opr_sz, simd_maxsz(desc)); 2420 } 2421 2422 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2423 { 2424 intptr_t i, opr_sz = simd_oprsz(desc); 2425 uint8_t *d = vd, *n = vn, *m = vm; 2426 2427 for (i = 0; i < opr_sz; ++i) { 2428 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2429 } 2430 clear_tail(d, opr_sz, simd_maxsz(desc)); 2431 } 2432 2433 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2434 { 2435 intptr_t i, opr_sz = simd_oprsz(desc); 2436 uint16_t *d = vd, *n = vn, *m = vm; 2437 2438 for (i = 0; i < opr_sz / 2; ++i) { 2439 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2440 } 2441 clear_tail(d, opr_sz, simd_maxsz(desc)); 2442 } 2443 2444 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2445 { 2446 intptr_t i, opr_sz = simd_oprsz(desc); 2447 uint32_t *d = vd, *n = vn, *m = vm; 2448 2449 for (i = 0; i < opr_sz / 4; ++i) { 2450 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2451 } 2452 clear_tail(d, opr_sz, simd_maxsz(desc)); 2453 } 2454 2455 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2456 { 2457 intptr_t i, opr_sz = simd_oprsz(desc); 2458 uint64_t *d = vd, *n = vn, *m = vm; 2459 uint64_t discard; 2460 2461 for (i = 0; i < opr_sz / 8; ++i) { 2462 mulu64(&discard, &d[i], n[i], m[i]); 2463 } 2464 clear_tail(d, opr_sz, simd_maxsz(desc)); 2465 } 2466 2467 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2468 { 2469 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2470 int shr = simd_data(desc); 2471 uint64_t *d = vd, *n = vn, *m = vm; 2472 2473 for (i = 0; i < opr_sz; ++i) { 2474 d[i] = ror64(n[i] ^ m[i], shr); 2475 } 2476 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2477 } 2478 2479 /* 2480 * Integer matrix-multiply accumulate 2481 */ 2482 2483 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2484 { 2485 int8_t *n = vn, *m = vm; 2486 2487 for (intptr_t k = 0; k < 8; ++k) { 2488 sum += n[H1(k)] * m[H1(k)]; 2489 } 2490 return sum; 2491 } 2492 2493 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2494 { 2495 uint8_t *n = vn, *m = vm; 2496 2497 for (intptr_t k = 0; k < 8; ++k) { 2498 sum += n[H1(k)] * m[H1(k)]; 2499 } 2500 return sum; 2501 } 2502 2503 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2504 { 2505 uint8_t *n = vn; 2506 int8_t *m = vm; 2507 2508 for (intptr_t k = 0; k < 8; ++k) { 2509 sum += n[H1(k)] * m[H1(k)]; 2510 } 2511 return sum; 2512 } 2513 2514 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2515 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2516 { 2517 intptr_t seg, opr_sz = simd_oprsz(desc); 2518 2519 for (seg = 0; seg < opr_sz; seg += 16) { 2520 uint32_t *d = vd + seg; 2521 uint32_t *a = va + seg; 2522 uint32_t sum0, sum1, sum2, sum3; 2523 2524 /* 2525 * Process the entire segment at once, writing back the 2526 * results only after we've consumed all of the inputs. 2527 * 2528 * Key to indices by column: 2529 * i j i j 2530 */ 2531 sum0 = a[H4(0 + 0)]; 2532 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2533 sum1 = a[H4(0 + 1)]; 2534 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2535 sum2 = a[H4(2 + 0)]; 2536 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2537 sum3 = a[H4(2 + 1)]; 2538 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2539 2540 d[H4(0)] = sum0; 2541 d[H4(1)] = sum1; 2542 d[H4(2)] = sum2; 2543 d[H4(3)] = sum3; 2544 } 2545 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2546 } 2547 2548 #define DO_MMLA_B(NAME, INNER) \ 2549 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2550 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2551 2552 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2553 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2554 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2555 2556 /* 2557 * BFloat16 Dot Product 2558 */ 2559 2560 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) 2561 { 2562 /* FPCR is ignored for BFDOT and BFMMLA. */ 2563 float_status bf_status = { 2564 .tininess_before_rounding = float_tininess_before_rounding, 2565 .float_rounding_mode = float_round_to_odd_inf, 2566 .flush_to_zero = true, 2567 .flush_inputs_to_zero = true, 2568 .default_nan_mode = true, 2569 }; 2570 float32 t1, t2; 2571 2572 /* 2573 * Extract each BFloat16 from the element pair, and shift 2574 * them such that they become float32. 2575 */ 2576 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); 2577 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); 2578 t1 = float32_add(t1, t2, &bf_status); 2579 t1 = float32_add(sum, t1, &bf_status); 2580 2581 return t1; 2582 } 2583 2584 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2585 { 2586 intptr_t i, opr_sz = simd_oprsz(desc); 2587 float32 *d = vd, *a = va; 2588 uint32_t *n = vn, *m = vm; 2589 2590 for (i = 0; i < opr_sz / 4; ++i) { 2591 d[i] = bfdotadd(a[i], n[i], m[i]); 2592 } 2593 clear_tail(d, opr_sz, simd_maxsz(desc)); 2594 } 2595 2596 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2597 void *va, uint32_t desc) 2598 { 2599 intptr_t i, j, opr_sz = simd_oprsz(desc); 2600 intptr_t index = simd_data(desc); 2601 intptr_t elements = opr_sz / 4; 2602 intptr_t eltspersegment = MIN(16 / 4, elements); 2603 float32 *d = vd, *a = va; 2604 uint32_t *n = vn, *m = vm; 2605 2606 for (i = 0; i < elements; i += eltspersegment) { 2607 uint32_t m_idx = m[i + H4(index)]; 2608 2609 for (j = i; j < i + eltspersegment; j++) { 2610 d[j] = bfdotadd(a[j], n[j], m_idx); 2611 } 2612 } 2613 clear_tail(d, opr_sz, simd_maxsz(desc)); 2614 } 2615 2616 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2617 { 2618 intptr_t s, opr_sz = simd_oprsz(desc); 2619 float32 *d = vd, *a = va; 2620 uint32_t *n = vn, *m = vm; 2621 2622 for (s = 0; s < opr_sz / 4; s += 4) { 2623 float32 sum00, sum01, sum10, sum11; 2624 2625 /* 2626 * Process the entire segment at once, writing back the 2627 * results only after we've consumed all of the inputs. 2628 * 2629 * Key to indices by column: 2630 * i j i k j k 2631 */ 2632 sum00 = a[s + H4(0 + 0)]; 2633 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); 2634 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); 2635 2636 sum01 = a[s + H4(0 + 1)]; 2637 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); 2638 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); 2639 2640 sum10 = a[s + H4(2 + 0)]; 2641 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); 2642 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); 2643 2644 sum11 = a[s + H4(2 + 1)]; 2645 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); 2646 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); 2647 2648 d[s + H4(0 + 0)] = sum00; 2649 d[s + H4(0 + 1)] = sum01; 2650 d[s + H4(2 + 0)] = sum10; 2651 d[s + H4(2 + 1)] = sum11; 2652 } 2653 clear_tail(d, opr_sz, simd_maxsz(desc)); 2654 } 2655 2656 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 2657 void *stat, uint32_t desc) 2658 { 2659 intptr_t i, opr_sz = simd_oprsz(desc); 2660 intptr_t sel = simd_data(desc); 2661 float32 *d = vd, *a = va; 2662 bfloat16 *n = vn, *m = vm; 2663 2664 for (i = 0; i < opr_sz / 4; ++i) { 2665 float32 nn = n[H2(i * 2 + sel)] << 16; 2666 float32 mm = m[H2(i * 2 + sel)] << 16; 2667 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 2668 } 2669 clear_tail(d, opr_sz, simd_maxsz(desc)); 2670 } 2671 2672 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 2673 void *va, void *stat, uint32_t desc) 2674 { 2675 intptr_t i, j, opr_sz = simd_oprsz(desc); 2676 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 2677 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 2678 intptr_t elements = opr_sz / 4; 2679 intptr_t eltspersegment = MIN(16 / 4, elements); 2680 float32 *d = vd, *a = va; 2681 bfloat16 *n = vn, *m = vm; 2682 2683 for (i = 0; i < elements; i += eltspersegment) { 2684 float32 m_idx = m[H2(2 * i + index)] << 16; 2685 2686 for (j = i; j < i + eltspersegment; j++) { 2687 float32 n_j = n[H2(2 * j + sel)] << 16; 2688 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 2689 } 2690 } 2691 clear_tail(d, opr_sz, simd_maxsz(desc)); 2692 } 2693 2694 #define DO_CLAMP(NAME, TYPE) \ 2695 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 2696 { \ 2697 intptr_t i, opr_sz = simd_oprsz(desc); \ 2698 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2699 TYPE aa = *(TYPE *)(a + i); \ 2700 TYPE nn = *(TYPE *)(n + i); \ 2701 TYPE mm = *(TYPE *)(m + i); \ 2702 TYPE dd = MIN(MAX(aa, nn), mm); \ 2703 *(TYPE *)(d + i) = dd; \ 2704 } \ 2705 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2706 } 2707 2708 DO_CLAMP(gvec_sclamp_b, int8_t) 2709 DO_CLAMP(gvec_sclamp_h, int16_t) 2710 DO_CLAMP(gvec_sclamp_s, int32_t) 2711 DO_CLAMP(gvec_sclamp_d, int64_t) 2712 2713 DO_CLAMP(gvec_uclamp_b, uint8_t) 2714 DO_CLAMP(gvec_uclamp_h, uint16_t) 2715 DO_CLAMP(gvec_uclamp_s, uint32_t) 2716 DO_CLAMP(gvec_uclamp_d, uint64_t) 2717