1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 315 void *va, uint32_t desc) 316 { 317 intptr_t i, opr_sz = simd_oprsz(desc); 318 int16_t *d = vd, *n = vn, *m = vm, *a = va; 319 uint32_t discard; 320 321 for (i = 0; i < opr_sz / 2; ++i) { 322 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 323 } 324 } 325 326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 327 void *va, uint32_t desc) 328 { 329 intptr_t i, opr_sz = simd_oprsz(desc); 330 int16_t *d = vd, *n = vn, *m = vm, *a = va; 331 uint32_t discard; 332 333 for (i = 0; i < opr_sz / 2; ++i) { 334 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 335 } 336 } 337 338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 339 { 340 intptr_t i, opr_sz = simd_oprsz(desc); 341 int16_t *d = vd, *n = vn, *m = vm; 342 uint32_t discard; 343 344 for (i = 0; i < opr_sz / 2; ++i) { 345 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 346 } 347 } 348 349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 350 { 351 intptr_t i, opr_sz = simd_oprsz(desc); 352 int16_t *d = vd, *n = vn, *m = vm; 353 uint32_t discard; 354 355 for (i = 0; i < opr_sz / 2; ++i) { 356 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 357 } 358 } 359 360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 361 { 362 intptr_t i, j, opr_sz = simd_oprsz(desc); 363 int idx = simd_data(desc); 364 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 365 uint32_t discard; 366 367 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 368 int16_t mm = m[i]; 369 for (j = 0; j < 16 / 2; ++j) { 370 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 371 } 372 } 373 } 374 375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 376 { 377 intptr_t i, j, opr_sz = simd_oprsz(desc); 378 int idx = simd_data(desc); 379 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 380 uint32_t discard; 381 382 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 383 int16_t mm = m[i]; 384 for (j = 0; j < 16 / 2; ++j) { 385 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 386 } 387 } 388 } 389 390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 392 bool neg, bool round, uint32_t *sat) 393 { 394 /* Simplify similarly to do_sqrdmlah_b above. */ 395 int64_t ret = (int64_t)src1 * src2; 396 if (neg) { 397 ret = -ret; 398 } 399 ret += ((int64_t)src3 << 31) + (round << 30); 400 ret >>= 31; 401 402 if (ret != (int32_t)ret) { 403 *sat = 1; 404 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 405 } 406 return ret; 407 } 408 409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 410 int32_t src2, int32_t src3) 411 { 412 uint32_t *sat = &env->vfp.qc[0]; 413 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 414 } 415 416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 417 void *vq, uint32_t desc) 418 { 419 uintptr_t opr_sz = simd_oprsz(desc); 420 int32_t *d = vd; 421 int32_t *n = vn; 422 int32_t *m = vm; 423 uintptr_t i; 424 425 for (i = 0; i < opr_sz / 4; ++i) { 426 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 427 } 428 clear_tail(d, opr_sz, simd_maxsz(desc)); 429 } 430 431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 432 int32_t src2, int32_t src3) 433 { 434 uint32_t *sat = &env->vfp.qc[0]; 435 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 436 } 437 438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 439 void *vq, uint32_t desc) 440 { 441 uintptr_t opr_sz = simd_oprsz(desc); 442 int32_t *d = vd; 443 int32_t *n = vn; 444 int32_t *m = vm; 445 uintptr_t i; 446 447 for (i = 0; i < opr_sz / 4; ++i) { 448 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 449 } 450 clear_tail(d, opr_sz, simd_maxsz(desc)); 451 } 452 453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 454 void *vq, uint32_t desc) 455 { 456 intptr_t i, opr_sz = simd_oprsz(desc); 457 int32_t *d = vd, *n = vn, *m = vm; 458 459 for (i = 0; i < opr_sz / 4; ++i) { 460 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 461 } 462 clear_tail(d, opr_sz, simd_maxsz(desc)); 463 } 464 465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 466 void *vq, uint32_t desc) 467 { 468 intptr_t i, opr_sz = simd_oprsz(desc); 469 int32_t *d = vd, *n = vn, *m = vm; 470 471 for (i = 0; i < opr_sz / 4; ++i) { 472 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 473 } 474 clear_tail(d, opr_sz, simd_maxsz(desc)); 475 } 476 477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 478 void *va, uint32_t desc) 479 { 480 intptr_t i, opr_sz = simd_oprsz(desc); 481 int32_t *d = vd, *n = vn, *m = vm, *a = va; 482 uint32_t discard; 483 484 for (i = 0; i < opr_sz / 4; ++i) { 485 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 486 } 487 } 488 489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 490 void *va, uint32_t desc) 491 { 492 intptr_t i, opr_sz = simd_oprsz(desc); 493 int32_t *d = vd, *n = vn, *m = vm, *a = va; 494 uint32_t discard; 495 496 for (i = 0; i < opr_sz / 4; ++i) { 497 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 498 } 499 } 500 501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 502 { 503 intptr_t i, opr_sz = simd_oprsz(desc); 504 int32_t *d = vd, *n = vn, *m = vm; 505 uint32_t discard; 506 507 for (i = 0; i < opr_sz / 4; ++i) { 508 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 509 } 510 } 511 512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 513 { 514 intptr_t i, opr_sz = simd_oprsz(desc); 515 int32_t *d = vd, *n = vn, *m = vm; 516 uint32_t discard; 517 518 for (i = 0; i < opr_sz / 4; ++i) { 519 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 520 } 521 } 522 523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 524 { 525 intptr_t i, j, opr_sz = simd_oprsz(desc); 526 int idx = simd_data(desc); 527 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 528 uint32_t discard; 529 530 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 531 int32_t mm = m[i]; 532 for (j = 0; j < 16 / 4; ++j) { 533 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 534 } 535 } 536 } 537 538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 539 { 540 intptr_t i, j, opr_sz = simd_oprsz(desc); 541 int idx = simd_data(desc); 542 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 543 uint32_t discard; 544 545 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 546 int32_t mm = m[i]; 547 for (j = 0; j < 16 / 4; ++j) { 548 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 549 } 550 } 551 } 552 553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 554 static int64_t do_sat128_d(Int128 r) 555 { 556 int64_t ls = int128_getlo(r); 557 int64_t hs = int128_gethi(r); 558 559 if (unlikely(hs != (ls >> 63))) { 560 return hs < 0 ? INT64_MIN : INT64_MAX; 561 } 562 return ls; 563 } 564 565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 566 { 567 uint64_t l, h; 568 Int128 r, t; 569 570 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 571 muls64(&l, &h, m, n); 572 r = int128_make128(l, h); 573 if (neg) { 574 r = int128_neg(r); 575 } 576 if (a) { 577 t = int128_exts64(a); 578 t = int128_lshift(t, 63); 579 r = int128_add(r, t); 580 } 581 if (round) { 582 t = int128_exts64(1ll << 62); 583 r = int128_add(r, t); 584 } 585 r = int128_rshift(r, 63); 586 587 return do_sat128_d(r); 588 } 589 590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 591 void *va, uint32_t desc) 592 { 593 intptr_t i, opr_sz = simd_oprsz(desc); 594 int64_t *d = vd, *n = vn, *m = vm, *a = va; 595 596 for (i = 0; i < opr_sz / 8; ++i) { 597 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 598 } 599 } 600 601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 602 void *va, uint32_t desc) 603 { 604 intptr_t i, opr_sz = simd_oprsz(desc); 605 int64_t *d = vd, *n = vn, *m = vm, *a = va; 606 607 for (i = 0; i < opr_sz / 8; ++i) { 608 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 609 } 610 } 611 612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 613 { 614 intptr_t i, opr_sz = simd_oprsz(desc); 615 int64_t *d = vd, *n = vn, *m = vm; 616 617 for (i = 0; i < opr_sz / 8; ++i) { 618 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 619 } 620 } 621 622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int64_t *d = vd, *n = vn, *m = vm; 626 627 for (i = 0; i < opr_sz / 8; ++i) { 628 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 629 } 630 } 631 632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 633 { 634 intptr_t i, j, opr_sz = simd_oprsz(desc); 635 int idx = simd_data(desc); 636 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 637 638 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 639 int64_t mm = m[i]; 640 for (j = 0; j < 16 / 8; ++j) { 641 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 642 } 643 } 644 } 645 646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 647 { 648 intptr_t i, j, opr_sz = simd_oprsz(desc); 649 int idx = simd_data(desc); 650 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 651 652 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 653 int64_t mm = m[i]; 654 for (j = 0; j < 16 / 8; ++j) { 655 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 656 } 657 } 658 } 659 660 /* Integer 8 and 16-bit dot-product. 661 * 662 * Note that for the loops herein, host endianness does not matter 663 * with respect to the ordering of data within the quad-width lanes. 664 * All elements are treated equally, no matter where they are. 665 */ 666 667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 669 { \ 670 intptr_t i, opr_sz = simd_oprsz(desc); \ 671 TYPED *d = vd, *a = va; \ 672 TYPEN *n = vn; \ 673 TYPEM *m = vm; \ 674 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 675 d[i] = (a[i] + \ 676 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 677 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 678 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 679 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 680 } \ 681 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 682 } 683 684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 689 690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 692 { \ 693 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 694 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 695 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 696 intptr_t index = simd_data(desc); \ 697 TYPED *d = vd, *a = va; \ 698 TYPEN *n = vn; \ 699 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 700 do { \ 701 TYPED m0 = m_indexed[i * 4 + 0]; \ 702 TYPED m1 = m_indexed[i * 4 + 1]; \ 703 TYPED m2 = m_indexed[i * 4 + 2]; \ 704 TYPED m3 = m_indexed[i * 4 + 3]; \ 705 do { \ 706 d[i] = (a[i] + \ 707 n[i * 4 + 0] * m0 + \ 708 n[i * 4 + 1] * m1 + \ 709 n[i * 4 + 2] * m2 + \ 710 n[i * 4 + 3] * m3); \ 711 } while (++i < segend); \ 712 segend = i + 4; \ 713 } while (i < opr_sz_n); \ 714 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 715 } 716 717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 723 724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 725 void *vfpst, uint32_t desc) 726 { 727 uintptr_t opr_sz = simd_oprsz(desc); 728 float16 *d = vd; 729 float16 *n = vn; 730 float16 *m = vm; 731 float_status *fpst = vfpst; 732 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 733 uint32_t neg_imag = neg_real ^ 1; 734 uintptr_t i; 735 736 /* Shift boolean to the sign bit so we can xor to negate. */ 737 neg_real <<= 15; 738 neg_imag <<= 15; 739 740 for (i = 0; i < opr_sz / 2; i += 2) { 741 float16 e0 = n[H2(i)]; 742 float16 e1 = m[H2(i + 1)] ^ neg_imag; 743 float16 e2 = n[H2(i + 1)]; 744 float16 e3 = m[H2(i)] ^ neg_real; 745 746 d[H2(i)] = float16_add(e0, e1, fpst); 747 d[H2(i + 1)] = float16_add(e2, e3, fpst); 748 } 749 clear_tail(d, opr_sz, simd_maxsz(desc)); 750 } 751 752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 753 void *vfpst, uint32_t desc) 754 { 755 uintptr_t opr_sz = simd_oprsz(desc); 756 float32 *d = vd; 757 float32 *n = vn; 758 float32 *m = vm; 759 float_status *fpst = vfpst; 760 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 761 uint32_t neg_imag = neg_real ^ 1; 762 uintptr_t i; 763 764 /* Shift boolean to the sign bit so we can xor to negate. */ 765 neg_real <<= 31; 766 neg_imag <<= 31; 767 768 for (i = 0; i < opr_sz / 4; i += 2) { 769 float32 e0 = n[H4(i)]; 770 float32 e1 = m[H4(i + 1)] ^ neg_imag; 771 float32 e2 = n[H4(i + 1)]; 772 float32 e3 = m[H4(i)] ^ neg_real; 773 774 d[H4(i)] = float32_add(e0, e1, fpst); 775 d[H4(i + 1)] = float32_add(e2, e3, fpst); 776 } 777 clear_tail(d, opr_sz, simd_maxsz(desc)); 778 } 779 780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 781 void *vfpst, uint32_t desc) 782 { 783 uintptr_t opr_sz = simd_oprsz(desc); 784 float64 *d = vd; 785 float64 *n = vn; 786 float64 *m = vm; 787 float_status *fpst = vfpst; 788 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 789 uint64_t neg_imag = neg_real ^ 1; 790 uintptr_t i; 791 792 /* Shift boolean to the sign bit so we can xor to negate. */ 793 neg_real <<= 63; 794 neg_imag <<= 63; 795 796 for (i = 0; i < opr_sz / 8; i += 2) { 797 float64 e0 = n[i]; 798 float64 e1 = m[i + 1] ^ neg_imag; 799 float64 e2 = n[i + 1]; 800 float64 e3 = m[i] ^ neg_real; 801 802 d[i] = float64_add(e0, e1, fpst); 803 d[i + 1] = float64_add(e2, e3, fpst); 804 } 805 clear_tail(d, opr_sz, simd_maxsz(desc)); 806 } 807 808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 809 void *vfpst, uint32_t desc) 810 { 811 uintptr_t opr_sz = simd_oprsz(desc); 812 float16 *d = vd, *n = vn, *m = vm, *a = va; 813 float_status *fpst = vfpst; 814 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 815 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 816 uint32_t neg_real = flip ^ neg_imag; 817 uintptr_t i; 818 819 /* Shift boolean to the sign bit so we can xor to negate. */ 820 neg_real <<= 15; 821 neg_imag <<= 15; 822 823 for (i = 0; i < opr_sz / 2; i += 2) { 824 float16 e2 = n[H2(i + flip)]; 825 float16 e1 = m[H2(i + flip)] ^ neg_real; 826 float16 e4 = e2; 827 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 828 829 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 830 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 831 } 832 clear_tail(d, opr_sz, simd_maxsz(desc)); 833 } 834 835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 836 void *vfpst, uint32_t desc) 837 { 838 uintptr_t opr_sz = simd_oprsz(desc); 839 float16 *d = vd, *n = vn, *m = vm, *a = va; 840 float_status *fpst = vfpst; 841 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 842 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 843 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 844 uint32_t neg_real = flip ^ neg_imag; 845 intptr_t elements = opr_sz / sizeof(float16); 846 intptr_t eltspersegment = 16 / sizeof(float16); 847 intptr_t i, j; 848 849 /* Shift boolean to the sign bit so we can xor to negate. */ 850 neg_real <<= 15; 851 neg_imag <<= 15; 852 853 for (i = 0; i < elements; i += eltspersegment) { 854 float16 mr = m[H2(i + 2 * index + 0)]; 855 float16 mi = m[H2(i + 2 * index + 1)]; 856 float16 e1 = neg_real ^ (flip ? mi : mr); 857 float16 e3 = neg_imag ^ (flip ? mr : mi); 858 859 for (j = i; j < i + eltspersegment; j += 2) { 860 float16 e2 = n[H2(j + flip)]; 861 float16 e4 = e2; 862 863 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 864 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 865 } 866 } 867 clear_tail(d, opr_sz, simd_maxsz(desc)); 868 } 869 870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 871 void *vfpst, uint32_t desc) 872 { 873 uintptr_t opr_sz = simd_oprsz(desc); 874 float32 *d = vd, *n = vn, *m = vm, *a = va; 875 float_status *fpst = vfpst; 876 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 877 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 878 uint32_t neg_real = flip ^ neg_imag; 879 uintptr_t i; 880 881 /* Shift boolean to the sign bit so we can xor to negate. */ 882 neg_real <<= 31; 883 neg_imag <<= 31; 884 885 for (i = 0; i < opr_sz / 4; i += 2) { 886 float32 e2 = n[H4(i + flip)]; 887 float32 e1 = m[H4(i + flip)] ^ neg_real; 888 float32 e4 = e2; 889 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 890 891 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 892 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 893 } 894 clear_tail(d, opr_sz, simd_maxsz(desc)); 895 } 896 897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 898 void *vfpst, uint32_t desc) 899 { 900 uintptr_t opr_sz = simd_oprsz(desc); 901 float32 *d = vd, *n = vn, *m = vm, *a = va; 902 float_status *fpst = vfpst; 903 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 904 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 905 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 906 uint32_t neg_real = flip ^ neg_imag; 907 intptr_t elements = opr_sz / sizeof(float32); 908 intptr_t eltspersegment = 16 / sizeof(float32); 909 intptr_t i, j; 910 911 /* Shift boolean to the sign bit so we can xor to negate. */ 912 neg_real <<= 31; 913 neg_imag <<= 31; 914 915 for (i = 0; i < elements; i += eltspersegment) { 916 float32 mr = m[H4(i + 2 * index + 0)]; 917 float32 mi = m[H4(i + 2 * index + 1)]; 918 float32 e1 = neg_real ^ (flip ? mi : mr); 919 float32 e3 = neg_imag ^ (flip ? mr : mi); 920 921 for (j = i; j < i + eltspersegment; j += 2) { 922 float32 e2 = n[H4(j + flip)]; 923 float32 e4 = e2; 924 925 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 926 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 927 } 928 } 929 clear_tail(d, opr_sz, simd_maxsz(desc)); 930 } 931 932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 933 void *vfpst, uint32_t desc) 934 { 935 uintptr_t opr_sz = simd_oprsz(desc); 936 float64 *d = vd, *n = vn, *m = vm, *a = va; 937 float_status *fpst = vfpst; 938 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 939 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 940 uint64_t neg_real = flip ^ neg_imag; 941 uintptr_t i; 942 943 /* Shift boolean to the sign bit so we can xor to negate. */ 944 neg_real <<= 63; 945 neg_imag <<= 63; 946 947 for (i = 0; i < opr_sz / 8; i += 2) { 948 float64 e2 = n[i + flip]; 949 float64 e1 = m[i + flip] ^ neg_real; 950 float64 e4 = e2; 951 float64 e3 = m[i + 1 - flip] ^ neg_imag; 952 953 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 954 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 955 } 956 clear_tail(d, opr_sz, simd_maxsz(desc)); 957 } 958 959 /* 960 * Floating point comparisons producing an integer result (all 1s or all 0s). 961 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 962 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 963 */ 964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 965 { 966 return -float16_eq_quiet(op1, op2, stat); 967 } 968 969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 970 { 971 return -float32_eq_quiet(op1, op2, stat); 972 } 973 974 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 975 { 976 return -float64_eq_quiet(op1, op2, stat); 977 } 978 979 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 980 { 981 return -float16_le(op2, op1, stat); 982 } 983 984 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 985 { 986 return -float32_le(op2, op1, stat); 987 } 988 989 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 990 { 991 return -float64_le(op2, op1, stat); 992 } 993 994 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 995 { 996 return -float16_lt(op2, op1, stat); 997 } 998 999 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1000 { 1001 return -float32_lt(op2, op1, stat); 1002 } 1003 1004 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1005 { 1006 return -float64_lt(op2, op1, stat); 1007 } 1008 1009 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1010 { 1011 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1012 } 1013 1014 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1015 { 1016 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1017 } 1018 1019 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1020 { 1021 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1022 } 1023 1024 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1025 { 1026 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1027 } 1028 1029 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1030 { 1031 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1032 } 1033 1034 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1035 { 1036 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1037 } 1038 1039 static int16_t vfp_tosszh(float16 x, void *fpstp) 1040 { 1041 float_status *fpst = fpstp; 1042 if (float16_is_any_nan(x)) { 1043 float_raise(float_flag_invalid, fpst); 1044 return 0; 1045 } 1046 return float16_to_int16_round_to_zero(x, fpst); 1047 } 1048 1049 static uint16_t vfp_touszh(float16 x, void *fpstp) 1050 { 1051 float_status *fpst = fpstp; 1052 if (float16_is_any_nan(x)) { 1053 float_raise(float_flag_invalid, fpst); 1054 return 0; 1055 } 1056 return float16_to_uint16_round_to_zero(x, fpst); 1057 } 1058 1059 #define DO_2OP(NAME, FUNC, TYPE) \ 1060 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1061 { \ 1062 intptr_t i, oprsz = simd_oprsz(desc); \ 1063 TYPE *d = vd, *n = vn; \ 1064 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1065 d[i] = FUNC(n[i], stat); \ 1066 } \ 1067 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1068 } 1069 1070 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1071 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1072 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1073 1074 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1075 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1076 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1077 1078 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1079 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1080 1081 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1082 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1083 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1084 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1085 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1086 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1087 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1088 DO_2OP(gvec_touszh, vfp_touszh, float16) 1089 1090 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1091 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1092 { \ 1093 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1094 } 1095 1096 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1097 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1098 { \ 1099 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1100 } 1101 1102 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1103 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1104 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1105 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1106 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1107 1108 DO_2OP_CMP0(cgt, cgt, FWD) 1109 DO_2OP_CMP0(cge, cge, FWD) 1110 DO_2OP_CMP0(ceq, ceq, FWD) 1111 DO_2OP_CMP0(clt, cgt, REV) 1112 DO_2OP_CMP0(cle, cge, REV) 1113 1114 #undef DO_2OP 1115 #undef DO_2OP_CMP0 1116 1117 /* Floating-point trigonometric starting value. 1118 * See the ARM ARM pseudocode function FPTrigSMul. 1119 */ 1120 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1121 { 1122 float16 result = float16_mul(op1, op1, stat); 1123 if (!float16_is_any_nan(result)) { 1124 result = float16_set_sign(result, op2 & 1); 1125 } 1126 return result; 1127 } 1128 1129 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1130 { 1131 float32 result = float32_mul(op1, op1, stat); 1132 if (!float32_is_any_nan(result)) { 1133 result = float32_set_sign(result, op2 & 1); 1134 } 1135 return result; 1136 } 1137 1138 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1139 { 1140 float64 result = float64_mul(op1, op1, stat); 1141 if (!float64_is_any_nan(result)) { 1142 result = float64_set_sign(result, op2 & 1); 1143 } 1144 return result; 1145 } 1146 1147 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1148 { 1149 return float16_abs(float16_sub(op1, op2, stat)); 1150 } 1151 1152 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1153 { 1154 return float32_abs(float32_sub(op1, op2, stat)); 1155 } 1156 1157 /* 1158 * Reciprocal step. These are the AArch32 version which uses a 1159 * non-fused multiply-and-subtract. 1160 */ 1161 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1162 { 1163 op1 = float16_squash_input_denormal(op1, stat); 1164 op2 = float16_squash_input_denormal(op2, stat); 1165 1166 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1167 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1168 return float16_two; 1169 } 1170 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1171 } 1172 1173 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1174 { 1175 op1 = float32_squash_input_denormal(op1, stat); 1176 op2 = float32_squash_input_denormal(op2, stat); 1177 1178 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1179 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1180 return float32_two; 1181 } 1182 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1183 } 1184 1185 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1186 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1187 { 1188 op1 = float16_squash_input_denormal(op1, stat); 1189 op2 = float16_squash_input_denormal(op2, stat); 1190 1191 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1192 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1193 return float16_one_point_five; 1194 } 1195 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1196 return float16_div(op1, float16_two, stat); 1197 } 1198 1199 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1200 { 1201 op1 = float32_squash_input_denormal(op1, stat); 1202 op2 = float32_squash_input_denormal(op2, stat); 1203 1204 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1205 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1206 return float32_one_point_five; 1207 } 1208 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1209 return float32_div(op1, float32_two, stat); 1210 } 1211 1212 #define DO_3OP(NAME, FUNC, TYPE) \ 1213 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1214 { \ 1215 intptr_t i, oprsz = simd_oprsz(desc); \ 1216 TYPE *d = vd, *n = vn, *m = vm; \ 1217 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1218 d[i] = FUNC(n[i], m[i], stat); \ 1219 } \ 1220 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1221 } 1222 1223 DO_3OP(gvec_fadd_h, float16_add, float16) 1224 DO_3OP(gvec_fadd_s, float32_add, float32) 1225 DO_3OP(gvec_fadd_d, float64_add, float64) 1226 1227 DO_3OP(gvec_fsub_h, float16_sub, float16) 1228 DO_3OP(gvec_fsub_s, float32_sub, float32) 1229 DO_3OP(gvec_fsub_d, float64_sub, float64) 1230 1231 DO_3OP(gvec_fmul_h, float16_mul, float16) 1232 DO_3OP(gvec_fmul_s, float32_mul, float32) 1233 DO_3OP(gvec_fmul_d, float64_mul, float64) 1234 1235 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1236 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1237 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1238 1239 DO_3OP(gvec_fabd_h, float16_abd, float16) 1240 DO_3OP(gvec_fabd_s, float32_abd, float32) 1241 1242 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1243 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1244 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1245 1246 DO_3OP(gvec_fcge_h, float16_cge, float16) 1247 DO_3OP(gvec_fcge_s, float32_cge, float32) 1248 DO_3OP(gvec_fcge_d, float64_cge, float64) 1249 1250 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1251 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1252 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1253 1254 DO_3OP(gvec_facge_h, float16_acge, float16) 1255 DO_3OP(gvec_facge_s, float32_acge, float32) 1256 DO_3OP(gvec_facge_d, float64_acge, float64) 1257 1258 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1259 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1260 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1261 1262 DO_3OP(gvec_fmax_h, float16_max, float16) 1263 DO_3OP(gvec_fmax_s, float32_max, float32) 1264 DO_3OP(gvec_fmax_d, float64_max, float64) 1265 1266 DO_3OP(gvec_fmin_h, float16_min, float16) 1267 DO_3OP(gvec_fmin_s, float32_min, float32) 1268 DO_3OP(gvec_fmin_d, float64_min, float64) 1269 1270 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1271 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1272 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1273 1274 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1275 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1276 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1277 1278 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1279 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1280 1281 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1282 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1283 1284 #ifdef TARGET_AARCH64 1285 DO_3OP(gvec_fdiv_h, float16_div, float16) 1286 DO_3OP(gvec_fdiv_s, float32_div, float32) 1287 DO_3OP(gvec_fdiv_d, float64_div, float64) 1288 1289 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1290 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1291 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1292 1293 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1294 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1295 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1296 1297 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1298 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1299 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1300 1301 #endif 1302 #undef DO_3OP 1303 1304 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1305 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1306 float_status *stat) 1307 { 1308 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1309 } 1310 1311 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1312 float_status *stat) 1313 { 1314 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1315 } 1316 1317 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1318 float_status *stat) 1319 { 1320 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1321 } 1322 1323 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1324 float_status *stat) 1325 { 1326 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1327 } 1328 1329 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1330 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1331 float_status *stat) 1332 { 1333 return float16_muladd(op1, op2, dest, 0, stat); 1334 } 1335 1336 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1337 float_status *stat) 1338 { 1339 return float32_muladd(op1, op2, dest, 0, stat); 1340 } 1341 1342 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1343 float_status *stat) 1344 { 1345 return float64_muladd(op1, op2, dest, 0, stat); 1346 } 1347 1348 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1349 float_status *stat) 1350 { 1351 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1352 } 1353 1354 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1355 float_status *stat) 1356 { 1357 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1358 } 1359 1360 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1361 float_status *stat) 1362 { 1363 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1364 } 1365 1366 #define DO_MULADD(NAME, FUNC, TYPE) \ 1367 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1368 { \ 1369 intptr_t i, oprsz = simd_oprsz(desc); \ 1370 TYPE *d = vd, *n = vn, *m = vm; \ 1371 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1372 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1373 } \ 1374 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1375 } 1376 1377 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1378 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1379 1380 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1381 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1382 1383 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1384 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1385 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1386 1387 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1388 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1389 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1390 1391 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1392 * For AdvSIMD, there is of course only one such vector segment. 1393 */ 1394 1395 #define DO_MUL_IDX(NAME, TYPE, H) \ 1396 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1397 { \ 1398 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1399 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1400 intptr_t idx = simd_data(desc); \ 1401 TYPE *d = vd, *n = vn, *m = vm; \ 1402 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1403 TYPE mm = m[H(i + idx)]; \ 1404 for (j = 0; j < segment; j++) { \ 1405 d[i + j] = n[i + j] * mm; \ 1406 } \ 1407 } \ 1408 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1409 } 1410 1411 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1412 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1413 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1414 1415 #undef DO_MUL_IDX 1416 1417 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1418 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1419 { \ 1420 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1421 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1422 intptr_t idx = simd_data(desc); \ 1423 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1424 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1425 TYPE mm = m[H(i + idx)]; \ 1426 for (j = 0; j < segment; j++) { \ 1427 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1428 } \ 1429 } \ 1430 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1431 } 1432 1433 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1434 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1435 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1436 1437 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1438 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1439 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1440 1441 #undef DO_MLA_IDX 1442 1443 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1444 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1445 { \ 1446 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1447 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1448 intptr_t idx = simd_data(desc); \ 1449 TYPE *d = vd, *n = vn, *m = vm; \ 1450 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1451 TYPE mm = m[H(i + idx)]; \ 1452 for (j = 0; j < segment; j++) { \ 1453 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1454 } \ 1455 } \ 1456 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1457 } 1458 1459 #define nop(N, M, S) (M) 1460 1461 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1462 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1463 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1464 1465 #ifdef TARGET_AARCH64 1466 1467 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1468 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1469 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1470 1471 #endif 1472 1473 #undef nop 1474 1475 /* 1476 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1477 * the fused ops below they assume accumulate both from and into Vd. 1478 */ 1479 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1480 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1481 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1482 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1483 1484 #undef DO_FMUL_IDX 1485 1486 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1487 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1488 void *stat, uint32_t desc) \ 1489 { \ 1490 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1491 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1492 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1493 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1494 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1495 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1496 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1497 TYPE mm = m[H(i + idx)]; \ 1498 for (j = 0; j < segment; j++) { \ 1499 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1500 mm, a[i + j], 0, stat); \ 1501 } \ 1502 } \ 1503 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1504 } 1505 1506 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1507 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1508 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1509 1510 #undef DO_FMLA_IDX 1511 1512 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1513 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1514 { \ 1515 intptr_t i, oprsz = simd_oprsz(desc); \ 1516 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1517 bool q = false; \ 1518 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1519 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1520 if (dd < MIN) { \ 1521 dd = MIN; \ 1522 q = true; \ 1523 } else if (dd > MAX) { \ 1524 dd = MAX; \ 1525 q = true; \ 1526 } \ 1527 d[i] = dd; \ 1528 } \ 1529 if (q) { \ 1530 uint32_t *qc = vq; \ 1531 qc[0] = 1; \ 1532 } \ 1533 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1534 } 1535 1536 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1537 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1538 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1539 1540 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1541 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1542 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1543 1544 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1545 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1546 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1547 1548 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1549 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1550 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1551 1552 #undef DO_SAT 1553 1554 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1555 void *vm, uint32_t desc) 1556 { 1557 intptr_t i, oprsz = simd_oprsz(desc); 1558 uint64_t *d = vd, *n = vn, *m = vm; 1559 bool q = false; 1560 1561 for (i = 0; i < oprsz / 8; i++) { 1562 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1563 if (dd < nn) { 1564 dd = UINT64_MAX; 1565 q = true; 1566 } 1567 d[i] = dd; 1568 } 1569 if (q) { 1570 uint32_t *qc = vq; 1571 qc[0] = 1; 1572 } 1573 clear_tail(d, oprsz, simd_maxsz(desc)); 1574 } 1575 1576 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1577 void *vm, uint32_t desc) 1578 { 1579 intptr_t i, oprsz = simd_oprsz(desc); 1580 uint64_t *d = vd, *n = vn, *m = vm; 1581 bool q = false; 1582 1583 for (i = 0; i < oprsz / 8; i++) { 1584 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1585 if (nn < mm) { 1586 dd = 0; 1587 q = true; 1588 } 1589 d[i] = dd; 1590 } 1591 if (q) { 1592 uint32_t *qc = vq; 1593 qc[0] = 1; 1594 } 1595 clear_tail(d, oprsz, simd_maxsz(desc)); 1596 } 1597 1598 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1599 void *vm, uint32_t desc) 1600 { 1601 intptr_t i, oprsz = simd_oprsz(desc); 1602 int64_t *d = vd, *n = vn, *m = vm; 1603 bool q = false; 1604 1605 for (i = 0; i < oprsz / 8; i++) { 1606 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1607 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1608 dd = (nn >> 63) ^ ~INT64_MIN; 1609 q = true; 1610 } 1611 d[i] = dd; 1612 } 1613 if (q) { 1614 uint32_t *qc = vq; 1615 qc[0] = 1; 1616 } 1617 clear_tail(d, oprsz, simd_maxsz(desc)); 1618 } 1619 1620 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1621 void *vm, uint32_t desc) 1622 { 1623 intptr_t i, oprsz = simd_oprsz(desc); 1624 int64_t *d = vd, *n = vn, *m = vm; 1625 bool q = false; 1626 1627 for (i = 0; i < oprsz / 8; i++) { 1628 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1629 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1630 dd = (nn >> 63) ^ ~INT64_MIN; 1631 q = true; 1632 } 1633 d[i] = dd; 1634 } 1635 if (q) { 1636 uint32_t *qc = vq; 1637 qc[0] = 1; 1638 } 1639 clear_tail(d, oprsz, simd_maxsz(desc)); 1640 } 1641 1642 1643 #define DO_SRA(NAME, TYPE) \ 1644 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1645 { \ 1646 intptr_t i, oprsz = simd_oprsz(desc); \ 1647 int shift = simd_data(desc); \ 1648 TYPE *d = vd, *n = vn; \ 1649 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1650 d[i] += n[i] >> shift; \ 1651 } \ 1652 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1653 } 1654 1655 DO_SRA(gvec_ssra_b, int8_t) 1656 DO_SRA(gvec_ssra_h, int16_t) 1657 DO_SRA(gvec_ssra_s, int32_t) 1658 DO_SRA(gvec_ssra_d, int64_t) 1659 1660 DO_SRA(gvec_usra_b, uint8_t) 1661 DO_SRA(gvec_usra_h, uint16_t) 1662 DO_SRA(gvec_usra_s, uint32_t) 1663 DO_SRA(gvec_usra_d, uint64_t) 1664 1665 #undef DO_SRA 1666 1667 #define DO_RSHR(NAME, TYPE) \ 1668 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1669 { \ 1670 intptr_t i, oprsz = simd_oprsz(desc); \ 1671 int shift = simd_data(desc); \ 1672 TYPE *d = vd, *n = vn; \ 1673 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1674 TYPE tmp = n[i] >> (shift - 1); \ 1675 d[i] = (tmp >> 1) + (tmp & 1); \ 1676 } \ 1677 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1678 } 1679 1680 DO_RSHR(gvec_srshr_b, int8_t) 1681 DO_RSHR(gvec_srshr_h, int16_t) 1682 DO_RSHR(gvec_srshr_s, int32_t) 1683 DO_RSHR(gvec_srshr_d, int64_t) 1684 1685 DO_RSHR(gvec_urshr_b, uint8_t) 1686 DO_RSHR(gvec_urshr_h, uint16_t) 1687 DO_RSHR(gvec_urshr_s, uint32_t) 1688 DO_RSHR(gvec_urshr_d, uint64_t) 1689 1690 #undef DO_RSHR 1691 1692 #define DO_RSRA(NAME, TYPE) \ 1693 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1694 { \ 1695 intptr_t i, oprsz = simd_oprsz(desc); \ 1696 int shift = simd_data(desc); \ 1697 TYPE *d = vd, *n = vn; \ 1698 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1699 TYPE tmp = n[i] >> (shift - 1); \ 1700 d[i] += (tmp >> 1) + (tmp & 1); \ 1701 } \ 1702 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1703 } 1704 1705 DO_RSRA(gvec_srsra_b, int8_t) 1706 DO_RSRA(gvec_srsra_h, int16_t) 1707 DO_RSRA(gvec_srsra_s, int32_t) 1708 DO_RSRA(gvec_srsra_d, int64_t) 1709 1710 DO_RSRA(gvec_ursra_b, uint8_t) 1711 DO_RSRA(gvec_ursra_h, uint16_t) 1712 DO_RSRA(gvec_ursra_s, uint32_t) 1713 DO_RSRA(gvec_ursra_d, uint64_t) 1714 1715 #undef DO_RSRA 1716 1717 #define DO_SRI(NAME, TYPE) \ 1718 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1719 { \ 1720 intptr_t i, oprsz = simd_oprsz(desc); \ 1721 int shift = simd_data(desc); \ 1722 TYPE *d = vd, *n = vn; \ 1723 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1724 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1725 } \ 1726 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1727 } 1728 1729 DO_SRI(gvec_sri_b, uint8_t) 1730 DO_SRI(gvec_sri_h, uint16_t) 1731 DO_SRI(gvec_sri_s, uint32_t) 1732 DO_SRI(gvec_sri_d, uint64_t) 1733 1734 #undef DO_SRI 1735 1736 #define DO_SLI(NAME, TYPE) \ 1737 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1738 { \ 1739 intptr_t i, oprsz = simd_oprsz(desc); \ 1740 int shift = simd_data(desc); \ 1741 TYPE *d = vd, *n = vn; \ 1742 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1743 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1744 } \ 1745 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1746 } 1747 1748 DO_SLI(gvec_sli_b, uint8_t) 1749 DO_SLI(gvec_sli_h, uint16_t) 1750 DO_SLI(gvec_sli_s, uint32_t) 1751 DO_SLI(gvec_sli_d, uint64_t) 1752 1753 #undef DO_SLI 1754 1755 /* 1756 * Convert float16 to float32, raising no exceptions and 1757 * preserving exceptional values, including SNaN. 1758 * This is effectively an unpack+repack operation. 1759 */ 1760 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1761 { 1762 const int f16_bias = 15; 1763 const int f32_bias = 127; 1764 uint32_t sign = extract32(f16, 15, 1); 1765 uint32_t exp = extract32(f16, 10, 5); 1766 uint32_t frac = extract32(f16, 0, 10); 1767 1768 if (exp == 0x1f) { 1769 /* Inf or NaN */ 1770 exp = 0xff; 1771 } else if (exp == 0) { 1772 /* Zero or denormal. */ 1773 if (frac != 0) { 1774 if (fz16) { 1775 frac = 0; 1776 } else { 1777 /* 1778 * Denormal; these are all normal float32. 1779 * Shift the fraction so that the msb is at bit 11, 1780 * then remove bit 11 as the implicit bit of the 1781 * normalized float32. Note that we still go through 1782 * the shift for normal numbers below, to put the 1783 * float32 fraction at the right place. 1784 */ 1785 int shift = clz32(frac) - 21; 1786 frac = (frac << shift) & 0x3ff; 1787 exp = f32_bias - f16_bias - shift + 1; 1788 } 1789 } 1790 } else { 1791 /* Normal number; adjust the bias. */ 1792 exp += f32_bias - f16_bias; 1793 } 1794 sign <<= 31; 1795 exp <<= 23; 1796 frac <<= 23 - 10; 1797 1798 return sign | exp | frac; 1799 } 1800 1801 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1802 { 1803 /* 1804 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1805 * Load the 2nd qword iff is_q & is_2. 1806 * Shift to the 2nd dword iff !is_q & is_2. 1807 * For !is_q & !is_2, the upper bits of the result are garbage. 1808 */ 1809 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1810 } 1811 1812 /* 1813 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1814 * as there is not yet SVE versions that might use blocking. 1815 */ 1816 1817 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1818 uint32_t desc, bool fz16) 1819 { 1820 intptr_t i, oprsz = simd_oprsz(desc); 1821 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1822 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1823 int is_q = oprsz == 16; 1824 uint64_t n_4, m_4; 1825 1826 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1827 n_4 = load4_f16(vn, is_q, is_2); 1828 m_4 = load4_f16(vm, is_q, is_2); 1829 1830 /* Negate all inputs for FMLSL at once. */ 1831 if (is_s) { 1832 n_4 ^= 0x8000800080008000ull; 1833 } 1834 1835 for (i = 0; i < oprsz / 4; i++) { 1836 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1837 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1838 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1839 } 1840 clear_tail(d, oprsz, simd_maxsz(desc)); 1841 } 1842 1843 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1844 void *venv, uint32_t desc) 1845 { 1846 CPUARMState *env = venv; 1847 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1848 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1849 } 1850 1851 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1852 void *venv, uint32_t desc) 1853 { 1854 CPUARMState *env = venv; 1855 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1856 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1857 } 1858 1859 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 1860 void *venv, uint32_t desc) 1861 { 1862 intptr_t i, oprsz = simd_oprsz(desc); 1863 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1864 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1865 CPUARMState *env = venv; 1866 float_status *status = &env->vfp.fp_status; 1867 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1868 1869 for (i = 0; i < oprsz; i += sizeof(float32)) { 1870 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 1871 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 1872 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1873 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1874 float32 aa = *(float32 *)(va + H1_4(i)); 1875 1876 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 1877 } 1878 } 1879 1880 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 1881 uint32_t desc, bool fz16) 1882 { 1883 intptr_t i, oprsz = simd_oprsz(desc); 1884 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1885 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1886 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 1887 int is_q = oprsz == 16; 1888 uint64_t n_4; 1889 float32 m_1; 1890 1891 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1892 n_4 = load4_f16(vn, is_q, is_2); 1893 1894 /* Negate all inputs for FMLSL at once. */ 1895 if (is_s) { 1896 n_4 ^= 0x8000800080008000ull; 1897 } 1898 1899 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 1900 1901 for (i = 0; i < oprsz / 4; i++) { 1902 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1903 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1904 } 1905 clear_tail(d, oprsz, simd_maxsz(desc)); 1906 } 1907 1908 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 1909 void *venv, uint32_t desc) 1910 { 1911 CPUARMState *env = venv; 1912 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1913 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1914 } 1915 1916 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 1917 void *venv, uint32_t desc) 1918 { 1919 CPUARMState *env = venv; 1920 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 1921 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1922 } 1923 1924 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 1925 void *venv, uint32_t desc) 1926 { 1927 intptr_t i, j, oprsz = simd_oprsz(desc); 1928 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1929 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1930 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 1931 CPUARMState *env = venv; 1932 float_status *status = &env->vfp.fp_status; 1933 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1934 1935 for (i = 0; i < oprsz; i += 16) { 1936 float16 mm_16 = *(float16 *)(vm + i + idx); 1937 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1938 1939 for (j = 0; j < 16; j += sizeof(float32)) { 1940 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 1941 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1942 float32 aa = *(float32 *)(va + H1_4(i + j)); 1943 1944 *(float32 *)(vd + H1_4(i + j)) = 1945 float32_muladd(nn, mm, aa, 0, status); 1946 } 1947 } 1948 } 1949 1950 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1951 { 1952 intptr_t i, opr_sz = simd_oprsz(desc); 1953 int8_t *d = vd, *n = vn, *m = vm; 1954 1955 for (i = 0; i < opr_sz; ++i) { 1956 int8_t mm = m[i]; 1957 int8_t nn = n[i]; 1958 int8_t res = 0; 1959 if (mm >= 0) { 1960 if (mm < 8) { 1961 res = nn << mm; 1962 } 1963 } else { 1964 res = nn >> (mm > -8 ? -mm : 7); 1965 } 1966 d[i] = res; 1967 } 1968 clear_tail(d, opr_sz, simd_maxsz(desc)); 1969 } 1970 1971 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1972 { 1973 intptr_t i, opr_sz = simd_oprsz(desc); 1974 int16_t *d = vd, *n = vn, *m = vm; 1975 1976 for (i = 0; i < opr_sz / 2; ++i) { 1977 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1978 int16_t nn = n[i]; 1979 int16_t res = 0; 1980 if (mm >= 0) { 1981 if (mm < 16) { 1982 res = nn << mm; 1983 } 1984 } else { 1985 res = nn >> (mm > -16 ? -mm : 15); 1986 } 1987 d[i] = res; 1988 } 1989 clear_tail(d, opr_sz, simd_maxsz(desc)); 1990 } 1991 1992 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1993 { 1994 intptr_t i, opr_sz = simd_oprsz(desc); 1995 uint8_t *d = vd, *n = vn, *m = vm; 1996 1997 for (i = 0; i < opr_sz; ++i) { 1998 int8_t mm = m[i]; 1999 uint8_t nn = n[i]; 2000 uint8_t res = 0; 2001 if (mm >= 0) { 2002 if (mm < 8) { 2003 res = nn << mm; 2004 } 2005 } else { 2006 if (mm > -8) { 2007 res = nn >> -mm; 2008 } 2009 } 2010 d[i] = res; 2011 } 2012 clear_tail(d, opr_sz, simd_maxsz(desc)); 2013 } 2014 2015 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2016 { 2017 intptr_t i, opr_sz = simd_oprsz(desc); 2018 uint16_t *d = vd, *n = vn, *m = vm; 2019 2020 for (i = 0; i < opr_sz / 2; ++i) { 2021 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2022 uint16_t nn = n[i]; 2023 uint16_t res = 0; 2024 if (mm >= 0) { 2025 if (mm < 16) { 2026 res = nn << mm; 2027 } 2028 } else { 2029 if (mm > -16) { 2030 res = nn >> -mm; 2031 } 2032 } 2033 d[i] = res; 2034 } 2035 clear_tail(d, opr_sz, simd_maxsz(desc)); 2036 } 2037 2038 /* 2039 * 8x8->8 polynomial multiply. 2040 * 2041 * Polynomial multiplication is like integer multiplication except the 2042 * partial products are XORed, not added. 2043 * 2044 * TODO: expose this as a generic vector operation, as it is a common 2045 * crypto building block. 2046 */ 2047 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2048 { 2049 intptr_t i, opr_sz = simd_oprsz(desc); 2050 uint64_t *d = vd, *n = vn, *m = vm; 2051 2052 for (i = 0; i < opr_sz / 8; ++i) { 2053 d[i] = clmul_8x8_low(n[i], m[i]); 2054 } 2055 clear_tail(d, opr_sz, simd_maxsz(desc)); 2056 } 2057 2058 /* 2059 * 64x64->128 polynomial multiply. 2060 * Because of the lanes are not accessed in strict columns, 2061 * this probably cannot be turned into a generic helper. 2062 */ 2063 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2064 { 2065 intptr_t i, opr_sz = simd_oprsz(desc); 2066 intptr_t hi = simd_data(desc); 2067 uint64_t *d = vd, *n = vn, *m = vm; 2068 2069 for (i = 0; i < opr_sz / 8; i += 2) { 2070 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2071 d[i] = int128_getlo(r); 2072 d[i + 1] = int128_gethi(r); 2073 } 2074 clear_tail(d, opr_sz, simd_maxsz(desc)); 2075 } 2076 2077 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2078 { 2079 int hi = simd_data(desc); 2080 uint64_t *d = vd, *n = vn, *m = vm; 2081 uint64_t nn = n[hi], mm = m[hi]; 2082 2083 d[0] = clmul_8x4_packed(nn, mm); 2084 nn >>= 32; 2085 mm >>= 32; 2086 d[1] = clmul_8x4_packed(nn, mm); 2087 2088 clear_tail(d, 16, simd_maxsz(desc)); 2089 } 2090 2091 #ifdef TARGET_AARCH64 2092 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2093 { 2094 int shift = simd_data(desc) * 8; 2095 intptr_t i, opr_sz = simd_oprsz(desc); 2096 uint64_t *d = vd, *n = vn, *m = vm; 2097 2098 for (i = 0; i < opr_sz / 8; ++i) { 2099 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2100 } 2101 } 2102 2103 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2104 { 2105 intptr_t sel = H4(simd_data(desc)); 2106 intptr_t i, opr_sz = simd_oprsz(desc); 2107 uint32_t *n = vn, *m = vm; 2108 uint64_t *d = vd; 2109 2110 for (i = 0; i < opr_sz / 8; ++i) { 2111 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2112 } 2113 } 2114 #endif 2115 2116 #define DO_CMP0(NAME, TYPE, OP) \ 2117 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2118 { \ 2119 intptr_t i, opr_sz = simd_oprsz(desc); \ 2120 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2121 TYPE nn = *(TYPE *)(vn + i); \ 2122 *(TYPE *)(vd + i) = -(nn OP 0); \ 2123 } \ 2124 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2125 } 2126 2127 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2128 DO_CMP0(gvec_clt0_b, int8_t, <) 2129 DO_CMP0(gvec_cle0_b, int8_t, <=) 2130 DO_CMP0(gvec_cgt0_b, int8_t, >) 2131 DO_CMP0(gvec_cge0_b, int8_t, >=) 2132 2133 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2134 DO_CMP0(gvec_clt0_h, int16_t, <) 2135 DO_CMP0(gvec_cle0_h, int16_t, <=) 2136 DO_CMP0(gvec_cgt0_h, int16_t, >) 2137 DO_CMP0(gvec_cge0_h, int16_t, >=) 2138 2139 #undef DO_CMP0 2140 2141 #define DO_ABD(NAME, TYPE) \ 2142 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2143 { \ 2144 intptr_t i, opr_sz = simd_oprsz(desc); \ 2145 TYPE *d = vd, *n = vn, *m = vm; \ 2146 \ 2147 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2148 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2149 } \ 2150 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2151 } 2152 2153 DO_ABD(gvec_sabd_b, int8_t) 2154 DO_ABD(gvec_sabd_h, int16_t) 2155 DO_ABD(gvec_sabd_s, int32_t) 2156 DO_ABD(gvec_sabd_d, int64_t) 2157 2158 DO_ABD(gvec_uabd_b, uint8_t) 2159 DO_ABD(gvec_uabd_h, uint16_t) 2160 DO_ABD(gvec_uabd_s, uint32_t) 2161 DO_ABD(gvec_uabd_d, uint64_t) 2162 2163 #undef DO_ABD 2164 2165 #define DO_ABA(NAME, TYPE) \ 2166 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2167 { \ 2168 intptr_t i, opr_sz = simd_oprsz(desc); \ 2169 TYPE *d = vd, *n = vn, *m = vm; \ 2170 \ 2171 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2172 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2173 } \ 2174 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2175 } 2176 2177 DO_ABA(gvec_saba_b, int8_t) 2178 DO_ABA(gvec_saba_h, int16_t) 2179 DO_ABA(gvec_saba_s, int32_t) 2180 DO_ABA(gvec_saba_d, int64_t) 2181 2182 DO_ABA(gvec_uaba_b, uint8_t) 2183 DO_ABA(gvec_uaba_h, uint16_t) 2184 DO_ABA(gvec_uaba_s, uint32_t) 2185 DO_ABA(gvec_uaba_d, uint64_t) 2186 2187 #undef DO_ABA 2188 2189 #define DO_NEON_PAIRWISE(NAME, OP) \ 2190 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \ 2191 void *stat, uint32_t oprsz) \ 2192 { \ 2193 float_status *fpst = stat; \ 2194 float32 *d = vd; \ 2195 float32 *n = vn; \ 2196 float32 *m = vm; \ 2197 float32 r0, r1; \ 2198 \ 2199 /* Read all inputs before writing outputs in case vm == vd */ \ 2200 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \ 2201 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \ 2202 \ 2203 d[H4(0)] = r0; \ 2204 d[H4(1)] = r1; \ 2205 } \ 2206 \ 2207 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \ 2208 void *stat, uint32_t oprsz) \ 2209 { \ 2210 float_status *fpst = stat; \ 2211 float16 *d = vd; \ 2212 float16 *n = vn; \ 2213 float16 *m = vm; \ 2214 float16 r0, r1, r2, r3; \ 2215 \ 2216 /* Read all inputs before writing outputs in case vm == vd */ \ 2217 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \ 2218 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \ 2219 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \ 2220 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \ 2221 \ 2222 d[H2(0)] = r0; \ 2223 d[H2(1)] = r1; \ 2224 d[H2(2)] = r2; \ 2225 d[H2(3)] = r3; \ 2226 } 2227 2228 DO_NEON_PAIRWISE(neon_padd, add) 2229 DO_NEON_PAIRWISE(neon_pmax, max) 2230 DO_NEON_PAIRWISE(neon_pmin, min) 2231 2232 #undef DO_NEON_PAIRWISE 2233 2234 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2235 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2236 { \ 2237 intptr_t i, oprsz = simd_oprsz(desc); \ 2238 int shift = simd_data(desc); \ 2239 TYPE *d = vd, *n = vn; \ 2240 float_status *fpst = stat; \ 2241 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2242 d[i] = FUNC(n[i], shift, fpst); \ 2243 } \ 2244 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2245 } 2246 2247 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2248 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2249 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2250 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) 2251 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2252 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2253 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2254 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2255 2256 #undef DO_VCVT_FIXED 2257 2258 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2259 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2260 { \ 2261 float_status *fpst = stat; \ 2262 intptr_t i, oprsz = simd_oprsz(desc); \ 2263 uint32_t rmode = simd_data(desc); \ 2264 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2265 TYPE *d = vd, *n = vn; \ 2266 set_float_rounding_mode(rmode, fpst); \ 2267 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2268 d[i] = FUNC(n[i], 0, fpst); \ 2269 } \ 2270 set_float_rounding_mode(prev_rmode, fpst); \ 2271 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2272 } 2273 2274 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2275 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2276 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2277 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2278 2279 #undef DO_VCVT_RMODE 2280 2281 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2282 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2283 { \ 2284 float_status *fpst = stat; \ 2285 intptr_t i, oprsz = simd_oprsz(desc); \ 2286 uint32_t rmode = simd_data(desc); \ 2287 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2288 TYPE *d = vd, *n = vn; \ 2289 set_float_rounding_mode(rmode, fpst); \ 2290 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2291 d[i] = FUNC(n[i], fpst); \ 2292 } \ 2293 set_float_rounding_mode(prev_rmode, fpst); \ 2294 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2295 } 2296 2297 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2298 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2299 2300 #undef DO_VRINT_RMODE 2301 2302 #ifdef TARGET_AARCH64 2303 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2304 { 2305 const uint8_t *indices = vm; 2306 CPUARMState *env = venv; 2307 size_t oprsz = simd_oprsz(desc); 2308 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2309 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2310 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2311 union { 2312 uint8_t b[16]; 2313 uint64_t d[2]; 2314 } result; 2315 2316 /* 2317 * We must construct the final result in a temp, lest the output 2318 * overlaps the input table. For TBL, begin with zero; for TBX, 2319 * begin with the original register contents. Note that we always 2320 * copy 16 bytes here to avoid an extra branch; clearing the high 2321 * bits of the register for oprsz == 8 is handled below. 2322 */ 2323 if (is_tbx) { 2324 memcpy(&result, vd, 16); 2325 } else { 2326 memset(&result, 0, 16); 2327 } 2328 2329 for (size_t i = 0; i < oprsz; ++i) { 2330 uint32_t index = indices[H1(i)]; 2331 2332 if (index < table_len) { 2333 /* 2334 * Convert index (a byte offset into the virtual table 2335 * which is a series of 128-bit vectors concatenated) 2336 * into the correct register element, bearing in mind 2337 * that the table can wrap around from V31 to V0. 2338 */ 2339 const uint8_t *table = (const uint8_t *) 2340 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2341 result.b[H1(i)] = table[H1(index % 16)]; 2342 } 2343 } 2344 2345 memcpy(vd, &result, 16); 2346 clear_tail(vd, oprsz, simd_maxsz(desc)); 2347 } 2348 #endif 2349 2350 /* 2351 * NxN -> N highpart multiply 2352 * 2353 * TODO: expose this as a generic vector operation. 2354 */ 2355 2356 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2357 { 2358 intptr_t i, opr_sz = simd_oprsz(desc); 2359 int8_t *d = vd, *n = vn, *m = vm; 2360 2361 for (i = 0; i < opr_sz; ++i) { 2362 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2363 } 2364 clear_tail(d, opr_sz, simd_maxsz(desc)); 2365 } 2366 2367 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2368 { 2369 intptr_t i, opr_sz = simd_oprsz(desc); 2370 int16_t *d = vd, *n = vn, *m = vm; 2371 2372 for (i = 0; i < opr_sz / 2; ++i) { 2373 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2374 } 2375 clear_tail(d, opr_sz, simd_maxsz(desc)); 2376 } 2377 2378 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2379 { 2380 intptr_t i, opr_sz = simd_oprsz(desc); 2381 int32_t *d = vd, *n = vn, *m = vm; 2382 2383 for (i = 0; i < opr_sz / 4; ++i) { 2384 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2385 } 2386 clear_tail(d, opr_sz, simd_maxsz(desc)); 2387 } 2388 2389 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2390 { 2391 intptr_t i, opr_sz = simd_oprsz(desc); 2392 uint64_t *d = vd, *n = vn, *m = vm; 2393 uint64_t discard; 2394 2395 for (i = 0; i < opr_sz / 8; ++i) { 2396 muls64(&discard, &d[i], n[i], m[i]); 2397 } 2398 clear_tail(d, opr_sz, simd_maxsz(desc)); 2399 } 2400 2401 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2402 { 2403 intptr_t i, opr_sz = simd_oprsz(desc); 2404 uint8_t *d = vd, *n = vn, *m = vm; 2405 2406 for (i = 0; i < opr_sz; ++i) { 2407 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2408 } 2409 clear_tail(d, opr_sz, simd_maxsz(desc)); 2410 } 2411 2412 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2413 { 2414 intptr_t i, opr_sz = simd_oprsz(desc); 2415 uint16_t *d = vd, *n = vn, *m = vm; 2416 2417 for (i = 0; i < opr_sz / 2; ++i) { 2418 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2419 } 2420 clear_tail(d, opr_sz, simd_maxsz(desc)); 2421 } 2422 2423 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2424 { 2425 intptr_t i, opr_sz = simd_oprsz(desc); 2426 uint32_t *d = vd, *n = vn, *m = vm; 2427 2428 for (i = 0; i < opr_sz / 4; ++i) { 2429 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2430 } 2431 clear_tail(d, opr_sz, simd_maxsz(desc)); 2432 } 2433 2434 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2435 { 2436 intptr_t i, opr_sz = simd_oprsz(desc); 2437 uint64_t *d = vd, *n = vn, *m = vm; 2438 uint64_t discard; 2439 2440 for (i = 0; i < opr_sz / 8; ++i) { 2441 mulu64(&discard, &d[i], n[i], m[i]); 2442 } 2443 clear_tail(d, opr_sz, simd_maxsz(desc)); 2444 } 2445 2446 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2447 { 2448 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2449 int shr = simd_data(desc); 2450 uint64_t *d = vd, *n = vn, *m = vm; 2451 2452 for (i = 0; i < opr_sz; ++i) { 2453 d[i] = ror64(n[i] ^ m[i], shr); 2454 } 2455 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2456 } 2457 2458 /* 2459 * Integer matrix-multiply accumulate 2460 */ 2461 2462 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2463 { 2464 int8_t *n = vn, *m = vm; 2465 2466 for (intptr_t k = 0; k < 8; ++k) { 2467 sum += n[H1(k)] * m[H1(k)]; 2468 } 2469 return sum; 2470 } 2471 2472 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2473 { 2474 uint8_t *n = vn, *m = vm; 2475 2476 for (intptr_t k = 0; k < 8; ++k) { 2477 sum += n[H1(k)] * m[H1(k)]; 2478 } 2479 return sum; 2480 } 2481 2482 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2483 { 2484 uint8_t *n = vn; 2485 int8_t *m = vm; 2486 2487 for (intptr_t k = 0; k < 8; ++k) { 2488 sum += n[H1(k)] * m[H1(k)]; 2489 } 2490 return sum; 2491 } 2492 2493 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2494 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2495 { 2496 intptr_t seg, opr_sz = simd_oprsz(desc); 2497 2498 for (seg = 0; seg < opr_sz; seg += 16) { 2499 uint32_t *d = vd + seg; 2500 uint32_t *a = va + seg; 2501 uint32_t sum0, sum1, sum2, sum3; 2502 2503 /* 2504 * Process the entire segment at once, writing back the 2505 * results only after we've consumed all of the inputs. 2506 * 2507 * Key to indices by column: 2508 * i j i j 2509 */ 2510 sum0 = a[H4(0 + 0)]; 2511 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2512 sum1 = a[H4(0 + 1)]; 2513 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2514 sum2 = a[H4(2 + 0)]; 2515 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2516 sum3 = a[H4(2 + 1)]; 2517 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2518 2519 d[H4(0)] = sum0; 2520 d[H4(1)] = sum1; 2521 d[H4(2)] = sum2; 2522 d[H4(3)] = sum3; 2523 } 2524 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2525 } 2526 2527 #define DO_MMLA_B(NAME, INNER) \ 2528 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2529 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2530 2531 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2532 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2533 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2534 2535 /* 2536 * BFloat16 Dot Product 2537 */ 2538 2539 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) 2540 { 2541 /* FPCR is ignored for BFDOT and BFMMLA. */ 2542 float_status bf_status = { 2543 .tininess_before_rounding = float_tininess_before_rounding, 2544 .float_rounding_mode = float_round_to_odd_inf, 2545 .flush_to_zero = true, 2546 .flush_inputs_to_zero = true, 2547 .default_nan_mode = true, 2548 }; 2549 float32 t1, t2; 2550 2551 /* 2552 * Extract each BFloat16 from the element pair, and shift 2553 * them such that they become float32. 2554 */ 2555 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); 2556 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); 2557 t1 = float32_add(t1, t2, &bf_status); 2558 t1 = float32_add(sum, t1, &bf_status); 2559 2560 return t1; 2561 } 2562 2563 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2564 { 2565 intptr_t i, opr_sz = simd_oprsz(desc); 2566 float32 *d = vd, *a = va; 2567 uint32_t *n = vn, *m = vm; 2568 2569 for (i = 0; i < opr_sz / 4; ++i) { 2570 d[i] = bfdotadd(a[i], n[i], m[i]); 2571 } 2572 clear_tail(d, opr_sz, simd_maxsz(desc)); 2573 } 2574 2575 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2576 void *va, uint32_t desc) 2577 { 2578 intptr_t i, j, opr_sz = simd_oprsz(desc); 2579 intptr_t index = simd_data(desc); 2580 intptr_t elements = opr_sz / 4; 2581 intptr_t eltspersegment = MIN(16 / 4, elements); 2582 float32 *d = vd, *a = va; 2583 uint32_t *n = vn, *m = vm; 2584 2585 for (i = 0; i < elements; i += eltspersegment) { 2586 uint32_t m_idx = m[i + H4(index)]; 2587 2588 for (j = i; j < i + eltspersegment; j++) { 2589 d[j] = bfdotadd(a[j], n[j], m_idx); 2590 } 2591 } 2592 clear_tail(d, opr_sz, simd_maxsz(desc)); 2593 } 2594 2595 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2596 { 2597 intptr_t s, opr_sz = simd_oprsz(desc); 2598 float32 *d = vd, *a = va; 2599 uint32_t *n = vn, *m = vm; 2600 2601 for (s = 0; s < opr_sz / 4; s += 4) { 2602 float32 sum00, sum01, sum10, sum11; 2603 2604 /* 2605 * Process the entire segment at once, writing back the 2606 * results only after we've consumed all of the inputs. 2607 * 2608 * Key to indices by column: 2609 * i j i k j k 2610 */ 2611 sum00 = a[s + H4(0 + 0)]; 2612 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); 2613 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); 2614 2615 sum01 = a[s + H4(0 + 1)]; 2616 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); 2617 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); 2618 2619 sum10 = a[s + H4(2 + 0)]; 2620 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); 2621 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); 2622 2623 sum11 = a[s + H4(2 + 1)]; 2624 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); 2625 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); 2626 2627 d[s + H4(0 + 0)] = sum00; 2628 d[s + H4(0 + 1)] = sum01; 2629 d[s + H4(2 + 0)] = sum10; 2630 d[s + H4(2 + 1)] = sum11; 2631 } 2632 clear_tail(d, opr_sz, simd_maxsz(desc)); 2633 } 2634 2635 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 2636 void *stat, uint32_t desc) 2637 { 2638 intptr_t i, opr_sz = simd_oprsz(desc); 2639 intptr_t sel = simd_data(desc); 2640 float32 *d = vd, *a = va; 2641 bfloat16 *n = vn, *m = vm; 2642 2643 for (i = 0; i < opr_sz / 4; ++i) { 2644 float32 nn = n[H2(i * 2 + sel)] << 16; 2645 float32 mm = m[H2(i * 2 + sel)] << 16; 2646 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 2647 } 2648 clear_tail(d, opr_sz, simd_maxsz(desc)); 2649 } 2650 2651 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 2652 void *va, void *stat, uint32_t desc) 2653 { 2654 intptr_t i, j, opr_sz = simd_oprsz(desc); 2655 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 2656 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 2657 intptr_t elements = opr_sz / 4; 2658 intptr_t eltspersegment = MIN(16 / 4, elements); 2659 float32 *d = vd, *a = va; 2660 bfloat16 *n = vn, *m = vm; 2661 2662 for (i = 0; i < elements; i += eltspersegment) { 2663 float32 m_idx = m[H2(2 * i + index)] << 16; 2664 2665 for (j = i; j < i + eltspersegment; j++) { 2666 float32 n_j = n[H2(2 * j + sel)] << 16; 2667 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 2668 } 2669 } 2670 clear_tail(d, opr_sz, simd_maxsz(desc)); 2671 } 2672 2673 #define DO_CLAMP(NAME, TYPE) \ 2674 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 2675 { \ 2676 intptr_t i, opr_sz = simd_oprsz(desc); \ 2677 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2678 TYPE aa = *(TYPE *)(a + i); \ 2679 TYPE nn = *(TYPE *)(n + i); \ 2680 TYPE mm = *(TYPE *)(m + i); \ 2681 TYPE dd = MIN(MAX(aa, nn), mm); \ 2682 *(TYPE *)(d + i) = dd; \ 2683 } \ 2684 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2685 } 2686 2687 DO_CLAMP(gvec_sclamp_b, int8_t) 2688 DO_CLAMP(gvec_sclamp_h, int16_t) 2689 DO_CLAMP(gvec_sclamp_s, int32_t) 2690 DO_CLAMP(gvec_sclamp_d, int64_t) 2691 2692 DO_CLAMP(gvec_uclamp_b, uint8_t) 2693 DO_CLAMP(gvec_uclamp_h, uint16_t) 2694 DO_CLAMP(gvec_uclamp_s, uint32_t) 2695 DO_CLAMP(gvec_uclamp_d, uint64_t) 2696