1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 315 void *va, uint32_t desc) 316 { 317 intptr_t i, opr_sz = simd_oprsz(desc); 318 int16_t *d = vd, *n = vn, *m = vm, *a = va; 319 uint32_t discard; 320 321 for (i = 0; i < opr_sz / 2; ++i) { 322 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 323 } 324 } 325 326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 327 void *va, uint32_t desc) 328 { 329 intptr_t i, opr_sz = simd_oprsz(desc); 330 int16_t *d = vd, *n = vn, *m = vm, *a = va; 331 uint32_t discard; 332 333 for (i = 0; i < opr_sz / 2; ++i) { 334 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 335 } 336 } 337 338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 339 { 340 intptr_t i, opr_sz = simd_oprsz(desc); 341 int16_t *d = vd, *n = vn, *m = vm; 342 uint32_t discard; 343 344 for (i = 0; i < opr_sz / 2; ++i) { 345 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 346 } 347 } 348 349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 350 { 351 intptr_t i, opr_sz = simd_oprsz(desc); 352 int16_t *d = vd, *n = vn, *m = vm; 353 uint32_t discard; 354 355 for (i = 0; i < opr_sz / 2; ++i) { 356 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 357 } 358 } 359 360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 361 { 362 intptr_t i, j, opr_sz = simd_oprsz(desc); 363 int idx = simd_data(desc); 364 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 365 uint32_t discard; 366 367 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 368 int16_t mm = m[i]; 369 for (j = 0; j < 16 / 2; ++j) { 370 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 371 } 372 } 373 } 374 375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 376 { 377 intptr_t i, j, opr_sz = simd_oprsz(desc); 378 int idx = simd_data(desc); 379 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 380 uint32_t discard; 381 382 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 383 int16_t mm = m[i]; 384 for (j = 0; j < 16 / 2; ++j) { 385 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 386 } 387 } 388 } 389 390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 392 bool neg, bool round, uint32_t *sat) 393 { 394 /* Simplify similarly to do_sqrdmlah_b above. */ 395 int64_t ret = (int64_t)src1 * src2; 396 if (neg) { 397 ret = -ret; 398 } 399 ret += ((int64_t)src3 << 31) + (round << 30); 400 ret >>= 31; 401 402 if (ret != (int32_t)ret) { 403 *sat = 1; 404 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 405 } 406 return ret; 407 } 408 409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 410 int32_t src2, int32_t src3) 411 { 412 uint32_t *sat = &env->vfp.qc[0]; 413 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 414 } 415 416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 417 void *vq, uint32_t desc) 418 { 419 uintptr_t opr_sz = simd_oprsz(desc); 420 int32_t *d = vd; 421 int32_t *n = vn; 422 int32_t *m = vm; 423 uintptr_t i; 424 425 for (i = 0; i < opr_sz / 4; ++i) { 426 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 427 } 428 clear_tail(d, opr_sz, simd_maxsz(desc)); 429 } 430 431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 432 int32_t src2, int32_t src3) 433 { 434 uint32_t *sat = &env->vfp.qc[0]; 435 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 436 } 437 438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 439 void *vq, uint32_t desc) 440 { 441 uintptr_t opr_sz = simd_oprsz(desc); 442 int32_t *d = vd; 443 int32_t *n = vn; 444 int32_t *m = vm; 445 uintptr_t i; 446 447 for (i = 0; i < opr_sz / 4; ++i) { 448 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 449 } 450 clear_tail(d, opr_sz, simd_maxsz(desc)); 451 } 452 453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 454 void *vq, uint32_t desc) 455 { 456 intptr_t i, opr_sz = simd_oprsz(desc); 457 int32_t *d = vd, *n = vn, *m = vm; 458 459 for (i = 0; i < opr_sz / 4; ++i) { 460 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 461 } 462 clear_tail(d, opr_sz, simd_maxsz(desc)); 463 } 464 465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 466 void *vq, uint32_t desc) 467 { 468 intptr_t i, opr_sz = simd_oprsz(desc); 469 int32_t *d = vd, *n = vn, *m = vm; 470 471 for (i = 0; i < opr_sz / 4; ++i) { 472 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 473 } 474 clear_tail(d, opr_sz, simd_maxsz(desc)); 475 } 476 477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 478 void *va, uint32_t desc) 479 { 480 intptr_t i, opr_sz = simd_oprsz(desc); 481 int32_t *d = vd, *n = vn, *m = vm, *a = va; 482 uint32_t discard; 483 484 for (i = 0; i < opr_sz / 4; ++i) { 485 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 486 } 487 } 488 489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 490 void *va, uint32_t desc) 491 { 492 intptr_t i, opr_sz = simd_oprsz(desc); 493 int32_t *d = vd, *n = vn, *m = vm, *a = va; 494 uint32_t discard; 495 496 for (i = 0; i < opr_sz / 4; ++i) { 497 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 498 } 499 } 500 501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 502 { 503 intptr_t i, opr_sz = simd_oprsz(desc); 504 int32_t *d = vd, *n = vn, *m = vm; 505 uint32_t discard; 506 507 for (i = 0; i < opr_sz / 4; ++i) { 508 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 509 } 510 } 511 512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 513 { 514 intptr_t i, opr_sz = simd_oprsz(desc); 515 int32_t *d = vd, *n = vn, *m = vm; 516 uint32_t discard; 517 518 for (i = 0; i < opr_sz / 4; ++i) { 519 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 520 } 521 } 522 523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 524 { 525 intptr_t i, j, opr_sz = simd_oprsz(desc); 526 int idx = simd_data(desc); 527 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 528 uint32_t discard; 529 530 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 531 int32_t mm = m[i]; 532 for (j = 0; j < 16 / 4; ++j) { 533 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 534 } 535 } 536 } 537 538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 539 { 540 intptr_t i, j, opr_sz = simd_oprsz(desc); 541 int idx = simd_data(desc); 542 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 543 uint32_t discard; 544 545 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 546 int32_t mm = m[i]; 547 for (j = 0; j < 16 / 4; ++j) { 548 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 549 } 550 } 551 } 552 553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 554 static int64_t do_sat128_d(Int128 r) 555 { 556 int64_t ls = int128_getlo(r); 557 int64_t hs = int128_gethi(r); 558 559 if (unlikely(hs != (ls >> 63))) { 560 return hs < 0 ? INT64_MIN : INT64_MAX; 561 } 562 return ls; 563 } 564 565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 566 { 567 uint64_t l, h; 568 Int128 r, t; 569 570 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 571 muls64(&l, &h, m, n); 572 r = int128_make128(l, h); 573 if (neg) { 574 r = int128_neg(r); 575 } 576 if (a) { 577 t = int128_exts64(a); 578 t = int128_lshift(t, 63); 579 r = int128_add(r, t); 580 } 581 if (round) { 582 t = int128_exts64(1ll << 62); 583 r = int128_add(r, t); 584 } 585 r = int128_rshift(r, 63); 586 587 return do_sat128_d(r); 588 } 589 590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 591 void *va, uint32_t desc) 592 { 593 intptr_t i, opr_sz = simd_oprsz(desc); 594 int64_t *d = vd, *n = vn, *m = vm, *a = va; 595 596 for (i = 0; i < opr_sz / 8; ++i) { 597 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 598 } 599 } 600 601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 602 void *va, uint32_t desc) 603 { 604 intptr_t i, opr_sz = simd_oprsz(desc); 605 int64_t *d = vd, *n = vn, *m = vm, *a = va; 606 607 for (i = 0; i < opr_sz / 8; ++i) { 608 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 609 } 610 } 611 612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 613 { 614 intptr_t i, opr_sz = simd_oprsz(desc); 615 int64_t *d = vd, *n = vn, *m = vm; 616 617 for (i = 0; i < opr_sz / 8; ++i) { 618 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 619 } 620 } 621 622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int64_t *d = vd, *n = vn, *m = vm; 626 627 for (i = 0; i < opr_sz / 8; ++i) { 628 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 629 } 630 } 631 632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 633 { 634 intptr_t i, j, opr_sz = simd_oprsz(desc); 635 int idx = simd_data(desc); 636 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 637 638 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 639 int64_t mm = m[i]; 640 for (j = 0; j < 16 / 8; ++j) { 641 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 642 } 643 } 644 } 645 646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 647 { 648 intptr_t i, j, opr_sz = simd_oprsz(desc); 649 int idx = simd_data(desc); 650 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 651 652 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 653 int64_t mm = m[i]; 654 for (j = 0; j < 16 / 8; ++j) { 655 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 656 } 657 } 658 } 659 660 /* Integer 8 and 16-bit dot-product. 661 * 662 * Note that for the loops herein, host endianness does not matter 663 * with respect to the ordering of data within the quad-width lanes. 664 * All elements are treated equally, no matter where they are. 665 */ 666 667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 669 { \ 670 intptr_t i, opr_sz = simd_oprsz(desc); \ 671 TYPED *d = vd, *a = va; \ 672 TYPEN *n = vn; \ 673 TYPEM *m = vm; \ 674 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 675 d[i] = (a[i] + \ 676 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 677 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 678 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 679 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 680 } \ 681 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 682 } 683 684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 689 690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 692 { \ 693 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 694 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 695 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 696 intptr_t index = simd_data(desc); \ 697 TYPED *d = vd, *a = va; \ 698 TYPEN *n = vn; \ 699 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 700 do { \ 701 TYPED m0 = m_indexed[i * 4 + 0]; \ 702 TYPED m1 = m_indexed[i * 4 + 1]; \ 703 TYPED m2 = m_indexed[i * 4 + 2]; \ 704 TYPED m3 = m_indexed[i * 4 + 3]; \ 705 do { \ 706 d[i] = (a[i] + \ 707 n[i * 4 + 0] * m0 + \ 708 n[i * 4 + 1] * m1 + \ 709 n[i * 4 + 2] * m2 + \ 710 n[i * 4 + 3] * m3); \ 711 } while (++i < segend); \ 712 segend = i + 4; \ 713 } while (i < opr_sz_n); \ 714 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 715 } 716 717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 723 724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 725 void *vfpst, uint32_t desc) 726 { 727 uintptr_t opr_sz = simd_oprsz(desc); 728 float16 *d = vd; 729 float16 *n = vn; 730 float16 *m = vm; 731 float_status *fpst = vfpst; 732 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 733 uint32_t neg_imag = neg_real ^ 1; 734 uintptr_t i; 735 736 /* Shift boolean to the sign bit so we can xor to negate. */ 737 neg_real <<= 15; 738 neg_imag <<= 15; 739 740 for (i = 0; i < opr_sz / 2; i += 2) { 741 float16 e0 = n[H2(i)]; 742 float16 e1 = m[H2(i + 1)] ^ neg_imag; 743 float16 e2 = n[H2(i + 1)]; 744 float16 e3 = m[H2(i)] ^ neg_real; 745 746 d[H2(i)] = float16_add(e0, e1, fpst); 747 d[H2(i + 1)] = float16_add(e2, e3, fpst); 748 } 749 clear_tail(d, opr_sz, simd_maxsz(desc)); 750 } 751 752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 753 void *vfpst, uint32_t desc) 754 { 755 uintptr_t opr_sz = simd_oprsz(desc); 756 float32 *d = vd; 757 float32 *n = vn; 758 float32 *m = vm; 759 float_status *fpst = vfpst; 760 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 761 uint32_t neg_imag = neg_real ^ 1; 762 uintptr_t i; 763 764 /* Shift boolean to the sign bit so we can xor to negate. */ 765 neg_real <<= 31; 766 neg_imag <<= 31; 767 768 for (i = 0; i < opr_sz / 4; i += 2) { 769 float32 e0 = n[H4(i)]; 770 float32 e1 = m[H4(i + 1)] ^ neg_imag; 771 float32 e2 = n[H4(i + 1)]; 772 float32 e3 = m[H4(i)] ^ neg_real; 773 774 d[H4(i)] = float32_add(e0, e1, fpst); 775 d[H4(i + 1)] = float32_add(e2, e3, fpst); 776 } 777 clear_tail(d, opr_sz, simd_maxsz(desc)); 778 } 779 780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 781 void *vfpst, uint32_t desc) 782 { 783 uintptr_t opr_sz = simd_oprsz(desc); 784 float64 *d = vd; 785 float64 *n = vn; 786 float64 *m = vm; 787 float_status *fpst = vfpst; 788 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 789 uint64_t neg_imag = neg_real ^ 1; 790 uintptr_t i; 791 792 /* Shift boolean to the sign bit so we can xor to negate. */ 793 neg_real <<= 63; 794 neg_imag <<= 63; 795 796 for (i = 0; i < opr_sz / 8; i += 2) { 797 float64 e0 = n[i]; 798 float64 e1 = m[i + 1] ^ neg_imag; 799 float64 e2 = n[i + 1]; 800 float64 e3 = m[i] ^ neg_real; 801 802 d[i] = float64_add(e0, e1, fpst); 803 d[i + 1] = float64_add(e2, e3, fpst); 804 } 805 clear_tail(d, opr_sz, simd_maxsz(desc)); 806 } 807 808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 809 void *vfpst, uint32_t desc) 810 { 811 uintptr_t opr_sz = simd_oprsz(desc); 812 float16 *d = vd, *n = vn, *m = vm, *a = va; 813 float_status *fpst = vfpst; 814 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 815 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 816 uint32_t neg_real = flip ^ neg_imag; 817 uintptr_t i; 818 819 /* Shift boolean to the sign bit so we can xor to negate. */ 820 neg_real <<= 15; 821 neg_imag <<= 15; 822 823 for (i = 0; i < opr_sz / 2; i += 2) { 824 float16 e2 = n[H2(i + flip)]; 825 float16 e1 = m[H2(i + flip)] ^ neg_real; 826 float16 e4 = e2; 827 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 828 829 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 830 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 831 } 832 clear_tail(d, opr_sz, simd_maxsz(desc)); 833 } 834 835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 836 void *vfpst, uint32_t desc) 837 { 838 uintptr_t opr_sz = simd_oprsz(desc); 839 float16 *d = vd, *n = vn, *m = vm, *a = va; 840 float_status *fpst = vfpst; 841 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 842 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 843 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 844 uint32_t neg_real = flip ^ neg_imag; 845 intptr_t elements = opr_sz / sizeof(float16); 846 intptr_t eltspersegment = 16 / sizeof(float16); 847 intptr_t i, j; 848 849 /* Shift boolean to the sign bit so we can xor to negate. */ 850 neg_real <<= 15; 851 neg_imag <<= 15; 852 853 for (i = 0; i < elements; i += eltspersegment) { 854 float16 mr = m[H2(i + 2 * index + 0)]; 855 float16 mi = m[H2(i + 2 * index + 1)]; 856 float16 e1 = neg_real ^ (flip ? mi : mr); 857 float16 e3 = neg_imag ^ (flip ? mr : mi); 858 859 for (j = i; j < i + eltspersegment; j += 2) { 860 float16 e2 = n[H2(j + flip)]; 861 float16 e4 = e2; 862 863 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 864 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 865 } 866 } 867 clear_tail(d, opr_sz, simd_maxsz(desc)); 868 } 869 870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 871 void *vfpst, uint32_t desc) 872 { 873 uintptr_t opr_sz = simd_oprsz(desc); 874 float32 *d = vd, *n = vn, *m = vm, *a = va; 875 float_status *fpst = vfpst; 876 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 877 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 878 uint32_t neg_real = flip ^ neg_imag; 879 uintptr_t i; 880 881 /* Shift boolean to the sign bit so we can xor to negate. */ 882 neg_real <<= 31; 883 neg_imag <<= 31; 884 885 for (i = 0; i < opr_sz / 4; i += 2) { 886 float32 e2 = n[H4(i + flip)]; 887 float32 e1 = m[H4(i + flip)] ^ neg_real; 888 float32 e4 = e2; 889 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 890 891 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 892 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 893 } 894 clear_tail(d, opr_sz, simd_maxsz(desc)); 895 } 896 897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 898 void *vfpst, uint32_t desc) 899 { 900 uintptr_t opr_sz = simd_oprsz(desc); 901 float32 *d = vd, *n = vn, *m = vm, *a = va; 902 float_status *fpst = vfpst; 903 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 904 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 905 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 906 uint32_t neg_real = flip ^ neg_imag; 907 intptr_t elements = opr_sz / sizeof(float32); 908 intptr_t eltspersegment = 16 / sizeof(float32); 909 intptr_t i, j; 910 911 /* Shift boolean to the sign bit so we can xor to negate. */ 912 neg_real <<= 31; 913 neg_imag <<= 31; 914 915 for (i = 0; i < elements; i += eltspersegment) { 916 float32 mr = m[H4(i + 2 * index + 0)]; 917 float32 mi = m[H4(i + 2 * index + 1)]; 918 float32 e1 = neg_real ^ (flip ? mi : mr); 919 float32 e3 = neg_imag ^ (flip ? mr : mi); 920 921 for (j = i; j < i + eltspersegment; j += 2) { 922 float32 e2 = n[H4(j + flip)]; 923 float32 e4 = e2; 924 925 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 926 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 927 } 928 } 929 clear_tail(d, opr_sz, simd_maxsz(desc)); 930 } 931 932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 933 void *vfpst, uint32_t desc) 934 { 935 uintptr_t opr_sz = simd_oprsz(desc); 936 float64 *d = vd, *n = vn, *m = vm, *a = va; 937 float_status *fpst = vfpst; 938 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 939 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 940 uint64_t neg_real = flip ^ neg_imag; 941 uintptr_t i; 942 943 /* Shift boolean to the sign bit so we can xor to negate. */ 944 neg_real <<= 63; 945 neg_imag <<= 63; 946 947 for (i = 0; i < opr_sz / 8; i += 2) { 948 float64 e2 = n[i + flip]; 949 float64 e1 = m[i + flip] ^ neg_real; 950 float64 e4 = e2; 951 float64 e3 = m[i + 1 - flip] ^ neg_imag; 952 953 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 954 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 955 } 956 clear_tail(d, opr_sz, simd_maxsz(desc)); 957 } 958 959 /* 960 * Floating point comparisons producing an integer result (all 1s or all 0s). 961 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 962 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 963 */ 964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 965 { 966 return -float16_eq_quiet(op1, op2, stat); 967 } 968 969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 970 { 971 return -float32_eq_quiet(op1, op2, stat); 972 } 973 974 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 975 { 976 return -float64_eq_quiet(op1, op2, stat); 977 } 978 979 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 980 { 981 return -float16_le(op2, op1, stat); 982 } 983 984 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 985 { 986 return -float32_le(op2, op1, stat); 987 } 988 989 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 990 { 991 return -float64_le(op2, op1, stat); 992 } 993 994 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 995 { 996 return -float16_lt(op2, op1, stat); 997 } 998 999 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1000 { 1001 return -float32_lt(op2, op1, stat); 1002 } 1003 1004 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1005 { 1006 return -float64_lt(op2, op1, stat); 1007 } 1008 1009 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1010 { 1011 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1012 } 1013 1014 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1015 { 1016 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1017 } 1018 1019 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1020 { 1021 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1022 } 1023 1024 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1025 { 1026 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1027 } 1028 1029 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1030 { 1031 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1032 } 1033 1034 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1035 { 1036 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1037 } 1038 1039 static int16_t vfp_tosszh(float16 x, void *fpstp) 1040 { 1041 float_status *fpst = fpstp; 1042 if (float16_is_any_nan(x)) { 1043 float_raise(float_flag_invalid, fpst); 1044 return 0; 1045 } 1046 return float16_to_int16_round_to_zero(x, fpst); 1047 } 1048 1049 static uint16_t vfp_touszh(float16 x, void *fpstp) 1050 { 1051 float_status *fpst = fpstp; 1052 if (float16_is_any_nan(x)) { 1053 float_raise(float_flag_invalid, fpst); 1054 return 0; 1055 } 1056 return float16_to_uint16_round_to_zero(x, fpst); 1057 } 1058 1059 #define DO_2OP(NAME, FUNC, TYPE) \ 1060 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1061 { \ 1062 intptr_t i, oprsz = simd_oprsz(desc); \ 1063 TYPE *d = vd, *n = vn; \ 1064 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1065 d[i] = FUNC(n[i], stat); \ 1066 } \ 1067 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1068 } 1069 1070 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1071 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1072 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1073 1074 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1075 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1076 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1077 1078 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1079 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1080 1081 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1082 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1083 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1084 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1085 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1086 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1087 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1088 DO_2OP(gvec_touszh, vfp_touszh, float16) 1089 1090 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1091 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1092 { \ 1093 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1094 } 1095 1096 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1097 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1098 { \ 1099 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1100 } 1101 1102 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1103 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1104 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1105 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1106 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1107 1108 DO_2OP_CMP0(cgt, cgt, FWD) 1109 DO_2OP_CMP0(cge, cge, FWD) 1110 DO_2OP_CMP0(ceq, ceq, FWD) 1111 DO_2OP_CMP0(clt, cgt, REV) 1112 DO_2OP_CMP0(cle, cge, REV) 1113 1114 #undef DO_2OP 1115 #undef DO_2OP_CMP0 1116 1117 /* Floating-point trigonometric starting value. 1118 * See the ARM ARM pseudocode function FPTrigSMul. 1119 */ 1120 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1121 { 1122 float16 result = float16_mul(op1, op1, stat); 1123 if (!float16_is_any_nan(result)) { 1124 result = float16_set_sign(result, op2 & 1); 1125 } 1126 return result; 1127 } 1128 1129 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1130 { 1131 float32 result = float32_mul(op1, op1, stat); 1132 if (!float32_is_any_nan(result)) { 1133 result = float32_set_sign(result, op2 & 1); 1134 } 1135 return result; 1136 } 1137 1138 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1139 { 1140 float64 result = float64_mul(op1, op1, stat); 1141 if (!float64_is_any_nan(result)) { 1142 result = float64_set_sign(result, op2 & 1); 1143 } 1144 return result; 1145 } 1146 1147 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1148 { 1149 return float16_abs(float16_sub(op1, op2, stat)); 1150 } 1151 1152 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1153 { 1154 return float32_abs(float32_sub(op1, op2, stat)); 1155 } 1156 1157 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1158 { 1159 return float64_abs(float64_sub(op1, op2, stat)); 1160 } 1161 1162 /* 1163 * Reciprocal step. These are the AArch32 version which uses a 1164 * non-fused multiply-and-subtract. 1165 */ 1166 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1167 { 1168 op1 = float16_squash_input_denormal(op1, stat); 1169 op2 = float16_squash_input_denormal(op2, stat); 1170 1171 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1172 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1173 return float16_two; 1174 } 1175 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1176 } 1177 1178 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1179 { 1180 op1 = float32_squash_input_denormal(op1, stat); 1181 op2 = float32_squash_input_denormal(op2, stat); 1182 1183 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1184 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1185 return float32_two; 1186 } 1187 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1188 } 1189 1190 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1191 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1192 { 1193 op1 = float16_squash_input_denormal(op1, stat); 1194 op2 = float16_squash_input_denormal(op2, stat); 1195 1196 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1197 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1198 return float16_one_point_five; 1199 } 1200 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1201 return float16_div(op1, float16_two, stat); 1202 } 1203 1204 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1205 { 1206 op1 = float32_squash_input_denormal(op1, stat); 1207 op2 = float32_squash_input_denormal(op2, stat); 1208 1209 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1210 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1211 return float32_one_point_five; 1212 } 1213 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1214 return float32_div(op1, float32_two, stat); 1215 } 1216 1217 #define DO_3OP(NAME, FUNC, TYPE) \ 1218 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1219 { \ 1220 intptr_t i, oprsz = simd_oprsz(desc); \ 1221 TYPE *d = vd, *n = vn, *m = vm; \ 1222 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1223 d[i] = FUNC(n[i], m[i], stat); \ 1224 } \ 1225 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1226 } 1227 1228 DO_3OP(gvec_fadd_h, float16_add, float16) 1229 DO_3OP(gvec_fadd_s, float32_add, float32) 1230 DO_3OP(gvec_fadd_d, float64_add, float64) 1231 1232 DO_3OP(gvec_fsub_h, float16_sub, float16) 1233 DO_3OP(gvec_fsub_s, float32_sub, float32) 1234 DO_3OP(gvec_fsub_d, float64_sub, float64) 1235 1236 DO_3OP(gvec_fmul_h, float16_mul, float16) 1237 DO_3OP(gvec_fmul_s, float32_mul, float32) 1238 DO_3OP(gvec_fmul_d, float64_mul, float64) 1239 1240 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1241 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1242 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1243 1244 DO_3OP(gvec_fabd_h, float16_abd, float16) 1245 DO_3OP(gvec_fabd_s, float32_abd, float32) 1246 DO_3OP(gvec_fabd_d, float64_abd, float64) 1247 1248 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1249 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1250 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1251 1252 DO_3OP(gvec_fcge_h, float16_cge, float16) 1253 DO_3OP(gvec_fcge_s, float32_cge, float32) 1254 DO_3OP(gvec_fcge_d, float64_cge, float64) 1255 1256 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1257 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1258 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1259 1260 DO_3OP(gvec_facge_h, float16_acge, float16) 1261 DO_3OP(gvec_facge_s, float32_acge, float32) 1262 DO_3OP(gvec_facge_d, float64_acge, float64) 1263 1264 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1265 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1266 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1267 1268 DO_3OP(gvec_fmax_h, float16_max, float16) 1269 DO_3OP(gvec_fmax_s, float32_max, float32) 1270 DO_3OP(gvec_fmax_d, float64_max, float64) 1271 1272 DO_3OP(gvec_fmin_h, float16_min, float16) 1273 DO_3OP(gvec_fmin_s, float32_min, float32) 1274 DO_3OP(gvec_fmin_d, float64_min, float64) 1275 1276 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1277 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1278 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1279 1280 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1281 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1282 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1283 1284 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1285 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1286 1287 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1288 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1289 1290 #ifdef TARGET_AARCH64 1291 DO_3OP(gvec_fdiv_h, float16_div, float16) 1292 DO_3OP(gvec_fdiv_s, float32_div, float32) 1293 DO_3OP(gvec_fdiv_d, float64_div, float64) 1294 1295 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1296 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1297 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1298 1299 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1300 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1301 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1302 1303 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1304 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1305 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1306 1307 #endif 1308 #undef DO_3OP 1309 1310 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1311 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1312 float_status *stat) 1313 { 1314 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1315 } 1316 1317 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1318 float_status *stat) 1319 { 1320 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1321 } 1322 1323 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1324 float_status *stat) 1325 { 1326 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1327 } 1328 1329 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1330 float_status *stat) 1331 { 1332 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1333 } 1334 1335 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1336 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1337 float_status *stat) 1338 { 1339 return float16_muladd(op1, op2, dest, 0, stat); 1340 } 1341 1342 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1343 float_status *stat) 1344 { 1345 return float32_muladd(op1, op2, dest, 0, stat); 1346 } 1347 1348 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1349 float_status *stat) 1350 { 1351 return float64_muladd(op1, op2, dest, 0, stat); 1352 } 1353 1354 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1355 float_status *stat) 1356 { 1357 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1358 } 1359 1360 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1361 float_status *stat) 1362 { 1363 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1364 } 1365 1366 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1367 float_status *stat) 1368 { 1369 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1370 } 1371 1372 #define DO_MULADD(NAME, FUNC, TYPE) \ 1373 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1374 { \ 1375 intptr_t i, oprsz = simd_oprsz(desc); \ 1376 TYPE *d = vd, *n = vn, *m = vm; \ 1377 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1378 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1379 } \ 1380 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1381 } 1382 1383 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1384 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1385 1386 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1387 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1388 1389 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1390 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1391 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1392 1393 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1394 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1395 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1396 1397 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1398 * For AdvSIMD, there is of course only one such vector segment. 1399 */ 1400 1401 #define DO_MUL_IDX(NAME, TYPE, H) \ 1402 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1403 { \ 1404 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1405 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1406 intptr_t idx = simd_data(desc); \ 1407 TYPE *d = vd, *n = vn, *m = vm; \ 1408 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1409 TYPE mm = m[H(i + idx)]; \ 1410 for (j = 0; j < segment; j++) { \ 1411 d[i + j] = n[i + j] * mm; \ 1412 } \ 1413 } \ 1414 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1415 } 1416 1417 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1418 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1419 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1420 1421 #undef DO_MUL_IDX 1422 1423 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1424 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1425 { \ 1426 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1427 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1428 intptr_t idx = simd_data(desc); \ 1429 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1430 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1431 TYPE mm = m[H(i + idx)]; \ 1432 for (j = 0; j < segment; j++) { \ 1433 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1434 } \ 1435 } \ 1436 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1437 } 1438 1439 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1440 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1441 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1442 1443 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1444 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1445 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1446 1447 #undef DO_MLA_IDX 1448 1449 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1450 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1451 { \ 1452 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1453 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1454 intptr_t idx = simd_data(desc); \ 1455 TYPE *d = vd, *n = vn, *m = vm; \ 1456 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1457 TYPE mm = m[H(i + idx)]; \ 1458 for (j = 0; j < segment; j++) { \ 1459 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1460 } \ 1461 } \ 1462 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1463 } 1464 1465 #define nop(N, M, S) (M) 1466 1467 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1468 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1469 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1470 1471 #ifdef TARGET_AARCH64 1472 1473 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1474 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1475 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1476 1477 #endif 1478 1479 #undef nop 1480 1481 /* 1482 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1483 * the fused ops below they assume accumulate both from and into Vd. 1484 */ 1485 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1486 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1487 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1488 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1489 1490 #undef DO_FMUL_IDX 1491 1492 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1493 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1494 void *stat, uint32_t desc) \ 1495 { \ 1496 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1497 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1498 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1499 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1500 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1501 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1502 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1503 TYPE mm = m[H(i + idx)]; \ 1504 for (j = 0; j < segment; j++) { \ 1505 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1506 mm, a[i + j], 0, stat); \ 1507 } \ 1508 } \ 1509 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1510 } 1511 1512 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1513 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1514 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1515 1516 #undef DO_FMLA_IDX 1517 1518 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1519 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1520 { \ 1521 intptr_t i, oprsz = simd_oprsz(desc); \ 1522 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1523 bool q = false; \ 1524 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1525 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1526 if (dd < MIN) { \ 1527 dd = MIN; \ 1528 q = true; \ 1529 } else if (dd > MAX) { \ 1530 dd = MAX; \ 1531 q = true; \ 1532 } \ 1533 d[i] = dd; \ 1534 } \ 1535 if (q) { \ 1536 uint32_t *qc = vq; \ 1537 qc[0] = 1; \ 1538 } \ 1539 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1540 } 1541 1542 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1543 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1544 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1545 1546 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1547 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1548 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1549 1550 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1551 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1552 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1553 1554 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1555 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1556 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1557 1558 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1559 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1560 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1561 1562 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1563 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1564 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1565 1566 #undef DO_SAT 1567 1568 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1569 void *vm, uint32_t desc) 1570 { 1571 intptr_t i, oprsz = simd_oprsz(desc); 1572 uint64_t *d = vd, *n = vn, *m = vm; 1573 bool q = false; 1574 1575 for (i = 0; i < oprsz / 8; i++) { 1576 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1577 if (dd < nn) { 1578 dd = UINT64_MAX; 1579 q = true; 1580 } 1581 d[i] = dd; 1582 } 1583 if (q) { 1584 uint32_t *qc = vq; 1585 qc[0] = 1; 1586 } 1587 clear_tail(d, oprsz, simd_maxsz(desc)); 1588 } 1589 1590 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1591 void *vm, uint32_t desc) 1592 { 1593 intptr_t i, oprsz = simd_oprsz(desc); 1594 uint64_t *d = vd, *n = vn, *m = vm; 1595 bool q = false; 1596 1597 for (i = 0; i < oprsz / 8; i++) { 1598 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1599 if (nn < mm) { 1600 dd = 0; 1601 q = true; 1602 } 1603 d[i] = dd; 1604 } 1605 if (q) { 1606 uint32_t *qc = vq; 1607 qc[0] = 1; 1608 } 1609 clear_tail(d, oprsz, simd_maxsz(desc)); 1610 } 1611 1612 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1613 void *vm, uint32_t desc) 1614 { 1615 intptr_t i, oprsz = simd_oprsz(desc); 1616 int64_t *d = vd, *n = vn, *m = vm; 1617 bool q = false; 1618 1619 for (i = 0; i < oprsz / 8; i++) { 1620 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1621 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1622 dd = (nn >> 63) ^ ~INT64_MIN; 1623 q = true; 1624 } 1625 d[i] = dd; 1626 } 1627 if (q) { 1628 uint32_t *qc = vq; 1629 qc[0] = 1; 1630 } 1631 clear_tail(d, oprsz, simd_maxsz(desc)); 1632 } 1633 1634 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1635 void *vm, uint32_t desc) 1636 { 1637 intptr_t i, oprsz = simd_oprsz(desc); 1638 int64_t *d = vd, *n = vn, *m = vm; 1639 bool q = false; 1640 1641 for (i = 0; i < oprsz / 8; i++) { 1642 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1643 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1644 dd = (nn >> 63) ^ ~INT64_MIN; 1645 q = true; 1646 } 1647 d[i] = dd; 1648 } 1649 if (q) { 1650 uint32_t *qc = vq; 1651 qc[0] = 1; 1652 } 1653 clear_tail(d, oprsz, simd_maxsz(desc)); 1654 } 1655 1656 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1657 void *vm, uint32_t desc) 1658 { 1659 intptr_t i, oprsz = simd_oprsz(desc); 1660 uint64_t *d = vd, *n = vn, *m = vm; 1661 bool q = false; 1662 1663 for (i = 0; i < oprsz / 8; i++) { 1664 uint64_t nn = n[i]; 1665 int64_t mm = m[i]; 1666 uint64_t dd = nn + mm; 1667 1668 if (mm < 0) { 1669 if (nn < (uint64_t)-mm) { 1670 dd = 0; 1671 q = true; 1672 } 1673 } else { 1674 if (dd < nn) { 1675 dd = UINT64_MAX; 1676 q = true; 1677 } 1678 } 1679 d[i] = dd; 1680 } 1681 if (q) { 1682 uint32_t *qc = vq; 1683 qc[0] = 1; 1684 } 1685 clear_tail(d, oprsz, simd_maxsz(desc)); 1686 } 1687 1688 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1689 void *vm, uint32_t desc) 1690 { 1691 intptr_t i, oprsz = simd_oprsz(desc); 1692 uint64_t *d = vd, *n = vn, *m = vm; 1693 bool q = false; 1694 1695 for (i = 0; i < oprsz / 8; i++) { 1696 int64_t nn = n[i]; 1697 uint64_t mm = m[i]; 1698 int64_t dd = nn + mm; 1699 1700 if (mm > (uint64_t)(INT64_MAX - nn)) { 1701 dd = INT64_MAX; 1702 q = true; 1703 } 1704 d[i] = dd; 1705 } 1706 if (q) { 1707 uint32_t *qc = vq; 1708 qc[0] = 1; 1709 } 1710 clear_tail(d, oprsz, simd_maxsz(desc)); 1711 } 1712 1713 #define DO_SRA(NAME, TYPE) \ 1714 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1715 { \ 1716 intptr_t i, oprsz = simd_oprsz(desc); \ 1717 int shift = simd_data(desc); \ 1718 TYPE *d = vd, *n = vn; \ 1719 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1720 d[i] += n[i] >> shift; \ 1721 } \ 1722 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1723 } 1724 1725 DO_SRA(gvec_ssra_b, int8_t) 1726 DO_SRA(gvec_ssra_h, int16_t) 1727 DO_SRA(gvec_ssra_s, int32_t) 1728 DO_SRA(gvec_ssra_d, int64_t) 1729 1730 DO_SRA(gvec_usra_b, uint8_t) 1731 DO_SRA(gvec_usra_h, uint16_t) 1732 DO_SRA(gvec_usra_s, uint32_t) 1733 DO_SRA(gvec_usra_d, uint64_t) 1734 1735 #undef DO_SRA 1736 1737 #define DO_RSHR(NAME, TYPE) \ 1738 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1739 { \ 1740 intptr_t i, oprsz = simd_oprsz(desc); \ 1741 int shift = simd_data(desc); \ 1742 TYPE *d = vd, *n = vn; \ 1743 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1744 TYPE tmp = n[i] >> (shift - 1); \ 1745 d[i] = (tmp >> 1) + (tmp & 1); \ 1746 } \ 1747 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1748 } 1749 1750 DO_RSHR(gvec_srshr_b, int8_t) 1751 DO_RSHR(gvec_srshr_h, int16_t) 1752 DO_RSHR(gvec_srshr_s, int32_t) 1753 DO_RSHR(gvec_srshr_d, int64_t) 1754 1755 DO_RSHR(gvec_urshr_b, uint8_t) 1756 DO_RSHR(gvec_urshr_h, uint16_t) 1757 DO_RSHR(gvec_urshr_s, uint32_t) 1758 DO_RSHR(gvec_urshr_d, uint64_t) 1759 1760 #undef DO_RSHR 1761 1762 #define DO_RSRA(NAME, TYPE) \ 1763 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1764 { \ 1765 intptr_t i, oprsz = simd_oprsz(desc); \ 1766 int shift = simd_data(desc); \ 1767 TYPE *d = vd, *n = vn; \ 1768 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1769 TYPE tmp = n[i] >> (shift - 1); \ 1770 d[i] += (tmp >> 1) + (tmp & 1); \ 1771 } \ 1772 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1773 } 1774 1775 DO_RSRA(gvec_srsra_b, int8_t) 1776 DO_RSRA(gvec_srsra_h, int16_t) 1777 DO_RSRA(gvec_srsra_s, int32_t) 1778 DO_RSRA(gvec_srsra_d, int64_t) 1779 1780 DO_RSRA(gvec_ursra_b, uint8_t) 1781 DO_RSRA(gvec_ursra_h, uint16_t) 1782 DO_RSRA(gvec_ursra_s, uint32_t) 1783 DO_RSRA(gvec_ursra_d, uint64_t) 1784 1785 #undef DO_RSRA 1786 1787 #define DO_SRI(NAME, TYPE) \ 1788 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1789 { \ 1790 intptr_t i, oprsz = simd_oprsz(desc); \ 1791 int shift = simd_data(desc); \ 1792 TYPE *d = vd, *n = vn; \ 1793 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1794 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1795 } \ 1796 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1797 } 1798 1799 DO_SRI(gvec_sri_b, uint8_t) 1800 DO_SRI(gvec_sri_h, uint16_t) 1801 DO_SRI(gvec_sri_s, uint32_t) 1802 DO_SRI(gvec_sri_d, uint64_t) 1803 1804 #undef DO_SRI 1805 1806 #define DO_SLI(NAME, TYPE) \ 1807 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1808 { \ 1809 intptr_t i, oprsz = simd_oprsz(desc); \ 1810 int shift = simd_data(desc); \ 1811 TYPE *d = vd, *n = vn; \ 1812 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1813 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1814 } \ 1815 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1816 } 1817 1818 DO_SLI(gvec_sli_b, uint8_t) 1819 DO_SLI(gvec_sli_h, uint16_t) 1820 DO_SLI(gvec_sli_s, uint32_t) 1821 DO_SLI(gvec_sli_d, uint64_t) 1822 1823 #undef DO_SLI 1824 1825 /* 1826 * Convert float16 to float32, raising no exceptions and 1827 * preserving exceptional values, including SNaN. 1828 * This is effectively an unpack+repack operation. 1829 */ 1830 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1831 { 1832 const int f16_bias = 15; 1833 const int f32_bias = 127; 1834 uint32_t sign = extract32(f16, 15, 1); 1835 uint32_t exp = extract32(f16, 10, 5); 1836 uint32_t frac = extract32(f16, 0, 10); 1837 1838 if (exp == 0x1f) { 1839 /* Inf or NaN */ 1840 exp = 0xff; 1841 } else if (exp == 0) { 1842 /* Zero or denormal. */ 1843 if (frac != 0) { 1844 if (fz16) { 1845 frac = 0; 1846 } else { 1847 /* 1848 * Denormal; these are all normal float32. 1849 * Shift the fraction so that the msb is at bit 11, 1850 * then remove bit 11 as the implicit bit of the 1851 * normalized float32. Note that we still go through 1852 * the shift for normal numbers below, to put the 1853 * float32 fraction at the right place. 1854 */ 1855 int shift = clz32(frac) - 21; 1856 frac = (frac << shift) & 0x3ff; 1857 exp = f32_bias - f16_bias - shift + 1; 1858 } 1859 } 1860 } else { 1861 /* Normal number; adjust the bias. */ 1862 exp += f32_bias - f16_bias; 1863 } 1864 sign <<= 31; 1865 exp <<= 23; 1866 frac <<= 23 - 10; 1867 1868 return sign | exp | frac; 1869 } 1870 1871 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1872 { 1873 /* 1874 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1875 * Load the 2nd qword iff is_q & is_2. 1876 * Shift to the 2nd dword iff !is_q & is_2. 1877 * For !is_q & !is_2, the upper bits of the result are garbage. 1878 */ 1879 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1880 } 1881 1882 /* 1883 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1884 * as there is not yet SVE versions that might use blocking. 1885 */ 1886 1887 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1888 uint32_t desc, bool fz16) 1889 { 1890 intptr_t i, oprsz = simd_oprsz(desc); 1891 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1892 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1893 int is_q = oprsz == 16; 1894 uint64_t n_4, m_4; 1895 1896 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1897 n_4 = load4_f16(vn, is_q, is_2); 1898 m_4 = load4_f16(vm, is_q, is_2); 1899 1900 /* Negate all inputs for FMLSL at once. */ 1901 if (is_s) { 1902 n_4 ^= 0x8000800080008000ull; 1903 } 1904 1905 for (i = 0; i < oprsz / 4; i++) { 1906 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1907 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1908 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1909 } 1910 clear_tail(d, oprsz, simd_maxsz(desc)); 1911 } 1912 1913 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1914 void *venv, uint32_t desc) 1915 { 1916 CPUARMState *env = venv; 1917 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1918 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1919 } 1920 1921 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1922 void *venv, uint32_t desc) 1923 { 1924 CPUARMState *env = venv; 1925 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1926 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1927 } 1928 1929 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 1930 void *venv, uint32_t desc) 1931 { 1932 intptr_t i, oprsz = simd_oprsz(desc); 1933 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1934 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1935 CPUARMState *env = venv; 1936 float_status *status = &env->vfp.fp_status; 1937 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1938 1939 for (i = 0; i < oprsz; i += sizeof(float32)) { 1940 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 1941 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 1942 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1943 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1944 float32 aa = *(float32 *)(va + H1_4(i)); 1945 1946 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 1947 } 1948 } 1949 1950 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 1951 uint32_t desc, bool fz16) 1952 { 1953 intptr_t i, oprsz = simd_oprsz(desc); 1954 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1955 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1956 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 1957 int is_q = oprsz == 16; 1958 uint64_t n_4; 1959 float32 m_1; 1960 1961 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1962 n_4 = load4_f16(vn, is_q, is_2); 1963 1964 /* Negate all inputs for FMLSL at once. */ 1965 if (is_s) { 1966 n_4 ^= 0x8000800080008000ull; 1967 } 1968 1969 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 1970 1971 for (i = 0; i < oprsz / 4; i++) { 1972 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1973 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1974 } 1975 clear_tail(d, oprsz, simd_maxsz(desc)); 1976 } 1977 1978 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 1979 void *venv, uint32_t desc) 1980 { 1981 CPUARMState *env = venv; 1982 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1983 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1984 } 1985 1986 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 1987 void *venv, uint32_t desc) 1988 { 1989 CPUARMState *env = venv; 1990 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 1991 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1992 } 1993 1994 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 1995 void *venv, uint32_t desc) 1996 { 1997 intptr_t i, j, oprsz = simd_oprsz(desc); 1998 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1999 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2000 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2001 CPUARMState *env = venv; 2002 float_status *status = &env->vfp.fp_status; 2003 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2004 2005 for (i = 0; i < oprsz; i += 16) { 2006 float16 mm_16 = *(float16 *)(vm + i + idx); 2007 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2008 2009 for (j = 0; j < 16; j += sizeof(float32)) { 2010 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2011 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2012 float32 aa = *(float32 *)(va + H1_4(i + j)); 2013 2014 *(float32 *)(vd + H1_4(i + j)) = 2015 float32_muladd(nn, mm, aa, 0, status); 2016 } 2017 } 2018 } 2019 2020 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2021 { 2022 intptr_t i, opr_sz = simd_oprsz(desc); 2023 int8_t *d = vd, *n = vn, *m = vm; 2024 2025 for (i = 0; i < opr_sz; ++i) { 2026 int8_t mm = m[i]; 2027 int8_t nn = n[i]; 2028 int8_t res = 0; 2029 if (mm >= 0) { 2030 if (mm < 8) { 2031 res = nn << mm; 2032 } 2033 } else { 2034 res = nn >> (mm > -8 ? -mm : 7); 2035 } 2036 d[i] = res; 2037 } 2038 clear_tail(d, opr_sz, simd_maxsz(desc)); 2039 } 2040 2041 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2042 { 2043 intptr_t i, opr_sz = simd_oprsz(desc); 2044 int16_t *d = vd, *n = vn, *m = vm; 2045 2046 for (i = 0; i < opr_sz / 2; ++i) { 2047 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2048 int16_t nn = n[i]; 2049 int16_t res = 0; 2050 if (mm >= 0) { 2051 if (mm < 16) { 2052 res = nn << mm; 2053 } 2054 } else { 2055 res = nn >> (mm > -16 ? -mm : 15); 2056 } 2057 d[i] = res; 2058 } 2059 clear_tail(d, opr_sz, simd_maxsz(desc)); 2060 } 2061 2062 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2063 { 2064 intptr_t i, opr_sz = simd_oprsz(desc); 2065 uint8_t *d = vd, *n = vn, *m = vm; 2066 2067 for (i = 0; i < opr_sz; ++i) { 2068 int8_t mm = m[i]; 2069 uint8_t nn = n[i]; 2070 uint8_t res = 0; 2071 if (mm >= 0) { 2072 if (mm < 8) { 2073 res = nn << mm; 2074 } 2075 } else { 2076 if (mm > -8) { 2077 res = nn >> -mm; 2078 } 2079 } 2080 d[i] = res; 2081 } 2082 clear_tail(d, opr_sz, simd_maxsz(desc)); 2083 } 2084 2085 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2086 { 2087 intptr_t i, opr_sz = simd_oprsz(desc); 2088 uint16_t *d = vd, *n = vn, *m = vm; 2089 2090 for (i = 0; i < opr_sz / 2; ++i) { 2091 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2092 uint16_t nn = n[i]; 2093 uint16_t res = 0; 2094 if (mm >= 0) { 2095 if (mm < 16) { 2096 res = nn << mm; 2097 } 2098 } else { 2099 if (mm > -16) { 2100 res = nn >> -mm; 2101 } 2102 } 2103 d[i] = res; 2104 } 2105 clear_tail(d, opr_sz, simd_maxsz(desc)); 2106 } 2107 2108 /* 2109 * 8x8->8 polynomial multiply. 2110 * 2111 * Polynomial multiplication is like integer multiplication except the 2112 * partial products are XORed, not added. 2113 * 2114 * TODO: expose this as a generic vector operation, as it is a common 2115 * crypto building block. 2116 */ 2117 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2118 { 2119 intptr_t i, opr_sz = simd_oprsz(desc); 2120 uint64_t *d = vd, *n = vn, *m = vm; 2121 2122 for (i = 0; i < opr_sz / 8; ++i) { 2123 d[i] = clmul_8x8_low(n[i], m[i]); 2124 } 2125 clear_tail(d, opr_sz, simd_maxsz(desc)); 2126 } 2127 2128 /* 2129 * 64x64->128 polynomial multiply. 2130 * Because of the lanes are not accessed in strict columns, 2131 * this probably cannot be turned into a generic helper. 2132 */ 2133 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2134 { 2135 intptr_t i, opr_sz = simd_oprsz(desc); 2136 intptr_t hi = simd_data(desc); 2137 uint64_t *d = vd, *n = vn, *m = vm; 2138 2139 for (i = 0; i < opr_sz / 8; i += 2) { 2140 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2141 d[i] = int128_getlo(r); 2142 d[i + 1] = int128_gethi(r); 2143 } 2144 clear_tail(d, opr_sz, simd_maxsz(desc)); 2145 } 2146 2147 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2148 { 2149 int hi = simd_data(desc); 2150 uint64_t *d = vd, *n = vn, *m = vm; 2151 uint64_t nn = n[hi], mm = m[hi]; 2152 2153 d[0] = clmul_8x4_packed(nn, mm); 2154 nn >>= 32; 2155 mm >>= 32; 2156 d[1] = clmul_8x4_packed(nn, mm); 2157 2158 clear_tail(d, 16, simd_maxsz(desc)); 2159 } 2160 2161 #ifdef TARGET_AARCH64 2162 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2163 { 2164 int shift = simd_data(desc) * 8; 2165 intptr_t i, opr_sz = simd_oprsz(desc); 2166 uint64_t *d = vd, *n = vn, *m = vm; 2167 2168 for (i = 0; i < opr_sz / 8; ++i) { 2169 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2170 } 2171 } 2172 2173 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2174 { 2175 intptr_t sel = H4(simd_data(desc)); 2176 intptr_t i, opr_sz = simd_oprsz(desc); 2177 uint32_t *n = vn, *m = vm; 2178 uint64_t *d = vd; 2179 2180 for (i = 0; i < opr_sz / 8; ++i) { 2181 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2182 } 2183 } 2184 #endif 2185 2186 #define DO_CMP0(NAME, TYPE, OP) \ 2187 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2188 { \ 2189 intptr_t i, opr_sz = simd_oprsz(desc); \ 2190 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2191 TYPE nn = *(TYPE *)(vn + i); \ 2192 *(TYPE *)(vd + i) = -(nn OP 0); \ 2193 } \ 2194 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2195 } 2196 2197 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2198 DO_CMP0(gvec_clt0_b, int8_t, <) 2199 DO_CMP0(gvec_cle0_b, int8_t, <=) 2200 DO_CMP0(gvec_cgt0_b, int8_t, >) 2201 DO_CMP0(gvec_cge0_b, int8_t, >=) 2202 2203 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2204 DO_CMP0(gvec_clt0_h, int16_t, <) 2205 DO_CMP0(gvec_cle0_h, int16_t, <=) 2206 DO_CMP0(gvec_cgt0_h, int16_t, >) 2207 DO_CMP0(gvec_cge0_h, int16_t, >=) 2208 2209 #undef DO_CMP0 2210 2211 #define DO_ABD(NAME, TYPE) \ 2212 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2213 { \ 2214 intptr_t i, opr_sz = simd_oprsz(desc); \ 2215 TYPE *d = vd, *n = vn, *m = vm; \ 2216 \ 2217 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2218 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2219 } \ 2220 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2221 } 2222 2223 DO_ABD(gvec_sabd_b, int8_t) 2224 DO_ABD(gvec_sabd_h, int16_t) 2225 DO_ABD(gvec_sabd_s, int32_t) 2226 DO_ABD(gvec_sabd_d, int64_t) 2227 2228 DO_ABD(gvec_uabd_b, uint8_t) 2229 DO_ABD(gvec_uabd_h, uint16_t) 2230 DO_ABD(gvec_uabd_s, uint32_t) 2231 DO_ABD(gvec_uabd_d, uint64_t) 2232 2233 #undef DO_ABD 2234 2235 #define DO_ABA(NAME, TYPE) \ 2236 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2237 { \ 2238 intptr_t i, opr_sz = simd_oprsz(desc); \ 2239 TYPE *d = vd, *n = vn, *m = vm; \ 2240 \ 2241 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2242 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2243 } \ 2244 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2245 } 2246 2247 DO_ABA(gvec_saba_b, int8_t) 2248 DO_ABA(gvec_saba_h, int16_t) 2249 DO_ABA(gvec_saba_s, int32_t) 2250 DO_ABA(gvec_saba_d, int64_t) 2251 2252 DO_ABA(gvec_uaba_b, uint8_t) 2253 DO_ABA(gvec_uaba_h, uint16_t) 2254 DO_ABA(gvec_uaba_s, uint32_t) 2255 DO_ABA(gvec_uaba_d, uint64_t) 2256 2257 #undef DO_ABA 2258 2259 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2260 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 2261 { \ 2262 ARMVectorReg scratch; \ 2263 intptr_t oprsz = simd_oprsz(desc); \ 2264 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2265 TYPE *d = vd, *n = vn, *m = vm; \ 2266 if (unlikely(d == m)) { \ 2267 m = memcpy(&scratch, m, oprsz); \ 2268 } \ 2269 for (intptr_t i = 0; i < half; ++i) { \ 2270 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2271 } \ 2272 for (intptr_t i = 0; i < half; ++i) { \ 2273 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2274 } \ 2275 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2276 } 2277 2278 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2279 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2280 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2281 2282 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2283 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2284 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2285 2286 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2287 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2288 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2289 2290 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2291 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2292 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2293 2294 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2295 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2296 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2297 2298 #undef DO_3OP_PAIR 2299 2300 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2301 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2302 { \ 2303 ARMVectorReg scratch; \ 2304 intptr_t oprsz = simd_oprsz(desc); \ 2305 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2306 TYPE *d = vd, *n = vn, *m = vm; \ 2307 if (unlikely(d == m)) { \ 2308 m = memcpy(&scratch, m, oprsz); \ 2309 } \ 2310 for (intptr_t i = 0; i < half; ++i) { \ 2311 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2312 } \ 2313 for (intptr_t i = 0; i < half; ++i) { \ 2314 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2315 } \ 2316 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2317 } 2318 2319 #define ADD(A, B) (A + B) 2320 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2321 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2322 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2323 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2324 #undef ADD 2325 2326 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2327 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2328 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2329 2330 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2331 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2332 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2333 2334 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2335 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2336 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2337 2338 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2339 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2340 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2341 2342 #undef DO_3OP_PAIR 2343 2344 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2345 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2346 { \ 2347 intptr_t i, oprsz = simd_oprsz(desc); \ 2348 int shift = simd_data(desc); \ 2349 TYPE *d = vd, *n = vn; \ 2350 float_status *fpst = stat; \ 2351 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2352 d[i] = FUNC(n[i], shift, fpst); \ 2353 } \ 2354 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2355 } 2356 2357 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2358 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2359 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2360 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) 2361 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2362 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2363 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2364 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2365 2366 #undef DO_VCVT_FIXED 2367 2368 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2369 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2370 { \ 2371 float_status *fpst = stat; \ 2372 intptr_t i, oprsz = simd_oprsz(desc); \ 2373 uint32_t rmode = simd_data(desc); \ 2374 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2375 TYPE *d = vd, *n = vn; \ 2376 set_float_rounding_mode(rmode, fpst); \ 2377 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2378 d[i] = FUNC(n[i], 0, fpst); \ 2379 } \ 2380 set_float_rounding_mode(prev_rmode, fpst); \ 2381 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2382 } 2383 2384 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2385 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2386 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2387 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2388 2389 #undef DO_VCVT_RMODE 2390 2391 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2392 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2393 { \ 2394 float_status *fpst = stat; \ 2395 intptr_t i, oprsz = simd_oprsz(desc); \ 2396 uint32_t rmode = simd_data(desc); \ 2397 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2398 TYPE *d = vd, *n = vn; \ 2399 set_float_rounding_mode(rmode, fpst); \ 2400 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2401 d[i] = FUNC(n[i], fpst); \ 2402 } \ 2403 set_float_rounding_mode(prev_rmode, fpst); \ 2404 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2405 } 2406 2407 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2408 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2409 2410 #undef DO_VRINT_RMODE 2411 2412 #ifdef TARGET_AARCH64 2413 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2414 { 2415 const uint8_t *indices = vm; 2416 CPUARMState *env = venv; 2417 size_t oprsz = simd_oprsz(desc); 2418 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2419 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2420 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2421 union { 2422 uint8_t b[16]; 2423 uint64_t d[2]; 2424 } result; 2425 2426 /* 2427 * We must construct the final result in a temp, lest the output 2428 * overlaps the input table. For TBL, begin with zero; for TBX, 2429 * begin with the original register contents. Note that we always 2430 * copy 16 bytes here to avoid an extra branch; clearing the high 2431 * bits of the register for oprsz == 8 is handled below. 2432 */ 2433 if (is_tbx) { 2434 memcpy(&result, vd, 16); 2435 } else { 2436 memset(&result, 0, 16); 2437 } 2438 2439 for (size_t i = 0; i < oprsz; ++i) { 2440 uint32_t index = indices[H1(i)]; 2441 2442 if (index < table_len) { 2443 /* 2444 * Convert index (a byte offset into the virtual table 2445 * which is a series of 128-bit vectors concatenated) 2446 * into the correct register element, bearing in mind 2447 * that the table can wrap around from V31 to V0. 2448 */ 2449 const uint8_t *table = (const uint8_t *) 2450 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2451 result.b[H1(i)] = table[H1(index % 16)]; 2452 } 2453 } 2454 2455 memcpy(vd, &result, 16); 2456 clear_tail(vd, oprsz, simd_maxsz(desc)); 2457 } 2458 #endif 2459 2460 /* 2461 * NxN -> N highpart multiply 2462 * 2463 * TODO: expose this as a generic vector operation. 2464 */ 2465 2466 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2467 { 2468 intptr_t i, opr_sz = simd_oprsz(desc); 2469 int8_t *d = vd, *n = vn, *m = vm; 2470 2471 for (i = 0; i < opr_sz; ++i) { 2472 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2473 } 2474 clear_tail(d, opr_sz, simd_maxsz(desc)); 2475 } 2476 2477 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2478 { 2479 intptr_t i, opr_sz = simd_oprsz(desc); 2480 int16_t *d = vd, *n = vn, *m = vm; 2481 2482 for (i = 0; i < opr_sz / 2; ++i) { 2483 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2484 } 2485 clear_tail(d, opr_sz, simd_maxsz(desc)); 2486 } 2487 2488 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2489 { 2490 intptr_t i, opr_sz = simd_oprsz(desc); 2491 int32_t *d = vd, *n = vn, *m = vm; 2492 2493 for (i = 0; i < opr_sz / 4; ++i) { 2494 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2495 } 2496 clear_tail(d, opr_sz, simd_maxsz(desc)); 2497 } 2498 2499 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2500 { 2501 intptr_t i, opr_sz = simd_oprsz(desc); 2502 uint64_t *d = vd, *n = vn, *m = vm; 2503 uint64_t discard; 2504 2505 for (i = 0; i < opr_sz / 8; ++i) { 2506 muls64(&discard, &d[i], n[i], m[i]); 2507 } 2508 clear_tail(d, opr_sz, simd_maxsz(desc)); 2509 } 2510 2511 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2512 { 2513 intptr_t i, opr_sz = simd_oprsz(desc); 2514 uint8_t *d = vd, *n = vn, *m = vm; 2515 2516 for (i = 0; i < opr_sz; ++i) { 2517 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2518 } 2519 clear_tail(d, opr_sz, simd_maxsz(desc)); 2520 } 2521 2522 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2523 { 2524 intptr_t i, opr_sz = simd_oprsz(desc); 2525 uint16_t *d = vd, *n = vn, *m = vm; 2526 2527 for (i = 0; i < opr_sz / 2; ++i) { 2528 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2529 } 2530 clear_tail(d, opr_sz, simd_maxsz(desc)); 2531 } 2532 2533 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2534 { 2535 intptr_t i, opr_sz = simd_oprsz(desc); 2536 uint32_t *d = vd, *n = vn, *m = vm; 2537 2538 for (i = 0; i < opr_sz / 4; ++i) { 2539 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2540 } 2541 clear_tail(d, opr_sz, simd_maxsz(desc)); 2542 } 2543 2544 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2545 { 2546 intptr_t i, opr_sz = simd_oprsz(desc); 2547 uint64_t *d = vd, *n = vn, *m = vm; 2548 uint64_t discard; 2549 2550 for (i = 0; i < opr_sz / 8; ++i) { 2551 mulu64(&discard, &d[i], n[i], m[i]); 2552 } 2553 clear_tail(d, opr_sz, simd_maxsz(desc)); 2554 } 2555 2556 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2557 { 2558 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2559 int shr = simd_data(desc); 2560 uint64_t *d = vd, *n = vn, *m = vm; 2561 2562 for (i = 0; i < opr_sz; ++i) { 2563 d[i] = ror64(n[i] ^ m[i], shr); 2564 } 2565 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2566 } 2567 2568 /* 2569 * Integer matrix-multiply accumulate 2570 */ 2571 2572 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2573 { 2574 int8_t *n = vn, *m = vm; 2575 2576 for (intptr_t k = 0; k < 8; ++k) { 2577 sum += n[H1(k)] * m[H1(k)]; 2578 } 2579 return sum; 2580 } 2581 2582 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2583 { 2584 uint8_t *n = vn, *m = vm; 2585 2586 for (intptr_t k = 0; k < 8; ++k) { 2587 sum += n[H1(k)] * m[H1(k)]; 2588 } 2589 return sum; 2590 } 2591 2592 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2593 { 2594 uint8_t *n = vn; 2595 int8_t *m = vm; 2596 2597 for (intptr_t k = 0; k < 8; ++k) { 2598 sum += n[H1(k)] * m[H1(k)]; 2599 } 2600 return sum; 2601 } 2602 2603 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2604 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2605 { 2606 intptr_t seg, opr_sz = simd_oprsz(desc); 2607 2608 for (seg = 0; seg < opr_sz; seg += 16) { 2609 uint32_t *d = vd + seg; 2610 uint32_t *a = va + seg; 2611 uint32_t sum0, sum1, sum2, sum3; 2612 2613 /* 2614 * Process the entire segment at once, writing back the 2615 * results only after we've consumed all of the inputs. 2616 * 2617 * Key to indices by column: 2618 * i j i j 2619 */ 2620 sum0 = a[H4(0 + 0)]; 2621 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2622 sum1 = a[H4(0 + 1)]; 2623 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2624 sum2 = a[H4(2 + 0)]; 2625 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2626 sum3 = a[H4(2 + 1)]; 2627 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2628 2629 d[H4(0)] = sum0; 2630 d[H4(1)] = sum1; 2631 d[H4(2)] = sum2; 2632 d[H4(3)] = sum3; 2633 } 2634 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2635 } 2636 2637 #define DO_MMLA_B(NAME, INNER) \ 2638 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2639 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2640 2641 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2642 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2643 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2644 2645 /* 2646 * BFloat16 Dot Product 2647 */ 2648 2649 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) 2650 { 2651 /* FPCR is ignored for BFDOT and BFMMLA. */ 2652 float_status bf_status = { 2653 .tininess_before_rounding = float_tininess_before_rounding, 2654 .float_rounding_mode = float_round_to_odd_inf, 2655 .flush_to_zero = true, 2656 .flush_inputs_to_zero = true, 2657 .default_nan_mode = true, 2658 }; 2659 float32 t1, t2; 2660 2661 /* 2662 * Extract each BFloat16 from the element pair, and shift 2663 * them such that they become float32. 2664 */ 2665 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); 2666 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); 2667 t1 = float32_add(t1, t2, &bf_status); 2668 t1 = float32_add(sum, t1, &bf_status); 2669 2670 return t1; 2671 } 2672 2673 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2674 { 2675 intptr_t i, opr_sz = simd_oprsz(desc); 2676 float32 *d = vd, *a = va; 2677 uint32_t *n = vn, *m = vm; 2678 2679 for (i = 0; i < opr_sz / 4; ++i) { 2680 d[i] = bfdotadd(a[i], n[i], m[i]); 2681 } 2682 clear_tail(d, opr_sz, simd_maxsz(desc)); 2683 } 2684 2685 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2686 void *va, uint32_t desc) 2687 { 2688 intptr_t i, j, opr_sz = simd_oprsz(desc); 2689 intptr_t index = simd_data(desc); 2690 intptr_t elements = opr_sz / 4; 2691 intptr_t eltspersegment = MIN(16 / 4, elements); 2692 float32 *d = vd, *a = va; 2693 uint32_t *n = vn, *m = vm; 2694 2695 for (i = 0; i < elements; i += eltspersegment) { 2696 uint32_t m_idx = m[i + H4(index)]; 2697 2698 for (j = i; j < i + eltspersegment; j++) { 2699 d[j] = bfdotadd(a[j], n[j], m_idx); 2700 } 2701 } 2702 clear_tail(d, opr_sz, simd_maxsz(desc)); 2703 } 2704 2705 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2706 { 2707 intptr_t s, opr_sz = simd_oprsz(desc); 2708 float32 *d = vd, *a = va; 2709 uint32_t *n = vn, *m = vm; 2710 2711 for (s = 0; s < opr_sz / 4; s += 4) { 2712 float32 sum00, sum01, sum10, sum11; 2713 2714 /* 2715 * Process the entire segment at once, writing back the 2716 * results only after we've consumed all of the inputs. 2717 * 2718 * Key to indices by column: 2719 * i j i k j k 2720 */ 2721 sum00 = a[s + H4(0 + 0)]; 2722 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); 2723 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); 2724 2725 sum01 = a[s + H4(0 + 1)]; 2726 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); 2727 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); 2728 2729 sum10 = a[s + H4(2 + 0)]; 2730 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); 2731 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); 2732 2733 sum11 = a[s + H4(2 + 1)]; 2734 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); 2735 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); 2736 2737 d[s + H4(0 + 0)] = sum00; 2738 d[s + H4(0 + 1)] = sum01; 2739 d[s + H4(2 + 0)] = sum10; 2740 d[s + H4(2 + 1)] = sum11; 2741 } 2742 clear_tail(d, opr_sz, simd_maxsz(desc)); 2743 } 2744 2745 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 2746 void *stat, uint32_t desc) 2747 { 2748 intptr_t i, opr_sz = simd_oprsz(desc); 2749 intptr_t sel = simd_data(desc); 2750 float32 *d = vd, *a = va; 2751 bfloat16 *n = vn, *m = vm; 2752 2753 for (i = 0; i < opr_sz / 4; ++i) { 2754 float32 nn = n[H2(i * 2 + sel)] << 16; 2755 float32 mm = m[H2(i * 2 + sel)] << 16; 2756 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 2757 } 2758 clear_tail(d, opr_sz, simd_maxsz(desc)); 2759 } 2760 2761 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 2762 void *va, void *stat, uint32_t desc) 2763 { 2764 intptr_t i, j, opr_sz = simd_oprsz(desc); 2765 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 2766 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 2767 intptr_t elements = opr_sz / 4; 2768 intptr_t eltspersegment = MIN(16 / 4, elements); 2769 float32 *d = vd, *a = va; 2770 bfloat16 *n = vn, *m = vm; 2771 2772 for (i = 0; i < elements; i += eltspersegment) { 2773 float32 m_idx = m[H2(2 * i + index)] << 16; 2774 2775 for (j = i; j < i + eltspersegment; j++) { 2776 float32 n_j = n[H2(2 * j + sel)] << 16; 2777 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 2778 } 2779 } 2780 clear_tail(d, opr_sz, simd_maxsz(desc)); 2781 } 2782 2783 #define DO_CLAMP(NAME, TYPE) \ 2784 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 2785 { \ 2786 intptr_t i, opr_sz = simd_oprsz(desc); \ 2787 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2788 TYPE aa = *(TYPE *)(a + i); \ 2789 TYPE nn = *(TYPE *)(n + i); \ 2790 TYPE mm = *(TYPE *)(m + i); \ 2791 TYPE dd = MIN(MAX(aa, nn), mm); \ 2792 *(TYPE *)(d + i) = dd; \ 2793 } \ 2794 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2795 } 2796 2797 DO_CLAMP(gvec_sclamp_b, int8_t) 2798 DO_CLAMP(gvec_sclamp_h, int16_t) 2799 DO_CLAMP(gvec_sclamp_s, int32_t) 2800 DO_CLAMP(gvec_sclamp_d, int64_t) 2801 2802 DO_CLAMP(gvec_uclamp_b, uint8_t) 2803 DO_CLAMP(gvec_uclamp_h, uint16_t) 2804 DO_CLAMP(gvec_uclamp_s, uint32_t) 2805 DO_CLAMP(gvec_uclamp_d, uint64_t) 2806