1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 315 void *va, uint32_t desc) 316 { 317 intptr_t i, opr_sz = simd_oprsz(desc); 318 int16_t *d = vd, *n = vn, *m = vm, *a = va; 319 uint32_t discard; 320 321 for (i = 0; i < opr_sz / 2; ++i) { 322 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 323 } 324 } 325 326 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 327 void *va, uint32_t desc) 328 { 329 intptr_t i, opr_sz = simd_oprsz(desc); 330 int16_t *d = vd, *n = vn, *m = vm, *a = va; 331 uint32_t discard; 332 333 for (i = 0; i < opr_sz / 2; ++i) { 334 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 335 } 336 } 337 338 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 339 { 340 intptr_t i, opr_sz = simd_oprsz(desc); 341 int16_t *d = vd, *n = vn, *m = vm; 342 uint32_t discard; 343 344 for (i = 0; i < opr_sz / 2; ++i) { 345 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 346 } 347 } 348 349 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 350 { 351 intptr_t i, opr_sz = simd_oprsz(desc); 352 int16_t *d = vd, *n = vn, *m = vm; 353 uint32_t discard; 354 355 for (i = 0; i < opr_sz / 2; ++i) { 356 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 357 } 358 } 359 360 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 361 { 362 intptr_t i, j, opr_sz = simd_oprsz(desc); 363 int idx = simd_data(desc); 364 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 365 uint32_t discard; 366 367 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 368 int16_t mm = m[i]; 369 for (j = 0; j < 16 / 2; ++j) { 370 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 371 } 372 } 373 } 374 375 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 376 { 377 intptr_t i, j, opr_sz = simd_oprsz(desc); 378 int idx = simd_data(desc); 379 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 380 uint32_t discard; 381 382 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 383 int16_t mm = m[i]; 384 for (j = 0; j < 16 / 2; ++j) { 385 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 386 } 387 } 388 } 389 390 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 391 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 392 bool neg, bool round, uint32_t *sat) 393 { 394 /* Simplify similarly to do_sqrdmlah_b above. */ 395 int64_t ret = (int64_t)src1 * src2; 396 if (neg) { 397 ret = -ret; 398 } 399 ret += ((int64_t)src3 << 31) + (round << 30); 400 ret >>= 31; 401 402 if (ret != (int32_t)ret) { 403 *sat = 1; 404 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 405 } 406 return ret; 407 } 408 409 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 410 int32_t src2, int32_t src3) 411 { 412 uint32_t *sat = &env->vfp.qc[0]; 413 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 414 } 415 416 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 417 void *vq, uint32_t desc) 418 { 419 uintptr_t opr_sz = simd_oprsz(desc); 420 int32_t *d = vd; 421 int32_t *n = vn; 422 int32_t *m = vm; 423 uintptr_t i; 424 425 for (i = 0; i < opr_sz / 4; ++i) { 426 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 427 } 428 clear_tail(d, opr_sz, simd_maxsz(desc)); 429 } 430 431 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 432 int32_t src2, int32_t src3) 433 { 434 uint32_t *sat = &env->vfp.qc[0]; 435 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 436 } 437 438 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 439 void *vq, uint32_t desc) 440 { 441 uintptr_t opr_sz = simd_oprsz(desc); 442 int32_t *d = vd; 443 int32_t *n = vn; 444 int32_t *m = vm; 445 uintptr_t i; 446 447 for (i = 0; i < opr_sz / 4; ++i) { 448 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 449 } 450 clear_tail(d, opr_sz, simd_maxsz(desc)); 451 } 452 453 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 454 void *vq, uint32_t desc) 455 { 456 intptr_t i, opr_sz = simd_oprsz(desc); 457 int32_t *d = vd, *n = vn, *m = vm; 458 459 for (i = 0; i < opr_sz / 4; ++i) { 460 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 461 } 462 clear_tail(d, opr_sz, simd_maxsz(desc)); 463 } 464 465 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 466 void *vq, uint32_t desc) 467 { 468 intptr_t i, opr_sz = simd_oprsz(desc); 469 int32_t *d = vd, *n = vn, *m = vm; 470 471 for (i = 0; i < opr_sz / 4; ++i) { 472 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 473 } 474 clear_tail(d, opr_sz, simd_maxsz(desc)); 475 } 476 477 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 478 void *va, uint32_t desc) 479 { 480 intptr_t i, opr_sz = simd_oprsz(desc); 481 int32_t *d = vd, *n = vn, *m = vm, *a = va; 482 uint32_t discard; 483 484 for (i = 0; i < opr_sz / 4; ++i) { 485 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 486 } 487 } 488 489 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 490 void *va, uint32_t desc) 491 { 492 intptr_t i, opr_sz = simd_oprsz(desc); 493 int32_t *d = vd, *n = vn, *m = vm, *a = va; 494 uint32_t discard; 495 496 for (i = 0; i < opr_sz / 4; ++i) { 497 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 498 } 499 } 500 501 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 502 { 503 intptr_t i, opr_sz = simd_oprsz(desc); 504 int32_t *d = vd, *n = vn, *m = vm; 505 uint32_t discard; 506 507 for (i = 0; i < opr_sz / 4; ++i) { 508 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 509 } 510 } 511 512 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 513 { 514 intptr_t i, opr_sz = simd_oprsz(desc); 515 int32_t *d = vd, *n = vn, *m = vm; 516 uint32_t discard; 517 518 for (i = 0; i < opr_sz / 4; ++i) { 519 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 520 } 521 } 522 523 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 524 { 525 intptr_t i, j, opr_sz = simd_oprsz(desc); 526 int idx = simd_data(desc); 527 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 528 uint32_t discard; 529 530 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 531 int32_t mm = m[i]; 532 for (j = 0; j < 16 / 4; ++j) { 533 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 534 } 535 } 536 } 537 538 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 539 { 540 intptr_t i, j, opr_sz = simd_oprsz(desc); 541 int idx = simd_data(desc); 542 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 543 uint32_t discard; 544 545 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 546 int32_t mm = m[i]; 547 for (j = 0; j < 16 / 4; ++j) { 548 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 549 } 550 } 551 } 552 553 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 554 static int64_t do_sat128_d(Int128 r) 555 { 556 int64_t ls = int128_getlo(r); 557 int64_t hs = int128_gethi(r); 558 559 if (unlikely(hs != (ls >> 63))) { 560 return hs < 0 ? INT64_MIN : INT64_MAX; 561 } 562 return ls; 563 } 564 565 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 566 { 567 uint64_t l, h; 568 Int128 r, t; 569 570 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 571 muls64(&l, &h, m, n); 572 r = int128_make128(l, h); 573 if (neg) { 574 r = int128_neg(r); 575 } 576 if (a) { 577 t = int128_exts64(a); 578 t = int128_lshift(t, 63); 579 r = int128_add(r, t); 580 } 581 if (round) { 582 t = int128_exts64(1ll << 62); 583 r = int128_add(r, t); 584 } 585 r = int128_rshift(r, 63); 586 587 return do_sat128_d(r); 588 } 589 590 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 591 void *va, uint32_t desc) 592 { 593 intptr_t i, opr_sz = simd_oprsz(desc); 594 int64_t *d = vd, *n = vn, *m = vm, *a = va; 595 596 for (i = 0; i < opr_sz / 8; ++i) { 597 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 598 } 599 } 600 601 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 602 void *va, uint32_t desc) 603 { 604 intptr_t i, opr_sz = simd_oprsz(desc); 605 int64_t *d = vd, *n = vn, *m = vm, *a = va; 606 607 for (i = 0; i < opr_sz / 8; ++i) { 608 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 609 } 610 } 611 612 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 613 { 614 intptr_t i, opr_sz = simd_oprsz(desc); 615 int64_t *d = vd, *n = vn, *m = vm; 616 617 for (i = 0; i < opr_sz / 8; ++i) { 618 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 619 } 620 } 621 622 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int64_t *d = vd, *n = vn, *m = vm; 626 627 for (i = 0; i < opr_sz / 8; ++i) { 628 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 629 } 630 } 631 632 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 633 { 634 intptr_t i, j, opr_sz = simd_oprsz(desc); 635 int idx = simd_data(desc); 636 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 637 638 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 639 int64_t mm = m[i]; 640 for (j = 0; j < 16 / 8; ++j) { 641 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 642 } 643 } 644 } 645 646 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 647 { 648 intptr_t i, j, opr_sz = simd_oprsz(desc); 649 int idx = simd_data(desc); 650 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 651 652 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 653 int64_t mm = m[i]; 654 for (j = 0; j < 16 / 8; ++j) { 655 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 656 } 657 } 658 } 659 660 /* Integer 8 and 16-bit dot-product. 661 * 662 * Note that for the loops herein, host endianness does not matter 663 * with respect to the ordering of data within the quad-width lanes. 664 * All elements are treated equally, no matter where they are. 665 */ 666 667 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 668 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 669 { \ 670 intptr_t i, opr_sz = simd_oprsz(desc); \ 671 TYPED *d = vd, *a = va; \ 672 TYPEN *n = vn; \ 673 TYPEM *m = vm; \ 674 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 675 d[i] = (a[i] + \ 676 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 677 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 678 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 679 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 680 } \ 681 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 682 } 683 684 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 685 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 686 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 687 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 688 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 689 690 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 691 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 692 { \ 693 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 694 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 695 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 696 intptr_t index = simd_data(desc); \ 697 TYPED *d = vd, *a = va; \ 698 TYPEN *n = vn; \ 699 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 700 do { \ 701 TYPED m0 = m_indexed[i * 4 + 0]; \ 702 TYPED m1 = m_indexed[i * 4 + 1]; \ 703 TYPED m2 = m_indexed[i * 4 + 2]; \ 704 TYPED m3 = m_indexed[i * 4 + 3]; \ 705 do { \ 706 d[i] = (a[i] + \ 707 n[i * 4 + 0] * m0 + \ 708 n[i * 4 + 1] * m1 + \ 709 n[i * 4 + 2] * m2 + \ 710 n[i * 4 + 3] * m3); \ 711 } while (++i < segend); \ 712 segend = i + 4; \ 713 } while (i < opr_sz_n); \ 714 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 715 } 716 717 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 718 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 719 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 720 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 721 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 722 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 723 724 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 725 void *vfpst, uint32_t desc) 726 { 727 uintptr_t opr_sz = simd_oprsz(desc); 728 float16 *d = vd; 729 float16 *n = vn; 730 float16 *m = vm; 731 float_status *fpst = vfpst; 732 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 733 uint32_t neg_imag = neg_real ^ 1; 734 uintptr_t i; 735 736 /* Shift boolean to the sign bit so we can xor to negate. */ 737 neg_real <<= 15; 738 neg_imag <<= 15; 739 740 for (i = 0; i < opr_sz / 2; i += 2) { 741 float16 e0 = n[H2(i)]; 742 float16 e1 = m[H2(i + 1)] ^ neg_imag; 743 float16 e2 = n[H2(i + 1)]; 744 float16 e3 = m[H2(i)] ^ neg_real; 745 746 d[H2(i)] = float16_add(e0, e1, fpst); 747 d[H2(i + 1)] = float16_add(e2, e3, fpst); 748 } 749 clear_tail(d, opr_sz, simd_maxsz(desc)); 750 } 751 752 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 753 void *vfpst, uint32_t desc) 754 { 755 uintptr_t opr_sz = simd_oprsz(desc); 756 float32 *d = vd; 757 float32 *n = vn; 758 float32 *m = vm; 759 float_status *fpst = vfpst; 760 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 761 uint32_t neg_imag = neg_real ^ 1; 762 uintptr_t i; 763 764 /* Shift boolean to the sign bit so we can xor to negate. */ 765 neg_real <<= 31; 766 neg_imag <<= 31; 767 768 for (i = 0; i < opr_sz / 4; i += 2) { 769 float32 e0 = n[H4(i)]; 770 float32 e1 = m[H4(i + 1)] ^ neg_imag; 771 float32 e2 = n[H4(i + 1)]; 772 float32 e3 = m[H4(i)] ^ neg_real; 773 774 d[H4(i)] = float32_add(e0, e1, fpst); 775 d[H4(i + 1)] = float32_add(e2, e3, fpst); 776 } 777 clear_tail(d, opr_sz, simd_maxsz(desc)); 778 } 779 780 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 781 void *vfpst, uint32_t desc) 782 { 783 uintptr_t opr_sz = simd_oprsz(desc); 784 float64 *d = vd; 785 float64 *n = vn; 786 float64 *m = vm; 787 float_status *fpst = vfpst; 788 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 789 uint64_t neg_imag = neg_real ^ 1; 790 uintptr_t i; 791 792 /* Shift boolean to the sign bit so we can xor to negate. */ 793 neg_real <<= 63; 794 neg_imag <<= 63; 795 796 for (i = 0; i < opr_sz / 8; i += 2) { 797 float64 e0 = n[i]; 798 float64 e1 = m[i + 1] ^ neg_imag; 799 float64 e2 = n[i + 1]; 800 float64 e3 = m[i] ^ neg_real; 801 802 d[i] = float64_add(e0, e1, fpst); 803 d[i + 1] = float64_add(e2, e3, fpst); 804 } 805 clear_tail(d, opr_sz, simd_maxsz(desc)); 806 } 807 808 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 809 void *vfpst, uint32_t desc) 810 { 811 uintptr_t opr_sz = simd_oprsz(desc); 812 float16 *d = vd, *n = vn, *m = vm, *a = va; 813 float_status *fpst = vfpst; 814 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 815 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 816 uint32_t neg_real = flip ^ neg_imag; 817 uintptr_t i; 818 819 /* Shift boolean to the sign bit so we can xor to negate. */ 820 neg_real <<= 15; 821 neg_imag <<= 15; 822 823 for (i = 0; i < opr_sz / 2; i += 2) { 824 float16 e2 = n[H2(i + flip)]; 825 float16 e1 = m[H2(i + flip)] ^ neg_real; 826 float16 e4 = e2; 827 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 828 829 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 830 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 831 } 832 clear_tail(d, opr_sz, simd_maxsz(desc)); 833 } 834 835 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 836 void *vfpst, uint32_t desc) 837 { 838 uintptr_t opr_sz = simd_oprsz(desc); 839 float16 *d = vd, *n = vn, *m = vm, *a = va; 840 float_status *fpst = vfpst; 841 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 842 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 843 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 844 uint32_t neg_real = flip ^ neg_imag; 845 intptr_t elements = opr_sz / sizeof(float16); 846 intptr_t eltspersegment = 16 / sizeof(float16); 847 intptr_t i, j; 848 849 /* Shift boolean to the sign bit so we can xor to negate. */ 850 neg_real <<= 15; 851 neg_imag <<= 15; 852 853 for (i = 0; i < elements; i += eltspersegment) { 854 float16 mr = m[H2(i + 2 * index + 0)]; 855 float16 mi = m[H2(i + 2 * index + 1)]; 856 float16 e1 = neg_real ^ (flip ? mi : mr); 857 float16 e3 = neg_imag ^ (flip ? mr : mi); 858 859 for (j = i; j < i + eltspersegment; j += 2) { 860 float16 e2 = n[H2(j + flip)]; 861 float16 e4 = e2; 862 863 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 864 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 865 } 866 } 867 clear_tail(d, opr_sz, simd_maxsz(desc)); 868 } 869 870 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 871 void *vfpst, uint32_t desc) 872 { 873 uintptr_t opr_sz = simd_oprsz(desc); 874 float32 *d = vd, *n = vn, *m = vm, *a = va; 875 float_status *fpst = vfpst; 876 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 877 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 878 uint32_t neg_real = flip ^ neg_imag; 879 uintptr_t i; 880 881 /* Shift boolean to the sign bit so we can xor to negate. */ 882 neg_real <<= 31; 883 neg_imag <<= 31; 884 885 for (i = 0; i < opr_sz / 4; i += 2) { 886 float32 e2 = n[H4(i + flip)]; 887 float32 e1 = m[H4(i + flip)] ^ neg_real; 888 float32 e4 = e2; 889 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 890 891 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 892 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 893 } 894 clear_tail(d, opr_sz, simd_maxsz(desc)); 895 } 896 897 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 898 void *vfpst, uint32_t desc) 899 { 900 uintptr_t opr_sz = simd_oprsz(desc); 901 float32 *d = vd, *n = vn, *m = vm, *a = va; 902 float_status *fpst = vfpst; 903 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 904 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 905 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 906 uint32_t neg_real = flip ^ neg_imag; 907 intptr_t elements = opr_sz / sizeof(float32); 908 intptr_t eltspersegment = 16 / sizeof(float32); 909 intptr_t i, j; 910 911 /* Shift boolean to the sign bit so we can xor to negate. */ 912 neg_real <<= 31; 913 neg_imag <<= 31; 914 915 for (i = 0; i < elements; i += eltspersegment) { 916 float32 mr = m[H4(i + 2 * index + 0)]; 917 float32 mi = m[H4(i + 2 * index + 1)]; 918 float32 e1 = neg_real ^ (flip ? mi : mr); 919 float32 e3 = neg_imag ^ (flip ? mr : mi); 920 921 for (j = i; j < i + eltspersegment; j += 2) { 922 float32 e2 = n[H4(j + flip)]; 923 float32 e4 = e2; 924 925 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 926 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 927 } 928 } 929 clear_tail(d, opr_sz, simd_maxsz(desc)); 930 } 931 932 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 933 void *vfpst, uint32_t desc) 934 { 935 uintptr_t opr_sz = simd_oprsz(desc); 936 float64 *d = vd, *n = vn, *m = vm, *a = va; 937 float_status *fpst = vfpst; 938 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 939 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 940 uint64_t neg_real = flip ^ neg_imag; 941 uintptr_t i; 942 943 /* Shift boolean to the sign bit so we can xor to negate. */ 944 neg_real <<= 63; 945 neg_imag <<= 63; 946 947 for (i = 0; i < opr_sz / 8; i += 2) { 948 float64 e2 = n[i + flip]; 949 float64 e1 = m[i + flip] ^ neg_real; 950 float64 e4 = e2; 951 float64 e3 = m[i + 1 - flip] ^ neg_imag; 952 953 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 954 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 955 } 956 clear_tail(d, opr_sz, simd_maxsz(desc)); 957 } 958 959 /* 960 * Floating point comparisons producing an integer result (all 1s or all 0s). 961 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 962 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 963 */ 964 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 965 { 966 return -float16_eq_quiet(op1, op2, stat); 967 } 968 969 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 970 { 971 return -float32_eq_quiet(op1, op2, stat); 972 } 973 974 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 975 { 976 return -float16_le(op2, op1, stat); 977 } 978 979 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 980 { 981 return -float32_le(op2, op1, stat); 982 } 983 984 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 985 { 986 return -float16_lt(op2, op1, stat); 987 } 988 989 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 990 { 991 return -float32_lt(op2, op1, stat); 992 } 993 994 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 995 { 996 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 997 } 998 999 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1000 { 1001 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1002 } 1003 1004 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1005 { 1006 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1007 } 1008 1009 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1010 { 1011 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1012 } 1013 1014 static int16_t vfp_tosszh(float16 x, void *fpstp) 1015 { 1016 float_status *fpst = fpstp; 1017 if (float16_is_any_nan(x)) { 1018 float_raise(float_flag_invalid, fpst); 1019 return 0; 1020 } 1021 return float16_to_int16_round_to_zero(x, fpst); 1022 } 1023 1024 static uint16_t vfp_touszh(float16 x, void *fpstp) 1025 { 1026 float_status *fpst = fpstp; 1027 if (float16_is_any_nan(x)) { 1028 float_raise(float_flag_invalid, fpst); 1029 return 0; 1030 } 1031 return float16_to_uint16_round_to_zero(x, fpst); 1032 } 1033 1034 #define DO_2OP(NAME, FUNC, TYPE) \ 1035 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1036 { \ 1037 intptr_t i, oprsz = simd_oprsz(desc); \ 1038 TYPE *d = vd, *n = vn; \ 1039 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1040 d[i] = FUNC(n[i], stat); \ 1041 } \ 1042 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1043 } 1044 1045 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1046 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1047 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1048 1049 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1050 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1051 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1052 1053 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1054 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1055 1056 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1057 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1058 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1059 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1060 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1061 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1062 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1063 DO_2OP(gvec_touszh, vfp_touszh, float16) 1064 1065 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1066 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1067 { \ 1068 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1069 } 1070 1071 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1072 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1073 { \ 1074 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1075 } 1076 1077 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1078 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1079 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1080 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1081 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1082 1083 DO_2OP_CMP0(cgt, cgt, FWD) 1084 DO_2OP_CMP0(cge, cge, FWD) 1085 DO_2OP_CMP0(ceq, ceq, FWD) 1086 DO_2OP_CMP0(clt, cgt, REV) 1087 DO_2OP_CMP0(cle, cge, REV) 1088 1089 #undef DO_2OP 1090 #undef DO_2OP_CMP0 1091 1092 /* Floating-point trigonometric starting value. 1093 * See the ARM ARM pseudocode function FPTrigSMul. 1094 */ 1095 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1096 { 1097 float16 result = float16_mul(op1, op1, stat); 1098 if (!float16_is_any_nan(result)) { 1099 result = float16_set_sign(result, op2 & 1); 1100 } 1101 return result; 1102 } 1103 1104 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1105 { 1106 float32 result = float32_mul(op1, op1, stat); 1107 if (!float32_is_any_nan(result)) { 1108 result = float32_set_sign(result, op2 & 1); 1109 } 1110 return result; 1111 } 1112 1113 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1114 { 1115 float64 result = float64_mul(op1, op1, stat); 1116 if (!float64_is_any_nan(result)) { 1117 result = float64_set_sign(result, op2 & 1); 1118 } 1119 return result; 1120 } 1121 1122 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1123 { 1124 return float16_abs(float16_sub(op1, op2, stat)); 1125 } 1126 1127 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1128 { 1129 return float32_abs(float32_sub(op1, op2, stat)); 1130 } 1131 1132 /* 1133 * Reciprocal step. These are the AArch32 version which uses a 1134 * non-fused multiply-and-subtract. 1135 */ 1136 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1137 { 1138 op1 = float16_squash_input_denormal(op1, stat); 1139 op2 = float16_squash_input_denormal(op2, stat); 1140 1141 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1142 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1143 return float16_two; 1144 } 1145 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1146 } 1147 1148 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1149 { 1150 op1 = float32_squash_input_denormal(op1, stat); 1151 op2 = float32_squash_input_denormal(op2, stat); 1152 1153 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1154 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1155 return float32_two; 1156 } 1157 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1158 } 1159 1160 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1161 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1162 { 1163 op1 = float16_squash_input_denormal(op1, stat); 1164 op2 = float16_squash_input_denormal(op2, stat); 1165 1166 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1167 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1168 return float16_one_point_five; 1169 } 1170 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1171 return float16_div(op1, float16_two, stat); 1172 } 1173 1174 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1175 { 1176 op1 = float32_squash_input_denormal(op1, stat); 1177 op2 = float32_squash_input_denormal(op2, stat); 1178 1179 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1180 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1181 return float32_one_point_five; 1182 } 1183 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1184 return float32_div(op1, float32_two, stat); 1185 } 1186 1187 #define DO_3OP(NAME, FUNC, TYPE) \ 1188 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1189 { \ 1190 intptr_t i, oprsz = simd_oprsz(desc); \ 1191 TYPE *d = vd, *n = vn, *m = vm; \ 1192 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1193 d[i] = FUNC(n[i], m[i], stat); \ 1194 } \ 1195 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1196 } 1197 1198 DO_3OP(gvec_fadd_h, float16_add, float16) 1199 DO_3OP(gvec_fadd_s, float32_add, float32) 1200 DO_3OP(gvec_fadd_d, float64_add, float64) 1201 1202 DO_3OP(gvec_fsub_h, float16_sub, float16) 1203 DO_3OP(gvec_fsub_s, float32_sub, float32) 1204 DO_3OP(gvec_fsub_d, float64_sub, float64) 1205 1206 DO_3OP(gvec_fmul_h, float16_mul, float16) 1207 DO_3OP(gvec_fmul_s, float32_mul, float32) 1208 DO_3OP(gvec_fmul_d, float64_mul, float64) 1209 1210 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1211 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1212 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1213 1214 DO_3OP(gvec_fabd_h, float16_abd, float16) 1215 DO_3OP(gvec_fabd_s, float32_abd, float32) 1216 1217 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1218 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1219 1220 DO_3OP(gvec_fcge_h, float16_cge, float16) 1221 DO_3OP(gvec_fcge_s, float32_cge, float32) 1222 1223 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1224 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1225 1226 DO_3OP(gvec_facge_h, float16_acge, float16) 1227 DO_3OP(gvec_facge_s, float32_acge, float32) 1228 1229 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1230 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1231 1232 DO_3OP(gvec_fmax_h, float16_max, float16) 1233 DO_3OP(gvec_fmax_s, float32_max, float32) 1234 1235 DO_3OP(gvec_fmin_h, float16_min, float16) 1236 DO_3OP(gvec_fmin_s, float32_min, float32) 1237 1238 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1239 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1240 1241 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1242 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1243 1244 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1245 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1246 1247 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1248 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1249 1250 #ifdef TARGET_AARCH64 1251 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1252 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1253 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1254 1255 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1256 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1257 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1258 1259 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1260 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1261 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1262 1263 #endif 1264 #undef DO_3OP 1265 1266 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1267 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1268 float_status *stat) 1269 { 1270 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1271 } 1272 1273 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1274 float_status *stat) 1275 { 1276 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1277 } 1278 1279 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1280 float_status *stat) 1281 { 1282 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1283 } 1284 1285 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1286 float_status *stat) 1287 { 1288 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1289 } 1290 1291 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1292 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1293 float_status *stat) 1294 { 1295 return float16_muladd(op1, op2, dest, 0, stat); 1296 } 1297 1298 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1299 float_status *stat) 1300 { 1301 return float32_muladd(op1, op2, dest, 0, stat); 1302 } 1303 1304 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1305 float_status *stat) 1306 { 1307 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1308 } 1309 1310 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1311 float_status *stat) 1312 { 1313 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1314 } 1315 1316 #define DO_MULADD(NAME, FUNC, TYPE) \ 1317 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1318 { \ 1319 intptr_t i, oprsz = simd_oprsz(desc); \ 1320 TYPE *d = vd, *n = vn, *m = vm; \ 1321 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1322 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1323 } \ 1324 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1325 } 1326 1327 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1328 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1329 1330 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1331 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1332 1333 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1334 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1335 1336 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1337 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1338 1339 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1340 * For AdvSIMD, there is of course only one such vector segment. 1341 */ 1342 1343 #define DO_MUL_IDX(NAME, TYPE, H) \ 1344 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1345 { \ 1346 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1347 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1348 intptr_t idx = simd_data(desc); \ 1349 TYPE *d = vd, *n = vn, *m = vm; \ 1350 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1351 TYPE mm = m[H(i + idx)]; \ 1352 for (j = 0; j < segment; j++) { \ 1353 d[i + j] = n[i + j] * mm; \ 1354 } \ 1355 } \ 1356 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1357 } 1358 1359 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1360 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1361 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1362 1363 #undef DO_MUL_IDX 1364 1365 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1366 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1367 { \ 1368 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1369 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1370 intptr_t idx = simd_data(desc); \ 1371 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1372 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1373 TYPE mm = m[H(i + idx)]; \ 1374 for (j = 0; j < segment; j++) { \ 1375 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1376 } \ 1377 } \ 1378 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1379 } 1380 1381 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1382 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1383 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1384 1385 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1386 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1387 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1388 1389 #undef DO_MLA_IDX 1390 1391 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1392 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1393 { \ 1394 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1395 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1396 intptr_t idx = simd_data(desc); \ 1397 TYPE *d = vd, *n = vn, *m = vm; \ 1398 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1399 TYPE mm = m[H(i + idx)]; \ 1400 for (j = 0; j < segment; j++) { \ 1401 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1402 } \ 1403 } \ 1404 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1405 } 1406 1407 #define nop(N, M, S) (M) 1408 1409 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1410 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1411 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1412 1413 #ifdef TARGET_AARCH64 1414 1415 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1416 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1417 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1418 1419 #endif 1420 1421 #undef nop 1422 1423 /* 1424 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1425 * the fused ops below they assume accumulate both from and into Vd. 1426 */ 1427 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1428 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1429 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1430 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1431 1432 #undef DO_FMUL_IDX 1433 1434 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1435 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1436 void *stat, uint32_t desc) \ 1437 { \ 1438 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1439 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1440 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1441 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1442 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1443 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1444 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1445 TYPE mm = m[H(i + idx)]; \ 1446 for (j = 0; j < segment; j++) { \ 1447 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1448 mm, a[i + j], 0, stat); \ 1449 } \ 1450 } \ 1451 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1452 } 1453 1454 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1455 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1456 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1457 1458 #undef DO_FMLA_IDX 1459 1460 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1461 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1462 { \ 1463 intptr_t i, oprsz = simd_oprsz(desc); \ 1464 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1465 bool q = false; \ 1466 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1467 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1468 if (dd < MIN) { \ 1469 dd = MIN; \ 1470 q = true; \ 1471 } else if (dd > MAX) { \ 1472 dd = MAX; \ 1473 q = true; \ 1474 } \ 1475 d[i] = dd; \ 1476 } \ 1477 if (q) { \ 1478 uint32_t *qc = vq; \ 1479 qc[0] = 1; \ 1480 } \ 1481 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1482 } 1483 1484 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1485 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1486 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1487 1488 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1489 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1490 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1491 1492 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1493 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1494 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1495 1496 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1497 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1498 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1499 1500 #undef DO_SAT 1501 1502 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1503 void *vm, uint32_t desc) 1504 { 1505 intptr_t i, oprsz = simd_oprsz(desc); 1506 uint64_t *d = vd, *n = vn, *m = vm; 1507 bool q = false; 1508 1509 for (i = 0; i < oprsz / 8; i++) { 1510 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1511 if (dd < nn) { 1512 dd = UINT64_MAX; 1513 q = true; 1514 } 1515 d[i] = dd; 1516 } 1517 if (q) { 1518 uint32_t *qc = vq; 1519 qc[0] = 1; 1520 } 1521 clear_tail(d, oprsz, simd_maxsz(desc)); 1522 } 1523 1524 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1525 void *vm, uint32_t desc) 1526 { 1527 intptr_t i, oprsz = simd_oprsz(desc); 1528 uint64_t *d = vd, *n = vn, *m = vm; 1529 bool q = false; 1530 1531 for (i = 0; i < oprsz / 8; i++) { 1532 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1533 if (nn < mm) { 1534 dd = 0; 1535 q = true; 1536 } 1537 d[i] = dd; 1538 } 1539 if (q) { 1540 uint32_t *qc = vq; 1541 qc[0] = 1; 1542 } 1543 clear_tail(d, oprsz, simd_maxsz(desc)); 1544 } 1545 1546 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1547 void *vm, uint32_t desc) 1548 { 1549 intptr_t i, oprsz = simd_oprsz(desc); 1550 int64_t *d = vd, *n = vn, *m = vm; 1551 bool q = false; 1552 1553 for (i = 0; i < oprsz / 8; i++) { 1554 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1555 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1556 dd = (nn >> 63) ^ ~INT64_MIN; 1557 q = true; 1558 } 1559 d[i] = dd; 1560 } 1561 if (q) { 1562 uint32_t *qc = vq; 1563 qc[0] = 1; 1564 } 1565 clear_tail(d, oprsz, simd_maxsz(desc)); 1566 } 1567 1568 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1569 void *vm, uint32_t desc) 1570 { 1571 intptr_t i, oprsz = simd_oprsz(desc); 1572 int64_t *d = vd, *n = vn, *m = vm; 1573 bool q = false; 1574 1575 for (i = 0; i < oprsz / 8; i++) { 1576 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1577 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1578 dd = (nn >> 63) ^ ~INT64_MIN; 1579 q = true; 1580 } 1581 d[i] = dd; 1582 } 1583 if (q) { 1584 uint32_t *qc = vq; 1585 qc[0] = 1; 1586 } 1587 clear_tail(d, oprsz, simd_maxsz(desc)); 1588 } 1589 1590 1591 #define DO_SRA(NAME, TYPE) \ 1592 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1593 { \ 1594 intptr_t i, oprsz = simd_oprsz(desc); \ 1595 int shift = simd_data(desc); \ 1596 TYPE *d = vd, *n = vn; \ 1597 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1598 d[i] += n[i] >> shift; \ 1599 } \ 1600 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1601 } 1602 1603 DO_SRA(gvec_ssra_b, int8_t) 1604 DO_SRA(gvec_ssra_h, int16_t) 1605 DO_SRA(gvec_ssra_s, int32_t) 1606 DO_SRA(gvec_ssra_d, int64_t) 1607 1608 DO_SRA(gvec_usra_b, uint8_t) 1609 DO_SRA(gvec_usra_h, uint16_t) 1610 DO_SRA(gvec_usra_s, uint32_t) 1611 DO_SRA(gvec_usra_d, uint64_t) 1612 1613 #undef DO_SRA 1614 1615 #define DO_RSHR(NAME, TYPE) \ 1616 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1617 { \ 1618 intptr_t i, oprsz = simd_oprsz(desc); \ 1619 int shift = simd_data(desc); \ 1620 TYPE *d = vd, *n = vn; \ 1621 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1622 TYPE tmp = n[i] >> (shift - 1); \ 1623 d[i] = (tmp >> 1) + (tmp & 1); \ 1624 } \ 1625 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1626 } 1627 1628 DO_RSHR(gvec_srshr_b, int8_t) 1629 DO_RSHR(gvec_srshr_h, int16_t) 1630 DO_RSHR(gvec_srshr_s, int32_t) 1631 DO_RSHR(gvec_srshr_d, int64_t) 1632 1633 DO_RSHR(gvec_urshr_b, uint8_t) 1634 DO_RSHR(gvec_urshr_h, uint16_t) 1635 DO_RSHR(gvec_urshr_s, uint32_t) 1636 DO_RSHR(gvec_urshr_d, uint64_t) 1637 1638 #undef DO_RSHR 1639 1640 #define DO_RSRA(NAME, TYPE) \ 1641 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1642 { \ 1643 intptr_t i, oprsz = simd_oprsz(desc); \ 1644 int shift = simd_data(desc); \ 1645 TYPE *d = vd, *n = vn; \ 1646 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1647 TYPE tmp = n[i] >> (shift - 1); \ 1648 d[i] += (tmp >> 1) + (tmp & 1); \ 1649 } \ 1650 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1651 } 1652 1653 DO_RSRA(gvec_srsra_b, int8_t) 1654 DO_RSRA(gvec_srsra_h, int16_t) 1655 DO_RSRA(gvec_srsra_s, int32_t) 1656 DO_RSRA(gvec_srsra_d, int64_t) 1657 1658 DO_RSRA(gvec_ursra_b, uint8_t) 1659 DO_RSRA(gvec_ursra_h, uint16_t) 1660 DO_RSRA(gvec_ursra_s, uint32_t) 1661 DO_RSRA(gvec_ursra_d, uint64_t) 1662 1663 #undef DO_RSRA 1664 1665 #define DO_SRI(NAME, TYPE) \ 1666 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1667 { \ 1668 intptr_t i, oprsz = simd_oprsz(desc); \ 1669 int shift = simd_data(desc); \ 1670 TYPE *d = vd, *n = vn; \ 1671 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1672 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1673 } \ 1674 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1675 } 1676 1677 DO_SRI(gvec_sri_b, uint8_t) 1678 DO_SRI(gvec_sri_h, uint16_t) 1679 DO_SRI(gvec_sri_s, uint32_t) 1680 DO_SRI(gvec_sri_d, uint64_t) 1681 1682 #undef DO_SRI 1683 1684 #define DO_SLI(NAME, TYPE) \ 1685 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1686 { \ 1687 intptr_t i, oprsz = simd_oprsz(desc); \ 1688 int shift = simd_data(desc); \ 1689 TYPE *d = vd, *n = vn; \ 1690 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1691 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1692 } \ 1693 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1694 } 1695 1696 DO_SLI(gvec_sli_b, uint8_t) 1697 DO_SLI(gvec_sli_h, uint16_t) 1698 DO_SLI(gvec_sli_s, uint32_t) 1699 DO_SLI(gvec_sli_d, uint64_t) 1700 1701 #undef DO_SLI 1702 1703 /* 1704 * Convert float16 to float32, raising no exceptions and 1705 * preserving exceptional values, including SNaN. 1706 * This is effectively an unpack+repack operation. 1707 */ 1708 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1709 { 1710 const int f16_bias = 15; 1711 const int f32_bias = 127; 1712 uint32_t sign = extract32(f16, 15, 1); 1713 uint32_t exp = extract32(f16, 10, 5); 1714 uint32_t frac = extract32(f16, 0, 10); 1715 1716 if (exp == 0x1f) { 1717 /* Inf or NaN */ 1718 exp = 0xff; 1719 } else if (exp == 0) { 1720 /* Zero or denormal. */ 1721 if (frac != 0) { 1722 if (fz16) { 1723 frac = 0; 1724 } else { 1725 /* 1726 * Denormal; these are all normal float32. 1727 * Shift the fraction so that the msb is at bit 11, 1728 * then remove bit 11 as the implicit bit of the 1729 * normalized float32. Note that we still go through 1730 * the shift for normal numbers below, to put the 1731 * float32 fraction at the right place. 1732 */ 1733 int shift = clz32(frac) - 21; 1734 frac = (frac << shift) & 0x3ff; 1735 exp = f32_bias - f16_bias - shift + 1; 1736 } 1737 } 1738 } else { 1739 /* Normal number; adjust the bias. */ 1740 exp += f32_bias - f16_bias; 1741 } 1742 sign <<= 31; 1743 exp <<= 23; 1744 frac <<= 23 - 10; 1745 1746 return sign | exp | frac; 1747 } 1748 1749 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1750 { 1751 /* 1752 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1753 * Load the 2nd qword iff is_q & is_2. 1754 * Shift to the 2nd dword iff !is_q & is_2. 1755 * For !is_q & !is_2, the upper bits of the result are garbage. 1756 */ 1757 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1758 } 1759 1760 /* 1761 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1762 * as there is not yet SVE versions that might use blocking. 1763 */ 1764 1765 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1766 uint32_t desc, bool fz16) 1767 { 1768 intptr_t i, oprsz = simd_oprsz(desc); 1769 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1770 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1771 int is_q = oprsz == 16; 1772 uint64_t n_4, m_4; 1773 1774 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1775 n_4 = load4_f16(vn, is_q, is_2); 1776 m_4 = load4_f16(vm, is_q, is_2); 1777 1778 /* Negate all inputs for FMLSL at once. */ 1779 if (is_s) { 1780 n_4 ^= 0x8000800080008000ull; 1781 } 1782 1783 for (i = 0; i < oprsz / 4; i++) { 1784 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1785 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1786 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1787 } 1788 clear_tail(d, oprsz, simd_maxsz(desc)); 1789 } 1790 1791 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1792 void *venv, uint32_t desc) 1793 { 1794 CPUARMState *env = venv; 1795 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1796 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1797 } 1798 1799 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1800 void *venv, uint32_t desc) 1801 { 1802 CPUARMState *env = venv; 1803 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1804 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1805 } 1806 1807 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 1808 void *venv, uint32_t desc) 1809 { 1810 intptr_t i, oprsz = simd_oprsz(desc); 1811 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1812 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1813 CPUARMState *env = venv; 1814 float_status *status = &env->vfp.fp_status; 1815 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1816 1817 for (i = 0; i < oprsz; i += sizeof(float32)) { 1818 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 1819 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 1820 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1821 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1822 float32 aa = *(float32 *)(va + H1_4(i)); 1823 1824 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 1825 } 1826 } 1827 1828 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 1829 uint32_t desc, bool fz16) 1830 { 1831 intptr_t i, oprsz = simd_oprsz(desc); 1832 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1833 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1834 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 1835 int is_q = oprsz == 16; 1836 uint64_t n_4; 1837 float32 m_1; 1838 1839 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1840 n_4 = load4_f16(vn, is_q, is_2); 1841 1842 /* Negate all inputs for FMLSL at once. */ 1843 if (is_s) { 1844 n_4 ^= 0x8000800080008000ull; 1845 } 1846 1847 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 1848 1849 for (i = 0; i < oprsz / 4; i++) { 1850 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1851 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1852 } 1853 clear_tail(d, oprsz, simd_maxsz(desc)); 1854 } 1855 1856 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 1857 void *venv, uint32_t desc) 1858 { 1859 CPUARMState *env = venv; 1860 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1861 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1862 } 1863 1864 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 1865 void *venv, uint32_t desc) 1866 { 1867 CPUARMState *env = venv; 1868 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 1869 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1870 } 1871 1872 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 1873 void *venv, uint32_t desc) 1874 { 1875 intptr_t i, j, oprsz = simd_oprsz(desc); 1876 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1877 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1878 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 1879 CPUARMState *env = venv; 1880 float_status *status = &env->vfp.fp_status; 1881 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 1882 1883 for (i = 0; i < oprsz; i += 16) { 1884 float16 mm_16 = *(float16 *)(vm + i + idx); 1885 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 1886 1887 for (j = 0; j < 16; j += sizeof(float32)) { 1888 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 1889 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 1890 float32 aa = *(float32 *)(va + H1_4(i + j)); 1891 1892 *(float32 *)(vd + H1_4(i + j)) = 1893 float32_muladd(nn, mm, aa, 0, status); 1894 } 1895 } 1896 } 1897 1898 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1899 { 1900 intptr_t i, opr_sz = simd_oprsz(desc); 1901 int8_t *d = vd, *n = vn, *m = vm; 1902 1903 for (i = 0; i < opr_sz; ++i) { 1904 int8_t mm = m[i]; 1905 int8_t nn = n[i]; 1906 int8_t res = 0; 1907 if (mm >= 0) { 1908 if (mm < 8) { 1909 res = nn << mm; 1910 } 1911 } else { 1912 res = nn >> (mm > -8 ? -mm : 7); 1913 } 1914 d[i] = res; 1915 } 1916 clear_tail(d, opr_sz, simd_maxsz(desc)); 1917 } 1918 1919 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1920 { 1921 intptr_t i, opr_sz = simd_oprsz(desc); 1922 int16_t *d = vd, *n = vn, *m = vm; 1923 1924 for (i = 0; i < opr_sz / 2; ++i) { 1925 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1926 int16_t nn = n[i]; 1927 int16_t res = 0; 1928 if (mm >= 0) { 1929 if (mm < 16) { 1930 res = nn << mm; 1931 } 1932 } else { 1933 res = nn >> (mm > -16 ? -mm : 15); 1934 } 1935 d[i] = res; 1936 } 1937 clear_tail(d, opr_sz, simd_maxsz(desc)); 1938 } 1939 1940 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 1941 { 1942 intptr_t i, opr_sz = simd_oprsz(desc); 1943 uint8_t *d = vd, *n = vn, *m = vm; 1944 1945 for (i = 0; i < opr_sz; ++i) { 1946 int8_t mm = m[i]; 1947 uint8_t nn = n[i]; 1948 uint8_t res = 0; 1949 if (mm >= 0) { 1950 if (mm < 8) { 1951 res = nn << mm; 1952 } 1953 } else { 1954 if (mm > -8) { 1955 res = nn >> -mm; 1956 } 1957 } 1958 d[i] = res; 1959 } 1960 clear_tail(d, opr_sz, simd_maxsz(desc)); 1961 } 1962 1963 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 1964 { 1965 intptr_t i, opr_sz = simd_oprsz(desc); 1966 uint16_t *d = vd, *n = vn, *m = vm; 1967 1968 for (i = 0; i < opr_sz / 2; ++i) { 1969 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 1970 uint16_t nn = n[i]; 1971 uint16_t res = 0; 1972 if (mm >= 0) { 1973 if (mm < 16) { 1974 res = nn << mm; 1975 } 1976 } else { 1977 if (mm > -16) { 1978 res = nn >> -mm; 1979 } 1980 } 1981 d[i] = res; 1982 } 1983 clear_tail(d, opr_sz, simd_maxsz(desc)); 1984 } 1985 1986 /* 1987 * 8x8->8 polynomial multiply. 1988 * 1989 * Polynomial multiplication is like integer multiplication except the 1990 * partial products are XORed, not added. 1991 * 1992 * TODO: expose this as a generic vector operation, as it is a common 1993 * crypto building block. 1994 */ 1995 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 1996 { 1997 intptr_t i, opr_sz = simd_oprsz(desc); 1998 uint64_t *d = vd, *n = vn, *m = vm; 1999 2000 for (i = 0; i < opr_sz / 8; ++i) { 2001 d[i] = clmul_8x8_low(n[i], m[i]); 2002 } 2003 clear_tail(d, opr_sz, simd_maxsz(desc)); 2004 } 2005 2006 /* 2007 * 64x64->128 polynomial multiply. 2008 * Because of the lanes are not accessed in strict columns, 2009 * this probably cannot be turned into a generic helper. 2010 */ 2011 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2012 { 2013 intptr_t i, opr_sz = simd_oprsz(desc); 2014 intptr_t hi = simd_data(desc); 2015 uint64_t *d = vd, *n = vn, *m = vm; 2016 2017 for (i = 0; i < opr_sz / 8; i += 2) { 2018 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2019 d[i] = int128_getlo(r); 2020 d[i + 1] = int128_gethi(r); 2021 } 2022 clear_tail(d, opr_sz, simd_maxsz(desc)); 2023 } 2024 2025 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2026 { 2027 int hi = simd_data(desc); 2028 uint64_t *d = vd, *n = vn, *m = vm; 2029 uint64_t nn = n[hi], mm = m[hi]; 2030 2031 d[0] = clmul_8x4_packed(nn, mm); 2032 nn >>= 32; 2033 mm >>= 32; 2034 d[1] = clmul_8x4_packed(nn, mm); 2035 2036 clear_tail(d, 16, simd_maxsz(desc)); 2037 } 2038 2039 #ifdef TARGET_AARCH64 2040 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2041 { 2042 int shift = simd_data(desc) * 8; 2043 intptr_t i, opr_sz = simd_oprsz(desc); 2044 uint64_t *d = vd, *n = vn, *m = vm; 2045 2046 for (i = 0; i < opr_sz / 8; ++i) { 2047 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2048 } 2049 } 2050 2051 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2052 { 2053 intptr_t sel = H4(simd_data(desc)); 2054 intptr_t i, opr_sz = simd_oprsz(desc); 2055 uint32_t *n = vn, *m = vm; 2056 uint64_t *d = vd; 2057 2058 for (i = 0; i < opr_sz / 8; ++i) { 2059 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2060 } 2061 } 2062 #endif 2063 2064 #define DO_CMP0(NAME, TYPE, OP) \ 2065 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2066 { \ 2067 intptr_t i, opr_sz = simd_oprsz(desc); \ 2068 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2069 TYPE nn = *(TYPE *)(vn + i); \ 2070 *(TYPE *)(vd + i) = -(nn OP 0); \ 2071 } \ 2072 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2073 } 2074 2075 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2076 DO_CMP0(gvec_clt0_b, int8_t, <) 2077 DO_CMP0(gvec_cle0_b, int8_t, <=) 2078 DO_CMP0(gvec_cgt0_b, int8_t, >) 2079 DO_CMP0(gvec_cge0_b, int8_t, >=) 2080 2081 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2082 DO_CMP0(gvec_clt0_h, int16_t, <) 2083 DO_CMP0(gvec_cle0_h, int16_t, <=) 2084 DO_CMP0(gvec_cgt0_h, int16_t, >) 2085 DO_CMP0(gvec_cge0_h, int16_t, >=) 2086 2087 #undef DO_CMP0 2088 2089 #define DO_ABD(NAME, TYPE) \ 2090 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2091 { \ 2092 intptr_t i, opr_sz = simd_oprsz(desc); \ 2093 TYPE *d = vd, *n = vn, *m = vm; \ 2094 \ 2095 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2096 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2097 } \ 2098 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2099 } 2100 2101 DO_ABD(gvec_sabd_b, int8_t) 2102 DO_ABD(gvec_sabd_h, int16_t) 2103 DO_ABD(gvec_sabd_s, int32_t) 2104 DO_ABD(gvec_sabd_d, int64_t) 2105 2106 DO_ABD(gvec_uabd_b, uint8_t) 2107 DO_ABD(gvec_uabd_h, uint16_t) 2108 DO_ABD(gvec_uabd_s, uint32_t) 2109 DO_ABD(gvec_uabd_d, uint64_t) 2110 2111 #undef DO_ABD 2112 2113 #define DO_ABA(NAME, TYPE) \ 2114 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2115 { \ 2116 intptr_t i, opr_sz = simd_oprsz(desc); \ 2117 TYPE *d = vd, *n = vn, *m = vm; \ 2118 \ 2119 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2120 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2121 } \ 2122 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2123 } 2124 2125 DO_ABA(gvec_saba_b, int8_t) 2126 DO_ABA(gvec_saba_h, int16_t) 2127 DO_ABA(gvec_saba_s, int32_t) 2128 DO_ABA(gvec_saba_d, int64_t) 2129 2130 DO_ABA(gvec_uaba_b, uint8_t) 2131 DO_ABA(gvec_uaba_h, uint16_t) 2132 DO_ABA(gvec_uaba_s, uint32_t) 2133 DO_ABA(gvec_uaba_d, uint64_t) 2134 2135 #undef DO_ABA 2136 2137 #define DO_NEON_PAIRWISE(NAME, OP) \ 2138 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \ 2139 void *stat, uint32_t oprsz) \ 2140 { \ 2141 float_status *fpst = stat; \ 2142 float32 *d = vd; \ 2143 float32 *n = vn; \ 2144 float32 *m = vm; \ 2145 float32 r0, r1; \ 2146 \ 2147 /* Read all inputs before writing outputs in case vm == vd */ \ 2148 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \ 2149 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \ 2150 \ 2151 d[H4(0)] = r0; \ 2152 d[H4(1)] = r1; \ 2153 } \ 2154 \ 2155 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \ 2156 void *stat, uint32_t oprsz) \ 2157 { \ 2158 float_status *fpst = stat; \ 2159 float16 *d = vd; \ 2160 float16 *n = vn; \ 2161 float16 *m = vm; \ 2162 float16 r0, r1, r2, r3; \ 2163 \ 2164 /* Read all inputs before writing outputs in case vm == vd */ \ 2165 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \ 2166 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \ 2167 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \ 2168 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \ 2169 \ 2170 d[H2(0)] = r0; \ 2171 d[H2(1)] = r1; \ 2172 d[H2(2)] = r2; \ 2173 d[H2(3)] = r3; \ 2174 } 2175 2176 DO_NEON_PAIRWISE(neon_padd, add) 2177 DO_NEON_PAIRWISE(neon_pmax, max) 2178 DO_NEON_PAIRWISE(neon_pmin, min) 2179 2180 #undef DO_NEON_PAIRWISE 2181 2182 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2183 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2184 { \ 2185 intptr_t i, oprsz = simd_oprsz(desc); \ 2186 int shift = simd_data(desc); \ 2187 TYPE *d = vd, *n = vn; \ 2188 float_status *fpst = stat; \ 2189 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2190 d[i] = FUNC(n[i], shift, fpst); \ 2191 } \ 2192 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2193 } 2194 2195 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2196 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2197 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2198 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) 2199 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2200 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2201 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2202 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2203 2204 #undef DO_VCVT_FIXED 2205 2206 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2207 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2208 { \ 2209 float_status *fpst = stat; \ 2210 intptr_t i, oprsz = simd_oprsz(desc); \ 2211 uint32_t rmode = simd_data(desc); \ 2212 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2213 TYPE *d = vd, *n = vn; \ 2214 set_float_rounding_mode(rmode, fpst); \ 2215 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2216 d[i] = FUNC(n[i], 0, fpst); \ 2217 } \ 2218 set_float_rounding_mode(prev_rmode, fpst); \ 2219 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2220 } 2221 2222 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2223 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2224 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2225 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2226 2227 #undef DO_VCVT_RMODE 2228 2229 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2230 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2231 { \ 2232 float_status *fpst = stat; \ 2233 intptr_t i, oprsz = simd_oprsz(desc); \ 2234 uint32_t rmode = simd_data(desc); \ 2235 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2236 TYPE *d = vd, *n = vn; \ 2237 set_float_rounding_mode(rmode, fpst); \ 2238 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2239 d[i] = FUNC(n[i], fpst); \ 2240 } \ 2241 set_float_rounding_mode(prev_rmode, fpst); \ 2242 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2243 } 2244 2245 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2246 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2247 2248 #undef DO_VRINT_RMODE 2249 2250 #ifdef TARGET_AARCH64 2251 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2252 { 2253 const uint8_t *indices = vm; 2254 CPUARMState *env = venv; 2255 size_t oprsz = simd_oprsz(desc); 2256 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2257 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2258 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2259 union { 2260 uint8_t b[16]; 2261 uint64_t d[2]; 2262 } result; 2263 2264 /* 2265 * We must construct the final result in a temp, lest the output 2266 * overlaps the input table. For TBL, begin with zero; for TBX, 2267 * begin with the original register contents. Note that we always 2268 * copy 16 bytes here to avoid an extra branch; clearing the high 2269 * bits of the register for oprsz == 8 is handled below. 2270 */ 2271 if (is_tbx) { 2272 memcpy(&result, vd, 16); 2273 } else { 2274 memset(&result, 0, 16); 2275 } 2276 2277 for (size_t i = 0; i < oprsz; ++i) { 2278 uint32_t index = indices[H1(i)]; 2279 2280 if (index < table_len) { 2281 /* 2282 * Convert index (a byte offset into the virtual table 2283 * which is a series of 128-bit vectors concatenated) 2284 * into the correct register element, bearing in mind 2285 * that the table can wrap around from V31 to V0. 2286 */ 2287 const uint8_t *table = (const uint8_t *) 2288 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2289 result.b[H1(i)] = table[H1(index % 16)]; 2290 } 2291 } 2292 2293 memcpy(vd, &result, 16); 2294 clear_tail(vd, oprsz, simd_maxsz(desc)); 2295 } 2296 #endif 2297 2298 /* 2299 * NxN -> N highpart multiply 2300 * 2301 * TODO: expose this as a generic vector operation. 2302 */ 2303 2304 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2305 { 2306 intptr_t i, opr_sz = simd_oprsz(desc); 2307 int8_t *d = vd, *n = vn, *m = vm; 2308 2309 for (i = 0; i < opr_sz; ++i) { 2310 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2311 } 2312 clear_tail(d, opr_sz, simd_maxsz(desc)); 2313 } 2314 2315 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2316 { 2317 intptr_t i, opr_sz = simd_oprsz(desc); 2318 int16_t *d = vd, *n = vn, *m = vm; 2319 2320 for (i = 0; i < opr_sz / 2; ++i) { 2321 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2322 } 2323 clear_tail(d, opr_sz, simd_maxsz(desc)); 2324 } 2325 2326 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2327 { 2328 intptr_t i, opr_sz = simd_oprsz(desc); 2329 int32_t *d = vd, *n = vn, *m = vm; 2330 2331 for (i = 0; i < opr_sz / 4; ++i) { 2332 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2333 } 2334 clear_tail(d, opr_sz, simd_maxsz(desc)); 2335 } 2336 2337 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2338 { 2339 intptr_t i, opr_sz = simd_oprsz(desc); 2340 uint64_t *d = vd, *n = vn, *m = vm; 2341 uint64_t discard; 2342 2343 for (i = 0; i < opr_sz / 8; ++i) { 2344 muls64(&discard, &d[i], n[i], m[i]); 2345 } 2346 clear_tail(d, opr_sz, simd_maxsz(desc)); 2347 } 2348 2349 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2350 { 2351 intptr_t i, opr_sz = simd_oprsz(desc); 2352 uint8_t *d = vd, *n = vn, *m = vm; 2353 2354 for (i = 0; i < opr_sz; ++i) { 2355 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2356 } 2357 clear_tail(d, opr_sz, simd_maxsz(desc)); 2358 } 2359 2360 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2361 { 2362 intptr_t i, opr_sz = simd_oprsz(desc); 2363 uint16_t *d = vd, *n = vn, *m = vm; 2364 2365 for (i = 0; i < opr_sz / 2; ++i) { 2366 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2367 } 2368 clear_tail(d, opr_sz, simd_maxsz(desc)); 2369 } 2370 2371 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2372 { 2373 intptr_t i, opr_sz = simd_oprsz(desc); 2374 uint32_t *d = vd, *n = vn, *m = vm; 2375 2376 for (i = 0; i < opr_sz / 4; ++i) { 2377 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2378 } 2379 clear_tail(d, opr_sz, simd_maxsz(desc)); 2380 } 2381 2382 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2383 { 2384 intptr_t i, opr_sz = simd_oprsz(desc); 2385 uint64_t *d = vd, *n = vn, *m = vm; 2386 uint64_t discard; 2387 2388 for (i = 0; i < opr_sz / 8; ++i) { 2389 mulu64(&discard, &d[i], n[i], m[i]); 2390 } 2391 clear_tail(d, opr_sz, simd_maxsz(desc)); 2392 } 2393 2394 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2395 { 2396 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2397 int shr = simd_data(desc); 2398 uint64_t *d = vd, *n = vn, *m = vm; 2399 2400 for (i = 0; i < opr_sz; ++i) { 2401 d[i] = ror64(n[i] ^ m[i], shr); 2402 } 2403 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2404 } 2405 2406 /* 2407 * Integer matrix-multiply accumulate 2408 */ 2409 2410 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2411 { 2412 int8_t *n = vn, *m = vm; 2413 2414 for (intptr_t k = 0; k < 8; ++k) { 2415 sum += n[H1(k)] * m[H1(k)]; 2416 } 2417 return sum; 2418 } 2419 2420 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2421 { 2422 uint8_t *n = vn, *m = vm; 2423 2424 for (intptr_t k = 0; k < 8; ++k) { 2425 sum += n[H1(k)] * m[H1(k)]; 2426 } 2427 return sum; 2428 } 2429 2430 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2431 { 2432 uint8_t *n = vn; 2433 int8_t *m = vm; 2434 2435 for (intptr_t k = 0; k < 8; ++k) { 2436 sum += n[H1(k)] * m[H1(k)]; 2437 } 2438 return sum; 2439 } 2440 2441 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2442 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2443 { 2444 intptr_t seg, opr_sz = simd_oprsz(desc); 2445 2446 for (seg = 0; seg < opr_sz; seg += 16) { 2447 uint32_t *d = vd + seg; 2448 uint32_t *a = va + seg; 2449 uint32_t sum0, sum1, sum2, sum3; 2450 2451 /* 2452 * Process the entire segment at once, writing back the 2453 * results only after we've consumed all of the inputs. 2454 * 2455 * Key to indices by column: 2456 * i j i j 2457 */ 2458 sum0 = a[H4(0 + 0)]; 2459 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2460 sum1 = a[H4(0 + 1)]; 2461 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2462 sum2 = a[H4(2 + 0)]; 2463 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2464 sum3 = a[H4(2 + 1)]; 2465 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2466 2467 d[H4(0)] = sum0; 2468 d[H4(1)] = sum1; 2469 d[H4(2)] = sum2; 2470 d[H4(3)] = sum3; 2471 } 2472 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2473 } 2474 2475 #define DO_MMLA_B(NAME, INNER) \ 2476 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2477 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2478 2479 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2480 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2481 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2482 2483 /* 2484 * BFloat16 Dot Product 2485 */ 2486 2487 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) 2488 { 2489 /* FPCR is ignored for BFDOT and BFMMLA. */ 2490 float_status bf_status = { 2491 .tininess_before_rounding = float_tininess_before_rounding, 2492 .float_rounding_mode = float_round_to_odd_inf, 2493 .flush_to_zero = true, 2494 .flush_inputs_to_zero = true, 2495 .default_nan_mode = true, 2496 }; 2497 float32 t1, t2; 2498 2499 /* 2500 * Extract each BFloat16 from the element pair, and shift 2501 * them such that they become float32. 2502 */ 2503 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); 2504 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); 2505 t1 = float32_add(t1, t2, &bf_status); 2506 t1 = float32_add(sum, t1, &bf_status); 2507 2508 return t1; 2509 } 2510 2511 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2512 { 2513 intptr_t i, opr_sz = simd_oprsz(desc); 2514 float32 *d = vd, *a = va; 2515 uint32_t *n = vn, *m = vm; 2516 2517 for (i = 0; i < opr_sz / 4; ++i) { 2518 d[i] = bfdotadd(a[i], n[i], m[i]); 2519 } 2520 clear_tail(d, opr_sz, simd_maxsz(desc)); 2521 } 2522 2523 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2524 void *va, uint32_t desc) 2525 { 2526 intptr_t i, j, opr_sz = simd_oprsz(desc); 2527 intptr_t index = simd_data(desc); 2528 intptr_t elements = opr_sz / 4; 2529 intptr_t eltspersegment = MIN(16 / 4, elements); 2530 float32 *d = vd, *a = va; 2531 uint32_t *n = vn, *m = vm; 2532 2533 for (i = 0; i < elements; i += eltspersegment) { 2534 uint32_t m_idx = m[i + H4(index)]; 2535 2536 for (j = i; j < i + eltspersegment; j++) { 2537 d[j] = bfdotadd(a[j], n[j], m_idx); 2538 } 2539 } 2540 clear_tail(d, opr_sz, simd_maxsz(desc)); 2541 } 2542 2543 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2544 { 2545 intptr_t s, opr_sz = simd_oprsz(desc); 2546 float32 *d = vd, *a = va; 2547 uint32_t *n = vn, *m = vm; 2548 2549 for (s = 0; s < opr_sz / 4; s += 4) { 2550 float32 sum00, sum01, sum10, sum11; 2551 2552 /* 2553 * Process the entire segment at once, writing back the 2554 * results only after we've consumed all of the inputs. 2555 * 2556 * Key to indices by column: 2557 * i j i k j k 2558 */ 2559 sum00 = a[s + H4(0 + 0)]; 2560 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); 2561 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); 2562 2563 sum01 = a[s + H4(0 + 1)]; 2564 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); 2565 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); 2566 2567 sum10 = a[s + H4(2 + 0)]; 2568 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); 2569 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); 2570 2571 sum11 = a[s + H4(2 + 1)]; 2572 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); 2573 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); 2574 2575 d[s + H4(0 + 0)] = sum00; 2576 d[s + H4(0 + 1)] = sum01; 2577 d[s + H4(2 + 0)] = sum10; 2578 d[s + H4(2 + 1)] = sum11; 2579 } 2580 clear_tail(d, opr_sz, simd_maxsz(desc)); 2581 } 2582 2583 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 2584 void *stat, uint32_t desc) 2585 { 2586 intptr_t i, opr_sz = simd_oprsz(desc); 2587 intptr_t sel = simd_data(desc); 2588 float32 *d = vd, *a = va; 2589 bfloat16 *n = vn, *m = vm; 2590 2591 for (i = 0; i < opr_sz / 4; ++i) { 2592 float32 nn = n[H2(i * 2 + sel)] << 16; 2593 float32 mm = m[H2(i * 2 + sel)] << 16; 2594 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 2595 } 2596 clear_tail(d, opr_sz, simd_maxsz(desc)); 2597 } 2598 2599 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 2600 void *va, void *stat, uint32_t desc) 2601 { 2602 intptr_t i, j, opr_sz = simd_oprsz(desc); 2603 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 2604 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 2605 intptr_t elements = opr_sz / 4; 2606 intptr_t eltspersegment = MIN(16 / 4, elements); 2607 float32 *d = vd, *a = va; 2608 bfloat16 *n = vn, *m = vm; 2609 2610 for (i = 0; i < elements; i += eltspersegment) { 2611 float32 m_idx = m[H2(2 * i + index)] << 16; 2612 2613 for (j = i; j < i + eltspersegment; j++) { 2614 float32 n_j = n[H2(2 * j + sel)] << 16; 2615 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 2616 } 2617 } 2618 clear_tail(d, opr_sz, simd_maxsz(desc)); 2619 } 2620 2621 #define DO_CLAMP(NAME, TYPE) \ 2622 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 2623 { \ 2624 intptr_t i, opr_sz = simd_oprsz(desc); \ 2625 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2626 TYPE aa = *(TYPE *)(a + i); \ 2627 TYPE nn = *(TYPE *)(n + i); \ 2628 TYPE mm = *(TYPE *)(m + i); \ 2629 TYPE dd = MIN(MAX(aa, nn), mm); \ 2630 *(TYPE *)(d + i) = dd; \ 2631 } \ 2632 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2633 } 2634 2635 DO_CLAMP(gvec_sclamp_b, int8_t) 2636 DO_CLAMP(gvec_sclamp_h, int16_t) 2637 DO_CLAMP(gvec_sclamp_s, int32_t) 2638 DO_CLAMP(gvec_sclamp_d, int64_t) 2639 2640 DO_CLAMP(gvec_uclamp_b, uint8_t) 2641 DO_CLAMP(gvec_uclamp_h, uint16_t) 2642 DO_CLAMP(gvec_uclamp_s, uint32_t) 2643 DO_CLAMP(gvec_uclamp_d, uint64_t) 2644