1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 321 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 322 int16_t mm = m[i]; 323 for (j = 0; j < 16 / 2; ++j) { 324 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 325 } 326 } 327 clear_tail(d, opr_sz, simd_maxsz(desc)); 328 } 329 330 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 331 void *vq, uint32_t desc) 332 { 333 intptr_t i, j, opr_sz = simd_oprsz(desc); 334 int idx = simd_data(desc); 335 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 336 337 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 338 int16_t mm = m[i]; 339 for (j = 0; j < 16 / 2; ++j) { 340 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 341 } 342 } 343 clear_tail(d, opr_sz, simd_maxsz(desc)); 344 } 345 346 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 347 void *va, uint32_t desc) 348 { 349 intptr_t i, opr_sz = simd_oprsz(desc); 350 int16_t *d = vd, *n = vn, *m = vm, *a = va; 351 uint32_t discard; 352 353 for (i = 0; i < opr_sz / 2; ++i) { 354 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 355 } 356 } 357 358 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 359 void *va, uint32_t desc) 360 { 361 intptr_t i, opr_sz = simd_oprsz(desc); 362 int16_t *d = vd, *n = vn, *m = vm, *a = va; 363 uint32_t discard; 364 365 for (i = 0; i < opr_sz / 2; ++i) { 366 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 367 } 368 } 369 370 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 371 { 372 intptr_t i, opr_sz = simd_oprsz(desc); 373 int16_t *d = vd, *n = vn, *m = vm; 374 uint32_t discard; 375 376 for (i = 0; i < opr_sz / 2; ++i) { 377 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 378 } 379 } 380 381 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 382 { 383 intptr_t i, opr_sz = simd_oprsz(desc); 384 int16_t *d = vd, *n = vn, *m = vm; 385 uint32_t discard; 386 387 for (i = 0; i < opr_sz / 2; ++i) { 388 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 389 } 390 } 391 392 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 393 { 394 intptr_t i, j, opr_sz = simd_oprsz(desc); 395 int idx = simd_data(desc); 396 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 397 uint32_t discard; 398 399 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 400 int16_t mm = m[i]; 401 for (j = 0; j < 16 / 2; ++j) { 402 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 403 } 404 } 405 } 406 407 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 408 { 409 intptr_t i, j, opr_sz = simd_oprsz(desc); 410 int idx = simd_data(desc); 411 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 412 uint32_t discard; 413 414 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 415 int16_t mm = m[i]; 416 for (j = 0; j < 16 / 2; ++j) { 417 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 418 } 419 } 420 } 421 422 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 423 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 424 bool neg, bool round, uint32_t *sat) 425 { 426 /* Simplify similarly to do_sqrdmlah_b above. */ 427 int64_t ret = (int64_t)src1 * src2; 428 if (neg) { 429 ret = -ret; 430 } 431 ret += ((int64_t)src3 << 31) + (round << 30); 432 ret >>= 31; 433 434 if (ret != (int32_t)ret) { 435 *sat = 1; 436 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 437 } 438 return ret; 439 } 440 441 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 442 int32_t src2, int32_t src3) 443 { 444 uint32_t *sat = &env->vfp.qc[0]; 445 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 446 } 447 448 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 449 void *vq, uint32_t desc) 450 { 451 uintptr_t opr_sz = simd_oprsz(desc); 452 int32_t *d = vd; 453 int32_t *n = vn; 454 int32_t *m = vm; 455 uintptr_t i; 456 457 for (i = 0; i < opr_sz / 4; ++i) { 458 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 459 } 460 clear_tail(d, opr_sz, simd_maxsz(desc)); 461 } 462 463 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 464 int32_t src2, int32_t src3) 465 { 466 uint32_t *sat = &env->vfp.qc[0]; 467 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 468 } 469 470 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 471 void *vq, uint32_t desc) 472 { 473 uintptr_t opr_sz = simd_oprsz(desc); 474 int32_t *d = vd; 475 int32_t *n = vn; 476 int32_t *m = vm; 477 uintptr_t i; 478 479 for (i = 0; i < opr_sz / 4; ++i) { 480 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 481 } 482 clear_tail(d, opr_sz, simd_maxsz(desc)); 483 } 484 485 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 486 void *vq, uint32_t desc) 487 { 488 intptr_t i, opr_sz = simd_oprsz(desc); 489 int32_t *d = vd, *n = vn, *m = vm; 490 491 for (i = 0; i < opr_sz / 4; ++i) { 492 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 493 } 494 clear_tail(d, opr_sz, simd_maxsz(desc)); 495 } 496 497 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 498 void *vq, uint32_t desc) 499 { 500 intptr_t i, opr_sz = simd_oprsz(desc); 501 int32_t *d = vd, *n = vn, *m = vm; 502 503 for (i = 0; i < opr_sz / 4; ++i) { 504 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 505 } 506 clear_tail(d, opr_sz, simd_maxsz(desc)); 507 } 508 509 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 510 void *vq, uint32_t desc) 511 { 512 intptr_t i, j, opr_sz = simd_oprsz(desc); 513 int idx = simd_data(desc); 514 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 515 516 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 517 int32_t mm = m[i]; 518 for (j = 0; j < 16 / 4; ++j) { 519 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 520 } 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, j, opr_sz = simd_oprsz(desc); 529 int idx = simd_data(desc); 530 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 531 532 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 533 int32_t mm = m[i]; 534 for (j = 0; j < 16 / 4; ++j) { 535 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 536 } 537 } 538 clear_tail(d, opr_sz, simd_maxsz(desc)); 539 } 540 541 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 542 void *va, uint32_t desc) 543 { 544 intptr_t i, opr_sz = simd_oprsz(desc); 545 int32_t *d = vd, *n = vn, *m = vm, *a = va; 546 uint32_t discard; 547 548 for (i = 0; i < opr_sz / 4; ++i) { 549 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 550 } 551 } 552 553 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 554 void *va, uint32_t desc) 555 { 556 intptr_t i, opr_sz = simd_oprsz(desc); 557 int32_t *d = vd, *n = vn, *m = vm, *a = va; 558 uint32_t discard; 559 560 for (i = 0; i < opr_sz / 4; ++i) { 561 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 562 } 563 } 564 565 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 566 { 567 intptr_t i, opr_sz = simd_oprsz(desc); 568 int32_t *d = vd, *n = vn, *m = vm; 569 uint32_t discard; 570 571 for (i = 0; i < opr_sz / 4; ++i) { 572 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 573 } 574 } 575 576 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 577 { 578 intptr_t i, opr_sz = simd_oprsz(desc); 579 int32_t *d = vd, *n = vn, *m = vm; 580 uint32_t discard; 581 582 for (i = 0; i < opr_sz / 4; ++i) { 583 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 584 } 585 } 586 587 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 588 { 589 intptr_t i, j, opr_sz = simd_oprsz(desc); 590 int idx = simd_data(desc); 591 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 592 uint32_t discard; 593 594 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < 16 / 4; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 598 } 599 } 600 } 601 602 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 603 { 604 intptr_t i, j, opr_sz = simd_oprsz(desc); 605 int idx = simd_data(desc); 606 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 607 uint32_t discard; 608 609 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 610 int32_t mm = m[i]; 611 for (j = 0; j < 16 / 4; ++j) { 612 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 613 } 614 } 615 } 616 617 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 618 static int64_t do_sat128_d(Int128 r) 619 { 620 int64_t ls = int128_getlo(r); 621 int64_t hs = int128_gethi(r); 622 623 if (unlikely(hs != (ls >> 63))) { 624 return hs < 0 ? INT64_MIN : INT64_MAX; 625 } 626 return ls; 627 } 628 629 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 630 { 631 uint64_t l, h; 632 Int128 r, t; 633 634 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 635 muls64(&l, &h, m, n); 636 r = int128_make128(l, h); 637 if (neg) { 638 r = int128_neg(r); 639 } 640 if (a) { 641 t = int128_exts64(a); 642 t = int128_lshift(t, 63); 643 r = int128_add(r, t); 644 } 645 if (round) { 646 t = int128_exts64(1ll << 62); 647 r = int128_add(r, t); 648 } 649 r = int128_rshift(r, 63); 650 651 return do_sat128_d(r); 652 } 653 654 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 655 void *va, uint32_t desc) 656 { 657 intptr_t i, opr_sz = simd_oprsz(desc); 658 int64_t *d = vd, *n = vn, *m = vm, *a = va; 659 660 for (i = 0; i < opr_sz / 8; ++i) { 661 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 662 } 663 } 664 665 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 666 void *va, uint32_t desc) 667 { 668 intptr_t i, opr_sz = simd_oprsz(desc); 669 int64_t *d = vd, *n = vn, *m = vm, *a = va; 670 671 for (i = 0; i < opr_sz / 8; ++i) { 672 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 673 } 674 } 675 676 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 677 { 678 intptr_t i, opr_sz = simd_oprsz(desc); 679 int64_t *d = vd, *n = vn, *m = vm; 680 681 for (i = 0; i < opr_sz / 8; ++i) { 682 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 683 } 684 } 685 686 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 687 { 688 intptr_t i, opr_sz = simd_oprsz(desc); 689 int64_t *d = vd, *n = vn, *m = vm; 690 691 for (i = 0; i < opr_sz / 8; ++i) { 692 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 693 } 694 } 695 696 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 697 { 698 intptr_t i, j, opr_sz = simd_oprsz(desc); 699 int idx = simd_data(desc); 700 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 701 702 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 703 int64_t mm = m[i]; 704 for (j = 0; j < 16 / 8; ++j) { 705 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 706 } 707 } 708 } 709 710 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 711 { 712 intptr_t i, j, opr_sz = simd_oprsz(desc); 713 int idx = simd_data(desc); 714 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 715 716 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 717 int64_t mm = m[i]; 718 for (j = 0; j < 16 / 8; ++j) { 719 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 720 } 721 } 722 } 723 724 /* Integer 8 and 16-bit dot-product. 725 * 726 * Note that for the loops herein, host endianness does not matter 727 * with respect to the ordering of data within the quad-width lanes. 728 * All elements are treated equally, no matter where they are. 729 */ 730 731 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 732 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 733 { \ 734 intptr_t i, opr_sz = simd_oprsz(desc); \ 735 TYPED *d = vd, *a = va; \ 736 TYPEN *n = vn; \ 737 TYPEM *m = vm; \ 738 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 739 d[i] = (a[i] + \ 740 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 741 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 742 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 743 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 744 } \ 745 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 746 } 747 748 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 749 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 750 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 751 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 752 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 753 754 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 755 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 756 { \ 757 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 758 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 759 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 760 intptr_t index = simd_data(desc); \ 761 TYPED *d = vd, *a = va; \ 762 TYPEN *n = vn; \ 763 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 764 do { \ 765 TYPED m0 = m_indexed[i * 4 + 0]; \ 766 TYPED m1 = m_indexed[i * 4 + 1]; \ 767 TYPED m2 = m_indexed[i * 4 + 2]; \ 768 TYPED m3 = m_indexed[i * 4 + 3]; \ 769 do { \ 770 d[i] = (a[i] + \ 771 n[i * 4 + 0] * m0 + \ 772 n[i * 4 + 1] * m1 + \ 773 n[i * 4 + 2] * m2 + \ 774 n[i * 4 + 3] * m3); \ 775 } while (++i < segend); \ 776 segend = i + 4; \ 777 } while (i < opr_sz_n); \ 778 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 779 } 780 781 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 782 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 783 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 784 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 785 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 786 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 787 788 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 789 void *vfpst, uint32_t desc) 790 { 791 uintptr_t opr_sz = simd_oprsz(desc); 792 float16 *d = vd; 793 float16 *n = vn; 794 float16 *m = vm; 795 float_status *fpst = vfpst; 796 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 797 uint32_t neg_imag = neg_real ^ 1; 798 uintptr_t i; 799 800 /* Shift boolean to the sign bit so we can xor to negate. */ 801 neg_real <<= 15; 802 neg_imag <<= 15; 803 804 for (i = 0; i < opr_sz / 2; i += 2) { 805 float16 e0 = n[H2(i)]; 806 float16 e1 = m[H2(i + 1)] ^ neg_imag; 807 float16 e2 = n[H2(i + 1)]; 808 float16 e3 = m[H2(i)] ^ neg_real; 809 810 d[H2(i)] = float16_add(e0, e1, fpst); 811 d[H2(i + 1)] = float16_add(e2, e3, fpst); 812 } 813 clear_tail(d, opr_sz, simd_maxsz(desc)); 814 } 815 816 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 817 void *vfpst, uint32_t desc) 818 { 819 uintptr_t opr_sz = simd_oprsz(desc); 820 float32 *d = vd; 821 float32 *n = vn; 822 float32 *m = vm; 823 float_status *fpst = vfpst; 824 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 825 uint32_t neg_imag = neg_real ^ 1; 826 uintptr_t i; 827 828 /* Shift boolean to the sign bit so we can xor to negate. */ 829 neg_real <<= 31; 830 neg_imag <<= 31; 831 832 for (i = 0; i < opr_sz / 4; i += 2) { 833 float32 e0 = n[H4(i)]; 834 float32 e1 = m[H4(i + 1)] ^ neg_imag; 835 float32 e2 = n[H4(i + 1)]; 836 float32 e3 = m[H4(i)] ^ neg_real; 837 838 d[H4(i)] = float32_add(e0, e1, fpst); 839 d[H4(i + 1)] = float32_add(e2, e3, fpst); 840 } 841 clear_tail(d, opr_sz, simd_maxsz(desc)); 842 } 843 844 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 845 void *vfpst, uint32_t desc) 846 { 847 uintptr_t opr_sz = simd_oprsz(desc); 848 float64 *d = vd; 849 float64 *n = vn; 850 float64 *m = vm; 851 float_status *fpst = vfpst; 852 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 853 uint64_t neg_imag = neg_real ^ 1; 854 uintptr_t i; 855 856 /* Shift boolean to the sign bit so we can xor to negate. */ 857 neg_real <<= 63; 858 neg_imag <<= 63; 859 860 for (i = 0; i < opr_sz / 8; i += 2) { 861 float64 e0 = n[i]; 862 float64 e1 = m[i + 1] ^ neg_imag; 863 float64 e2 = n[i + 1]; 864 float64 e3 = m[i] ^ neg_real; 865 866 d[i] = float64_add(e0, e1, fpst); 867 d[i + 1] = float64_add(e2, e3, fpst); 868 } 869 clear_tail(d, opr_sz, simd_maxsz(desc)); 870 } 871 872 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 873 void *vfpst, uint32_t desc) 874 { 875 uintptr_t opr_sz = simd_oprsz(desc); 876 float16 *d = vd, *n = vn, *m = vm, *a = va; 877 float_status *fpst = vfpst; 878 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 879 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 880 uint32_t neg_real = flip ^ neg_imag; 881 uintptr_t i; 882 883 /* Shift boolean to the sign bit so we can xor to negate. */ 884 neg_real <<= 15; 885 neg_imag <<= 15; 886 887 for (i = 0; i < opr_sz / 2; i += 2) { 888 float16 e2 = n[H2(i + flip)]; 889 float16 e1 = m[H2(i + flip)] ^ neg_real; 890 float16 e4 = e2; 891 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 892 893 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 894 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 895 } 896 clear_tail(d, opr_sz, simd_maxsz(desc)); 897 } 898 899 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 900 void *vfpst, uint32_t desc) 901 { 902 uintptr_t opr_sz = simd_oprsz(desc); 903 float16 *d = vd, *n = vn, *m = vm, *a = va; 904 float_status *fpst = vfpst; 905 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 906 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 907 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 908 uint32_t neg_real = flip ^ neg_imag; 909 intptr_t elements = opr_sz / sizeof(float16); 910 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 911 intptr_t i, j; 912 913 /* Shift boolean to the sign bit so we can xor to negate. */ 914 neg_real <<= 15; 915 neg_imag <<= 15; 916 917 for (i = 0; i < elements; i += eltspersegment) { 918 float16 mr = m[H2(i + 2 * index + 0)]; 919 float16 mi = m[H2(i + 2 * index + 1)]; 920 float16 e1 = neg_real ^ (flip ? mi : mr); 921 float16 e3 = neg_imag ^ (flip ? mr : mi); 922 923 for (j = i; j < i + eltspersegment; j += 2) { 924 float16 e2 = n[H2(j + flip)]; 925 float16 e4 = e2; 926 927 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 928 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 929 } 930 } 931 clear_tail(d, opr_sz, simd_maxsz(desc)); 932 } 933 934 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 935 void *vfpst, uint32_t desc) 936 { 937 uintptr_t opr_sz = simd_oprsz(desc); 938 float32 *d = vd, *n = vn, *m = vm, *a = va; 939 float_status *fpst = vfpst; 940 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 941 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 942 uint32_t neg_real = flip ^ neg_imag; 943 uintptr_t i; 944 945 /* Shift boolean to the sign bit so we can xor to negate. */ 946 neg_real <<= 31; 947 neg_imag <<= 31; 948 949 for (i = 0; i < opr_sz / 4; i += 2) { 950 float32 e2 = n[H4(i + flip)]; 951 float32 e1 = m[H4(i + flip)] ^ neg_real; 952 float32 e4 = e2; 953 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 954 955 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 956 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 957 } 958 clear_tail(d, opr_sz, simd_maxsz(desc)); 959 } 960 961 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 962 void *vfpst, uint32_t desc) 963 { 964 uintptr_t opr_sz = simd_oprsz(desc); 965 float32 *d = vd, *n = vn, *m = vm, *a = va; 966 float_status *fpst = vfpst; 967 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 968 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 969 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 970 uint32_t neg_real = flip ^ neg_imag; 971 intptr_t elements = opr_sz / sizeof(float32); 972 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 973 intptr_t i, j; 974 975 /* Shift boolean to the sign bit so we can xor to negate. */ 976 neg_real <<= 31; 977 neg_imag <<= 31; 978 979 for (i = 0; i < elements; i += eltspersegment) { 980 float32 mr = m[H4(i + 2 * index + 0)]; 981 float32 mi = m[H4(i + 2 * index + 1)]; 982 float32 e1 = neg_real ^ (flip ? mi : mr); 983 float32 e3 = neg_imag ^ (flip ? mr : mi); 984 985 for (j = i; j < i + eltspersegment; j += 2) { 986 float32 e2 = n[H4(j + flip)]; 987 float32 e4 = e2; 988 989 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 990 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 991 } 992 } 993 clear_tail(d, opr_sz, simd_maxsz(desc)); 994 } 995 996 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 997 void *vfpst, uint32_t desc) 998 { 999 uintptr_t opr_sz = simd_oprsz(desc); 1000 float64 *d = vd, *n = vn, *m = vm, *a = va; 1001 float_status *fpst = vfpst; 1002 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1003 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1004 uint64_t neg_real = flip ^ neg_imag; 1005 uintptr_t i; 1006 1007 /* Shift boolean to the sign bit so we can xor to negate. */ 1008 neg_real <<= 63; 1009 neg_imag <<= 63; 1010 1011 for (i = 0; i < opr_sz / 8; i += 2) { 1012 float64 e2 = n[i + flip]; 1013 float64 e1 = m[i + flip] ^ neg_real; 1014 float64 e4 = e2; 1015 float64 e3 = m[i + 1 - flip] ^ neg_imag; 1016 1017 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 1018 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 1019 } 1020 clear_tail(d, opr_sz, simd_maxsz(desc)); 1021 } 1022 1023 /* 1024 * Floating point comparisons producing an integer result (all 1s or all 0s). 1025 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1026 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1027 */ 1028 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1029 { 1030 return -float16_eq_quiet(op1, op2, stat); 1031 } 1032 1033 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1034 { 1035 return -float32_eq_quiet(op1, op2, stat); 1036 } 1037 1038 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1039 { 1040 return -float64_eq_quiet(op1, op2, stat); 1041 } 1042 1043 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1044 { 1045 return -float16_le(op2, op1, stat); 1046 } 1047 1048 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1049 { 1050 return -float32_le(op2, op1, stat); 1051 } 1052 1053 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1054 { 1055 return -float64_le(op2, op1, stat); 1056 } 1057 1058 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1059 { 1060 return -float16_lt(op2, op1, stat); 1061 } 1062 1063 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1064 { 1065 return -float32_lt(op2, op1, stat); 1066 } 1067 1068 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1069 { 1070 return -float64_lt(op2, op1, stat); 1071 } 1072 1073 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1074 { 1075 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1076 } 1077 1078 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1079 { 1080 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1081 } 1082 1083 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1084 { 1085 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1086 } 1087 1088 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1089 { 1090 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1091 } 1092 1093 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1094 { 1095 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1096 } 1097 1098 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1099 { 1100 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1101 } 1102 1103 static int16_t vfp_tosszh(float16 x, void *fpstp) 1104 { 1105 float_status *fpst = fpstp; 1106 if (float16_is_any_nan(x)) { 1107 float_raise(float_flag_invalid, fpst); 1108 return 0; 1109 } 1110 return float16_to_int16_round_to_zero(x, fpst); 1111 } 1112 1113 static uint16_t vfp_touszh(float16 x, void *fpstp) 1114 { 1115 float_status *fpst = fpstp; 1116 if (float16_is_any_nan(x)) { 1117 float_raise(float_flag_invalid, fpst); 1118 return 0; 1119 } 1120 return float16_to_uint16_round_to_zero(x, fpst); 1121 } 1122 1123 #define DO_2OP(NAME, FUNC, TYPE) \ 1124 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1125 { \ 1126 intptr_t i, oprsz = simd_oprsz(desc); \ 1127 TYPE *d = vd, *n = vn; \ 1128 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1129 d[i] = FUNC(n[i], stat); \ 1130 } \ 1131 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1132 } 1133 1134 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1135 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1136 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1137 1138 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1139 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1140 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1141 1142 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1143 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1144 1145 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1146 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1147 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1148 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1149 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1150 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1151 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1152 DO_2OP(gvec_touszh, vfp_touszh, float16) 1153 1154 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1155 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1156 { \ 1157 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1158 } 1159 1160 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1161 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1162 { \ 1163 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1164 } 1165 1166 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1167 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1168 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1169 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1170 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1171 1172 DO_2OP_CMP0(cgt, cgt, FWD) 1173 DO_2OP_CMP0(cge, cge, FWD) 1174 DO_2OP_CMP0(ceq, ceq, FWD) 1175 DO_2OP_CMP0(clt, cgt, REV) 1176 DO_2OP_CMP0(cle, cge, REV) 1177 1178 #undef DO_2OP 1179 #undef DO_2OP_CMP0 1180 1181 /* Floating-point trigonometric starting value. 1182 * See the ARM ARM pseudocode function FPTrigSMul. 1183 */ 1184 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1185 { 1186 float16 result = float16_mul(op1, op1, stat); 1187 if (!float16_is_any_nan(result)) { 1188 result = float16_set_sign(result, op2 & 1); 1189 } 1190 return result; 1191 } 1192 1193 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1194 { 1195 float32 result = float32_mul(op1, op1, stat); 1196 if (!float32_is_any_nan(result)) { 1197 result = float32_set_sign(result, op2 & 1); 1198 } 1199 return result; 1200 } 1201 1202 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1203 { 1204 float64 result = float64_mul(op1, op1, stat); 1205 if (!float64_is_any_nan(result)) { 1206 result = float64_set_sign(result, op2 & 1); 1207 } 1208 return result; 1209 } 1210 1211 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1212 { 1213 return float16_abs(float16_sub(op1, op2, stat)); 1214 } 1215 1216 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1217 { 1218 return float32_abs(float32_sub(op1, op2, stat)); 1219 } 1220 1221 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1222 { 1223 return float64_abs(float64_sub(op1, op2, stat)); 1224 } 1225 1226 /* 1227 * Reciprocal step. These are the AArch32 version which uses a 1228 * non-fused multiply-and-subtract. 1229 */ 1230 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1231 { 1232 op1 = float16_squash_input_denormal(op1, stat); 1233 op2 = float16_squash_input_denormal(op2, stat); 1234 1235 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1236 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1237 return float16_two; 1238 } 1239 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1240 } 1241 1242 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1243 { 1244 op1 = float32_squash_input_denormal(op1, stat); 1245 op2 = float32_squash_input_denormal(op2, stat); 1246 1247 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1248 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1249 return float32_two; 1250 } 1251 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1252 } 1253 1254 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1255 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1256 { 1257 op1 = float16_squash_input_denormal(op1, stat); 1258 op2 = float16_squash_input_denormal(op2, stat); 1259 1260 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1261 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1262 return float16_one_point_five; 1263 } 1264 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1265 return float16_div(op1, float16_two, stat); 1266 } 1267 1268 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1269 { 1270 op1 = float32_squash_input_denormal(op1, stat); 1271 op2 = float32_squash_input_denormal(op2, stat); 1272 1273 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1274 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1275 return float32_one_point_five; 1276 } 1277 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1278 return float32_div(op1, float32_two, stat); 1279 } 1280 1281 #define DO_3OP(NAME, FUNC, TYPE) \ 1282 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1283 { \ 1284 intptr_t i, oprsz = simd_oprsz(desc); \ 1285 TYPE *d = vd, *n = vn, *m = vm; \ 1286 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1287 d[i] = FUNC(n[i], m[i], stat); \ 1288 } \ 1289 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1290 } 1291 1292 DO_3OP(gvec_fadd_h, float16_add, float16) 1293 DO_3OP(gvec_fadd_s, float32_add, float32) 1294 DO_3OP(gvec_fadd_d, float64_add, float64) 1295 1296 DO_3OP(gvec_fsub_h, float16_sub, float16) 1297 DO_3OP(gvec_fsub_s, float32_sub, float32) 1298 DO_3OP(gvec_fsub_d, float64_sub, float64) 1299 1300 DO_3OP(gvec_fmul_h, float16_mul, float16) 1301 DO_3OP(gvec_fmul_s, float32_mul, float32) 1302 DO_3OP(gvec_fmul_d, float64_mul, float64) 1303 1304 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1305 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1306 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1307 1308 DO_3OP(gvec_fabd_h, float16_abd, float16) 1309 DO_3OP(gvec_fabd_s, float32_abd, float32) 1310 DO_3OP(gvec_fabd_d, float64_abd, float64) 1311 1312 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1313 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1314 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1315 1316 DO_3OP(gvec_fcge_h, float16_cge, float16) 1317 DO_3OP(gvec_fcge_s, float32_cge, float32) 1318 DO_3OP(gvec_fcge_d, float64_cge, float64) 1319 1320 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1321 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1322 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1323 1324 DO_3OP(gvec_facge_h, float16_acge, float16) 1325 DO_3OP(gvec_facge_s, float32_acge, float32) 1326 DO_3OP(gvec_facge_d, float64_acge, float64) 1327 1328 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1329 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1330 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1331 1332 DO_3OP(gvec_fmax_h, float16_max, float16) 1333 DO_3OP(gvec_fmax_s, float32_max, float32) 1334 DO_3OP(gvec_fmax_d, float64_max, float64) 1335 1336 DO_3OP(gvec_fmin_h, float16_min, float16) 1337 DO_3OP(gvec_fmin_s, float32_min, float32) 1338 DO_3OP(gvec_fmin_d, float64_min, float64) 1339 1340 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1341 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1342 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1343 1344 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1345 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1346 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1347 1348 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1349 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1350 1351 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1352 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1353 1354 #ifdef TARGET_AARCH64 1355 DO_3OP(gvec_fdiv_h, float16_div, float16) 1356 DO_3OP(gvec_fdiv_s, float32_div, float32) 1357 DO_3OP(gvec_fdiv_d, float64_div, float64) 1358 1359 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1360 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1361 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1362 1363 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1364 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1365 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1366 1367 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1368 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1369 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1370 1371 #endif 1372 #undef DO_3OP 1373 1374 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1375 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1376 float_status *stat) 1377 { 1378 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1379 } 1380 1381 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1382 float_status *stat) 1383 { 1384 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1385 } 1386 1387 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1388 float_status *stat) 1389 { 1390 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1391 } 1392 1393 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1394 float_status *stat) 1395 { 1396 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1397 } 1398 1399 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1400 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1401 float_status *stat) 1402 { 1403 return float16_muladd(op1, op2, dest, 0, stat); 1404 } 1405 1406 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1407 float_status *stat) 1408 { 1409 return float32_muladd(op1, op2, dest, 0, stat); 1410 } 1411 1412 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1413 float_status *stat) 1414 { 1415 return float64_muladd(op1, op2, dest, 0, stat); 1416 } 1417 1418 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1419 float_status *stat) 1420 { 1421 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1422 } 1423 1424 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1425 float_status *stat) 1426 { 1427 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1428 } 1429 1430 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1431 float_status *stat) 1432 { 1433 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1434 } 1435 1436 #define DO_MULADD(NAME, FUNC, TYPE) \ 1437 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1438 { \ 1439 intptr_t i, oprsz = simd_oprsz(desc); \ 1440 TYPE *d = vd, *n = vn, *m = vm; \ 1441 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1442 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1443 } \ 1444 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1445 } 1446 1447 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1448 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1449 1450 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1451 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1452 1453 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1454 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1455 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1456 1457 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1458 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1459 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1460 1461 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1462 * For AdvSIMD, there is of course only one such vector segment. 1463 */ 1464 1465 #define DO_MUL_IDX(NAME, TYPE, H) \ 1466 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1467 { \ 1468 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1469 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1470 intptr_t idx = simd_data(desc); \ 1471 TYPE *d = vd, *n = vn, *m = vm; \ 1472 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1473 TYPE mm = m[H(i + idx)]; \ 1474 for (j = 0; j < segment; j++) { \ 1475 d[i + j] = n[i + j] * mm; \ 1476 } \ 1477 } \ 1478 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1479 } 1480 1481 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1482 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1483 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1484 1485 #undef DO_MUL_IDX 1486 1487 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1488 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1489 { \ 1490 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1491 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1492 intptr_t idx = simd_data(desc); \ 1493 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1494 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1495 TYPE mm = m[H(i + idx)]; \ 1496 for (j = 0; j < segment; j++) { \ 1497 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1498 } \ 1499 } \ 1500 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1501 } 1502 1503 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1504 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1505 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1506 1507 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1508 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1509 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1510 1511 #undef DO_MLA_IDX 1512 1513 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1514 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1515 { \ 1516 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1517 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1518 intptr_t idx = simd_data(desc); \ 1519 TYPE *d = vd, *n = vn, *m = vm; \ 1520 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1521 TYPE mm = m[H(i + idx)]; \ 1522 for (j = 0; j < segment; j++) { \ 1523 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1524 } \ 1525 } \ 1526 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1527 } 1528 1529 #define nop(N, M, S) (M) 1530 1531 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1532 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1533 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1534 1535 #ifdef TARGET_AARCH64 1536 1537 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1538 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1539 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1540 1541 #endif 1542 1543 #undef nop 1544 1545 /* 1546 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1547 * the fused ops below they assume accumulate both from and into Vd. 1548 */ 1549 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1550 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1551 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1552 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1553 1554 #undef DO_FMUL_IDX 1555 1556 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1557 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1558 void *stat, uint32_t desc) \ 1559 { \ 1560 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1561 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1562 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1563 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1564 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1565 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1566 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1567 TYPE mm = m[H(i + idx)]; \ 1568 for (j = 0; j < segment; j++) { \ 1569 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1570 mm, a[i + j], 0, stat); \ 1571 } \ 1572 } \ 1573 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1574 } 1575 1576 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1577 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1578 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1579 1580 #undef DO_FMLA_IDX 1581 1582 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1583 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1584 { \ 1585 intptr_t i, oprsz = simd_oprsz(desc); \ 1586 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1587 bool q = false; \ 1588 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1589 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1590 if (dd < MIN) { \ 1591 dd = MIN; \ 1592 q = true; \ 1593 } else if (dd > MAX) { \ 1594 dd = MAX; \ 1595 q = true; \ 1596 } \ 1597 d[i] = dd; \ 1598 } \ 1599 if (q) { \ 1600 uint32_t *qc = vq; \ 1601 qc[0] = 1; \ 1602 } \ 1603 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1604 } 1605 1606 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1607 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1608 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1609 1610 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1611 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1612 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1613 1614 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1615 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1616 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1617 1618 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1619 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1620 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1621 1622 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1623 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1624 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1625 1626 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1627 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1628 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1629 1630 #undef DO_SAT 1631 1632 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1633 void *vm, uint32_t desc) 1634 { 1635 intptr_t i, oprsz = simd_oprsz(desc); 1636 uint64_t *d = vd, *n = vn, *m = vm; 1637 bool q = false; 1638 1639 for (i = 0; i < oprsz / 8; i++) { 1640 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1641 if (dd < nn) { 1642 dd = UINT64_MAX; 1643 q = true; 1644 } 1645 d[i] = dd; 1646 } 1647 if (q) { 1648 uint32_t *qc = vq; 1649 qc[0] = 1; 1650 } 1651 clear_tail(d, oprsz, simd_maxsz(desc)); 1652 } 1653 1654 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1655 void *vm, uint32_t desc) 1656 { 1657 intptr_t i, oprsz = simd_oprsz(desc); 1658 uint64_t *d = vd, *n = vn, *m = vm; 1659 bool q = false; 1660 1661 for (i = 0; i < oprsz / 8; i++) { 1662 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1663 if (nn < mm) { 1664 dd = 0; 1665 q = true; 1666 } 1667 d[i] = dd; 1668 } 1669 if (q) { 1670 uint32_t *qc = vq; 1671 qc[0] = 1; 1672 } 1673 clear_tail(d, oprsz, simd_maxsz(desc)); 1674 } 1675 1676 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1677 void *vm, uint32_t desc) 1678 { 1679 intptr_t i, oprsz = simd_oprsz(desc); 1680 int64_t *d = vd, *n = vn, *m = vm; 1681 bool q = false; 1682 1683 for (i = 0; i < oprsz / 8; i++) { 1684 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1685 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1686 dd = (nn >> 63) ^ ~INT64_MIN; 1687 q = true; 1688 } 1689 d[i] = dd; 1690 } 1691 if (q) { 1692 uint32_t *qc = vq; 1693 qc[0] = 1; 1694 } 1695 clear_tail(d, oprsz, simd_maxsz(desc)); 1696 } 1697 1698 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1699 void *vm, uint32_t desc) 1700 { 1701 intptr_t i, oprsz = simd_oprsz(desc); 1702 int64_t *d = vd, *n = vn, *m = vm; 1703 bool q = false; 1704 1705 for (i = 0; i < oprsz / 8; i++) { 1706 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1707 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1708 dd = (nn >> 63) ^ ~INT64_MIN; 1709 q = true; 1710 } 1711 d[i] = dd; 1712 } 1713 if (q) { 1714 uint32_t *qc = vq; 1715 qc[0] = 1; 1716 } 1717 clear_tail(d, oprsz, simd_maxsz(desc)); 1718 } 1719 1720 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1721 void *vm, uint32_t desc) 1722 { 1723 intptr_t i, oprsz = simd_oprsz(desc); 1724 uint64_t *d = vd, *n = vn, *m = vm; 1725 bool q = false; 1726 1727 for (i = 0; i < oprsz / 8; i++) { 1728 uint64_t nn = n[i]; 1729 int64_t mm = m[i]; 1730 uint64_t dd = nn + mm; 1731 1732 if (mm < 0) { 1733 if (nn < (uint64_t)-mm) { 1734 dd = 0; 1735 q = true; 1736 } 1737 } else { 1738 if (dd < nn) { 1739 dd = UINT64_MAX; 1740 q = true; 1741 } 1742 } 1743 d[i] = dd; 1744 } 1745 if (q) { 1746 uint32_t *qc = vq; 1747 qc[0] = 1; 1748 } 1749 clear_tail(d, oprsz, simd_maxsz(desc)); 1750 } 1751 1752 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1753 void *vm, uint32_t desc) 1754 { 1755 intptr_t i, oprsz = simd_oprsz(desc); 1756 uint64_t *d = vd, *n = vn, *m = vm; 1757 bool q = false; 1758 1759 for (i = 0; i < oprsz / 8; i++) { 1760 int64_t nn = n[i]; 1761 uint64_t mm = m[i]; 1762 int64_t dd = nn + mm; 1763 1764 if (mm > (uint64_t)(INT64_MAX - nn)) { 1765 dd = INT64_MAX; 1766 q = true; 1767 } 1768 d[i] = dd; 1769 } 1770 if (q) { 1771 uint32_t *qc = vq; 1772 qc[0] = 1; 1773 } 1774 clear_tail(d, oprsz, simd_maxsz(desc)); 1775 } 1776 1777 #define DO_SRA(NAME, TYPE) \ 1778 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1779 { \ 1780 intptr_t i, oprsz = simd_oprsz(desc); \ 1781 int shift = simd_data(desc); \ 1782 TYPE *d = vd, *n = vn; \ 1783 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1784 d[i] += n[i] >> shift; \ 1785 } \ 1786 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1787 } 1788 1789 DO_SRA(gvec_ssra_b, int8_t) 1790 DO_SRA(gvec_ssra_h, int16_t) 1791 DO_SRA(gvec_ssra_s, int32_t) 1792 DO_SRA(gvec_ssra_d, int64_t) 1793 1794 DO_SRA(gvec_usra_b, uint8_t) 1795 DO_SRA(gvec_usra_h, uint16_t) 1796 DO_SRA(gvec_usra_s, uint32_t) 1797 DO_SRA(gvec_usra_d, uint64_t) 1798 1799 #undef DO_SRA 1800 1801 #define DO_RSHR(NAME, TYPE) \ 1802 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1803 { \ 1804 intptr_t i, oprsz = simd_oprsz(desc); \ 1805 int shift = simd_data(desc); \ 1806 TYPE *d = vd, *n = vn; \ 1807 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1808 TYPE tmp = n[i] >> (shift - 1); \ 1809 d[i] = (tmp >> 1) + (tmp & 1); \ 1810 } \ 1811 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1812 } 1813 1814 DO_RSHR(gvec_srshr_b, int8_t) 1815 DO_RSHR(gvec_srshr_h, int16_t) 1816 DO_RSHR(gvec_srshr_s, int32_t) 1817 DO_RSHR(gvec_srshr_d, int64_t) 1818 1819 DO_RSHR(gvec_urshr_b, uint8_t) 1820 DO_RSHR(gvec_urshr_h, uint16_t) 1821 DO_RSHR(gvec_urshr_s, uint32_t) 1822 DO_RSHR(gvec_urshr_d, uint64_t) 1823 1824 #undef DO_RSHR 1825 1826 #define DO_RSRA(NAME, TYPE) \ 1827 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1828 { \ 1829 intptr_t i, oprsz = simd_oprsz(desc); \ 1830 int shift = simd_data(desc); \ 1831 TYPE *d = vd, *n = vn; \ 1832 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1833 TYPE tmp = n[i] >> (shift - 1); \ 1834 d[i] += (tmp >> 1) + (tmp & 1); \ 1835 } \ 1836 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1837 } 1838 1839 DO_RSRA(gvec_srsra_b, int8_t) 1840 DO_RSRA(gvec_srsra_h, int16_t) 1841 DO_RSRA(gvec_srsra_s, int32_t) 1842 DO_RSRA(gvec_srsra_d, int64_t) 1843 1844 DO_RSRA(gvec_ursra_b, uint8_t) 1845 DO_RSRA(gvec_ursra_h, uint16_t) 1846 DO_RSRA(gvec_ursra_s, uint32_t) 1847 DO_RSRA(gvec_ursra_d, uint64_t) 1848 1849 #undef DO_RSRA 1850 1851 #define DO_SRI(NAME, TYPE) \ 1852 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1853 { \ 1854 intptr_t i, oprsz = simd_oprsz(desc); \ 1855 int shift = simd_data(desc); \ 1856 TYPE *d = vd, *n = vn; \ 1857 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1858 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1859 } \ 1860 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1861 } 1862 1863 DO_SRI(gvec_sri_b, uint8_t) 1864 DO_SRI(gvec_sri_h, uint16_t) 1865 DO_SRI(gvec_sri_s, uint32_t) 1866 DO_SRI(gvec_sri_d, uint64_t) 1867 1868 #undef DO_SRI 1869 1870 #define DO_SLI(NAME, TYPE) \ 1871 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1872 { \ 1873 intptr_t i, oprsz = simd_oprsz(desc); \ 1874 int shift = simd_data(desc); \ 1875 TYPE *d = vd, *n = vn; \ 1876 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1877 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1878 } \ 1879 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1880 } 1881 1882 DO_SLI(gvec_sli_b, uint8_t) 1883 DO_SLI(gvec_sli_h, uint16_t) 1884 DO_SLI(gvec_sli_s, uint32_t) 1885 DO_SLI(gvec_sli_d, uint64_t) 1886 1887 #undef DO_SLI 1888 1889 /* 1890 * Convert float16 to float32, raising no exceptions and 1891 * preserving exceptional values, including SNaN. 1892 * This is effectively an unpack+repack operation. 1893 */ 1894 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1895 { 1896 const int f16_bias = 15; 1897 const int f32_bias = 127; 1898 uint32_t sign = extract32(f16, 15, 1); 1899 uint32_t exp = extract32(f16, 10, 5); 1900 uint32_t frac = extract32(f16, 0, 10); 1901 1902 if (exp == 0x1f) { 1903 /* Inf or NaN */ 1904 exp = 0xff; 1905 } else if (exp == 0) { 1906 /* Zero or denormal. */ 1907 if (frac != 0) { 1908 if (fz16) { 1909 frac = 0; 1910 } else { 1911 /* 1912 * Denormal; these are all normal float32. 1913 * Shift the fraction so that the msb is at bit 11, 1914 * then remove bit 11 as the implicit bit of the 1915 * normalized float32. Note that we still go through 1916 * the shift for normal numbers below, to put the 1917 * float32 fraction at the right place. 1918 */ 1919 int shift = clz32(frac) - 21; 1920 frac = (frac << shift) & 0x3ff; 1921 exp = f32_bias - f16_bias - shift + 1; 1922 } 1923 } 1924 } else { 1925 /* Normal number; adjust the bias. */ 1926 exp += f32_bias - f16_bias; 1927 } 1928 sign <<= 31; 1929 exp <<= 23; 1930 frac <<= 23 - 10; 1931 1932 return sign | exp | frac; 1933 } 1934 1935 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1936 { 1937 /* 1938 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1939 * Load the 2nd qword iff is_q & is_2. 1940 * Shift to the 2nd dword iff !is_q & is_2. 1941 * For !is_q & !is_2, the upper bits of the result are garbage. 1942 */ 1943 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1944 } 1945 1946 /* 1947 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1948 * as there is not yet SVE versions that might use blocking. 1949 */ 1950 1951 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1952 uint32_t desc, bool fz16) 1953 { 1954 intptr_t i, oprsz = simd_oprsz(desc); 1955 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1956 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1957 int is_q = oprsz == 16; 1958 uint64_t n_4, m_4; 1959 1960 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1961 n_4 = load4_f16(vn, is_q, is_2); 1962 m_4 = load4_f16(vm, is_q, is_2); 1963 1964 /* Negate all inputs for FMLSL at once. */ 1965 if (is_s) { 1966 n_4 ^= 0x8000800080008000ull; 1967 } 1968 1969 for (i = 0; i < oprsz / 4; i++) { 1970 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1971 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1972 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1973 } 1974 clear_tail(d, oprsz, simd_maxsz(desc)); 1975 } 1976 1977 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1978 void *venv, uint32_t desc) 1979 { 1980 CPUARMState *env = venv; 1981 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1982 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1983 } 1984 1985 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1986 void *venv, uint32_t desc) 1987 { 1988 CPUARMState *env = venv; 1989 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1990 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1991 } 1992 1993 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 1994 void *venv, uint32_t desc) 1995 { 1996 intptr_t i, oprsz = simd_oprsz(desc); 1997 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 1998 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 1999 CPUARMState *env = venv; 2000 float_status *status = &env->vfp.fp_status; 2001 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2002 2003 for (i = 0; i < oprsz; i += sizeof(float32)) { 2004 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2005 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2006 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2007 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2008 float32 aa = *(float32 *)(va + H1_4(i)); 2009 2010 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2011 } 2012 } 2013 2014 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2015 uint32_t desc, bool fz16) 2016 { 2017 intptr_t i, oprsz = simd_oprsz(desc); 2018 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2019 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2020 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2021 int is_q = oprsz == 16; 2022 uint64_t n_4; 2023 float32 m_1; 2024 2025 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2026 n_4 = load4_f16(vn, is_q, is_2); 2027 2028 /* Negate all inputs for FMLSL at once. */ 2029 if (is_s) { 2030 n_4 ^= 0x8000800080008000ull; 2031 } 2032 2033 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2034 2035 for (i = 0; i < oprsz / 4; i++) { 2036 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2037 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2038 } 2039 clear_tail(d, oprsz, simd_maxsz(desc)); 2040 } 2041 2042 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2043 void *venv, uint32_t desc) 2044 { 2045 CPUARMState *env = venv; 2046 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2047 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2048 } 2049 2050 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2051 void *venv, uint32_t desc) 2052 { 2053 CPUARMState *env = venv; 2054 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 2055 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2056 } 2057 2058 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2059 void *venv, uint32_t desc) 2060 { 2061 intptr_t i, j, oprsz = simd_oprsz(desc); 2062 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2063 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2064 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2065 CPUARMState *env = venv; 2066 float_status *status = &env->vfp.fp_status; 2067 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2068 2069 for (i = 0; i < oprsz; i += 16) { 2070 float16 mm_16 = *(float16 *)(vm + i + idx); 2071 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2072 2073 for (j = 0; j < 16; j += sizeof(float32)) { 2074 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2075 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2076 float32 aa = *(float32 *)(va + H1_4(i + j)); 2077 2078 *(float32 *)(vd + H1_4(i + j)) = 2079 float32_muladd(nn, mm, aa, 0, status); 2080 } 2081 } 2082 } 2083 2084 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2085 { 2086 intptr_t i, opr_sz = simd_oprsz(desc); 2087 int8_t *d = vd, *n = vn, *m = vm; 2088 2089 for (i = 0; i < opr_sz; ++i) { 2090 int8_t mm = m[i]; 2091 int8_t nn = n[i]; 2092 int8_t res = 0; 2093 if (mm >= 0) { 2094 if (mm < 8) { 2095 res = nn << mm; 2096 } 2097 } else { 2098 res = nn >> (mm > -8 ? -mm : 7); 2099 } 2100 d[i] = res; 2101 } 2102 clear_tail(d, opr_sz, simd_maxsz(desc)); 2103 } 2104 2105 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2106 { 2107 intptr_t i, opr_sz = simd_oprsz(desc); 2108 int16_t *d = vd, *n = vn, *m = vm; 2109 2110 for (i = 0; i < opr_sz / 2; ++i) { 2111 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2112 int16_t nn = n[i]; 2113 int16_t res = 0; 2114 if (mm >= 0) { 2115 if (mm < 16) { 2116 res = nn << mm; 2117 } 2118 } else { 2119 res = nn >> (mm > -16 ? -mm : 15); 2120 } 2121 d[i] = res; 2122 } 2123 clear_tail(d, opr_sz, simd_maxsz(desc)); 2124 } 2125 2126 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2127 { 2128 intptr_t i, opr_sz = simd_oprsz(desc); 2129 uint8_t *d = vd, *n = vn, *m = vm; 2130 2131 for (i = 0; i < opr_sz; ++i) { 2132 int8_t mm = m[i]; 2133 uint8_t nn = n[i]; 2134 uint8_t res = 0; 2135 if (mm >= 0) { 2136 if (mm < 8) { 2137 res = nn << mm; 2138 } 2139 } else { 2140 if (mm > -8) { 2141 res = nn >> -mm; 2142 } 2143 } 2144 d[i] = res; 2145 } 2146 clear_tail(d, opr_sz, simd_maxsz(desc)); 2147 } 2148 2149 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2150 { 2151 intptr_t i, opr_sz = simd_oprsz(desc); 2152 uint16_t *d = vd, *n = vn, *m = vm; 2153 2154 for (i = 0; i < opr_sz / 2; ++i) { 2155 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2156 uint16_t nn = n[i]; 2157 uint16_t res = 0; 2158 if (mm >= 0) { 2159 if (mm < 16) { 2160 res = nn << mm; 2161 } 2162 } else { 2163 if (mm > -16) { 2164 res = nn >> -mm; 2165 } 2166 } 2167 d[i] = res; 2168 } 2169 clear_tail(d, opr_sz, simd_maxsz(desc)); 2170 } 2171 2172 /* 2173 * 8x8->8 polynomial multiply. 2174 * 2175 * Polynomial multiplication is like integer multiplication except the 2176 * partial products are XORed, not added. 2177 * 2178 * TODO: expose this as a generic vector operation, as it is a common 2179 * crypto building block. 2180 */ 2181 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2182 { 2183 intptr_t i, opr_sz = simd_oprsz(desc); 2184 uint64_t *d = vd, *n = vn, *m = vm; 2185 2186 for (i = 0; i < opr_sz / 8; ++i) { 2187 d[i] = clmul_8x8_low(n[i], m[i]); 2188 } 2189 clear_tail(d, opr_sz, simd_maxsz(desc)); 2190 } 2191 2192 /* 2193 * 64x64->128 polynomial multiply. 2194 * Because of the lanes are not accessed in strict columns, 2195 * this probably cannot be turned into a generic helper. 2196 */ 2197 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2198 { 2199 intptr_t i, opr_sz = simd_oprsz(desc); 2200 intptr_t hi = simd_data(desc); 2201 uint64_t *d = vd, *n = vn, *m = vm; 2202 2203 for (i = 0; i < opr_sz / 8; i += 2) { 2204 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2205 d[i] = int128_getlo(r); 2206 d[i + 1] = int128_gethi(r); 2207 } 2208 clear_tail(d, opr_sz, simd_maxsz(desc)); 2209 } 2210 2211 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2212 { 2213 int hi = simd_data(desc); 2214 uint64_t *d = vd, *n = vn, *m = vm; 2215 uint64_t nn = n[hi], mm = m[hi]; 2216 2217 d[0] = clmul_8x4_packed(nn, mm); 2218 nn >>= 32; 2219 mm >>= 32; 2220 d[1] = clmul_8x4_packed(nn, mm); 2221 2222 clear_tail(d, 16, simd_maxsz(desc)); 2223 } 2224 2225 #ifdef TARGET_AARCH64 2226 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2227 { 2228 int shift = simd_data(desc) * 8; 2229 intptr_t i, opr_sz = simd_oprsz(desc); 2230 uint64_t *d = vd, *n = vn, *m = vm; 2231 2232 for (i = 0; i < opr_sz / 8; ++i) { 2233 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2234 } 2235 } 2236 2237 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2238 { 2239 intptr_t sel = H4(simd_data(desc)); 2240 intptr_t i, opr_sz = simd_oprsz(desc); 2241 uint32_t *n = vn, *m = vm; 2242 uint64_t *d = vd; 2243 2244 for (i = 0; i < opr_sz / 8; ++i) { 2245 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2246 } 2247 } 2248 #endif 2249 2250 #define DO_CMP0(NAME, TYPE, OP) \ 2251 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2252 { \ 2253 intptr_t i, opr_sz = simd_oprsz(desc); \ 2254 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2255 TYPE nn = *(TYPE *)(vn + i); \ 2256 *(TYPE *)(vd + i) = -(nn OP 0); \ 2257 } \ 2258 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2259 } 2260 2261 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2262 DO_CMP0(gvec_clt0_b, int8_t, <) 2263 DO_CMP0(gvec_cle0_b, int8_t, <=) 2264 DO_CMP0(gvec_cgt0_b, int8_t, >) 2265 DO_CMP0(gvec_cge0_b, int8_t, >=) 2266 2267 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2268 DO_CMP0(gvec_clt0_h, int16_t, <) 2269 DO_CMP0(gvec_cle0_h, int16_t, <=) 2270 DO_CMP0(gvec_cgt0_h, int16_t, >) 2271 DO_CMP0(gvec_cge0_h, int16_t, >=) 2272 2273 #undef DO_CMP0 2274 2275 #define DO_ABD(NAME, TYPE) \ 2276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2277 { \ 2278 intptr_t i, opr_sz = simd_oprsz(desc); \ 2279 TYPE *d = vd, *n = vn, *m = vm; \ 2280 \ 2281 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2282 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2283 } \ 2284 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2285 } 2286 2287 DO_ABD(gvec_sabd_b, int8_t) 2288 DO_ABD(gvec_sabd_h, int16_t) 2289 DO_ABD(gvec_sabd_s, int32_t) 2290 DO_ABD(gvec_sabd_d, int64_t) 2291 2292 DO_ABD(gvec_uabd_b, uint8_t) 2293 DO_ABD(gvec_uabd_h, uint16_t) 2294 DO_ABD(gvec_uabd_s, uint32_t) 2295 DO_ABD(gvec_uabd_d, uint64_t) 2296 2297 #undef DO_ABD 2298 2299 #define DO_ABA(NAME, TYPE) \ 2300 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2301 { \ 2302 intptr_t i, opr_sz = simd_oprsz(desc); \ 2303 TYPE *d = vd, *n = vn, *m = vm; \ 2304 \ 2305 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2306 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2307 } \ 2308 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2309 } 2310 2311 DO_ABA(gvec_saba_b, int8_t) 2312 DO_ABA(gvec_saba_h, int16_t) 2313 DO_ABA(gvec_saba_s, int32_t) 2314 DO_ABA(gvec_saba_d, int64_t) 2315 2316 DO_ABA(gvec_uaba_b, uint8_t) 2317 DO_ABA(gvec_uaba_h, uint16_t) 2318 DO_ABA(gvec_uaba_s, uint32_t) 2319 DO_ABA(gvec_uaba_d, uint64_t) 2320 2321 #undef DO_ABA 2322 2323 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2324 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 2325 { \ 2326 ARMVectorReg scratch; \ 2327 intptr_t oprsz = simd_oprsz(desc); \ 2328 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2329 TYPE *d = vd, *n = vn, *m = vm; \ 2330 if (unlikely(d == m)) { \ 2331 m = memcpy(&scratch, m, oprsz); \ 2332 } \ 2333 for (intptr_t i = 0; i < half; ++i) { \ 2334 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2335 } \ 2336 for (intptr_t i = 0; i < half; ++i) { \ 2337 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2338 } \ 2339 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2340 } 2341 2342 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2343 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2344 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2345 2346 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2347 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2348 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2349 2350 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2351 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2352 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2353 2354 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2355 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2356 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2357 2358 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2359 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2360 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2361 2362 #undef DO_3OP_PAIR 2363 2364 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2365 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2366 { \ 2367 ARMVectorReg scratch; \ 2368 intptr_t oprsz = simd_oprsz(desc); \ 2369 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2370 TYPE *d = vd, *n = vn, *m = vm; \ 2371 if (unlikely(d == m)) { \ 2372 m = memcpy(&scratch, m, oprsz); \ 2373 } \ 2374 for (intptr_t i = 0; i < half; ++i) { \ 2375 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2376 } \ 2377 for (intptr_t i = 0; i < half; ++i) { \ 2378 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2379 } \ 2380 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2381 } 2382 2383 #define ADD(A, B) (A + B) 2384 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2385 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2386 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2387 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2388 #undef ADD 2389 2390 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2391 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2392 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2393 2394 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2395 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2396 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2397 2398 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2399 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2400 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2401 2402 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2403 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2404 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2405 2406 #undef DO_3OP_PAIR 2407 2408 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2409 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2410 { \ 2411 intptr_t i, oprsz = simd_oprsz(desc); \ 2412 int shift = simd_data(desc); \ 2413 TYPE *d = vd, *n = vn; \ 2414 float_status *fpst = stat; \ 2415 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2416 d[i] = FUNC(n[i], shift, fpst); \ 2417 } \ 2418 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2419 } 2420 2421 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2422 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2423 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2424 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) 2425 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2426 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2427 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2428 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2429 2430 #undef DO_VCVT_FIXED 2431 2432 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2433 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2434 { \ 2435 float_status *fpst = stat; \ 2436 intptr_t i, oprsz = simd_oprsz(desc); \ 2437 uint32_t rmode = simd_data(desc); \ 2438 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2439 TYPE *d = vd, *n = vn; \ 2440 set_float_rounding_mode(rmode, fpst); \ 2441 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2442 d[i] = FUNC(n[i], 0, fpst); \ 2443 } \ 2444 set_float_rounding_mode(prev_rmode, fpst); \ 2445 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2446 } 2447 2448 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2449 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2450 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2451 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2452 2453 #undef DO_VCVT_RMODE 2454 2455 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2456 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2457 { \ 2458 float_status *fpst = stat; \ 2459 intptr_t i, oprsz = simd_oprsz(desc); \ 2460 uint32_t rmode = simd_data(desc); \ 2461 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2462 TYPE *d = vd, *n = vn; \ 2463 set_float_rounding_mode(rmode, fpst); \ 2464 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2465 d[i] = FUNC(n[i], fpst); \ 2466 } \ 2467 set_float_rounding_mode(prev_rmode, fpst); \ 2468 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2469 } 2470 2471 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2472 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2473 2474 #undef DO_VRINT_RMODE 2475 2476 #ifdef TARGET_AARCH64 2477 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2478 { 2479 const uint8_t *indices = vm; 2480 CPUARMState *env = venv; 2481 size_t oprsz = simd_oprsz(desc); 2482 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2483 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2484 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2485 union { 2486 uint8_t b[16]; 2487 uint64_t d[2]; 2488 } result; 2489 2490 /* 2491 * We must construct the final result in a temp, lest the output 2492 * overlaps the input table. For TBL, begin with zero; for TBX, 2493 * begin with the original register contents. Note that we always 2494 * copy 16 bytes here to avoid an extra branch; clearing the high 2495 * bits of the register for oprsz == 8 is handled below. 2496 */ 2497 if (is_tbx) { 2498 memcpy(&result, vd, 16); 2499 } else { 2500 memset(&result, 0, 16); 2501 } 2502 2503 for (size_t i = 0; i < oprsz; ++i) { 2504 uint32_t index = indices[H1(i)]; 2505 2506 if (index < table_len) { 2507 /* 2508 * Convert index (a byte offset into the virtual table 2509 * which is a series of 128-bit vectors concatenated) 2510 * into the correct register element, bearing in mind 2511 * that the table can wrap around from V31 to V0. 2512 */ 2513 const uint8_t *table = (const uint8_t *) 2514 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2515 result.b[H1(i)] = table[H1(index % 16)]; 2516 } 2517 } 2518 2519 memcpy(vd, &result, 16); 2520 clear_tail(vd, oprsz, simd_maxsz(desc)); 2521 } 2522 #endif 2523 2524 /* 2525 * NxN -> N highpart multiply 2526 * 2527 * TODO: expose this as a generic vector operation. 2528 */ 2529 2530 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2531 { 2532 intptr_t i, opr_sz = simd_oprsz(desc); 2533 int8_t *d = vd, *n = vn, *m = vm; 2534 2535 for (i = 0; i < opr_sz; ++i) { 2536 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2537 } 2538 clear_tail(d, opr_sz, simd_maxsz(desc)); 2539 } 2540 2541 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2542 { 2543 intptr_t i, opr_sz = simd_oprsz(desc); 2544 int16_t *d = vd, *n = vn, *m = vm; 2545 2546 for (i = 0; i < opr_sz / 2; ++i) { 2547 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2548 } 2549 clear_tail(d, opr_sz, simd_maxsz(desc)); 2550 } 2551 2552 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2553 { 2554 intptr_t i, opr_sz = simd_oprsz(desc); 2555 int32_t *d = vd, *n = vn, *m = vm; 2556 2557 for (i = 0; i < opr_sz / 4; ++i) { 2558 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2559 } 2560 clear_tail(d, opr_sz, simd_maxsz(desc)); 2561 } 2562 2563 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2564 { 2565 intptr_t i, opr_sz = simd_oprsz(desc); 2566 uint64_t *d = vd, *n = vn, *m = vm; 2567 uint64_t discard; 2568 2569 for (i = 0; i < opr_sz / 8; ++i) { 2570 muls64(&discard, &d[i], n[i], m[i]); 2571 } 2572 clear_tail(d, opr_sz, simd_maxsz(desc)); 2573 } 2574 2575 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2576 { 2577 intptr_t i, opr_sz = simd_oprsz(desc); 2578 uint8_t *d = vd, *n = vn, *m = vm; 2579 2580 for (i = 0; i < opr_sz; ++i) { 2581 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2582 } 2583 clear_tail(d, opr_sz, simd_maxsz(desc)); 2584 } 2585 2586 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2587 { 2588 intptr_t i, opr_sz = simd_oprsz(desc); 2589 uint16_t *d = vd, *n = vn, *m = vm; 2590 2591 for (i = 0; i < opr_sz / 2; ++i) { 2592 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2593 } 2594 clear_tail(d, opr_sz, simd_maxsz(desc)); 2595 } 2596 2597 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2598 { 2599 intptr_t i, opr_sz = simd_oprsz(desc); 2600 uint32_t *d = vd, *n = vn, *m = vm; 2601 2602 for (i = 0; i < opr_sz / 4; ++i) { 2603 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2604 } 2605 clear_tail(d, opr_sz, simd_maxsz(desc)); 2606 } 2607 2608 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2609 { 2610 intptr_t i, opr_sz = simd_oprsz(desc); 2611 uint64_t *d = vd, *n = vn, *m = vm; 2612 uint64_t discard; 2613 2614 for (i = 0; i < opr_sz / 8; ++i) { 2615 mulu64(&discard, &d[i], n[i], m[i]); 2616 } 2617 clear_tail(d, opr_sz, simd_maxsz(desc)); 2618 } 2619 2620 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2621 { 2622 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2623 int shr = simd_data(desc); 2624 uint64_t *d = vd, *n = vn, *m = vm; 2625 2626 for (i = 0; i < opr_sz; ++i) { 2627 d[i] = ror64(n[i] ^ m[i], shr); 2628 } 2629 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2630 } 2631 2632 /* 2633 * Integer matrix-multiply accumulate 2634 */ 2635 2636 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2637 { 2638 int8_t *n = vn, *m = vm; 2639 2640 for (intptr_t k = 0; k < 8; ++k) { 2641 sum += n[H1(k)] * m[H1(k)]; 2642 } 2643 return sum; 2644 } 2645 2646 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2647 { 2648 uint8_t *n = vn, *m = vm; 2649 2650 for (intptr_t k = 0; k < 8; ++k) { 2651 sum += n[H1(k)] * m[H1(k)]; 2652 } 2653 return sum; 2654 } 2655 2656 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2657 { 2658 uint8_t *n = vn; 2659 int8_t *m = vm; 2660 2661 for (intptr_t k = 0; k < 8; ++k) { 2662 sum += n[H1(k)] * m[H1(k)]; 2663 } 2664 return sum; 2665 } 2666 2667 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2668 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2669 { 2670 intptr_t seg, opr_sz = simd_oprsz(desc); 2671 2672 for (seg = 0; seg < opr_sz; seg += 16) { 2673 uint32_t *d = vd + seg; 2674 uint32_t *a = va + seg; 2675 uint32_t sum0, sum1, sum2, sum3; 2676 2677 /* 2678 * Process the entire segment at once, writing back the 2679 * results only after we've consumed all of the inputs. 2680 * 2681 * Key to indices by column: 2682 * i j i j 2683 */ 2684 sum0 = a[H4(0 + 0)]; 2685 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2686 sum1 = a[H4(0 + 1)]; 2687 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2688 sum2 = a[H4(2 + 0)]; 2689 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2690 sum3 = a[H4(2 + 1)]; 2691 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2692 2693 d[H4(0)] = sum0; 2694 d[H4(1)] = sum1; 2695 d[H4(2)] = sum2; 2696 d[H4(3)] = sum3; 2697 } 2698 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2699 } 2700 2701 #define DO_MMLA_B(NAME, INNER) \ 2702 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2703 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2704 2705 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2706 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2707 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2708 2709 /* 2710 * BFloat16 Dot Product 2711 */ 2712 2713 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) 2714 { 2715 /* FPCR is ignored for BFDOT and BFMMLA. */ 2716 float_status bf_status = { 2717 .tininess_before_rounding = float_tininess_before_rounding, 2718 .float_rounding_mode = float_round_to_odd_inf, 2719 .flush_to_zero = true, 2720 .flush_inputs_to_zero = true, 2721 .default_nan_mode = true, 2722 }; 2723 float32 t1, t2; 2724 2725 /* 2726 * Extract each BFloat16 from the element pair, and shift 2727 * them such that they become float32. 2728 */ 2729 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); 2730 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); 2731 t1 = float32_add(t1, t2, &bf_status); 2732 t1 = float32_add(sum, t1, &bf_status); 2733 2734 return t1; 2735 } 2736 2737 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2738 { 2739 intptr_t i, opr_sz = simd_oprsz(desc); 2740 float32 *d = vd, *a = va; 2741 uint32_t *n = vn, *m = vm; 2742 2743 for (i = 0; i < opr_sz / 4; ++i) { 2744 d[i] = bfdotadd(a[i], n[i], m[i]); 2745 } 2746 clear_tail(d, opr_sz, simd_maxsz(desc)); 2747 } 2748 2749 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2750 void *va, uint32_t desc) 2751 { 2752 intptr_t i, j, opr_sz = simd_oprsz(desc); 2753 intptr_t index = simd_data(desc); 2754 intptr_t elements = opr_sz / 4; 2755 intptr_t eltspersegment = MIN(16 / 4, elements); 2756 float32 *d = vd, *a = va; 2757 uint32_t *n = vn, *m = vm; 2758 2759 for (i = 0; i < elements; i += eltspersegment) { 2760 uint32_t m_idx = m[i + H4(index)]; 2761 2762 for (j = i; j < i + eltspersegment; j++) { 2763 d[j] = bfdotadd(a[j], n[j], m_idx); 2764 } 2765 } 2766 clear_tail(d, opr_sz, simd_maxsz(desc)); 2767 } 2768 2769 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2770 { 2771 intptr_t s, opr_sz = simd_oprsz(desc); 2772 float32 *d = vd, *a = va; 2773 uint32_t *n = vn, *m = vm; 2774 2775 for (s = 0; s < opr_sz / 4; s += 4) { 2776 float32 sum00, sum01, sum10, sum11; 2777 2778 /* 2779 * Process the entire segment at once, writing back the 2780 * results only after we've consumed all of the inputs. 2781 * 2782 * Key to indices by column: 2783 * i j i k j k 2784 */ 2785 sum00 = a[s + H4(0 + 0)]; 2786 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); 2787 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); 2788 2789 sum01 = a[s + H4(0 + 1)]; 2790 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); 2791 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); 2792 2793 sum10 = a[s + H4(2 + 0)]; 2794 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); 2795 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); 2796 2797 sum11 = a[s + H4(2 + 1)]; 2798 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); 2799 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); 2800 2801 d[s + H4(0 + 0)] = sum00; 2802 d[s + H4(0 + 1)] = sum01; 2803 d[s + H4(2 + 0)] = sum10; 2804 d[s + H4(2 + 1)] = sum11; 2805 } 2806 clear_tail(d, opr_sz, simd_maxsz(desc)); 2807 } 2808 2809 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 2810 void *stat, uint32_t desc) 2811 { 2812 intptr_t i, opr_sz = simd_oprsz(desc); 2813 intptr_t sel = simd_data(desc); 2814 float32 *d = vd, *a = va; 2815 bfloat16 *n = vn, *m = vm; 2816 2817 for (i = 0; i < opr_sz / 4; ++i) { 2818 float32 nn = n[H2(i * 2 + sel)] << 16; 2819 float32 mm = m[H2(i * 2 + sel)] << 16; 2820 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 2821 } 2822 clear_tail(d, opr_sz, simd_maxsz(desc)); 2823 } 2824 2825 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 2826 void *va, void *stat, uint32_t desc) 2827 { 2828 intptr_t i, j, opr_sz = simd_oprsz(desc); 2829 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 2830 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 2831 intptr_t elements = opr_sz / 4; 2832 intptr_t eltspersegment = MIN(16 / 4, elements); 2833 float32 *d = vd, *a = va; 2834 bfloat16 *n = vn, *m = vm; 2835 2836 for (i = 0; i < elements; i += eltspersegment) { 2837 float32 m_idx = m[H2(2 * i + index)] << 16; 2838 2839 for (j = i; j < i + eltspersegment; j++) { 2840 float32 n_j = n[H2(2 * j + sel)] << 16; 2841 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 2842 } 2843 } 2844 clear_tail(d, opr_sz, simd_maxsz(desc)); 2845 } 2846 2847 #define DO_CLAMP(NAME, TYPE) \ 2848 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 2849 { \ 2850 intptr_t i, opr_sz = simd_oprsz(desc); \ 2851 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2852 TYPE aa = *(TYPE *)(a + i); \ 2853 TYPE nn = *(TYPE *)(n + i); \ 2854 TYPE mm = *(TYPE *)(m + i); \ 2855 TYPE dd = MIN(MAX(aa, nn), mm); \ 2856 *(TYPE *)(d + i) = dd; \ 2857 } \ 2858 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2859 } 2860 2861 DO_CLAMP(gvec_sclamp_b, int8_t) 2862 DO_CLAMP(gvec_sclamp_h, int16_t) 2863 DO_CLAMP(gvec_sclamp_s, int32_t) 2864 DO_CLAMP(gvec_sclamp_d, int64_t) 2865 2866 DO_CLAMP(gvec_uclamp_b, uint8_t) 2867 DO_CLAMP(gvec_uclamp_h, uint16_t) 2868 DO_CLAMP(gvec_uclamp_s, uint32_t) 2869 DO_CLAMP(gvec_uclamp_d, uint64_t) 2870