1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 351 void *va, uint32_t desc) 352 { 353 intptr_t i, opr_sz = simd_oprsz(desc); 354 int16_t *d = vd, *n = vn, *m = vm, *a = va; 355 uint32_t discard; 356 357 for (i = 0; i < opr_sz / 2; ++i) { 358 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 359 } 360 } 361 362 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 363 void *va, uint32_t desc) 364 { 365 intptr_t i, opr_sz = simd_oprsz(desc); 366 int16_t *d = vd, *n = vn, *m = vm, *a = va; 367 uint32_t discard; 368 369 for (i = 0; i < opr_sz / 2; ++i) { 370 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 371 } 372 } 373 374 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 375 { 376 intptr_t i, opr_sz = simd_oprsz(desc); 377 int16_t *d = vd, *n = vn, *m = vm; 378 uint32_t discard; 379 380 for (i = 0; i < opr_sz / 2; ++i) { 381 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 382 } 383 } 384 385 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 386 { 387 intptr_t i, opr_sz = simd_oprsz(desc); 388 int16_t *d = vd, *n = vn, *m = vm; 389 uint32_t discard; 390 391 for (i = 0; i < opr_sz / 2; ++i) { 392 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 393 } 394 } 395 396 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 397 { 398 intptr_t i, j, opr_sz = simd_oprsz(desc); 399 int idx = simd_data(desc); 400 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 401 uint32_t discard; 402 403 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 404 int16_t mm = m[i]; 405 for (j = 0; j < 16 / 2; ++j) { 406 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 407 } 408 } 409 } 410 411 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 412 { 413 intptr_t i, j, opr_sz = simd_oprsz(desc); 414 int idx = simd_data(desc); 415 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 416 uint32_t discard; 417 418 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 419 int16_t mm = m[i]; 420 for (j = 0; j < 16 / 2; ++j) { 421 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 422 } 423 } 424 } 425 426 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 427 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 428 bool neg, bool round, uint32_t *sat) 429 { 430 /* Simplify similarly to do_sqrdmlah_b above. */ 431 int64_t ret = (int64_t)src1 * src2; 432 if (neg) { 433 ret = -ret; 434 } 435 ret += ((int64_t)src3 << 31) + (round << 30); 436 ret >>= 31; 437 438 if (ret != (int32_t)ret) { 439 *sat = 1; 440 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 441 } 442 return ret; 443 } 444 445 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 446 int32_t src2, int32_t src3) 447 { 448 uint32_t *sat = &env->vfp.qc[0]; 449 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 450 } 451 452 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 453 void *vq, uint32_t desc) 454 { 455 uintptr_t opr_sz = simd_oprsz(desc); 456 int32_t *d = vd; 457 int32_t *n = vn; 458 int32_t *m = vm; 459 uintptr_t i; 460 461 for (i = 0; i < opr_sz / 4; ++i) { 462 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 463 } 464 clear_tail(d, opr_sz, simd_maxsz(desc)); 465 } 466 467 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 468 int32_t src2, int32_t src3) 469 { 470 uint32_t *sat = &env->vfp.qc[0]; 471 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 472 } 473 474 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 475 void *vq, uint32_t desc) 476 { 477 uintptr_t opr_sz = simd_oprsz(desc); 478 int32_t *d = vd; 479 int32_t *n = vn; 480 int32_t *m = vm; 481 uintptr_t i; 482 483 for (i = 0; i < opr_sz / 4; ++i) { 484 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 485 } 486 clear_tail(d, opr_sz, simd_maxsz(desc)); 487 } 488 489 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 490 void *vq, uint32_t desc) 491 { 492 intptr_t i, opr_sz = simd_oprsz(desc); 493 int32_t *d = vd, *n = vn, *m = vm; 494 495 for (i = 0; i < opr_sz / 4; ++i) { 496 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 497 } 498 clear_tail(d, opr_sz, simd_maxsz(desc)); 499 } 500 501 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 502 void *vq, uint32_t desc) 503 { 504 intptr_t i, opr_sz = simd_oprsz(desc); 505 int32_t *d = vd, *n = vn, *m = vm; 506 507 for (i = 0; i < opr_sz / 4; ++i) { 508 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 509 } 510 clear_tail(d, opr_sz, simd_maxsz(desc)); 511 } 512 513 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 514 void *vq, uint32_t desc) 515 { 516 intptr_t i, j, opr_sz = simd_oprsz(desc); 517 int idx = simd_data(desc); 518 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 519 intptr_t elements = opr_sz / 4; 520 intptr_t eltspersegment = MIN(16 / 4, elements); 521 522 for (i = 0; i < elements; i += 16 / 4) { 523 int32_t mm = m[i]; 524 for (j = 0; j < eltspersegment; ++j) { 525 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 526 } 527 } 528 clear_tail(d, opr_sz, simd_maxsz(desc)); 529 } 530 531 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 532 void *vq, uint32_t desc) 533 { 534 intptr_t i, j, opr_sz = simd_oprsz(desc); 535 int idx = simd_data(desc); 536 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 537 intptr_t elements = opr_sz / 4; 538 intptr_t eltspersegment = MIN(16 / 4, elements); 539 540 for (i = 0; i < elements; i += 16 / 4) { 541 int32_t mm = m[i]; 542 for (j = 0; j < eltspersegment; ++j) { 543 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 544 } 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 550 void *va, uint32_t desc) 551 { 552 intptr_t i, opr_sz = simd_oprsz(desc); 553 int32_t *d = vd, *n = vn, *m = vm, *a = va; 554 uint32_t discard; 555 556 for (i = 0; i < opr_sz / 4; ++i) { 557 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 558 } 559 } 560 561 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 562 void *va, uint32_t desc) 563 { 564 intptr_t i, opr_sz = simd_oprsz(desc); 565 int32_t *d = vd, *n = vn, *m = vm, *a = va; 566 uint32_t discard; 567 568 for (i = 0; i < opr_sz / 4; ++i) { 569 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 570 } 571 } 572 573 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 574 { 575 intptr_t i, opr_sz = simd_oprsz(desc); 576 int32_t *d = vd, *n = vn, *m = vm; 577 uint32_t discard; 578 579 for (i = 0; i < opr_sz / 4; ++i) { 580 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 581 } 582 } 583 584 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 585 { 586 intptr_t i, opr_sz = simd_oprsz(desc); 587 int32_t *d = vd, *n = vn, *m = vm; 588 uint32_t discard; 589 590 for (i = 0; i < opr_sz / 4; ++i) { 591 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 592 } 593 } 594 595 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 596 { 597 intptr_t i, j, opr_sz = simd_oprsz(desc); 598 int idx = simd_data(desc); 599 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 600 uint32_t discard; 601 602 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 603 int32_t mm = m[i]; 604 for (j = 0; j < 16 / 4; ++j) { 605 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 606 } 607 } 608 } 609 610 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 611 { 612 intptr_t i, j, opr_sz = simd_oprsz(desc); 613 int idx = simd_data(desc); 614 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 615 uint32_t discard; 616 617 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 618 int32_t mm = m[i]; 619 for (j = 0; j < 16 / 4; ++j) { 620 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 621 } 622 } 623 } 624 625 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 626 static int64_t do_sat128_d(Int128 r) 627 { 628 int64_t ls = int128_getlo(r); 629 int64_t hs = int128_gethi(r); 630 631 if (unlikely(hs != (ls >> 63))) { 632 return hs < 0 ? INT64_MIN : INT64_MAX; 633 } 634 return ls; 635 } 636 637 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 638 { 639 uint64_t l, h; 640 Int128 r, t; 641 642 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 643 muls64(&l, &h, m, n); 644 r = int128_make128(l, h); 645 if (neg) { 646 r = int128_neg(r); 647 } 648 if (a) { 649 t = int128_exts64(a); 650 t = int128_lshift(t, 63); 651 r = int128_add(r, t); 652 } 653 if (round) { 654 t = int128_exts64(1ll << 62); 655 r = int128_add(r, t); 656 } 657 r = int128_rshift(r, 63); 658 659 return do_sat128_d(r); 660 } 661 662 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 663 void *va, uint32_t desc) 664 { 665 intptr_t i, opr_sz = simd_oprsz(desc); 666 int64_t *d = vd, *n = vn, *m = vm, *a = va; 667 668 for (i = 0; i < opr_sz / 8; ++i) { 669 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 670 } 671 } 672 673 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 674 void *va, uint32_t desc) 675 { 676 intptr_t i, opr_sz = simd_oprsz(desc); 677 int64_t *d = vd, *n = vn, *m = vm, *a = va; 678 679 for (i = 0; i < opr_sz / 8; ++i) { 680 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 681 } 682 } 683 684 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 685 { 686 intptr_t i, opr_sz = simd_oprsz(desc); 687 int64_t *d = vd, *n = vn, *m = vm; 688 689 for (i = 0; i < opr_sz / 8; ++i) { 690 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 691 } 692 } 693 694 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 695 { 696 intptr_t i, opr_sz = simd_oprsz(desc); 697 int64_t *d = vd, *n = vn, *m = vm; 698 699 for (i = 0; i < opr_sz / 8; ++i) { 700 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 701 } 702 } 703 704 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 705 { 706 intptr_t i, j, opr_sz = simd_oprsz(desc); 707 int idx = simd_data(desc); 708 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 709 710 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 711 int64_t mm = m[i]; 712 for (j = 0; j < 16 / 8; ++j) { 713 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 714 } 715 } 716 } 717 718 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 719 { 720 intptr_t i, j, opr_sz = simd_oprsz(desc); 721 int idx = simd_data(desc); 722 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 723 724 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 725 int64_t mm = m[i]; 726 for (j = 0; j < 16 / 8; ++j) { 727 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 728 } 729 } 730 } 731 732 /* Integer 8 and 16-bit dot-product. 733 * 734 * Note that for the loops herein, host endianness does not matter 735 * with respect to the ordering of data within the quad-width lanes. 736 * All elements are treated equally, no matter where they are. 737 */ 738 739 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 740 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 741 { \ 742 intptr_t i, opr_sz = simd_oprsz(desc); \ 743 TYPED *d = vd, *a = va; \ 744 TYPEN *n = vn; \ 745 TYPEM *m = vm; \ 746 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 747 d[i] = (a[i] + \ 748 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 749 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 750 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 751 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 752 } \ 753 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 754 } 755 756 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 757 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 758 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 759 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 760 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 761 762 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 763 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 764 { \ 765 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 766 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 767 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 768 intptr_t index = simd_data(desc); \ 769 TYPED *d = vd, *a = va; \ 770 TYPEN *n = vn; \ 771 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 772 do { \ 773 TYPED m0 = m_indexed[i * 4 + 0]; \ 774 TYPED m1 = m_indexed[i * 4 + 1]; \ 775 TYPED m2 = m_indexed[i * 4 + 2]; \ 776 TYPED m3 = m_indexed[i * 4 + 3]; \ 777 do { \ 778 d[i] = (a[i] + \ 779 n[i * 4 + 0] * m0 + \ 780 n[i * 4 + 1] * m1 + \ 781 n[i * 4 + 2] * m2 + \ 782 n[i * 4 + 3] * m3); \ 783 } while (++i < segend); \ 784 segend = i + 4; \ 785 } while (i < opr_sz_n); \ 786 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 787 } 788 789 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 790 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 791 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 792 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 793 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 794 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 795 796 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 797 void *vfpst, uint32_t desc) 798 { 799 uintptr_t opr_sz = simd_oprsz(desc); 800 float16 *d = vd; 801 float16 *n = vn; 802 float16 *m = vm; 803 float_status *fpst = vfpst; 804 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 805 uint32_t neg_imag = neg_real ^ 1; 806 uintptr_t i; 807 808 /* Shift boolean to the sign bit so we can xor to negate. */ 809 neg_real <<= 15; 810 neg_imag <<= 15; 811 812 for (i = 0; i < opr_sz / 2; i += 2) { 813 float16 e0 = n[H2(i)]; 814 float16 e1 = m[H2(i + 1)] ^ neg_imag; 815 float16 e2 = n[H2(i + 1)]; 816 float16 e3 = m[H2(i)] ^ neg_real; 817 818 d[H2(i)] = float16_add(e0, e1, fpst); 819 d[H2(i + 1)] = float16_add(e2, e3, fpst); 820 } 821 clear_tail(d, opr_sz, simd_maxsz(desc)); 822 } 823 824 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 825 void *vfpst, uint32_t desc) 826 { 827 uintptr_t opr_sz = simd_oprsz(desc); 828 float32 *d = vd; 829 float32 *n = vn; 830 float32 *m = vm; 831 float_status *fpst = vfpst; 832 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 833 uint32_t neg_imag = neg_real ^ 1; 834 uintptr_t i; 835 836 /* Shift boolean to the sign bit so we can xor to negate. */ 837 neg_real <<= 31; 838 neg_imag <<= 31; 839 840 for (i = 0; i < opr_sz / 4; i += 2) { 841 float32 e0 = n[H4(i)]; 842 float32 e1 = m[H4(i + 1)] ^ neg_imag; 843 float32 e2 = n[H4(i + 1)]; 844 float32 e3 = m[H4(i)] ^ neg_real; 845 846 d[H4(i)] = float32_add(e0, e1, fpst); 847 d[H4(i + 1)] = float32_add(e2, e3, fpst); 848 } 849 clear_tail(d, opr_sz, simd_maxsz(desc)); 850 } 851 852 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 853 void *vfpst, uint32_t desc) 854 { 855 uintptr_t opr_sz = simd_oprsz(desc); 856 float64 *d = vd; 857 float64 *n = vn; 858 float64 *m = vm; 859 float_status *fpst = vfpst; 860 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 861 uint64_t neg_imag = neg_real ^ 1; 862 uintptr_t i; 863 864 /* Shift boolean to the sign bit so we can xor to negate. */ 865 neg_real <<= 63; 866 neg_imag <<= 63; 867 868 for (i = 0; i < opr_sz / 8; i += 2) { 869 float64 e0 = n[i]; 870 float64 e1 = m[i + 1] ^ neg_imag; 871 float64 e2 = n[i + 1]; 872 float64 e3 = m[i] ^ neg_real; 873 874 d[i] = float64_add(e0, e1, fpst); 875 d[i + 1] = float64_add(e2, e3, fpst); 876 } 877 clear_tail(d, opr_sz, simd_maxsz(desc)); 878 } 879 880 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 881 void *vfpst, uint32_t desc) 882 { 883 uintptr_t opr_sz = simd_oprsz(desc); 884 float16 *d = vd, *n = vn, *m = vm, *a = va; 885 float_status *fpst = vfpst; 886 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 887 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 888 uint32_t neg_real = flip ^ neg_imag; 889 uintptr_t i; 890 891 /* Shift boolean to the sign bit so we can xor to negate. */ 892 neg_real <<= 15; 893 neg_imag <<= 15; 894 895 for (i = 0; i < opr_sz / 2; i += 2) { 896 float16 e2 = n[H2(i + flip)]; 897 float16 e1 = m[H2(i + flip)] ^ neg_real; 898 float16 e4 = e2; 899 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 900 901 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 902 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 903 } 904 clear_tail(d, opr_sz, simd_maxsz(desc)); 905 } 906 907 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 908 void *vfpst, uint32_t desc) 909 { 910 uintptr_t opr_sz = simd_oprsz(desc); 911 float16 *d = vd, *n = vn, *m = vm, *a = va; 912 float_status *fpst = vfpst; 913 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 914 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 915 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 916 uint32_t neg_real = flip ^ neg_imag; 917 intptr_t elements = opr_sz / sizeof(float16); 918 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 919 intptr_t i, j; 920 921 /* Shift boolean to the sign bit so we can xor to negate. */ 922 neg_real <<= 15; 923 neg_imag <<= 15; 924 925 for (i = 0; i < elements; i += eltspersegment) { 926 float16 mr = m[H2(i + 2 * index + 0)]; 927 float16 mi = m[H2(i + 2 * index + 1)]; 928 float16 e1 = neg_real ^ (flip ? mi : mr); 929 float16 e3 = neg_imag ^ (flip ? mr : mi); 930 931 for (j = i; j < i + eltspersegment; j += 2) { 932 float16 e2 = n[H2(j + flip)]; 933 float16 e4 = e2; 934 935 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 936 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 937 } 938 } 939 clear_tail(d, opr_sz, simd_maxsz(desc)); 940 } 941 942 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 943 void *vfpst, uint32_t desc) 944 { 945 uintptr_t opr_sz = simd_oprsz(desc); 946 float32 *d = vd, *n = vn, *m = vm, *a = va; 947 float_status *fpst = vfpst; 948 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 949 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 950 uint32_t neg_real = flip ^ neg_imag; 951 uintptr_t i; 952 953 /* Shift boolean to the sign bit so we can xor to negate. */ 954 neg_real <<= 31; 955 neg_imag <<= 31; 956 957 for (i = 0; i < opr_sz / 4; i += 2) { 958 float32 e2 = n[H4(i + flip)]; 959 float32 e1 = m[H4(i + flip)] ^ neg_real; 960 float32 e4 = e2; 961 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 962 963 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 964 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 965 } 966 clear_tail(d, opr_sz, simd_maxsz(desc)); 967 } 968 969 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 970 void *vfpst, uint32_t desc) 971 { 972 uintptr_t opr_sz = simd_oprsz(desc); 973 float32 *d = vd, *n = vn, *m = vm, *a = va; 974 float_status *fpst = vfpst; 975 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 976 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 977 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 978 uint32_t neg_real = flip ^ neg_imag; 979 intptr_t elements = opr_sz / sizeof(float32); 980 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 981 intptr_t i, j; 982 983 /* Shift boolean to the sign bit so we can xor to negate. */ 984 neg_real <<= 31; 985 neg_imag <<= 31; 986 987 for (i = 0; i < elements; i += eltspersegment) { 988 float32 mr = m[H4(i + 2 * index + 0)]; 989 float32 mi = m[H4(i + 2 * index + 1)]; 990 float32 e1 = neg_real ^ (flip ? mi : mr); 991 float32 e3 = neg_imag ^ (flip ? mr : mi); 992 993 for (j = i; j < i + eltspersegment; j += 2) { 994 float32 e2 = n[H4(j + flip)]; 995 float32 e4 = e2; 996 997 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 998 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 999 } 1000 } 1001 clear_tail(d, opr_sz, simd_maxsz(desc)); 1002 } 1003 1004 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1005 void *vfpst, uint32_t desc) 1006 { 1007 uintptr_t opr_sz = simd_oprsz(desc); 1008 float64 *d = vd, *n = vn, *m = vm, *a = va; 1009 float_status *fpst = vfpst; 1010 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1011 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1012 uint64_t neg_real = flip ^ neg_imag; 1013 uintptr_t i; 1014 1015 /* Shift boolean to the sign bit so we can xor to negate. */ 1016 neg_real <<= 63; 1017 neg_imag <<= 63; 1018 1019 for (i = 0; i < opr_sz / 8; i += 2) { 1020 float64 e2 = n[i + flip]; 1021 float64 e1 = m[i + flip] ^ neg_real; 1022 float64 e4 = e2; 1023 float64 e3 = m[i + 1 - flip] ^ neg_imag; 1024 1025 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 1026 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 1027 } 1028 clear_tail(d, opr_sz, simd_maxsz(desc)); 1029 } 1030 1031 /* 1032 * Floating point comparisons producing an integer result (all 1s or all 0s). 1033 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1034 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1035 */ 1036 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1037 { 1038 return -float16_eq_quiet(op1, op2, stat); 1039 } 1040 1041 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1042 { 1043 return -float32_eq_quiet(op1, op2, stat); 1044 } 1045 1046 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1047 { 1048 return -float64_eq_quiet(op1, op2, stat); 1049 } 1050 1051 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1052 { 1053 return -float16_le(op2, op1, stat); 1054 } 1055 1056 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1057 { 1058 return -float32_le(op2, op1, stat); 1059 } 1060 1061 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1062 { 1063 return -float64_le(op2, op1, stat); 1064 } 1065 1066 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1067 { 1068 return -float16_lt(op2, op1, stat); 1069 } 1070 1071 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1072 { 1073 return -float32_lt(op2, op1, stat); 1074 } 1075 1076 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1077 { 1078 return -float64_lt(op2, op1, stat); 1079 } 1080 1081 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1082 { 1083 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1084 } 1085 1086 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1087 { 1088 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1089 } 1090 1091 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1092 { 1093 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1094 } 1095 1096 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1097 { 1098 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1099 } 1100 1101 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1102 { 1103 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1104 } 1105 1106 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1107 { 1108 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1109 } 1110 1111 static int16_t vfp_tosszh(float16 x, void *fpstp) 1112 { 1113 float_status *fpst = fpstp; 1114 if (float16_is_any_nan(x)) { 1115 float_raise(float_flag_invalid, fpst); 1116 return 0; 1117 } 1118 return float16_to_int16_round_to_zero(x, fpst); 1119 } 1120 1121 static uint16_t vfp_touszh(float16 x, void *fpstp) 1122 { 1123 float_status *fpst = fpstp; 1124 if (float16_is_any_nan(x)) { 1125 float_raise(float_flag_invalid, fpst); 1126 return 0; 1127 } 1128 return float16_to_uint16_round_to_zero(x, fpst); 1129 } 1130 1131 #define DO_2OP(NAME, FUNC, TYPE) \ 1132 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1133 { \ 1134 intptr_t i, oprsz = simd_oprsz(desc); \ 1135 TYPE *d = vd, *n = vn; \ 1136 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1137 d[i] = FUNC(n[i], stat); \ 1138 } \ 1139 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1140 } 1141 1142 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1143 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1144 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1145 1146 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1147 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1148 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1149 1150 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1151 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1152 1153 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1154 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1155 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1156 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1157 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1158 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1159 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1160 DO_2OP(gvec_touszh, vfp_touszh, float16) 1161 1162 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1163 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1164 { \ 1165 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1166 } 1167 1168 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1169 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1170 { \ 1171 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1172 } 1173 1174 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1175 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1176 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1177 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1178 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1179 1180 DO_2OP_CMP0(cgt, cgt, FWD) 1181 DO_2OP_CMP0(cge, cge, FWD) 1182 DO_2OP_CMP0(ceq, ceq, FWD) 1183 DO_2OP_CMP0(clt, cgt, REV) 1184 DO_2OP_CMP0(cle, cge, REV) 1185 1186 #undef DO_2OP 1187 #undef DO_2OP_CMP0 1188 1189 /* Floating-point trigonometric starting value. 1190 * See the ARM ARM pseudocode function FPTrigSMul. 1191 */ 1192 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1193 { 1194 float16 result = float16_mul(op1, op1, stat); 1195 if (!float16_is_any_nan(result)) { 1196 result = float16_set_sign(result, op2 & 1); 1197 } 1198 return result; 1199 } 1200 1201 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1202 { 1203 float32 result = float32_mul(op1, op1, stat); 1204 if (!float32_is_any_nan(result)) { 1205 result = float32_set_sign(result, op2 & 1); 1206 } 1207 return result; 1208 } 1209 1210 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1211 { 1212 float64 result = float64_mul(op1, op1, stat); 1213 if (!float64_is_any_nan(result)) { 1214 result = float64_set_sign(result, op2 & 1); 1215 } 1216 return result; 1217 } 1218 1219 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1220 { 1221 return float16_abs(float16_sub(op1, op2, stat)); 1222 } 1223 1224 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1225 { 1226 return float32_abs(float32_sub(op1, op2, stat)); 1227 } 1228 1229 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1230 { 1231 return float64_abs(float64_sub(op1, op2, stat)); 1232 } 1233 1234 /* 1235 * Reciprocal step. These are the AArch32 version which uses a 1236 * non-fused multiply-and-subtract. 1237 */ 1238 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1239 { 1240 op1 = float16_squash_input_denormal(op1, stat); 1241 op2 = float16_squash_input_denormal(op2, stat); 1242 1243 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1244 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1245 return float16_two; 1246 } 1247 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1248 } 1249 1250 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1251 { 1252 op1 = float32_squash_input_denormal(op1, stat); 1253 op2 = float32_squash_input_denormal(op2, stat); 1254 1255 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1256 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1257 return float32_two; 1258 } 1259 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1260 } 1261 1262 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1263 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1264 { 1265 op1 = float16_squash_input_denormal(op1, stat); 1266 op2 = float16_squash_input_denormal(op2, stat); 1267 1268 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1269 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1270 return float16_one_point_five; 1271 } 1272 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1273 return float16_div(op1, float16_two, stat); 1274 } 1275 1276 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1277 { 1278 op1 = float32_squash_input_denormal(op1, stat); 1279 op2 = float32_squash_input_denormal(op2, stat); 1280 1281 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1282 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1283 return float32_one_point_five; 1284 } 1285 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1286 return float32_div(op1, float32_two, stat); 1287 } 1288 1289 #define DO_3OP(NAME, FUNC, TYPE) \ 1290 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1291 { \ 1292 intptr_t i, oprsz = simd_oprsz(desc); \ 1293 TYPE *d = vd, *n = vn, *m = vm; \ 1294 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1295 d[i] = FUNC(n[i], m[i], stat); \ 1296 } \ 1297 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1298 } 1299 1300 DO_3OP(gvec_fadd_h, float16_add, float16) 1301 DO_3OP(gvec_fadd_s, float32_add, float32) 1302 DO_3OP(gvec_fadd_d, float64_add, float64) 1303 1304 DO_3OP(gvec_fsub_h, float16_sub, float16) 1305 DO_3OP(gvec_fsub_s, float32_sub, float32) 1306 DO_3OP(gvec_fsub_d, float64_sub, float64) 1307 1308 DO_3OP(gvec_fmul_h, float16_mul, float16) 1309 DO_3OP(gvec_fmul_s, float32_mul, float32) 1310 DO_3OP(gvec_fmul_d, float64_mul, float64) 1311 1312 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1313 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1314 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1315 1316 DO_3OP(gvec_fabd_h, float16_abd, float16) 1317 DO_3OP(gvec_fabd_s, float32_abd, float32) 1318 DO_3OP(gvec_fabd_d, float64_abd, float64) 1319 1320 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1321 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1322 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1323 1324 DO_3OP(gvec_fcge_h, float16_cge, float16) 1325 DO_3OP(gvec_fcge_s, float32_cge, float32) 1326 DO_3OP(gvec_fcge_d, float64_cge, float64) 1327 1328 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1329 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1330 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1331 1332 DO_3OP(gvec_facge_h, float16_acge, float16) 1333 DO_3OP(gvec_facge_s, float32_acge, float32) 1334 DO_3OP(gvec_facge_d, float64_acge, float64) 1335 1336 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1337 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1338 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1339 1340 DO_3OP(gvec_fmax_h, float16_max, float16) 1341 DO_3OP(gvec_fmax_s, float32_max, float32) 1342 DO_3OP(gvec_fmax_d, float64_max, float64) 1343 1344 DO_3OP(gvec_fmin_h, float16_min, float16) 1345 DO_3OP(gvec_fmin_s, float32_min, float32) 1346 DO_3OP(gvec_fmin_d, float64_min, float64) 1347 1348 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1349 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1350 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1351 1352 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1353 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1354 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1355 1356 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1357 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1358 1359 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1360 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1361 1362 #ifdef TARGET_AARCH64 1363 DO_3OP(gvec_fdiv_h, float16_div, float16) 1364 DO_3OP(gvec_fdiv_s, float32_div, float32) 1365 DO_3OP(gvec_fdiv_d, float64_div, float64) 1366 1367 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1368 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1369 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1370 1371 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1372 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1373 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1374 1375 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1376 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1377 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1378 1379 #endif 1380 #undef DO_3OP 1381 1382 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1383 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1384 float_status *stat) 1385 { 1386 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1387 } 1388 1389 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1390 float_status *stat) 1391 { 1392 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1393 } 1394 1395 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1396 float_status *stat) 1397 { 1398 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1399 } 1400 1401 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1402 float_status *stat) 1403 { 1404 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1405 } 1406 1407 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1408 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1409 float_status *stat) 1410 { 1411 return float16_muladd(op1, op2, dest, 0, stat); 1412 } 1413 1414 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1415 float_status *stat) 1416 { 1417 return float32_muladd(op1, op2, dest, 0, stat); 1418 } 1419 1420 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1421 float_status *stat) 1422 { 1423 return float64_muladd(op1, op2, dest, 0, stat); 1424 } 1425 1426 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1427 float_status *stat) 1428 { 1429 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1430 } 1431 1432 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1433 float_status *stat) 1434 { 1435 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1436 } 1437 1438 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1439 float_status *stat) 1440 { 1441 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1442 } 1443 1444 #define DO_MULADD(NAME, FUNC, TYPE) \ 1445 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1446 { \ 1447 intptr_t i, oprsz = simd_oprsz(desc); \ 1448 TYPE *d = vd, *n = vn, *m = vm; \ 1449 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1450 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1451 } \ 1452 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1453 } 1454 1455 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1456 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1457 1458 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1459 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1460 1461 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1462 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1463 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1464 1465 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1466 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1467 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1468 1469 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1470 * For AdvSIMD, there is of course only one such vector segment. 1471 */ 1472 1473 #define DO_MUL_IDX(NAME, TYPE, H) \ 1474 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1475 { \ 1476 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1477 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1478 intptr_t idx = simd_data(desc); \ 1479 TYPE *d = vd, *n = vn, *m = vm; \ 1480 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1481 TYPE mm = m[H(i + idx)]; \ 1482 for (j = 0; j < segment; j++) { \ 1483 d[i + j] = n[i + j] * mm; \ 1484 } \ 1485 } \ 1486 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1487 } 1488 1489 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1490 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1491 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1492 1493 #undef DO_MUL_IDX 1494 1495 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1496 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1497 { \ 1498 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1499 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1500 intptr_t idx = simd_data(desc); \ 1501 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1502 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1503 TYPE mm = m[H(i + idx)]; \ 1504 for (j = 0; j < segment; j++) { \ 1505 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1506 } \ 1507 } \ 1508 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1509 } 1510 1511 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1512 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1513 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1514 1515 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1516 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1517 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1518 1519 #undef DO_MLA_IDX 1520 1521 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1522 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1523 { \ 1524 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1525 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1526 intptr_t idx = simd_data(desc); \ 1527 TYPE *d = vd, *n = vn, *m = vm; \ 1528 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1529 TYPE mm = m[H(i + idx)]; \ 1530 for (j = 0; j < segment; j++) { \ 1531 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1532 } \ 1533 } \ 1534 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1535 } 1536 1537 #define nop(N, M, S) (M) 1538 1539 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1540 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1541 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1542 1543 #ifdef TARGET_AARCH64 1544 1545 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1546 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1547 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1548 1549 #endif 1550 1551 #undef nop 1552 1553 /* 1554 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1555 * the fused ops below they assume accumulate both from and into Vd. 1556 */ 1557 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1558 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1559 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1560 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1561 1562 #undef DO_FMUL_IDX 1563 1564 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1565 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1566 void *stat, uint32_t desc) \ 1567 { \ 1568 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1569 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1570 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1571 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1572 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1573 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1574 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1575 TYPE mm = m[H(i + idx)]; \ 1576 for (j = 0; j < segment; j++) { \ 1577 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1578 mm, a[i + j], 0, stat); \ 1579 } \ 1580 } \ 1581 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1582 } 1583 1584 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1585 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1586 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1587 1588 #undef DO_FMLA_IDX 1589 1590 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1591 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1592 { \ 1593 intptr_t i, oprsz = simd_oprsz(desc); \ 1594 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1595 bool q = false; \ 1596 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1597 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1598 if (dd < MIN) { \ 1599 dd = MIN; \ 1600 q = true; \ 1601 } else if (dd > MAX) { \ 1602 dd = MAX; \ 1603 q = true; \ 1604 } \ 1605 d[i] = dd; \ 1606 } \ 1607 if (q) { \ 1608 uint32_t *qc = vq; \ 1609 qc[0] = 1; \ 1610 } \ 1611 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1612 } 1613 1614 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1615 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1616 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1617 1618 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1619 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1620 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1621 1622 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1623 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1624 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1625 1626 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1627 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1628 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1629 1630 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1631 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1632 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1633 1634 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1635 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1636 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1637 1638 #undef DO_SAT 1639 1640 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1641 void *vm, uint32_t desc) 1642 { 1643 intptr_t i, oprsz = simd_oprsz(desc); 1644 uint64_t *d = vd, *n = vn, *m = vm; 1645 bool q = false; 1646 1647 for (i = 0; i < oprsz / 8; i++) { 1648 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1649 if (dd < nn) { 1650 dd = UINT64_MAX; 1651 q = true; 1652 } 1653 d[i] = dd; 1654 } 1655 if (q) { 1656 uint32_t *qc = vq; 1657 qc[0] = 1; 1658 } 1659 clear_tail(d, oprsz, simd_maxsz(desc)); 1660 } 1661 1662 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1663 void *vm, uint32_t desc) 1664 { 1665 intptr_t i, oprsz = simd_oprsz(desc); 1666 uint64_t *d = vd, *n = vn, *m = vm; 1667 bool q = false; 1668 1669 for (i = 0; i < oprsz / 8; i++) { 1670 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1671 if (nn < mm) { 1672 dd = 0; 1673 q = true; 1674 } 1675 d[i] = dd; 1676 } 1677 if (q) { 1678 uint32_t *qc = vq; 1679 qc[0] = 1; 1680 } 1681 clear_tail(d, oprsz, simd_maxsz(desc)); 1682 } 1683 1684 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1685 void *vm, uint32_t desc) 1686 { 1687 intptr_t i, oprsz = simd_oprsz(desc); 1688 int64_t *d = vd, *n = vn, *m = vm; 1689 bool q = false; 1690 1691 for (i = 0; i < oprsz / 8; i++) { 1692 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1693 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1694 dd = (nn >> 63) ^ ~INT64_MIN; 1695 q = true; 1696 } 1697 d[i] = dd; 1698 } 1699 if (q) { 1700 uint32_t *qc = vq; 1701 qc[0] = 1; 1702 } 1703 clear_tail(d, oprsz, simd_maxsz(desc)); 1704 } 1705 1706 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1707 void *vm, uint32_t desc) 1708 { 1709 intptr_t i, oprsz = simd_oprsz(desc); 1710 int64_t *d = vd, *n = vn, *m = vm; 1711 bool q = false; 1712 1713 for (i = 0; i < oprsz / 8; i++) { 1714 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1715 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1716 dd = (nn >> 63) ^ ~INT64_MIN; 1717 q = true; 1718 } 1719 d[i] = dd; 1720 } 1721 if (q) { 1722 uint32_t *qc = vq; 1723 qc[0] = 1; 1724 } 1725 clear_tail(d, oprsz, simd_maxsz(desc)); 1726 } 1727 1728 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1729 void *vm, uint32_t desc) 1730 { 1731 intptr_t i, oprsz = simd_oprsz(desc); 1732 uint64_t *d = vd, *n = vn, *m = vm; 1733 bool q = false; 1734 1735 for (i = 0; i < oprsz / 8; i++) { 1736 uint64_t nn = n[i]; 1737 int64_t mm = m[i]; 1738 uint64_t dd = nn + mm; 1739 1740 if (mm < 0) { 1741 if (nn < (uint64_t)-mm) { 1742 dd = 0; 1743 q = true; 1744 } 1745 } else { 1746 if (dd < nn) { 1747 dd = UINT64_MAX; 1748 q = true; 1749 } 1750 } 1751 d[i] = dd; 1752 } 1753 if (q) { 1754 uint32_t *qc = vq; 1755 qc[0] = 1; 1756 } 1757 clear_tail(d, oprsz, simd_maxsz(desc)); 1758 } 1759 1760 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1761 void *vm, uint32_t desc) 1762 { 1763 intptr_t i, oprsz = simd_oprsz(desc); 1764 uint64_t *d = vd, *n = vn, *m = vm; 1765 bool q = false; 1766 1767 for (i = 0; i < oprsz / 8; i++) { 1768 int64_t nn = n[i]; 1769 uint64_t mm = m[i]; 1770 int64_t dd = nn + mm; 1771 1772 if (mm > (uint64_t)(INT64_MAX - nn)) { 1773 dd = INT64_MAX; 1774 q = true; 1775 } 1776 d[i] = dd; 1777 } 1778 if (q) { 1779 uint32_t *qc = vq; 1780 qc[0] = 1; 1781 } 1782 clear_tail(d, oprsz, simd_maxsz(desc)); 1783 } 1784 1785 #define DO_SRA(NAME, TYPE) \ 1786 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1787 { \ 1788 intptr_t i, oprsz = simd_oprsz(desc); \ 1789 int shift = simd_data(desc); \ 1790 TYPE *d = vd, *n = vn; \ 1791 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1792 d[i] += n[i] >> shift; \ 1793 } \ 1794 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1795 } 1796 1797 DO_SRA(gvec_ssra_b, int8_t) 1798 DO_SRA(gvec_ssra_h, int16_t) 1799 DO_SRA(gvec_ssra_s, int32_t) 1800 DO_SRA(gvec_ssra_d, int64_t) 1801 1802 DO_SRA(gvec_usra_b, uint8_t) 1803 DO_SRA(gvec_usra_h, uint16_t) 1804 DO_SRA(gvec_usra_s, uint32_t) 1805 DO_SRA(gvec_usra_d, uint64_t) 1806 1807 #undef DO_SRA 1808 1809 #define DO_RSHR(NAME, TYPE) \ 1810 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1811 { \ 1812 intptr_t i, oprsz = simd_oprsz(desc); \ 1813 int shift = simd_data(desc); \ 1814 TYPE *d = vd, *n = vn; \ 1815 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1816 TYPE tmp = n[i] >> (shift - 1); \ 1817 d[i] = (tmp >> 1) + (tmp & 1); \ 1818 } \ 1819 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1820 } 1821 1822 DO_RSHR(gvec_srshr_b, int8_t) 1823 DO_RSHR(gvec_srshr_h, int16_t) 1824 DO_RSHR(gvec_srshr_s, int32_t) 1825 DO_RSHR(gvec_srshr_d, int64_t) 1826 1827 DO_RSHR(gvec_urshr_b, uint8_t) 1828 DO_RSHR(gvec_urshr_h, uint16_t) 1829 DO_RSHR(gvec_urshr_s, uint32_t) 1830 DO_RSHR(gvec_urshr_d, uint64_t) 1831 1832 #undef DO_RSHR 1833 1834 #define DO_RSRA(NAME, TYPE) \ 1835 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1836 { \ 1837 intptr_t i, oprsz = simd_oprsz(desc); \ 1838 int shift = simd_data(desc); \ 1839 TYPE *d = vd, *n = vn; \ 1840 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1841 TYPE tmp = n[i] >> (shift - 1); \ 1842 d[i] += (tmp >> 1) + (tmp & 1); \ 1843 } \ 1844 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1845 } 1846 1847 DO_RSRA(gvec_srsra_b, int8_t) 1848 DO_RSRA(gvec_srsra_h, int16_t) 1849 DO_RSRA(gvec_srsra_s, int32_t) 1850 DO_RSRA(gvec_srsra_d, int64_t) 1851 1852 DO_RSRA(gvec_ursra_b, uint8_t) 1853 DO_RSRA(gvec_ursra_h, uint16_t) 1854 DO_RSRA(gvec_ursra_s, uint32_t) 1855 DO_RSRA(gvec_ursra_d, uint64_t) 1856 1857 #undef DO_RSRA 1858 1859 #define DO_SRI(NAME, TYPE) \ 1860 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1861 { \ 1862 intptr_t i, oprsz = simd_oprsz(desc); \ 1863 int shift = simd_data(desc); \ 1864 TYPE *d = vd, *n = vn; \ 1865 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1866 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1867 } \ 1868 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1869 } 1870 1871 DO_SRI(gvec_sri_b, uint8_t) 1872 DO_SRI(gvec_sri_h, uint16_t) 1873 DO_SRI(gvec_sri_s, uint32_t) 1874 DO_SRI(gvec_sri_d, uint64_t) 1875 1876 #undef DO_SRI 1877 1878 #define DO_SLI(NAME, TYPE) \ 1879 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1880 { \ 1881 intptr_t i, oprsz = simd_oprsz(desc); \ 1882 int shift = simd_data(desc); \ 1883 TYPE *d = vd, *n = vn; \ 1884 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1885 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1886 } \ 1887 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1888 } 1889 1890 DO_SLI(gvec_sli_b, uint8_t) 1891 DO_SLI(gvec_sli_h, uint16_t) 1892 DO_SLI(gvec_sli_s, uint32_t) 1893 DO_SLI(gvec_sli_d, uint64_t) 1894 1895 #undef DO_SLI 1896 1897 /* 1898 * Convert float16 to float32, raising no exceptions and 1899 * preserving exceptional values, including SNaN. 1900 * This is effectively an unpack+repack operation. 1901 */ 1902 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1903 { 1904 const int f16_bias = 15; 1905 const int f32_bias = 127; 1906 uint32_t sign = extract32(f16, 15, 1); 1907 uint32_t exp = extract32(f16, 10, 5); 1908 uint32_t frac = extract32(f16, 0, 10); 1909 1910 if (exp == 0x1f) { 1911 /* Inf or NaN */ 1912 exp = 0xff; 1913 } else if (exp == 0) { 1914 /* Zero or denormal. */ 1915 if (frac != 0) { 1916 if (fz16) { 1917 frac = 0; 1918 } else { 1919 /* 1920 * Denormal; these are all normal float32. 1921 * Shift the fraction so that the msb is at bit 11, 1922 * then remove bit 11 as the implicit bit of the 1923 * normalized float32. Note that we still go through 1924 * the shift for normal numbers below, to put the 1925 * float32 fraction at the right place. 1926 */ 1927 int shift = clz32(frac) - 21; 1928 frac = (frac << shift) & 0x3ff; 1929 exp = f32_bias - f16_bias - shift + 1; 1930 } 1931 } 1932 } else { 1933 /* Normal number; adjust the bias. */ 1934 exp += f32_bias - f16_bias; 1935 } 1936 sign <<= 31; 1937 exp <<= 23; 1938 frac <<= 23 - 10; 1939 1940 return sign | exp | frac; 1941 } 1942 1943 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 1944 { 1945 /* 1946 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 1947 * Load the 2nd qword iff is_q & is_2. 1948 * Shift to the 2nd dword iff !is_q & is_2. 1949 * For !is_q & !is_2, the upper bits of the result are garbage. 1950 */ 1951 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 1952 } 1953 1954 /* 1955 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 1956 * as there is not yet SVE versions that might use blocking. 1957 */ 1958 1959 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 1960 uint32_t desc, bool fz16) 1961 { 1962 intptr_t i, oprsz = simd_oprsz(desc); 1963 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 1964 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1965 int is_q = oprsz == 16; 1966 uint64_t n_4, m_4; 1967 1968 /* Pre-load all of the f16 data, avoiding overlap issues. */ 1969 n_4 = load4_f16(vn, is_q, is_2); 1970 m_4 = load4_f16(vm, is_q, is_2); 1971 1972 /* Negate all inputs for FMLSL at once. */ 1973 if (is_s) { 1974 n_4 ^= 0x8000800080008000ull; 1975 } 1976 1977 for (i = 0; i < oprsz / 4; i++) { 1978 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 1979 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 1980 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 1981 } 1982 clear_tail(d, oprsz, simd_maxsz(desc)); 1983 } 1984 1985 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 1986 void *venv, uint32_t desc) 1987 { 1988 CPUARMState *env = venv; 1989 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 1990 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1991 } 1992 1993 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 1994 void *venv, uint32_t desc) 1995 { 1996 CPUARMState *env = venv; 1997 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 1998 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 1999 } 2000 2001 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2002 void *venv, uint32_t desc) 2003 { 2004 intptr_t i, oprsz = simd_oprsz(desc); 2005 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2006 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2007 CPUARMState *env = venv; 2008 float_status *status = &env->vfp.fp_status; 2009 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2010 2011 for (i = 0; i < oprsz; i += sizeof(float32)) { 2012 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2013 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2014 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2015 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2016 float32 aa = *(float32 *)(va + H1_4(i)); 2017 2018 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2019 } 2020 } 2021 2022 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2023 uint32_t desc, bool fz16) 2024 { 2025 intptr_t i, oprsz = simd_oprsz(desc); 2026 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2027 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2028 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2029 int is_q = oprsz == 16; 2030 uint64_t n_4; 2031 float32 m_1; 2032 2033 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2034 n_4 = load4_f16(vn, is_q, is_2); 2035 2036 /* Negate all inputs for FMLSL at once. */ 2037 if (is_s) { 2038 n_4 ^= 0x8000800080008000ull; 2039 } 2040 2041 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2042 2043 for (i = 0; i < oprsz / 4; i++) { 2044 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2045 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2046 } 2047 clear_tail(d, oprsz, simd_maxsz(desc)); 2048 } 2049 2050 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2051 void *venv, uint32_t desc) 2052 { 2053 CPUARMState *env = venv; 2054 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2055 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2056 } 2057 2058 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2059 void *venv, uint32_t desc) 2060 { 2061 CPUARMState *env = venv; 2062 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 2063 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2064 } 2065 2066 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2067 void *venv, uint32_t desc) 2068 { 2069 intptr_t i, j, oprsz = simd_oprsz(desc); 2070 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2071 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2072 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2073 CPUARMState *env = venv; 2074 float_status *status = &env->vfp.fp_status; 2075 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2076 2077 for (i = 0; i < oprsz; i += 16) { 2078 float16 mm_16 = *(float16 *)(vm + i + idx); 2079 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2080 2081 for (j = 0; j < 16; j += sizeof(float32)) { 2082 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2083 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2084 float32 aa = *(float32 *)(va + H1_4(i + j)); 2085 2086 *(float32 *)(vd + H1_4(i + j)) = 2087 float32_muladd(nn, mm, aa, 0, status); 2088 } 2089 } 2090 } 2091 2092 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2093 { 2094 intptr_t i, opr_sz = simd_oprsz(desc); 2095 int8_t *d = vd, *n = vn, *m = vm; 2096 2097 for (i = 0; i < opr_sz; ++i) { 2098 int8_t mm = m[i]; 2099 int8_t nn = n[i]; 2100 int8_t res = 0; 2101 if (mm >= 0) { 2102 if (mm < 8) { 2103 res = nn << mm; 2104 } 2105 } else { 2106 res = nn >> (mm > -8 ? -mm : 7); 2107 } 2108 d[i] = res; 2109 } 2110 clear_tail(d, opr_sz, simd_maxsz(desc)); 2111 } 2112 2113 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2114 { 2115 intptr_t i, opr_sz = simd_oprsz(desc); 2116 int16_t *d = vd, *n = vn, *m = vm; 2117 2118 for (i = 0; i < opr_sz / 2; ++i) { 2119 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2120 int16_t nn = n[i]; 2121 int16_t res = 0; 2122 if (mm >= 0) { 2123 if (mm < 16) { 2124 res = nn << mm; 2125 } 2126 } else { 2127 res = nn >> (mm > -16 ? -mm : 15); 2128 } 2129 d[i] = res; 2130 } 2131 clear_tail(d, opr_sz, simd_maxsz(desc)); 2132 } 2133 2134 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2135 { 2136 intptr_t i, opr_sz = simd_oprsz(desc); 2137 uint8_t *d = vd, *n = vn, *m = vm; 2138 2139 for (i = 0; i < opr_sz; ++i) { 2140 int8_t mm = m[i]; 2141 uint8_t nn = n[i]; 2142 uint8_t res = 0; 2143 if (mm >= 0) { 2144 if (mm < 8) { 2145 res = nn << mm; 2146 } 2147 } else { 2148 if (mm > -8) { 2149 res = nn >> -mm; 2150 } 2151 } 2152 d[i] = res; 2153 } 2154 clear_tail(d, opr_sz, simd_maxsz(desc)); 2155 } 2156 2157 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2158 { 2159 intptr_t i, opr_sz = simd_oprsz(desc); 2160 uint16_t *d = vd, *n = vn, *m = vm; 2161 2162 for (i = 0; i < opr_sz / 2; ++i) { 2163 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2164 uint16_t nn = n[i]; 2165 uint16_t res = 0; 2166 if (mm >= 0) { 2167 if (mm < 16) { 2168 res = nn << mm; 2169 } 2170 } else { 2171 if (mm > -16) { 2172 res = nn >> -mm; 2173 } 2174 } 2175 d[i] = res; 2176 } 2177 clear_tail(d, opr_sz, simd_maxsz(desc)); 2178 } 2179 2180 /* 2181 * 8x8->8 polynomial multiply. 2182 * 2183 * Polynomial multiplication is like integer multiplication except the 2184 * partial products are XORed, not added. 2185 * 2186 * TODO: expose this as a generic vector operation, as it is a common 2187 * crypto building block. 2188 */ 2189 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2190 { 2191 intptr_t i, opr_sz = simd_oprsz(desc); 2192 uint64_t *d = vd, *n = vn, *m = vm; 2193 2194 for (i = 0; i < opr_sz / 8; ++i) { 2195 d[i] = clmul_8x8_low(n[i], m[i]); 2196 } 2197 clear_tail(d, opr_sz, simd_maxsz(desc)); 2198 } 2199 2200 /* 2201 * 64x64->128 polynomial multiply. 2202 * Because of the lanes are not accessed in strict columns, 2203 * this probably cannot be turned into a generic helper. 2204 */ 2205 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2206 { 2207 intptr_t i, opr_sz = simd_oprsz(desc); 2208 intptr_t hi = simd_data(desc); 2209 uint64_t *d = vd, *n = vn, *m = vm; 2210 2211 for (i = 0; i < opr_sz / 8; i += 2) { 2212 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2213 d[i] = int128_getlo(r); 2214 d[i + 1] = int128_gethi(r); 2215 } 2216 clear_tail(d, opr_sz, simd_maxsz(desc)); 2217 } 2218 2219 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2220 { 2221 int hi = simd_data(desc); 2222 uint64_t *d = vd, *n = vn, *m = vm; 2223 uint64_t nn = n[hi], mm = m[hi]; 2224 2225 d[0] = clmul_8x4_packed(nn, mm); 2226 nn >>= 32; 2227 mm >>= 32; 2228 d[1] = clmul_8x4_packed(nn, mm); 2229 2230 clear_tail(d, 16, simd_maxsz(desc)); 2231 } 2232 2233 #ifdef TARGET_AARCH64 2234 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2235 { 2236 int shift = simd_data(desc) * 8; 2237 intptr_t i, opr_sz = simd_oprsz(desc); 2238 uint64_t *d = vd, *n = vn, *m = vm; 2239 2240 for (i = 0; i < opr_sz / 8; ++i) { 2241 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2242 } 2243 } 2244 2245 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2246 { 2247 intptr_t sel = H4(simd_data(desc)); 2248 intptr_t i, opr_sz = simd_oprsz(desc); 2249 uint32_t *n = vn, *m = vm; 2250 uint64_t *d = vd; 2251 2252 for (i = 0; i < opr_sz / 8; ++i) { 2253 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2254 } 2255 } 2256 #endif 2257 2258 #define DO_CMP0(NAME, TYPE, OP) \ 2259 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2260 { \ 2261 intptr_t i, opr_sz = simd_oprsz(desc); \ 2262 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2263 TYPE nn = *(TYPE *)(vn + i); \ 2264 *(TYPE *)(vd + i) = -(nn OP 0); \ 2265 } \ 2266 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2267 } 2268 2269 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2270 DO_CMP0(gvec_clt0_b, int8_t, <) 2271 DO_CMP0(gvec_cle0_b, int8_t, <=) 2272 DO_CMP0(gvec_cgt0_b, int8_t, >) 2273 DO_CMP0(gvec_cge0_b, int8_t, >=) 2274 2275 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2276 DO_CMP0(gvec_clt0_h, int16_t, <) 2277 DO_CMP0(gvec_cle0_h, int16_t, <=) 2278 DO_CMP0(gvec_cgt0_h, int16_t, >) 2279 DO_CMP0(gvec_cge0_h, int16_t, >=) 2280 2281 #undef DO_CMP0 2282 2283 #define DO_ABD(NAME, TYPE) \ 2284 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2285 { \ 2286 intptr_t i, opr_sz = simd_oprsz(desc); \ 2287 TYPE *d = vd, *n = vn, *m = vm; \ 2288 \ 2289 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2290 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2291 } \ 2292 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2293 } 2294 2295 DO_ABD(gvec_sabd_b, int8_t) 2296 DO_ABD(gvec_sabd_h, int16_t) 2297 DO_ABD(gvec_sabd_s, int32_t) 2298 DO_ABD(gvec_sabd_d, int64_t) 2299 2300 DO_ABD(gvec_uabd_b, uint8_t) 2301 DO_ABD(gvec_uabd_h, uint16_t) 2302 DO_ABD(gvec_uabd_s, uint32_t) 2303 DO_ABD(gvec_uabd_d, uint64_t) 2304 2305 #undef DO_ABD 2306 2307 #define DO_ABA(NAME, TYPE) \ 2308 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2309 { \ 2310 intptr_t i, opr_sz = simd_oprsz(desc); \ 2311 TYPE *d = vd, *n = vn, *m = vm; \ 2312 \ 2313 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2314 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2315 } \ 2316 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2317 } 2318 2319 DO_ABA(gvec_saba_b, int8_t) 2320 DO_ABA(gvec_saba_h, int16_t) 2321 DO_ABA(gvec_saba_s, int32_t) 2322 DO_ABA(gvec_saba_d, int64_t) 2323 2324 DO_ABA(gvec_uaba_b, uint8_t) 2325 DO_ABA(gvec_uaba_h, uint16_t) 2326 DO_ABA(gvec_uaba_s, uint32_t) 2327 DO_ABA(gvec_uaba_d, uint64_t) 2328 2329 #undef DO_ABA 2330 2331 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2332 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 2333 { \ 2334 ARMVectorReg scratch; \ 2335 intptr_t oprsz = simd_oprsz(desc); \ 2336 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2337 TYPE *d = vd, *n = vn, *m = vm; \ 2338 if (unlikely(d == m)) { \ 2339 m = memcpy(&scratch, m, oprsz); \ 2340 } \ 2341 for (intptr_t i = 0; i < half; ++i) { \ 2342 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2343 } \ 2344 for (intptr_t i = 0; i < half; ++i) { \ 2345 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2346 } \ 2347 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2348 } 2349 2350 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2351 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2352 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2353 2354 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2355 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2356 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2357 2358 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2359 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2360 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2361 2362 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2363 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2364 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2365 2366 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2367 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2368 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2369 2370 #undef DO_3OP_PAIR 2371 2372 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2373 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2374 { \ 2375 ARMVectorReg scratch; \ 2376 intptr_t oprsz = simd_oprsz(desc); \ 2377 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2378 TYPE *d = vd, *n = vn, *m = vm; \ 2379 if (unlikely(d == m)) { \ 2380 m = memcpy(&scratch, m, oprsz); \ 2381 } \ 2382 for (intptr_t i = 0; i < half; ++i) { \ 2383 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2384 } \ 2385 for (intptr_t i = 0; i < half; ++i) { \ 2386 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2387 } \ 2388 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2389 } 2390 2391 #define ADD(A, B) (A + B) 2392 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2393 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2394 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2395 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2396 #undef ADD 2397 2398 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2399 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2400 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2401 2402 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2403 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2404 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2405 2406 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2407 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2408 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2409 2410 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2411 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2412 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2413 2414 #undef DO_3OP_PAIR 2415 2416 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2417 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2418 { \ 2419 intptr_t i, oprsz = simd_oprsz(desc); \ 2420 int shift = simd_data(desc); \ 2421 TYPE *d = vd, *n = vn; \ 2422 float_status *fpst = stat; \ 2423 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2424 d[i] = FUNC(n[i], shift, fpst); \ 2425 } \ 2426 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2427 } 2428 2429 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2430 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2431 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2432 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) 2433 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2434 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2435 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2436 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2437 2438 #undef DO_VCVT_FIXED 2439 2440 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2441 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2442 { \ 2443 float_status *fpst = stat; \ 2444 intptr_t i, oprsz = simd_oprsz(desc); \ 2445 uint32_t rmode = simd_data(desc); \ 2446 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2447 TYPE *d = vd, *n = vn; \ 2448 set_float_rounding_mode(rmode, fpst); \ 2449 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2450 d[i] = FUNC(n[i], 0, fpst); \ 2451 } \ 2452 set_float_rounding_mode(prev_rmode, fpst); \ 2453 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2454 } 2455 2456 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2457 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2458 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2459 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2460 2461 #undef DO_VCVT_RMODE 2462 2463 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2464 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2465 { \ 2466 float_status *fpst = stat; \ 2467 intptr_t i, oprsz = simd_oprsz(desc); \ 2468 uint32_t rmode = simd_data(desc); \ 2469 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2470 TYPE *d = vd, *n = vn; \ 2471 set_float_rounding_mode(rmode, fpst); \ 2472 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2473 d[i] = FUNC(n[i], fpst); \ 2474 } \ 2475 set_float_rounding_mode(prev_rmode, fpst); \ 2476 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2477 } 2478 2479 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2480 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2481 2482 #undef DO_VRINT_RMODE 2483 2484 #ifdef TARGET_AARCH64 2485 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2486 { 2487 const uint8_t *indices = vm; 2488 CPUARMState *env = venv; 2489 size_t oprsz = simd_oprsz(desc); 2490 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2491 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2492 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2493 union { 2494 uint8_t b[16]; 2495 uint64_t d[2]; 2496 } result; 2497 2498 /* 2499 * We must construct the final result in a temp, lest the output 2500 * overlaps the input table. For TBL, begin with zero; for TBX, 2501 * begin with the original register contents. Note that we always 2502 * copy 16 bytes here to avoid an extra branch; clearing the high 2503 * bits of the register for oprsz == 8 is handled below. 2504 */ 2505 if (is_tbx) { 2506 memcpy(&result, vd, 16); 2507 } else { 2508 memset(&result, 0, 16); 2509 } 2510 2511 for (size_t i = 0; i < oprsz; ++i) { 2512 uint32_t index = indices[H1(i)]; 2513 2514 if (index < table_len) { 2515 /* 2516 * Convert index (a byte offset into the virtual table 2517 * which is a series of 128-bit vectors concatenated) 2518 * into the correct register element, bearing in mind 2519 * that the table can wrap around from V31 to V0. 2520 */ 2521 const uint8_t *table = (const uint8_t *) 2522 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2523 result.b[H1(i)] = table[H1(index % 16)]; 2524 } 2525 } 2526 2527 memcpy(vd, &result, 16); 2528 clear_tail(vd, oprsz, simd_maxsz(desc)); 2529 } 2530 #endif 2531 2532 /* 2533 * NxN -> N highpart multiply 2534 * 2535 * TODO: expose this as a generic vector operation. 2536 */ 2537 2538 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2539 { 2540 intptr_t i, opr_sz = simd_oprsz(desc); 2541 int8_t *d = vd, *n = vn, *m = vm; 2542 2543 for (i = 0; i < opr_sz; ++i) { 2544 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2545 } 2546 clear_tail(d, opr_sz, simd_maxsz(desc)); 2547 } 2548 2549 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2550 { 2551 intptr_t i, opr_sz = simd_oprsz(desc); 2552 int16_t *d = vd, *n = vn, *m = vm; 2553 2554 for (i = 0; i < opr_sz / 2; ++i) { 2555 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2556 } 2557 clear_tail(d, opr_sz, simd_maxsz(desc)); 2558 } 2559 2560 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2561 { 2562 intptr_t i, opr_sz = simd_oprsz(desc); 2563 int32_t *d = vd, *n = vn, *m = vm; 2564 2565 for (i = 0; i < opr_sz / 4; ++i) { 2566 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2567 } 2568 clear_tail(d, opr_sz, simd_maxsz(desc)); 2569 } 2570 2571 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2572 { 2573 intptr_t i, opr_sz = simd_oprsz(desc); 2574 uint64_t *d = vd, *n = vn, *m = vm; 2575 uint64_t discard; 2576 2577 for (i = 0; i < opr_sz / 8; ++i) { 2578 muls64(&discard, &d[i], n[i], m[i]); 2579 } 2580 clear_tail(d, opr_sz, simd_maxsz(desc)); 2581 } 2582 2583 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2584 { 2585 intptr_t i, opr_sz = simd_oprsz(desc); 2586 uint8_t *d = vd, *n = vn, *m = vm; 2587 2588 for (i = 0; i < opr_sz; ++i) { 2589 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2590 } 2591 clear_tail(d, opr_sz, simd_maxsz(desc)); 2592 } 2593 2594 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2595 { 2596 intptr_t i, opr_sz = simd_oprsz(desc); 2597 uint16_t *d = vd, *n = vn, *m = vm; 2598 2599 for (i = 0; i < opr_sz / 2; ++i) { 2600 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2601 } 2602 clear_tail(d, opr_sz, simd_maxsz(desc)); 2603 } 2604 2605 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2606 { 2607 intptr_t i, opr_sz = simd_oprsz(desc); 2608 uint32_t *d = vd, *n = vn, *m = vm; 2609 2610 for (i = 0; i < opr_sz / 4; ++i) { 2611 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2612 } 2613 clear_tail(d, opr_sz, simd_maxsz(desc)); 2614 } 2615 2616 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2617 { 2618 intptr_t i, opr_sz = simd_oprsz(desc); 2619 uint64_t *d = vd, *n = vn, *m = vm; 2620 uint64_t discard; 2621 2622 for (i = 0; i < opr_sz / 8; ++i) { 2623 mulu64(&discard, &d[i], n[i], m[i]); 2624 } 2625 clear_tail(d, opr_sz, simd_maxsz(desc)); 2626 } 2627 2628 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2629 { 2630 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2631 int shr = simd_data(desc); 2632 uint64_t *d = vd, *n = vn, *m = vm; 2633 2634 for (i = 0; i < opr_sz; ++i) { 2635 d[i] = ror64(n[i] ^ m[i], shr); 2636 } 2637 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2638 } 2639 2640 /* 2641 * Integer matrix-multiply accumulate 2642 */ 2643 2644 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2645 { 2646 int8_t *n = vn, *m = vm; 2647 2648 for (intptr_t k = 0; k < 8; ++k) { 2649 sum += n[H1(k)] * m[H1(k)]; 2650 } 2651 return sum; 2652 } 2653 2654 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2655 { 2656 uint8_t *n = vn, *m = vm; 2657 2658 for (intptr_t k = 0; k < 8; ++k) { 2659 sum += n[H1(k)] * m[H1(k)]; 2660 } 2661 return sum; 2662 } 2663 2664 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2665 { 2666 uint8_t *n = vn; 2667 int8_t *m = vm; 2668 2669 for (intptr_t k = 0; k < 8; ++k) { 2670 sum += n[H1(k)] * m[H1(k)]; 2671 } 2672 return sum; 2673 } 2674 2675 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2676 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2677 { 2678 intptr_t seg, opr_sz = simd_oprsz(desc); 2679 2680 for (seg = 0; seg < opr_sz; seg += 16) { 2681 uint32_t *d = vd + seg; 2682 uint32_t *a = va + seg; 2683 uint32_t sum0, sum1, sum2, sum3; 2684 2685 /* 2686 * Process the entire segment at once, writing back the 2687 * results only after we've consumed all of the inputs. 2688 * 2689 * Key to indices by column: 2690 * i j i j 2691 */ 2692 sum0 = a[H4(0 + 0)]; 2693 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2694 sum1 = a[H4(0 + 1)]; 2695 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2696 sum2 = a[H4(2 + 0)]; 2697 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2698 sum3 = a[H4(2 + 1)]; 2699 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2700 2701 d[H4(0)] = sum0; 2702 d[H4(1)] = sum1; 2703 d[H4(2)] = sum2; 2704 d[H4(3)] = sum3; 2705 } 2706 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2707 } 2708 2709 #define DO_MMLA_B(NAME, INNER) \ 2710 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2711 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2712 2713 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2714 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2715 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2716 2717 /* 2718 * BFloat16 Dot Product 2719 */ 2720 2721 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) 2722 { 2723 /* FPCR is ignored for BFDOT and BFMMLA. */ 2724 float_status bf_status = { 2725 .tininess_before_rounding = float_tininess_before_rounding, 2726 .float_rounding_mode = float_round_to_odd_inf, 2727 .flush_to_zero = true, 2728 .flush_inputs_to_zero = true, 2729 .default_nan_mode = true, 2730 }; 2731 float32 t1, t2; 2732 2733 /* 2734 * Extract each BFloat16 from the element pair, and shift 2735 * them such that they become float32. 2736 */ 2737 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); 2738 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); 2739 t1 = float32_add(t1, t2, &bf_status); 2740 t1 = float32_add(sum, t1, &bf_status); 2741 2742 return t1; 2743 } 2744 2745 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2746 { 2747 intptr_t i, opr_sz = simd_oprsz(desc); 2748 float32 *d = vd, *a = va; 2749 uint32_t *n = vn, *m = vm; 2750 2751 for (i = 0; i < opr_sz / 4; ++i) { 2752 d[i] = bfdotadd(a[i], n[i], m[i]); 2753 } 2754 clear_tail(d, opr_sz, simd_maxsz(desc)); 2755 } 2756 2757 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2758 void *va, uint32_t desc) 2759 { 2760 intptr_t i, j, opr_sz = simd_oprsz(desc); 2761 intptr_t index = simd_data(desc); 2762 intptr_t elements = opr_sz / 4; 2763 intptr_t eltspersegment = MIN(16 / 4, elements); 2764 float32 *d = vd, *a = va; 2765 uint32_t *n = vn, *m = vm; 2766 2767 for (i = 0; i < elements; i += eltspersegment) { 2768 uint32_t m_idx = m[i + H4(index)]; 2769 2770 for (j = i; j < i + eltspersegment; j++) { 2771 d[j] = bfdotadd(a[j], n[j], m_idx); 2772 } 2773 } 2774 clear_tail(d, opr_sz, simd_maxsz(desc)); 2775 } 2776 2777 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) 2778 { 2779 intptr_t s, opr_sz = simd_oprsz(desc); 2780 float32 *d = vd, *a = va; 2781 uint32_t *n = vn, *m = vm; 2782 2783 for (s = 0; s < opr_sz / 4; s += 4) { 2784 float32 sum00, sum01, sum10, sum11; 2785 2786 /* 2787 * Process the entire segment at once, writing back the 2788 * results only after we've consumed all of the inputs. 2789 * 2790 * Key to indices by column: 2791 * i j i k j k 2792 */ 2793 sum00 = a[s + H4(0 + 0)]; 2794 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); 2795 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); 2796 2797 sum01 = a[s + H4(0 + 1)]; 2798 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); 2799 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); 2800 2801 sum10 = a[s + H4(2 + 0)]; 2802 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); 2803 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); 2804 2805 sum11 = a[s + H4(2 + 1)]; 2806 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); 2807 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); 2808 2809 d[s + H4(0 + 0)] = sum00; 2810 d[s + H4(0 + 1)] = sum01; 2811 d[s + H4(2 + 0)] = sum10; 2812 d[s + H4(2 + 1)] = sum11; 2813 } 2814 clear_tail(d, opr_sz, simd_maxsz(desc)); 2815 } 2816 2817 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 2818 void *stat, uint32_t desc) 2819 { 2820 intptr_t i, opr_sz = simd_oprsz(desc); 2821 intptr_t sel = simd_data(desc); 2822 float32 *d = vd, *a = va; 2823 bfloat16 *n = vn, *m = vm; 2824 2825 for (i = 0; i < opr_sz / 4; ++i) { 2826 float32 nn = n[H2(i * 2 + sel)] << 16; 2827 float32 mm = m[H2(i * 2 + sel)] << 16; 2828 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 2829 } 2830 clear_tail(d, opr_sz, simd_maxsz(desc)); 2831 } 2832 2833 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 2834 void *va, void *stat, uint32_t desc) 2835 { 2836 intptr_t i, j, opr_sz = simd_oprsz(desc); 2837 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 2838 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 2839 intptr_t elements = opr_sz / 4; 2840 intptr_t eltspersegment = MIN(16 / 4, elements); 2841 float32 *d = vd, *a = va; 2842 bfloat16 *n = vn, *m = vm; 2843 2844 for (i = 0; i < elements; i += eltspersegment) { 2845 float32 m_idx = m[H2(2 * i + index)] << 16; 2846 2847 for (j = i; j < i + eltspersegment; j++) { 2848 float32 n_j = n[H2(2 * j + sel)] << 16; 2849 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 2850 } 2851 } 2852 clear_tail(d, opr_sz, simd_maxsz(desc)); 2853 } 2854 2855 #define DO_CLAMP(NAME, TYPE) \ 2856 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 2857 { \ 2858 intptr_t i, opr_sz = simd_oprsz(desc); \ 2859 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2860 TYPE aa = *(TYPE *)(a + i); \ 2861 TYPE nn = *(TYPE *)(n + i); \ 2862 TYPE mm = *(TYPE *)(m + i); \ 2863 TYPE dd = MIN(MAX(aa, nn), mm); \ 2864 *(TYPE *)(d + i) = dd; \ 2865 } \ 2866 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2867 } 2868 2869 DO_CLAMP(gvec_sclamp_b, int8_t) 2870 DO_CLAMP(gvec_sclamp_h, int16_t) 2871 DO_CLAMP(gvec_sclamp_s, int32_t) 2872 DO_CLAMP(gvec_sclamp_d, int64_t) 2873 2874 DO_CLAMP(gvec_uclamp_b, uint8_t) 2875 DO_CLAMP(gvec_uclamp_h, uint16_t) 2876 DO_CLAMP(gvec_uclamp_s, uint32_t) 2877 DO_CLAMP(gvec_uclamp_d, uint64_t) 2878