1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 void *vfpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 float_status *fpst = vfpst; 883 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 884 uint32_t neg_imag = neg_real ^ 1; 885 uintptr_t i; 886 887 /* Shift boolean to the sign bit so we can xor to negate. */ 888 neg_real <<= 15; 889 neg_imag <<= 15; 890 891 for (i = 0; i < opr_sz / 2; i += 2) { 892 float16 e0 = n[H2(i)]; 893 float16 e1 = m[H2(i + 1)] ^ neg_imag; 894 float16 e2 = n[H2(i + 1)]; 895 float16 e3 = m[H2(i)] ^ neg_real; 896 897 d[H2(i)] = float16_add(e0, e1, fpst); 898 d[H2(i + 1)] = float16_add(e2, e3, fpst); 899 } 900 clear_tail(d, opr_sz, simd_maxsz(desc)); 901 } 902 903 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 904 void *vfpst, uint32_t desc) 905 { 906 uintptr_t opr_sz = simd_oprsz(desc); 907 float32 *d = vd; 908 float32 *n = vn; 909 float32 *m = vm; 910 float_status *fpst = vfpst; 911 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 912 uint32_t neg_imag = neg_real ^ 1; 913 uintptr_t i; 914 915 /* Shift boolean to the sign bit so we can xor to negate. */ 916 neg_real <<= 31; 917 neg_imag <<= 31; 918 919 for (i = 0; i < opr_sz / 4; i += 2) { 920 float32 e0 = n[H4(i)]; 921 float32 e1 = m[H4(i + 1)] ^ neg_imag; 922 float32 e2 = n[H4(i + 1)]; 923 float32 e3 = m[H4(i)] ^ neg_real; 924 925 d[H4(i)] = float32_add(e0, e1, fpst); 926 d[H4(i + 1)] = float32_add(e2, e3, fpst); 927 } 928 clear_tail(d, opr_sz, simd_maxsz(desc)); 929 } 930 931 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 932 void *vfpst, uint32_t desc) 933 { 934 uintptr_t opr_sz = simd_oprsz(desc); 935 float64 *d = vd; 936 float64 *n = vn; 937 float64 *m = vm; 938 float_status *fpst = vfpst; 939 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 940 uint64_t neg_imag = neg_real ^ 1; 941 uintptr_t i; 942 943 /* Shift boolean to the sign bit so we can xor to negate. */ 944 neg_real <<= 63; 945 neg_imag <<= 63; 946 947 for (i = 0; i < opr_sz / 8; i += 2) { 948 float64 e0 = n[i]; 949 float64 e1 = m[i + 1] ^ neg_imag; 950 float64 e2 = n[i + 1]; 951 float64 e3 = m[i] ^ neg_real; 952 953 d[i] = float64_add(e0, e1, fpst); 954 d[i + 1] = float64_add(e2, e3, fpst); 955 } 956 clear_tail(d, opr_sz, simd_maxsz(desc)); 957 } 958 959 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 960 void *vfpst, uint32_t desc) 961 { 962 uintptr_t opr_sz = simd_oprsz(desc); 963 float16 *d = vd, *n = vn, *m = vm, *a = va; 964 float_status *fpst = vfpst; 965 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 966 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 967 uint32_t neg_real = flip ^ neg_imag; 968 uintptr_t i; 969 970 /* Shift boolean to the sign bit so we can xor to negate. */ 971 neg_real <<= 15; 972 neg_imag <<= 15; 973 974 for (i = 0; i < opr_sz / 2; i += 2) { 975 float16 e2 = n[H2(i + flip)]; 976 float16 e1 = m[H2(i + flip)] ^ neg_real; 977 float16 e4 = e2; 978 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 979 980 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 981 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 982 } 983 clear_tail(d, opr_sz, simd_maxsz(desc)); 984 } 985 986 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 987 void *vfpst, uint32_t desc) 988 { 989 uintptr_t opr_sz = simd_oprsz(desc); 990 float16 *d = vd, *n = vn, *m = vm, *a = va; 991 float_status *fpst = vfpst; 992 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 993 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 994 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 995 uint32_t neg_real = flip ^ neg_imag; 996 intptr_t elements = opr_sz / sizeof(float16); 997 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 998 intptr_t i, j; 999 1000 /* Shift boolean to the sign bit so we can xor to negate. */ 1001 neg_real <<= 15; 1002 neg_imag <<= 15; 1003 1004 for (i = 0; i < elements; i += eltspersegment) { 1005 float16 mr = m[H2(i + 2 * index + 0)]; 1006 float16 mi = m[H2(i + 2 * index + 1)]; 1007 float16 e1 = neg_real ^ (flip ? mi : mr); 1008 float16 e3 = neg_imag ^ (flip ? mr : mi); 1009 1010 for (j = i; j < i + eltspersegment; j += 2) { 1011 float16 e2 = n[H2(j + flip)]; 1012 float16 e4 = e2; 1013 1014 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 1015 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 1016 } 1017 } 1018 clear_tail(d, opr_sz, simd_maxsz(desc)); 1019 } 1020 1021 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1022 void *vfpst, uint32_t desc) 1023 { 1024 uintptr_t opr_sz = simd_oprsz(desc); 1025 float32 *d = vd, *n = vn, *m = vm, *a = va; 1026 float_status *fpst = vfpst; 1027 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1028 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1029 uint32_t neg_real = flip ^ neg_imag; 1030 uintptr_t i; 1031 1032 /* Shift boolean to the sign bit so we can xor to negate. */ 1033 neg_real <<= 31; 1034 neg_imag <<= 31; 1035 1036 for (i = 0; i < opr_sz / 4; i += 2) { 1037 float32 e2 = n[H4(i + flip)]; 1038 float32 e1 = m[H4(i + flip)] ^ neg_real; 1039 float32 e4 = e2; 1040 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 1041 1042 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 1043 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 1044 } 1045 clear_tail(d, opr_sz, simd_maxsz(desc)); 1046 } 1047 1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1049 void *vfpst, uint32_t desc) 1050 { 1051 uintptr_t opr_sz = simd_oprsz(desc); 1052 float32 *d = vd, *n = vn, *m = vm, *a = va; 1053 float_status *fpst = vfpst; 1054 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1055 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1056 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1057 uint32_t neg_real = flip ^ neg_imag; 1058 intptr_t elements = opr_sz / sizeof(float32); 1059 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1060 intptr_t i, j; 1061 1062 /* Shift boolean to the sign bit so we can xor to negate. */ 1063 neg_real <<= 31; 1064 neg_imag <<= 31; 1065 1066 for (i = 0; i < elements; i += eltspersegment) { 1067 float32 mr = m[H4(i + 2 * index + 0)]; 1068 float32 mi = m[H4(i + 2 * index + 1)]; 1069 float32 e1 = neg_real ^ (flip ? mi : mr); 1070 float32 e3 = neg_imag ^ (flip ? mr : mi); 1071 1072 for (j = i; j < i + eltspersegment; j += 2) { 1073 float32 e2 = n[H4(j + flip)]; 1074 float32 e4 = e2; 1075 1076 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 1077 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 1078 } 1079 } 1080 clear_tail(d, opr_sz, simd_maxsz(desc)); 1081 } 1082 1083 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1084 void *vfpst, uint32_t desc) 1085 { 1086 uintptr_t opr_sz = simd_oprsz(desc); 1087 float64 *d = vd, *n = vn, *m = vm, *a = va; 1088 float_status *fpst = vfpst; 1089 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1090 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1091 uint64_t neg_real = flip ^ neg_imag; 1092 uintptr_t i; 1093 1094 /* Shift boolean to the sign bit so we can xor to negate. */ 1095 neg_real <<= 63; 1096 neg_imag <<= 63; 1097 1098 for (i = 0; i < opr_sz / 8; i += 2) { 1099 float64 e2 = n[i + flip]; 1100 float64 e1 = m[i + flip] ^ neg_real; 1101 float64 e4 = e2; 1102 float64 e3 = m[i + 1 - flip] ^ neg_imag; 1103 1104 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 1105 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 1106 } 1107 clear_tail(d, opr_sz, simd_maxsz(desc)); 1108 } 1109 1110 /* 1111 * Floating point comparisons producing an integer result (all 1s or all 0s). 1112 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1113 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1114 */ 1115 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1116 { 1117 return -float16_eq_quiet(op1, op2, stat); 1118 } 1119 1120 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1121 { 1122 return -float32_eq_quiet(op1, op2, stat); 1123 } 1124 1125 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1126 { 1127 return -float64_eq_quiet(op1, op2, stat); 1128 } 1129 1130 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1131 { 1132 return -float16_le(op2, op1, stat); 1133 } 1134 1135 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1136 { 1137 return -float32_le(op2, op1, stat); 1138 } 1139 1140 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1141 { 1142 return -float64_le(op2, op1, stat); 1143 } 1144 1145 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1146 { 1147 return -float16_lt(op2, op1, stat); 1148 } 1149 1150 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1151 { 1152 return -float32_lt(op2, op1, stat); 1153 } 1154 1155 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1156 { 1157 return -float64_lt(op2, op1, stat); 1158 } 1159 1160 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1161 { 1162 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1163 } 1164 1165 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1166 { 1167 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1168 } 1169 1170 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1171 { 1172 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1173 } 1174 1175 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1176 { 1177 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1178 } 1179 1180 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1181 { 1182 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1183 } 1184 1185 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1186 { 1187 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1188 } 1189 1190 static int16_t vfp_tosszh(float16 x, void *fpstp) 1191 { 1192 float_status *fpst = fpstp; 1193 if (float16_is_any_nan(x)) { 1194 float_raise(float_flag_invalid, fpst); 1195 return 0; 1196 } 1197 return float16_to_int16_round_to_zero(x, fpst); 1198 } 1199 1200 static uint16_t vfp_touszh(float16 x, void *fpstp) 1201 { 1202 float_status *fpst = fpstp; 1203 if (float16_is_any_nan(x)) { 1204 float_raise(float_flag_invalid, fpst); 1205 return 0; 1206 } 1207 return float16_to_uint16_round_to_zero(x, fpst); 1208 } 1209 1210 #define DO_2OP(NAME, FUNC, TYPE) \ 1211 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1212 { \ 1213 intptr_t i, oprsz = simd_oprsz(desc); \ 1214 TYPE *d = vd, *n = vn; \ 1215 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1216 d[i] = FUNC(n[i], stat); \ 1217 } \ 1218 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1219 } 1220 1221 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1222 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1223 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1224 1225 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1226 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1227 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1228 1229 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1230 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1231 1232 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1233 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1234 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1235 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1236 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1237 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1238 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1239 DO_2OP(gvec_touszh, vfp_touszh, float16) 1240 1241 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1242 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1243 { \ 1244 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1245 } 1246 1247 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1248 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1249 { \ 1250 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1251 } 1252 1253 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1254 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1255 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1256 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1257 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1258 1259 DO_2OP_CMP0(cgt, cgt, FWD) 1260 DO_2OP_CMP0(cge, cge, FWD) 1261 DO_2OP_CMP0(ceq, ceq, FWD) 1262 DO_2OP_CMP0(clt, cgt, REV) 1263 DO_2OP_CMP0(cle, cge, REV) 1264 1265 #undef DO_2OP 1266 #undef DO_2OP_CMP0 1267 1268 /* Floating-point trigonometric starting value. 1269 * See the ARM ARM pseudocode function FPTrigSMul. 1270 */ 1271 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1272 { 1273 float16 result = float16_mul(op1, op1, stat); 1274 if (!float16_is_any_nan(result)) { 1275 result = float16_set_sign(result, op2 & 1); 1276 } 1277 return result; 1278 } 1279 1280 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1281 { 1282 float32 result = float32_mul(op1, op1, stat); 1283 if (!float32_is_any_nan(result)) { 1284 result = float32_set_sign(result, op2 & 1); 1285 } 1286 return result; 1287 } 1288 1289 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1290 { 1291 float64 result = float64_mul(op1, op1, stat); 1292 if (!float64_is_any_nan(result)) { 1293 result = float64_set_sign(result, op2 & 1); 1294 } 1295 return result; 1296 } 1297 1298 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1299 { 1300 return float16_abs(float16_sub(op1, op2, stat)); 1301 } 1302 1303 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1304 { 1305 return float32_abs(float32_sub(op1, op2, stat)); 1306 } 1307 1308 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1309 { 1310 return float64_abs(float64_sub(op1, op2, stat)); 1311 } 1312 1313 /* 1314 * Reciprocal step. These are the AArch32 version which uses a 1315 * non-fused multiply-and-subtract. 1316 */ 1317 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1318 { 1319 op1 = float16_squash_input_denormal(op1, stat); 1320 op2 = float16_squash_input_denormal(op2, stat); 1321 1322 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1323 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1324 return float16_two; 1325 } 1326 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1327 } 1328 1329 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1330 { 1331 op1 = float32_squash_input_denormal(op1, stat); 1332 op2 = float32_squash_input_denormal(op2, stat); 1333 1334 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1335 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1336 return float32_two; 1337 } 1338 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1339 } 1340 1341 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1342 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1343 { 1344 op1 = float16_squash_input_denormal(op1, stat); 1345 op2 = float16_squash_input_denormal(op2, stat); 1346 1347 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1348 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1349 return float16_one_point_five; 1350 } 1351 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1352 return float16_div(op1, float16_two, stat); 1353 } 1354 1355 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1356 { 1357 op1 = float32_squash_input_denormal(op1, stat); 1358 op2 = float32_squash_input_denormal(op2, stat); 1359 1360 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1361 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1362 return float32_one_point_five; 1363 } 1364 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1365 return float32_div(op1, float32_two, stat); 1366 } 1367 1368 #define DO_3OP(NAME, FUNC, TYPE) \ 1369 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1370 { \ 1371 intptr_t i, oprsz = simd_oprsz(desc); \ 1372 TYPE *d = vd, *n = vn, *m = vm; \ 1373 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1374 d[i] = FUNC(n[i], m[i], stat); \ 1375 } \ 1376 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1377 } 1378 1379 DO_3OP(gvec_fadd_h, float16_add, float16) 1380 DO_3OP(gvec_fadd_s, float32_add, float32) 1381 DO_3OP(gvec_fadd_d, float64_add, float64) 1382 1383 DO_3OP(gvec_fsub_h, float16_sub, float16) 1384 DO_3OP(gvec_fsub_s, float32_sub, float32) 1385 DO_3OP(gvec_fsub_d, float64_sub, float64) 1386 1387 DO_3OP(gvec_fmul_h, float16_mul, float16) 1388 DO_3OP(gvec_fmul_s, float32_mul, float32) 1389 DO_3OP(gvec_fmul_d, float64_mul, float64) 1390 1391 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1392 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1393 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1394 1395 DO_3OP(gvec_fabd_h, float16_abd, float16) 1396 DO_3OP(gvec_fabd_s, float32_abd, float32) 1397 DO_3OP(gvec_fabd_d, float64_abd, float64) 1398 1399 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1400 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1401 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1402 1403 DO_3OP(gvec_fcge_h, float16_cge, float16) 1404 DO_3OP(gvec_fcge_s, float32_cge, float32) 1405 DO_3OP(gvec_fcge_d, float64_cge, float64) 1406 1407 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1408 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1409 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1410 1411 DO_3OP(gvec_facge_h, float16_acge, float16) 1412 DO_3OP(gvec_facge_s, float32_acge, float32) 1413 DO_3OP(gvec_facge_d, float64_acge, float64) 1414 1415 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1416 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1417 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1418 1419 DO_3OP(gvec_fmax_h, float16_max, float16) 1420 DO_3OP(gvec_fmax_s, float32_max, float32) 1421 DO_3OP(gvec_fmax_d, float64_max, float64) 1422 1423 DO_3OP(gvec_fmin_h, float16_min, float16) 1424 DO_3OP(gvec_fmin_s, float32_min, float32) 1425 DO_3OP(gvec_fmin_d, float64_min, float64) 1426 1427 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1428 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1429 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1430 1431 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1432 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1433 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1434 1435 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1436 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1437 1438 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1439 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1440 1441 #ifdef TARGET_AARCH64 1442 DO_3OP(gvec_fdiv_h, float16_div, float16) 1443 DO_3OP(gvec_fdiv_s, float32_div, float32) 1444 DO_3OP(gvec_fdiv_d, float64_div, float64) 1445 1446 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1447 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1448 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1449 1450 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1451 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1452 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1453 1454 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1455 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1456 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1457 1458 #endif 1459 #undef DO_3OP 1460 1461 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1462 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1463 float_status *stat) 1464 { 1465 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1466 } 1467 1468 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1469 float_status *stat) 1470 { 1471 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1472 } 1473 1474 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1475 float_status *stat) 1476 { 1477 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1478 } 1479 1480 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1481 float_status *stat) 1482 { 1483 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1484 } 1485 1486 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1487 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1488 float_status *stat) 1489 { 1490 return float16_muladd(op1, op2, dest, 0, stat); 1491 } 1492 1493 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1494 float_status *stat) 1495 { 1496 return float32_muladd(op1, op2, dest, 0, stat); 1497 } 1498 1499 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1500 float_status *stat) 1501 { 1502 return float64_muladd(op1, op2, dest, 0, stat); 1503 } 1504 1505 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1506 float_status *stat) 1507 { 1508 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1509 } 1510 1511 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1512 float_status *stat) 1513 { 1514 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1515 } 1516 1517 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1518 float_status *stat) 1519 { 1520 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1521 } 1522 1523 #define DO_MULADD(NAME, FUNC, TYPE) \ 1524 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1525 { \ 1526 intptr_t i, oprsz = simd_oprsz(desc); \ 1527 TYPE *d = vd, *n = vn, *m = vm; \ 1528 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1529 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1530 } \ 1531 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1532 } 1533 1534 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1535 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1536 1537 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1538 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1539 1540 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1541 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1542 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1543 1544 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1545 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1546 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1547 1548 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1549 * For AdvSIMD, there is of course only one such vector segment. 1550 */ 1551 1552 #define DO_MUL_IDX(NAME, TYPE, H) \ 1553 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1554 { \ 1555 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1556 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1557 intptr_t idx = simd_data(desc); \ 1558 TYPE *d = vd, *n = vn, *m = vm; \ 1559 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1560 TYPE mm = m[H(i + idx)]; \ 1561 for (j = 0; j < segment; j++) { \ 1562 d[i + j] = n[i + j] * mm; \ 1563 } \ 1564 } \ 1565 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1566 } 1567 1568 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1569 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1570 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1571 1572 #undef DO_MUL_IDX 1573 1574 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1575 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1576 { \ 1577 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1578 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1579 intptr_t idx = simd_data(desc); \ 1580 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1581 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1582 TYPE mm = m[H(i + idx)]; \ 1583 for (j = 0; j < segment; j++) { \ 1584 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1585 } \ 1586 } \ 1587 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1588 } 1589 1590 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1591 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1592 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1593 1594 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1595 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1596 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1597 1598 #undef DO_MLA_IDX 1599 1600 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1601 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1602 { \ 1603 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1604 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1605 intptr_t idx = simd_data(desc); \ 1606 TYPE *d = vd, *n = vn, *m = vm; \ 1607 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1608 TYPE mm = m[H(i + idx)]; \ 1609 for (j = 0; j < segment; j++) { \ 1610 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1611 } \ 1612 } \ 1613 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1614 } 1615 1616 #define nop(N, M, S) (M) 1617 1618 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1619 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1620 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1621 1622 #ifdef TARGET_AARCH64 1623 1624 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1625 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1626 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1627 1628 #endif 1629 1630 #undef nop 1631 1632 /* 1633 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1634 * the fused ops below they assume accumulate both from and into Vd. 1635 */ 1636 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1637 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1638 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1639 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1640 1641 #undef DO_FMUL_IDX 1642 1643 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1644 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1645 void *stat, uint32_t desc) \ 1646 { \ 1647 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1648 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1649 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1650 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1651 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1652 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1653 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1654 TYPE mm = m[H(i + idx)]; \ 1655 for (j = 0; j < segment; j++) { \ 1656 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1657 mm, a[i + j], 0, stat); \ 1658 } \ 1659 } \ 1660 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1661 } 1662 1663 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1664 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1665 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1666 1667 #undef DO_FMLA_IDX 1668 1669 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1670 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1671 { \ 1672 intptr_t i, oprsz = simd_oprsz(desc); \ 1673 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1674 bool q = false; \ 1675 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1676 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1677 if (dd < MIN) { \ 1678 dd = MIN; \ 1679 q = true; \ 1680 } else if (dd > MAX) { \ 1681 dd = MAX; \ 1682 q = true; \ 1683 } \ 1684 d[i] = dd; \ 1685 } \ 1686 if (q) { \ 1687 uint32_t *qc = vq; \ 1688 qc[0] = 1; \ 1689 } \ 1690 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1691 } 1692 1693 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1694 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1695 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1696 1697 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1698 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1699 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1700 1701 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1702 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1703 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1704 1705 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1706 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1707 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1708 1709 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1710 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1711 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1712 1713 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1714 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1715 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1716 1717 #undef DO_SAT 1718 1719 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1720 void *vm, uint32_t desc) 1721 { 1722 intptr_t i, oprsz = simd_oprsz(desc); 1723 uint64_t *d = vd, *n = vn, *m = vm; 1724 bool q = false; 1725 1726 for (i = 0; i < oprsz / 8; i++) { 1727 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1728 if (dd < nn) { 1729 dd = UINT64_MAX; 1730 q = true; 1731 } 1732 d[i] = dd; 1733 } 1734 if (q) { 1735 uint32_t *qc = vq; 1736 qc[0] = 1; 1737 } 1738 clear_tail(d, oprsz, simd_maxsz(desc)); 1739 } 1740 1741 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1742 void *vm, uint32_t desc) 1743 { 1744 intptr_t i, oprsz = simd_oprsz(desc); 1745 uint64_t *d = vd, *n = vn, *m = vm; 1746 bool q = false; 1747 1748 for (i = 0; i < oprsz / 8; i++) { 1749 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1750 if (nn < mm) { 1751 dd = 0; 1752 q = true; 1753 } 1754 d[i] = dd; 1755 } 1756 if (q) { 1757 uint32_t *qc = vq; 1758 qc[0] = 1; 1759 } 1760 clear_tail(d, oprsz, simd_maxsz(desc)); 1761 } 1762 1763 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1764 void *vm, uint32_t desc) 1765 { 1766 intptr_t i, oprsz = simd_oprsz(desc); 1767 int64_t *d = vd, *n = vn, *m = vm; 1768 bool q = false; 1769 1770 for (i = 0; i < oprsz / 8; i++) { 1771 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1772 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1773 dd = (nn >> 63) ^ ~INT64_MIN; 1774 q = true; 1775 } 1776 d[i] = dd; 1777 } 1778 if (q) { 1779 uint32_t *qc = vq; 1780 qc[0] = 1; 1781 } 1782 clear_tail(d, oprsz, simd_maxsz(desc)); 1783 } 1784 1785 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1786 void *vm, uint32_t desc) 1787 { 1788 intptr_t i, oprsz = simd_oprsz(desc); 1789 int64_t *d = vd, *n = vn, *m = vm; 1790 bool q = false; 1791 1792 for (i = 0; i < oprsz / 8; i++) { 1793 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1794 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1795 dd = (nn >> 63) ^ ~INT64_MIN; 1796 q = true; 1797 } 1798 d[i] = dd; 1799 } 1800 if (q) { 1801 uint32_t *qc = vq; 1802 qc[0] = 1; 1803 } 1804 clear_tail(d, oprsz, simd_maxsz(desc)); 1805 } 1806 1807 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1808 void *vm, uint32_t desc) 1809 { 1810 intptr_t i, oprsz = simd_oprsz(desc); 1811 uint64_t *d = vd, *n = vn, *m = vm; 1812 bool q = false; 1813 1814 for (i = 0; i < oprsz / 8; i++) { 1815 uint64_t nn = n[i]; 1816 int64_t mm = m[i]; 1817 uint64_t dd = nn + mm; 1818 1819 if (mm < 0) { 1820 if (nn < (uint64_t)-mm) { 1821 dd = 0; 1822 q = true; 1823 } 1824 } else { 1825 if (dd < nn) { 1826 dd = UINT64_MAX; 1827 q = true; 1828 } 1829 } 1830 d[i] = dd; 1831 } 1832 if (q) { 1833 uint32_t *qc = vq; 1834 qc[0] = 1; 1835 } 1836 clear_tail(d, oprsz, simd_maxsz(desc)); 1837 } 1838 1839 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1840 void *vm, uint32_t desc) 1841 { 1842 intptr_t i, oprsz = simd_oprsz(desc); 1843 uint64_t *d = vd, *n = vn, *m = vm; 1844 bool q = false; 1845 1846 for (i = 0; i < oprsz / 8; i++) { 1847 int64_t nn = n[i]; 1848 uint64_t mm = m[i]; 1849 int64_t dd = nn + mm; 1850 1851 if (mm > (uint64_t)(INT64_MAX - nn)) { 1852 dd = INT64_MAX; 1853 q = true; 1854 } 1855 d[i] = dd; 1856 } 1857 if (q) { 1858 uint32_t *qc = vq; 1859 qc[0] = 1; 1860 } 1861 clear_tail(d, oprsz, simd_maxsz(desc)); 1862 } 1863 1864 #define DO_SRA(NAME, TYPE) \ 1865 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1866 { \ 1867 intptr_t i, oprsz = simd_oprsz(desc); \ 1868 int shift = simd_data(desc); \ 1869 TYPE *d = vd, *n = vn; \ 1870 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1871 d[i] += n[i] >> shift; \ 1872 } \ 1873 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1874 } 1875 1876 DO_SRA(gvec_ssra_b, int8_t) 1877 DO_SRA(gvec_ssra_h, int16_t) 1878 DO_SRA(gvec_ssra_s, int32_t) 1879 DO_SRA(gvec_ssra_d, int64_t) 1880 1881 DO_SRA(gvec_usra_b, uint8_t) 1882 DO_SRA(gvec_usra_h, uint16_t) 1883 DO_SRA(gvec_usra_s, uint32_t) 1884 DO_SRA(gvec_usra_d, uint64_t) 1885 1886 #undef DO_SRA 1887 1888 #define DO_RSHR(NAME, TYPE) \ 1889 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1890 { \ 1891 intptr_t i, oprsz = simd_oprsz(desc); \ 1892 int shift = simd_data(desc); \ 1893 TYPE *d = vd, *n = vn; \ 1894 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1895 TYPE tmp = n[i] >> (shift - 1); \ 1896 d[i] = (tmp >> 1) + (tmp & 1); \ 1897 } \ 1898 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1899 } 1900 1901 DO_RSHR(gvec_srshr_b, int8_t) 1902 DO_RSHR(gvec_srshr_h, int16_t) 1903 DO_RSHR(gvec_srshr_s, int32_t) 1904 DO_RSHR(gvec_srshr_d, int64_t) 1905 1906 DO_RSHR(gvec_urshr_b, uint8_t) 1907 DO_RSHR(gvec_urshr_h, uint16_t) 1908 DO_RSHR(gvec_urshr_s, uint32_t) 1909 DO_RSHR(gvec_urshr_d, uint64_t) 1910 1911 #undef DO_RSHR 1912 1913 #define DO_RSRA(NAME, TYPE) \ 1914 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1915 { \ 1916 intptr_t i, oprsz = simd_oprsz(desc); \ 1917 int shift = simd_data(desc); \ 1918 TYPE *d = vd, *n = vn; \ 1919 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1920 TYPE tmp = n[i] >> (shift - 1); \ 1921 d[i] += (tmp >> 1) + (tmp & 1); \ 1922 } \ 1923 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1924 } 1925 1926 DO_RSRA(gvec_srsra_b, int8_t) 1927 DO_RSRA(gvec_srsra_h, int16_t) 1928 DO_RSRA(gvec_srsra_s, int32_t) 1929 DO_RSRA(gvec_srsra_d, int64_t) 1930 1931 DO_RSRA(gvec_ursra_b, uint8_t) 1932 DO_RSRA(gvec_ursra_h, uint16_t) 1933 DO_RSRA(gvec_ursra_s, uint32_t) 1934 DO_RSRA(gvec_ursra_d, uint64_t) 1935 1936 #undef DO_RSRA 1937 1938 #define DO_SRI(NAME, TYPE) \ 1939 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1940 { \ 1941 intptr_t i, oprsz = simd_oprsz(desc); \ 1942 int shift = simd_data(desc); \ 1943 TYPE *d = vd, *n = vn; \ 1944 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1945 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1946 } \ 1947 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1948 } 1949 1950 DO_SRI(gvec_sri_b, uint8_t) 1951 DO_SRI(gvec_sri_h, uint16_t) 1952 DO_SRI(gvec_sri_s, uint32_t) 1953 DO_SRI(gvec_sri_d, uint64_t) 1954 1955 #undef DO_SRI 1956 1957 #define DO_SLI(NAME, TYPE) \ 1958 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1959 { \ 1960 intptr_t i, oprsz = simd_oprsz(desc); \ 1961 int shift = simd_data(desc); \ 1962 TYPE *d = vd, *n = vn; \ 1963 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1964 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1965 } \ 1966 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1967 } 1968 1969 DO_SLI(gvec_sli_b, uint8_t) 1970 DO_SLI(gvec_sli_h, uint16_t) 1971 DO_SLI(gvec_sli_s, uint32_t) 1972 DO_SLI(gvec_sli_d, uint64_t) 1973 1974 #undef DO_SLI 1975 1976 /* 1977 * Convert float16 to float32, raising no exceptions and 1978 * preserving exceptional values, including SNaN. 1979 * This is effectively an unpack+repack operation. 1980 */ 1981 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1982 { 1983 const int f16_bias = 15; 1984 const int f32_bias = 127; 1985 uint32_t sign = extract32(f16, 15, 1); 1986 uint32_t exp = extract32(f16, 10, 5); 1987 uint32_t frac = extract32(f16, 0, 10); 1988 1989 if (exp == 0x1f) { 1990 /* Inf or NaN */ 1991 exp = 0xff; 1992 } else if (exp == 0) { 1993 /* Zero or denormal. */ 1994 if (frac != 0) { 1995 if (fz16) { 1996 frac = 0; 1997 } else { 1998 /* 1999 * Denormal; these are all normal float32. 2000 * Shift the fraction so that the msb is at bit 11, 2001 * then remove bit 11 as the implicit bit of the 2002 * normalized float32. Note that we still go through 2003 * the shift for normal numbers below, to put the 2004 * float32 fraction at the right place. 2005 */ 2006 int shift = clz32(frac) - 21; 2007 frac = (frac << shift) & 0x3ff; 2008 exp = f32_bias - f16_bias - shift + 1; 2009 } 2010 } 2011 } else { 2012 /* Normal number; adjust the bias. */ 2013 exp += f32_bias - f16_bias; 2014 } 2015 sign <<= 31; 2016 exp <<= 23; 2017 frac <<= 23 - 10; 2018 2019 return sign | exp | frac; 2020 } 2021 2022 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2023 { 2024 /* 2025 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2026 * Load the 2nd qword iff is_q & is_2. 2027 * Shift to the 2nd dword iff !is_q & is_2. 2028 * For !is_q & !is_2, the upper bits of the result are garbage. 2029 */ 2030 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2031 } 2032 2033 /* 2034 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2035 * as there is not yet SVE versions that might use blocking. 2036 */ 2037 2038 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2039 uint32_t desc, bool fz16) 2040 { 2041 intptr_t i, oprsz = simd_oprsz(desc); 2042 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2043 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2044 int is_q = oprsz == 16; 2045 uint64_t n_4, m_4; 2046 2047 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2048 n_4 = load4_f16(vn, is_q, is_2); 2049 m_4 = load4_f16(vm, is_q, is_2); 2050 2051 /* Negate all inputs for FMLSL at once. */ 2052 if (is_s) { 2053 n_4 ^= 0x8000800080008000ull; 2054 } 2055 2056 for (i = 0; i < oprsz / 4; i++) { 2057 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2058 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2059 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2060 } 2061 clear_tail(d, oprsz, simd_maxsz(desc)); 2062 } 2063 2064 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2065 void *venv, uint32_t desc) 2066 { 2067 CPUARMState *env = venv; 2068 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2069 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2070 } 2071 2072 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2073 void *venv, uint32_t desc) 2074 { 2075 CPUARMState *env = venv; 2076 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 2077 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2078 } 2079 2080 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2081 void *venv, uint32_t desc) 2082 { 2083 intptr_t i, oprsz = simd_oprsz(desc); 2084 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2085 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2086 CPUARMState *env = venv; 2087 float_status *status = &env->vfp.fp_status; 2088 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2089 2090 for (i = 0; i < oprsz; i += sizeof(float32)) { 2091 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2092 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2093 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2094 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2095 float32 aa = *(float32 *)(va + H1_4(i)); 2096 2097 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2098 } 2099 } 2100 2101 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2102 uint32_t desc, bool fz16) 2103 { 2104 intptr_t i, oprsz = simd_oprsz(desc); 2105 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2106 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2107 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2108 int is_q = oprsz == 16; 2109 uint64_t n_4; 2110 float32 m_1; 2111 2112 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2113 n_4 = load4_f16(vn, is_q, is_2); 2114 2115 /* Negate all inputs for FMLSL at once. */ 2116 if (is_s) { 2117 n_4 ^= 0x8000800080008000ull; 2118 } 2119 2120 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2121 2122 for (i = 0; i < oprsz / 4; i++) { 2123 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2124 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2125 } 2126 clear_tail(d, oprsz, simd_maxsz(desc)); 2127 } 2128 2129 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2130 void *venv, uint32_t desc) 2131 { 2132 CPUARMState *env = venv; 2133 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2134 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2135 } 2136 2137 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2138 void *venv, uint32_t desc) 2139 { 2140 CPUARMState *env = venv; 2141 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 2142 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2143 } 2144 2145 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2146 void *venv, uint32_t desc) 2147 { 2148 intptr_t i, j, oprsz = simd_oprsz(desc); 2149 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2150 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2151 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2152 CPUARMState *env = venv; 2153 float_status *status = &env->vfp.fp_status; 2154 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2155 2156 for (i = 0; i < oprsz; i += 16) { 2157 float16 mm_16 = *(float16 *)(vm + i + idx); 2158 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2159 2160 for (j = 0; j < 16; j += sizeof(float32)) { 2161 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2162 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2163 float32 aa = *(float32 *)(va + H1_4(i + j)); 2164 2165 *(float32 *)(vd + H1_4(i + j)) = 2166 float32_muladd(nn, mm, aa, 0, status); 2167 } 2168 } 2169 } 2170 2171 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2172 { 2173 intptr_t i, opr_sz = simd_oprsz(desc); 2174 int8_t *d = vd, *n = vn, *m = vm; 2175 2176 for (i = 0; i < opr_sz; ++i) { 2177 int8_t mm = m[i]; 2178 int8_t nn = n[i]; 2179 int8_t res = 0; 2180 if (mm >= 0) { 2181 if (mm < 8) { 2182 res = nn << mm; 2183 } 2184 } else { 2185 res = nn >> (mm > -8 ? -mm : 7); 2186 } 2187 d[i] = res; 2188 } 2189 clear_tail(d, opr_sz, simd_maxsz(desc)); 2190 } 2191 2192 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2193 { 2194 intptr_t i, opr_sz = simd_oprsz(desc); 2195 int16_t *d = vd, *n = vn, *m = vm; 2196 2197 for (i = 0; i < opr_sz / 2; ++i) { 2198 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2199 int16_t nn = n[i]; 2200 int16_t res = 0; 2201 if (mm >= 0) { 2202 if (mm < 16) { 2203 res = nn << mm; 2204 } 2205 } else { 2206 res = nn >> (mm > -16 ? -mm : 15); 2207 } 2208 d[i] = res; 2209 } 2210 clear_tail(d, opr_sz, simd_maxsz(desc)); 2211 } 2212 2213 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2214 { 2215 intptr_t i, opr_sz = simd_oprsz(desc); 2216 uint8_t *d = vd, *n = vn, *m = vm; 2217 2218 for (i = 0; i < opr_sz; ++i) { 2219 int8_t mm = m[i]; 2220 uint8_t nn = n[i]; 2221 uint8_t res = 0; 2222 if (mm >= 0) { 2223 if (mm < 8) { 2224 res = nn << mm; 2225 } 2226 } else { 2227 if (mm > -8) { 2228 res = nn >> -mm; 2229 } 2230 } 2231 d[i] = res; 2232 } 2233 clear_tail(d, opr_sz, simd_maxsz(desc)); 2234 } 2235 2236 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2237 { 2238 intptr_t i, opr_sz = simd_oprsz(desc); 2239 uint16_t *d = vd, *n = vn, *m = vm; 2240 2241 for (i = 0; i < opr_sz / 2; ++i) { 2242 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2243 uint16_t nn = n[i]; 2244 uint16_t res = 0; 2245 if (mm >= 0) { 2246 if (mm < 16) { 2247 res = nn << mm; 2248 } 2249 } else { 2250 if (mm > -16) { 2251 res = nn >> -mm; 2252 } 2253 } 2254 d[i] = res; 2255 } 2256 clear_tail(d, opr_sz, simd_maxsz(desc)); 2257 } 2258 2259 /* 2260 * 8x8->8 polynomial multiply. 2261 * 2262 * Polynomial multiplication is like integer multiplication except the 2263 * partial products are XORed, not added. 2264 * 2265 * TODO: expose this as a generic vector operation, as it is a common 2266 * crypto building block. 2267 */ 2268 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2269 { 2270 intptr_t i, opr_sz = simd_oprsz(desc); 2271 uint64_t *d = vd, *n = vn, *m = vm; 2272 2273 for (i = 0; i < opr_sz / 8; ++i) { 2274 d[i] = clmul_8x8_low(n[i], m[i]); 2275 } 2276 clear_tail(d, opr_sz, simd_maxsz(desc)); 2277 } 2278 2279 /* 2280 * 64x64->128 polynomial multiply. 2281 * Because of the lanes are not accessed in strict columns, 2282 * this probably cannot be turned into a generic helper. 2283 */ 2284 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2285 { 2286 intptr_t i, opr_sz = simd_oprsz(desc); 2287 intptr_t hi = simd_data(desc); 2288 uint64_t *d = vd, *n = vn, *m = vm; 2289 2290 for (i = 0; i < opr_sz / 8; i += 2) { 2291 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2292 d[i] = int128_getlo(r); 2293 d[i + 1] = int128_gethi(r); 2294 } 2295 clear_tail(d, opr_sz, simd_maxsz(desc)); 2296 } 2297 2298 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2299 { 2300 int hi = simd_data(desc); 2301 uint64_t *d = vd, *n = vn, *m = vm; 2302 uint64_t nn = n[hi], mm = m[hi]; 2303 2304 d[0] = clmul_8x4_packed(nn, mm); 2305 nn >>= 32; 2306 mm >>= 32; 2307 d[1] = clmul_8x4_packed(nn, mm); 2308 2309 clear_tail(d, 16, simd_maxsz(desc)); 2310 } 2311 2312 #ifdef TARGET_AARCH64 2313 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2314 { 2315 int shift = simd_data(desc) * 8; 2316 intptr_t i, opr_sz = simd_oprsz(desc); 2317 uint64_t *d = vd, *n = vn, *m = vm; 2318 2319 for (i = 0; i < opr_sz / 8; ++i) { 2320 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2321 } 2322 } 2323 2324 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2325 { 2326 intptr_t sel = H4(simd_data(desc)); 2327 intptr_t i, opr_sz = simd_oprsz(desc); 2328 uint32_t *n = vn, *m = vm; 2329 uint64_t *d = vd; 2330 2331 for (i = 0; i < opr_sz / 8; ++i) { 2332 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2333 } 2334 } 2335 #endif 2336 2337 #define DO_CMP0(NAME, TYPE, OP) \ 2338 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2339 { \ 2340 intptr_t i, opr_sz = simd_oprsz(desc); \ 2341 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2342 TYPE nn = *(TYPE *)(vn + i); \ 2343 *(TYPE *)(vd + i) = -(nn OP 0); \ 2344 } \ 2345 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2346 } 2347 2348 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2349 DO_CMP0(gvec_clt0_b, int8_t, <) 2350 DO_CMP0(gvec_cle0_b, int8_t, <=) 2351 DO_CMP0(gvec_cgt0_b, int8_t, >) 2352 DO_CMP0(gvec_cge0_b, int8_t, >=) 2353 2354 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2355 DO_CMP0(gvec_clt0_h, int16_t, <) 2356 DO_CMP0(gvec_cle0_h, int16_t, <=) 2357 DO_CMP0(gvec_cgt0_h, int16_t, >) 2358 DO_CMP0(gvec_cge0_h, int16_t, >=) 2359 2360 #undef DO_CMP0 2361 2362 #define DO_ABD(NAME, TYPE) \ 2363 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2364 { \ 2365 intptr_t i, opr_sz = simd_oprsz(desc); \ 2366 TYPE *d = vd, *n = vn, *m = vm; \ 2367 \ 2368 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2369 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2370 } \ 2371 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2372 } 2373 2374 DO_ABD(gvec_sabd_b, int8_t) 2375 DO_ABD(gvec_sabd_h, int16_t) 2376 DO_ABD(gvec_sabd_s, int32_t) 2377 DO_ABD(gvec_sabd_d, int64_t) 2378 2379 DO_ABD(gvec_uabd_b, uint8_t) 2380 DO_ABD(gvec_uabd_h, uint16_t) 2381 DO_ABD(gvec_uabd_s, uint32_t) 2382 DO_ABD(gvec_uabd_d, uint64_t) 2383 2384 #undef DO_ABD 2385 2386 #define DO_ABA(NAME, TYPE) \ 2387 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2388 { \ 2389 intptr_t i, opr_sz = simd_oprsz(desc); \ 2390 TYPE *d = vd, *n = vn, *m = vm; \ 2391 \ 2392 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2393 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2394 } \ 2395 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2396 } 2397 2398 DO_ABA(gvec_saba_b, int8_t) 2399 DO_ABA(gvec_saba_h, int16_t) 2400 DO_ABA(gvec_saba_s, int32_t) 2401 DO_ABA(gvec_saba_d, int64_t) 2402 2403 DO_ABA(gvec_uaba_b, uint8_t) 2404 DO_ABA(gvec_uaba_h, uint16_t) 2405 DO_ABA(gvec_uaba_s, uint32_t) 2406 DO_ABA(gvec_uaba_d, uint64_t) 2407 2408 #undef DO_ABA 2409 2410 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2411 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 2412 { \ 2413 ARMVectorReg scratch; \ 2414 intptr_t oprsz = simd_oprsz(desc); \ 2415 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2416 TYPE *d = vd, *n = vn, *m = vm; \ 2417 if (unlikely(d == m)) { \ 2418 m = memcpy(&scratch, m, oprsz); \ 2419 } \ 2420 for (intptr_t i = 0; i < half; ++i) { \ 2421 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2422 } \ 2423 for (intptr_t i = 0; i < half; ++i) { \ 2424 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2425 } \ 2426 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2427 } 2428 2429 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2430 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2431 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2432 2433 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2434 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2435 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2436 2437 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2438 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2439 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2440 2441 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2442 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2443 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2444 2445 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2446 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2447 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2448 2449 #undef DO_3OP_PAIR 2450 2451 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2452 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2453 { \ 2454 ARMVectorReg scratch; \ 2455 intptr_t oprsz = simd_oprsz(desc); \ 2456 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2457 TYPE *d = vd, *n = vn, *m = vm; \ 2458 if (unlikely(d == m)) { \ 2459 m = memcpy(&scratch, m, oprsz); \ 2460 } \ 2461 for (intptr_t i = 0; i < half; ++i) { \ 2462 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2463 } \ 2464 for (intptr_t i = 0; i < half; ++i) { \ 2465 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2466 } \ 2467 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2468 } 2469 2470 #define ADD(A, B) (A + B) 2471 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2472 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2473 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2474 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2475 #undef ADD 2476 2477 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2478 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2479 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2480 2481 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2482 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2483 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2484 2485 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2486 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2487 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2488 2489 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2490 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2491 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2492 2493 #undef DO_3OP_PAIR 2494 2495 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2496 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2497 { \ 2498 intptr_t i, oprsz = simd_oprsz(desc); \ 2499 int shift = simd_data(desc); \ 2500 TYPE *d = vd, *n = vn; \ 2501 float_status *fpst = stat; \ 2502 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2503 d[i] = FUNC(n[i], shift, fpst); \ 2504 } \ 2505 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2506 } 2507 2508 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2509 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2510 DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2511 DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) 2512 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2513 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2514 DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2515 DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2516 2517 #undef DO_VCVT_FIXED 2518 2519 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2520 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2521 { \ 2522 float_status *fpst = stat; \ 2523 intptr_t i, oprsz = simd_oprsz(desc); \ 2524 uint32_t rmode = simd_data(desc); \ 2525 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2526 TYPE *d = vd, *n = vn; \ 2527 set_float_rounding_mode(rmode, fpst); \ 2528 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2529 d[i] = FUNC(n[i], 0, fpst); \ 2530 } \ 2531 set_float_rounding_mode(prev_rmode, fpst); \ 2532 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2533 } 2534 2535 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2536 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2537 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2538 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2539 2540 #undef DO_VCVT_RMODE 2541 2542 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2543 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2544 { \ 2545 float_status *fpst = stat; \ 2546 intptr_t i, oprsz = simd_oprsz(desc); \ 2547 uint32_t rmode = simd_data(desc); \ 2548 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2549 TYPE *d = vd, *n = vn; \ 2550 set_float_rounding_mode(rmode, fpst); \ 2551 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2552 d[i] = FUNC(n[i], fpst); \ 2553 } \ 2554 set_float_rounding_mode(prev_rmode, fpst); \ 2555 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2556 } 2557 2558 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2559 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2560 2561 #undef DO_VRINT_RMODE 2562 2563 #ifdef TARGET_AARCH64 2564 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2565 { 2566 const uint8_t *indices = vm; 2567 CPUARMState *env = venv; 2568 size_t oprsz = simd_oprsz(desc); 2569 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2570 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2571 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2572 union { 2573 uint8_t b[16]; 2574 uint64_t d[2]; 2575 } result; 2576 2577 /* 2578 * We must construct the final result in a temp, lest the output 2579 * overlaps the input table. For TBL, begin with zero; for TBX, 2580 * begin with the original register contents. Note that we always 2581 * copy 16 bytes here to avoid an extra branch; clearing the high 2582 * bits of the register for oprsz == 8 is handled below. 2583 */ 2584 if (is_tbx) { 2585 memcpy(&result, vd, 16); 2586 } else { 2587 memset(&result, 0, 16); 2588 } 2589 2590 for (size_t i = 0; i < oprsz; ++i) { 2591 uint32_t index = indices[H1(i)]; 2592 2593 if (index < table_len) { 2594 /* 2595 * Convert index (a byte offset into the virtual table 2596 * which is a series of 128-bit vectors concatenated) 2597 * into the correct register element, bearing in mind 2598 * that the table can wrap around from V31 to V0. 2599 */ 2600 const uint8_t *table = (const uint8_t *) 2601 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2602 result.b[H1(i)] = table[H1(index % 16)]; 2603 } 2604 } 2605 2606 memcpy(vd, &result, 16); 2607 clear_tail(vd, oprsz, simd_maxsz(desc)); 2608 } 2609 #endif 2610 2611 /* 2612 * NxN -> N highpart multiply 2613 * 2614 * TODO: expose this as a generic vector operation. 2615 */ 2616 2617 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2618 { 2619 intptr_t i, opr_sz = simd_oprsz(desc); 2620 int8_t *d = vd, *n = vn, *m = vm; 2621 2622 for (i = 0; i < opr_sz; ++i) { 2623 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2624 } 2625 clear_tail(d, opr_sz, simd_maxsz(desc)); 2626 } 2627 2628 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2629 { 2630 intptr_t i, opr_sz = simd_oprsz(desc); 2631 int16_t *d = vd, *n = vn, *m = vm; 2632 2633 for (i = 0; i < opr_sz / 2; ++i) { 2634 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2635 } 2636 clear_tail(d, opr_sz, simd_maxsz(desc)); 2637 } 2638 2639 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2640 { 2641 intptr_t i, opr_sz = simd_oprsz(desc); 2642 int32_t *d = vd, *n = vn, *m = vm; 2643 2644 for (i = 0; i < opr_sz / 4; ++i) { 2645 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2646 } 2647 clear_tail(d, opr_sz, simd_maxsz(desc)); 2648 } 2649 2650 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2651 { 2652 intptr_t i, opr_sz = simd_oprsz(desc); 2653 uint64_t *d = vd, *n = vn, *m = vm; 2654 uint64_t discard; 2655 2656 for (i = 0; i < opr_sz / 8; ++i) { 2657 muls64(&discard, &d[i], n[i], m[i]); 2658 } 2659 clear_tail(d, opr_sz, simd_maxsz(desc)); 2660 } 2661 2662 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2663 { 2664 intptr_t i, opr_sz = simd_oprsz(desc); 2665 uint8_t *d = vd, *n = vn, *m = vm; 2666 2667 for (i = 0; i < opr_sz; ++i) { 2668 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2669 } 2670 clear_tail(d, opr_sz, simd_maxsz(desc)); 2671 } 2672 2673 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2674 { 2675 intptr_t i, opr_sz = simd_oprsz(desc); 2676 uint16_t *d = vd, *n = vn, *m = vm; 2677 2678 for (i = 0; i < opr_sz / 2; ++i) { 2679 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2680 } 2681 clear_tail(d, opr_sz, simd_maxsz(desc)); 2682 } 2683 2684 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2685 { 2686 intptr_t i, opr_sz = simd_oprsz(desc); 2687 uint32_t *d = vd, *n = vn, *m = vm; 2688 2689 for (i = 0; i < opr_sz / 4; ++i) { 2690 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2691 } 2692 clear_tail(d, opr_sz, simd_maxsz(desc)); 2693 } 2694 2695 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2696 { 2697 intptr_t i, opr_sz = simd_oprsz(desc); 2698 uint64_t *d = vd, *n = vn, *m = vm; 2699 uint64_t discard; 2700 2701 for (i = 0; i < opr_sz / 8; ++i) { 2702 mulu64(&discard, &d[i], n[i], m[i]); 2703 } 2704 clear_tail(d, opr_sz, simd_maxsz(desc)); 2705 } 2706 2707 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2708 { 2709 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2710 int shr = simd_data(desc); 2711 uint64_t *d = vd, *n = vn, *m = vm; 2712 2713 for (i = 0; i < opr_sz; ++i) { 2714 d[i] = ror64(n[i] ^ m[i], shr); 2715 } 2716 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2717 } 2718 2719 /* 2720 * Integer matrix-multiply accumulate 2721 */ 2722 2723 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2724 { 2725 int8_t *n = vn, *m = vm; 2726 2727 for (intptr_t k = 0; k < 8; ++k) { 2728 sum += n[H1(k)] * m[H1(k)]; 2729 } 2730 return sum; 2731 } 2732 2733 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2734 { 2735 uint8_t *n = vn, *m = vm; 2736 2737 for (intptr_t k = 0; k < 8; ++k) { 2738 sum += n[H1(k)] * m[H1(k)]; 2739 } 2740 return sum; 2741 } 2742 2743 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2744 { 2745 uint8_t *n = vn; 2746 int8_t *m = vm; 2747 2748 for (intptr_t k = 0; k < 8; ++k) { 2749 sum += n[H1(k)] * m[H1(k)]; 2750 } 2751 return sum; 2752 } 2753 2754 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2755 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2756 { 2757 intptr_t seg, opr_sz = simd_oprsz(desc); 2758 2759 for (seg = 0; seg < opr_sz; seg += 16) { 2760 uint32_t *d = vd + seg; 2761 uint32_t *a = va + seg; 2762 uint32_t sum0, sum1, sum2, sum3; 2763 2764 /* 2765 * Process the entire segment at once, writing back the 2766 * results only after we've consumed all of the inputs. 2767 * 2768 * Key to indices by column: 2769 * i j i j 2770 */ 2771 sum0 = a[H4(0 + 0)]; 2772 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2773 sum1 = a[H4(0 + 1)]; 2774 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2775 sum2 = a[H4(2 + 0)]; 2776 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2777 sum3 = a[H4(2 + 1)]; 2778 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2779 2780 d[H4(0)] = sum0; 2781 d[H4(1)] = sum1; 2782 d[H4(2)] = sum2; 2783 d[H4(3)] = sum3; 2784 } 2785 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2786 } 2787 2788 #define DO_MMLA_B(NAME, INNER) \ 2789 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2790 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2791 2792 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2793 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2794 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2795 2796 /* 2797 * BFloat16 Dot Product 2798 */ 2799 2800 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2801 { 2802 /* 2803 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2804 * For EBF = 0, we ignore the FPCR bits which determine rounding 2805 * mode and denormal-flushing, and we do unfused multiplies and 2806 * additions with intermediate rounding of all products and sums. 2807 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2808 * and we perform a fused two-way sum-of-products without intermediate 2809 * rounding of the products. 2810 * In either case, we don't set fp exception flags. 2811 * 2812 * EBF is AArch64 only, so even if it's set in the FPCR it has 2813 * no effect on AArch32 instructions. 2814 */ 2815 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2816 *statusp = (float_status){ 2817 .tininess_before_rounding = float_tininess_before_rounding, 2818 .float_rounding_mode = float_round_to_odd_inf, 2819 .flush_to_zero = true, 2820 .flush_inputs_to_zero = true, 2821 .default_nan_mode = true, 2822 }; 2823 2824 if (ebf) { 2825 float_status *fpst = &env->vfp.fp_status; 2826 set_flush_to_zero(get_flush_to_zero(fpst), statusp); 2827 set_flush_inputs_to_zero(get_flush_inputs_to_zero(fpst), statusp); 2828 set_float_rounding_mode(get_float_rounding_mode(fpst), statusp); 2829 2830 /* EBF=1 needs to do a step with round-to-odd semantics */ 2831 *oddstatusp = *statusp; 2832 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2833 } 2834 2835 return ebf; 2836 } 2837 2838 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2839 { 2840 float32 t1, t2; 2841 2842 /* 2843 * Extract each BFloat16 from the element pair, and shift 2844 * them such that they become float32. 2845 */ 2846 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2847 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2848 t1 = float32_add(t1, t2, fpst); 2849 t1 = float32_add(sum, t1, fpst); 2850 2851 return t1; 2852 } 2853 2854 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2855 float_status *fpst, float_status *fpst_odd) 2856 { 2857 /* 2858 * Compare f16_dotadd() in sme_helper.c, but here we have 2859 * bfloat16 inputs. In particular that means that we do not 2860 * want the FPCR.FZ16 flush semantics, so we use the normal 2861 * float_status for the input handling here. 2862 */ 2863 float64 e1r = float32_to_float64(e1 << 16, fpst); 2864 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2865 float64 e2r = float32_to_float64(e2 << 16, fpst); 2866 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2867 float64 t64; 2868 float32 t32; 2869 2870 /* 2871 * The ARM pseudocode function FPDot performs both multiplies 2872 * and the add with a single rounding operation. Emulate this 2873 * by performing the first multiply in round-to-odd, then doing 2874 * the second multiply as fused multiply-add, and rounding to 2875 * float32 all in one step. 2876 */ 2877 t64 = float64_mul(e1r, e2r, fpst_odd); 2878 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 2879 2880 /* This conversion is exact, because we've already rounded. */ 2881 t32 = float64_to_float32(t64, fpst); 2882 2883 /* The final accumulation step is not fused. */ 2884 return float32_add(sum, t32, fpst); 2885 } 2886 2887 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 2888 CPUARMState *env, uint32_t desc) 2889 { 2890 intptr_t i, opr_sz = simd_oprsz(desc); 2891 float32 *d = vd, *a = va; 2892 uint32_t *n = vn, *m = vm; 2893 float_status fpst, fpst_odd; 2894 2895 if (is_ebf(env, &fpst, &fpst_odd)) { 2896 for (i = 0; i < opr_sz / 4; ++i) { 2897 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 2898 } 2899 } else { 2900 for (i = 0; i < opr_sz / 4; ++i) { 2901 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 2902 } 2903 } 2904 clear_tail(d, opr_sz, simd_maxsz(desc)); 2905 } 2906 2907 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2908 void *va, CPUARMState *env, uint32_t desc) 2909 { 2910 intptr_t i, j, opr_sz = simd_oprsz(desc); 2911 intptr_t index = simd_data(desc); 2912 intptr_t elements = opr_sz / 4; 2913 intptr_t eltspersegment = MIN(16 / 4, elements); 2914 float32 *d = vd, *a = va; 2915 uint32_t *n = vn, *m = vm; 2916 float_status fpst, fpst_odd; 2917 2918 if (is_ebf(env, &fpst, &fpst_odd)) { 2919 for (i = 0; i < elements; i += eltspersegment) { 2920 uint32_t m_idx = m[i + H4(index)]; 2921 2922 for (j = i; j < i + eltspersegment; j++) { 2923 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 2924 } 2925 } 2926 } else { 2927 for (i = 0; i < elements; i += eltspersegment) { 2928 uint32_t m_idx = m[i + H4(index)]; 2929 2930 for (j = i; j < i + eltspersegment; j++) { 2931 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 2932 } 2933 } 2934 } 2935 clear_tail(d, opr_sz, simd_maxsz(desc)); 2936 } 2937 2938 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 2939 CPUARMState *env, uint32_t desc) 2940 { 2941 intptr_t s, opr_sz = simd_oprsz(desc); 2942 float32 *d = vd, *a = va; 2943 uint32_t *n = vn, *m = vm; 2944 float_status fpst, fpst_odd; 2945 2946 if (is_ebf(env, &fpst, &fpst_odd)) { 2947 for (s = 0; s < opr_sz / 4; s += 4) { 2948 float32 sum00, sum01, sum10, sum11; 2949 2950 /* 2951 * Process the entire segment at once, writing back the 2952 * results only after we've consumed all of the inputs. 2953 * 2954 * Key to indices by column: 2955 * i j i k j k 2956 */ 2957 sum00 = a[s + H4(0 + 0)]; 2958 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2959 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2960 2961 sum01 = a[s + H4(0 + 1)]; 2962 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2963 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2964 2965 sum10 = a[s + H4(2 + 0)]; 2966 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2967 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2968 2969 sum11 = a[s + H4(2 + 1)]; 2970 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2971 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2972 2973 d[s + H4(0 + 0)] = sum00; 2974 d[s + H4(0 + 1)] = sum01; 2975 d[s + H4(2 + 0)] = sum10; 2976 d[s + H4(2 + 1)] = sum11; 2977 } 2978 } else { 2979 for (s = 0; s < opr_sz / 4; s += 4) { 2980 float32 sum00, sum01, sum10, sum11; 2981 2982 /* 2983 * Process the entire segment at once, writing back the 2984 * results only after we've consumed all of the inputs. 2985 * 2986 * Key to indices by column: 2987 * i j i k j k 2988 */ 2989 sum00 = a[s + H4(0 + 0)]; 2990 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 2991 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 2992 2993 sum01 = a[s + H4(0 + 1)]; 2994 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 2995 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 2996 2997 sum10 = a[s + H4(2 + 0)]; 2998 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 2999 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3000 3001 sum11 = a[s + H4(2 + 1)]; 3002 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3003 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3004 3005 d[s + H4(0 + 0)] = sum00; 3006 d[s + H4(0 + 1)] = sum01; 3007 d[s + H4(2 + 0)] = sum10; 3008 d[s + H4(2 + 1)] = sum11; 3009 } 3010 } 3011 clear_tail(d, opr_sz, simd_maxsz(desc)); 3012 } 3013 3014 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3015 void *stat, uint32_t desc) 3016 { 3017 intptr_t i, opr_sz = simd_oprsz(desc); 3018 intptr_t sel = simd_data(desc); 3019 float32 *d = vd, *a = va; 3020 bfloat16 *n = vn, *m = vm; 3021 3022 for (i = 0; i < opr_sz / 4; ++i) { 3023 float32 nn = n[H2(i * 2 + sel)] << 16; 3024 float32 mm = m[H2(i * 2 + sel)] << 16; 3025 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3026 } 3027 clear_tail(d, opr_sz, simd_maxsz(desc)); 3028 } 3029 3030 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3031 void *va, void *stat, uint32_t desc) 3032 { 3033 intptr_t i, j, opr_sz = simd_oprsz(desc); 3034 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3035 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3036 intptr_t elements = opr_sz / 4; 3037 intptr_t eltspersegment = MIN(16 / 4, elements); 3038 float32 *d = vd, *a = va; 3039 bfloat16 *n = vn, *m = vm; 3040 3041 for (i = 0; i < elements; i += eltspersegment) { 3042 float32 m_idx = m[H2(2 * i + index)] << 16; 3043 3044 for (j = i; j < i + eltspersegment; j++) { 3045 float32 n_j = n[H2(2 * j + sel)] << 16; 3046 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3047 } 3048 } 3049 clear_tail(d, opr_sz, simd_maxsz(desc)); 3050 } 3051 3052 #define DO_CLAMP(NAME, TYPE) \ 3053 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3054 { \ 3055 intptr_t i, opr_sz = simd_oprsz(desc); \ 3056 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3057 TYPE aa = *(TYPE *)(a + i); \ 3058 TYPE nn = *(TYPE *)(n + i); \ 3059 TYPE mm = *(TYPE *)(m + i); \ 3060 TYPE dd = MIN(MAX(aa, nn), mm); \ 3061 *(TYPE *)(d + i) = dd; \ 3062 } \ 3063 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3064 } 3065 3066 DO_CLAMP(gvec_sclamp_b, int8_t) 3067 DO_CLAMP(gvec_sclamp_h, int16_t) 3068 DO_CLAMP(gvec_sclamp_s, int32_t) 3069 DO_CLAMP(gvec_sclamp_d, int64_t) 3070 3071 DO_CLAMP(gvec_uclamp_b, uint8_t) 3072 DO_CLAMP(gvec_uclamp_h, uint16_t) 3073 DO_CLAMP(gvec_uclamp_s, uint32_t) 3074 DO_CLAMP(gvec_uclamp_d, uint64_t) 3075