1 /* 2 * Generic vectorized operation runtime 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu/host-utils.h" 22 #include "cpu.h" 23 #include "exec/helper-proto.h" 24 #include "tcg-gvec-desc.h" 25 26 27 /* Virtually all hosts support 16-byte vectors. Those that don't can emulate 28 * them via GCC's generic vector extension. This turns out to be simpler and 29 * more reliable than getting the compiler to autovectorize. 30 * 31 * In tcg-op-gvec.c, we asserted that both the size and alignment of the data 32 * are multiples of 16. 33 * 34 * When the compiler does not support all of the operations we require, the 35 * loops are written so that we can always fall back on the base types. 36 */ 37 #ifdef CONFIG_VECTOR16 38 typedef uint8_t vec8 __attribute__((vector_size(16))); 39 typedef uint16_t vec16 __attribute__((vector_size(16))); 40 typedef uint32_t vec32 __attribute__((vector_size(16))); 41 typedef uint64_t vec64 __attribute__((vector_size(16))); 42 43 typedef int8_t svec8 __attribute__((vector_size(16))); 44 typedef int16_t svec16 __attribute__((vector_size(16))); 45 typedef int32_t svec32 __attribute__((vector_size(16))); 46 typedef int64_t svec64 __attribute__((vector_size(16))); 47 48 #define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X } 49 #define DUP8(X) { X, X, X, X, X, X, X, X } 50 #define DUP4(X) { X, X, X, X } 51 #define DUP2(X) { X, X } 52 #else 53 typedef uint8_t vec8; 54 typedef uint16_t vec16; 55 typedef uint32_t vec32; 56 typedef uint64_t vec64; 57 58 typedef int8_t svec8; 59 typedef int16_t svec16; 60 typedef int32_t svec32; 61 typedef int64_t svec64; 62 63 #define DUP16(X) X 64 #define DUP8(X) X 65 #define DUP4(X) X 66 #define DUP2(X) X 67 #endif /* CONFIG_VECTOR16 */ 68 69 static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc) 70 { 71 intptr_t maxsz = simd_maxsz(desc); 72 intptr_t i; 73 74 if (unlikely(maxsz > oprsz)) { 75 for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) { 76 *(uint64_t *)(d + i) = 0; 77 } 78 } 79 } 80 81 void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc) 82 { 83 intptr_t oprsz = simd_oprsz(desc); 84 intptr_t i; 85 86 for (i = 0; i < oprsz; i += sizeof(vec8)) { 87 *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i); 88 } 89 clear_high(d, oprsz, desc); 90 } 91 92 void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc) 93 { 94 intptr_t oprsz = simd_oprsz(desc); 95 intptr_t i; 96 97 for (i = 0; i < oprsz; i += sizeof(vec16)) { 98 *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i); 99 } 100 clear_high(d, oprsz, desc); 101 } 102 103 void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc) 104 { 105 intptr_t oprsz = simd_oprsz(desc); 106 intptr_t i; 107 108 for (i = 0; i < oprsz; i += sizeof(vec32)) { 109 *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i); 110 } 111 clear_high(d, oprsz, desc); 112 } 113 114 void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc) 115 { 116 intptr_t oprsz = simd_oprsz(desc); 117 intptr_t i; 118 119 for (i = 0; i < oprsz; i += sizeof(vec64)) { 120 *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i); 121 } 122 clear_high(d, oprsz, desc); 123 } 124 125 void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc) 126 { 127 intptr_t oprsz = simd_oprsz(desc); 128 vec8 vecb = (vec8)DUP16(b); 129 intptr_t i; 130 131 for (i = 0; i < oprsz; i += sizeof(vec8)) { 132 *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb; 133 } 134 clear_high(d, oprsz, desc); 135 } 136 137 void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc) 138 { 139 intptr_t oprsz = simd_oprsz(desc); 140 vec16 vecb = (vec16)DUP8(b); 141 intptr_t i; 142 143 for (i = 0; i < oprsz; i += sizeof(vec16)) { 144 *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb; 145 } 146 clear_high(d, oprsz, desc); 147 } 148 149 void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc) 150 { 151 intptr_t oprsz = simd_oprsz(desc); 152 vec32 vecb = (vec32)DUP4(b); 153 intptr_t i; 154 155 for (i = 0; i < oprsz; i += sizeof(vec32)) { 156 *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb; 157 } 158 clear_high(d, oprsz, desc); 159 } 160 161 void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc) 162 { 163 intptr_t oprsz = simd_oprsz(desc); 164 vec64 vecb = (vec64)DUP2(b); 165 intptr_t i; 166 167 for (i = 0; i < oprsz; i += sizeof(vec64)) { 168 *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb; 169 } 170 clear_high(d, oprsz, desc); 171 } 172 173 void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc) 174 { 175 intptr_t oprsz = simd_oprsz(desc); 176 intptr_t i; 177 178 for (i = 0; i < oprsz; i += sizeof(vec8)) { 179 *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i); 180 } 181 clear_high(d, oprsz, desc); 182 } 183 184 void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc) 185 { 186 intptr_t oprsz = simd_oprsz(desc); 187 intptr_t i; 188 189 for (i = 0; i < oprsz; i += sizeof(vec16)) { 190 *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i); 191 } 192 clear_high(d, oprsz, desc); 193 } 194 195 void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc) 196 { 197 intptr_t oprsz = simd_oprsz(desc); 198 intptr_t i; 199 200 for (i = 0; i < oprsz; i += sizeof(vec32)) { 201 *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i); 202 } 203 clear_high(d, oprsz, desc); 204 } 205 206 void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc) 207 { 208 intptr_t oprsz = simd_oprsz(desc); 209 intptr_t i; 210 211 for (i = 0; i < oprsz; i += sizeof(vec64)) { 212 *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i); 213 } 214 clear_high(d, oprsz, desc); 215 } 216 217 void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc) 218 { 219 intptr_t oprsz = simd_oprsz(desc); 220 vec8 vecb = (vec8)DUP16(b); 221 intptr_t i; 222 223 for (i = 0; i < oprsz; i += sizeof(vec8)) { 224 *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb; 225 } 226 clear_high(d, oprsz, desc); 227 } 228 229 void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc) 230 { 231 intptr_t oprsz = simd_oprsz(desc); 232 vec16 vecb = (vec16)DUP8(b); 233 intptr_t i; 234 235 for (i = 0; i < oprsz; i += sizeof(vec16)) { 236 *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb; 237 } 238 clear_high(d, oprsz, desc); 239 } 240 241 void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc) 242 { 243 intptr_t oprsz = simd_oprsz(desc); 244 vec32 vecb = (vec32)DUP4(b); 245 intptr_t i; 246 247 for (i = 0; i < oprsz; i += sizeof(vec32)) { 248 *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb; 249 } 250 clear_high(d, oprsz, desc); 251 } 252 253 void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc) 254 { 255 intptr_t oprsz = simd_oprsz(desc); 256 vec64 vecb = (vec64)DUP2(b); 257 intptr_t i; 258 259 for (i = 0; i < oprsz; i += sizeof(vec64)) { 260 *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb; 261 } 262 clear_high(d, oprsz, desc); 263 } 264 265 void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc) 266 { 267 intptr_t oprsz = simd_oprsz(desc); 268 intptr_t i; 269 270 for (i = 0; i < oprsz; i += sizeof(vec8)) { 271 *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i); 272 } 273 clear_high(d, oprsz, desc); 274 } 275 276 void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc) 277 { 278 intptr_t oprsz = simd_oprsz(desc); 279 intptr_t i; 280 281 for (i = 0; i < oprsz; i += sizeof(vec16)) { 282 *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i); 283 } 284 clear_high(d, oprsz, desc); 285 } 286 287 void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc) 288 { 289 intptr_t oprsz = simd_oprsz(desc); 290 intptr_t i; 291 292 for (i = 0; i < oprsz; i += sizeof(vec32)) { 293 *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i); 294 } 295 clear_high(d, oprsz, desc); 296 } 297 298 void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc) 299 { 300 intptr_t oprsz = simd_oprsz(desc); 301 intptr_t i; 302 303 for (i = 0; i < oprsz; i += sizeof(vec64)) { 304 *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i); 305 } 306 clear_high(d, oprsz, desc); 307 } 308 309 void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc) 310 { 311 intptr_t oprsz = simd_oprsz(desc); 312 vec8 vecb = (vec8)DUP16(b); 313 intptr_t i; 314 315 for (i = 0; i < oprsz; i += sizeof(vec8)) { 316 *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb; 317 } 318 clear_high(d, oprsz, desc); 319 } 320 321 void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc) 322 { 323 intptr_t oprsz = simd_oprsz(desc); 324 vec16 vecb = (vec16)DUP8(b); 325 intptr_t i; 326 327 for (i = 0; i < oprsz; i += sizeof(vec16)) { 328 *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb; 329 } 330 clear_high(d, oprsz, desc); 331 } 332 333 void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc) 334 { 335 intptr_t oprsz = simd_oprsz(desc); 336 vec32 vecb = (vec32)DUP4(b); 337 intptr_t i; 338 339 for (i = 0; i < oprsz; i += sizeof(vec32)) { 340 *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb; 341 } 342 clear_high(d, oprsz, desc); 343 } 344 345 void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc) 346 { 347 intptr_t oprsz = simd_oprsz(desc); 348 vec64 vecb = (vec64)DUP2(b); 349 intptr_t i; 350 351 for (i = 0; i < oprsz; i += sizeof(vec64)) { 352 *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb; 353 } 354 clear_high(d, oprsz, desc); 355 } 356 357 void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc) 358 { 359 intptr_t oprsz = simd_oprsz(desc); 360 intptr_t i; 361 362 for (i = 0; i < oprsz; i += sizeof(vec8)) { 363 *(vec8 *)(d + i) = -*(vec8 *)(a + i); 364 } 365 clear_high(d, oprsz, desc); 366 } 367 368 void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc) 369 { 370 intptr_t oprsz = simd_oprsz(desc); 371 intptr_t i; 372 373 for (i = 0; i < oprsz; i += sizeof(vec16)) { 374 *(vec16 *)(d + i) = -*(vec16 *)(a + i); 375 } 376 clear_high(d, oprsz, desc); 377 } 378 379 void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc) 380 { 381 intptr_t oprsz = simd_oprsz(desc); 382 intptr_t i; 383 384 for (i = 0; i < oprsz; i += sizeof(vec32)) { 385 *(vec32 *)(d + i) = -*(vec32 *)(a + i); 386 } 387 clear_high(d, oprsz, desc); 388 } 389 390 void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc) 391 { 392 intptr_t oprsz = simd_oprsz(desc); 393 intptr_t i; 394 395 for (i = 0; i < oprsz; i += sizeof(vec64)) { 396 *(vec64 *)(d + i) = -*(vec64 *)(a + i); 397 } 398 clear_high(d, oprsz, desc); 399 } 400 401 void HELPER(gvec_mov)(void *d, void *a, uint32_t desc) 402 { 403 intptr_t oprsz = simd_oprsz(desc); 404 405 memcpy(d, a, oprsz); 406 clear_high(d, oprsz, desc); 407 } 408 409 void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c) 410 { 411 intptr_t oprsz = simd_oprsz(desc); 412 intptr_t i; 413 414 if (c == 0) { 415 oprsz = 0; 416 } else { 417 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 418 *(uint64_t *)(d + i) = c; 419 } 420 } 421 clear_high(d, oprsz, desc); 422 } 423 424 void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c) 425 { 426 intptr_t oprsz = simd_oprsz(desc); 427 intptr_t i; 428 429 if (c == 0) { 430 oprsz = 0; 431 } else { 432 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 433 *(uint32_t *)(d + i) = c; 434 } 435 } 436 clear_high(d, oprsz, desc); 437 } 438 439 void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c) 440 { 441 HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff)); 442 } 443 444 void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c) 445 { 446 HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff)); 447 } 448 449 void HELPER(gvec_not)(void *d, void *a, uint32_t desc) 450 { 451 intptr_t oprsz = simd_oprsz(desc); 452 intptr_t i; 453 454 for (i = 0; i < oprsz; i += sizeof(vec64)) { 455 *(vec64 *)(d + i) = ~*(vec64 *)(a + i); 456 } 457 clear_high(d, oprsz, desc); 458 } 459 460 void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc) 461 { 462 intptr_t oprsz = simd_oprsz(desc); 463 intptr_t i; 464 465 for (i = 0; i < oprsz; i += sizeof(vec64)) { 466 *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i); 467 } 468 clear_high(d, oprsz, desc); 469 } 470 471 void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc) 472 { 473 intptr_t oprsz = simd_oprsz(desc); 474 intptr_t i; 475 476 for (i = 0; i < oprsz; i += sizeof(vec64)) { 477 *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i); 478 } 479 clear_high(d, oprsz, desc); 480 } 481 482 void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc) 483 { 484 intptr_t oprsz = simd_oprsz(desc); 485 intptr_t i; 486 487 for (i = 0; i < oprsz; i += sizeof(vec64)) { 488 *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i); 489 } 490 clear_high(d, oprsz, desc); 491 } 492 493 void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc) 494 { 495 intptr_t oprsz = simd_oprsz(desc); 496 intptr_t i; 497 498 for (i = 0; i < oprsz; i += sizeof(vec64)) { 499 *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i); 500 } 501 clear_high(d, oprsz, desc); 502 } 503 504 void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc) 505 { 506 intptr_t oprsz = simd_oprsz(desc); 507 intptr_t i; 508 509 for (i = 0; i < oprsz; i += sizeof(vec64)) { 510 *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i); 511 } 512 clear_high(d, oprsz, desc); 513 } 514 515 void HELPER(gvec_nand)(void *d, void *a, void *b, uint32_t desc) 516 { 517 intptr_t oprsz = simd_oprsz(desc); 518 intptr_t i; 519 520 for (i = 0; i < oprsz; i += sizeof(vec64)) { 521 *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) & *(vec64 *)(b + i)); 522 } 523 clear_high(d, oprsz, desc); 524 } 525 526 void HELPER(gvec_nor)(void *d, void *a, void *b, uint32_t desc) 527 { 528 intptr_t oprsz = simd_oprsz(desc); 529 intptr_t i; 530 531 for (i = 0; i < oprsz; i += sizeof(vec64)) { 532 *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) | *(vec64 *)(b + i)); 533 } 534 clear_high(d, oprsz, desc); 535 } 536 537 void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc) 538 { 539 intptr_t oprsz = simd_oprsz(desc); 540 intptr_t i; 541 542 for (i = 0; i < oprsz; i += sizeof(vec64)) { 543 *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) ^ *(vec64 *)(b + i)); 544 } 545 clear_high(d, oprsz, desc); 546 } 547 548 void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc) 549 { 550 intptr_t oprsz = simd_oprsz(desc); 551 vec64 vecb = (vec64)DUP2(b); 552 intptr_t i; 553 554 for (i = 0; i < oprsz; i += sizeof(vec64)) { 555 *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb; 556 } 557 clear_high(d, oprsz, desc); 558 } 559 560 void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc) 561 { 562 intptr_t oprsz = simd_oprsz(desc); 563 vec64 vecb = (vec64)DUP2(b); 564 intptr_t i; 565 566 for (i = 0; i < oprsz; i += sizeof(vec64)) { 567 *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb; 568 } 569 clear_high(d, oprsz, desc); 570 } 571 572 void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc) 573 { 574 intptr_t oprsz = simd_oprsz(desc); 575 vec64 vecb = (vec64)DUP2(b); 576 intptr_t i; 577 578 for (i = 0; i < oprsz; i += sizeof(vec64)) { 579 *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb; 580 } 581 clear_high(d, oprsz, desc); 582 } 583 584 void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc) 585 { 586 intptr_t oprsz = simd_oprsz(desc); 587 int shift = simd_data(desc); 588 intptr_t i; 589 590 for (i = 0; i < oprsz; i += sizeof(vec8)) { 591 *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift; 592 } 593 clear_high(d, oprsz, desc); 594 } 595 596 void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc) 597 { 598 intptr_t oprsz = simd_oprsz(desc); 599 int shift = simd_data(desc); 600 intptr_t i; 601 602 for (i = 0; i < oprsz; i += sizeof(vec16)) { 603 *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift; 604 } 605 clear_high(d, oprsz, desc); 606 } 607 608 void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc) 609 { 610 intptr_t oprsz = simd_oprsz(desc); 611 int shift = simd_data(desc); 612 intptr_t i; 613 614 for (i = 0; i < oprsz; i += sizeof(vec32)) { 615 *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift; 616 } 617 clear_high(d, oprsz, desc); 618 } 619 620 void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc) 621 { 622 intptr_t oprsz = simd_oprsz(desc); 623 int shift = simd_data(desc); 624 intptr_t i; 625 626 for (i = 0; i < oprsz; i += sizeof(vec64)) { 627 *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift; 628 } 629 clear_high(d, oprsz, desc); 630 } 631 632 void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc) 633 { 634 intptr_t oprsz = simd_oprsz(desc); 635 int shift = simd_data(desc); 636 intptr_t i; 637 638 for (i = 0; i < oprsz; i += sizeof(vec8)) { 639 *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift; 640 } 641 clear_high(d, oprsz, desc); 642 } 643 644 void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc) 645 { 646 intptr_t oprsz = simd_oprsz(desc); 647 int shift = simd_data(desc); 648 intptr_t i; 649 650 for (i = 0; i < oprsz; i += sizeof(vec16)) { 651 *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift; 652 } 653 clear_high(d, oprsz, desc); 654 } 655 656 void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc) 657 { 658 intptr_t oprsz = simd_oprsz(desc); 659 int shift = simd_data(desc); 660 intptr_t i; 661 662 for (i = 0; i < oprsz; i += sizeof(vec32)) { 663 *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift; 664 } 665 clear_high(d, oprsz, desc); 666 } 667 668 void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc) 669 { 670 intptr_t oprsz = simd_oprsz(desc); 671 int shift = simd_data(desc); 672 intptr_t i; 673 674 for (i = 0; i < oprsz; i += sizeof(vec64)) { 675 *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift; 676 } 677 clear_high(d, oprsz, desc); 678 } 679 680 void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc) 681 { 682 intptr_t oprsz = simd_oprsz(desc); 683 int shift = simd_data(desc); 684 intptr_t i; 685 686 for (i = 0; i < oprsz; i += sizeof(vec8)) { 687 *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift; 688 } 689 clear_high(d, oprsz, desc); 690 } 691 692 void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc) 693 { 694 intptr_t oprsz = simd_oprsz(desc); 695 int shift = simd_data(desc); 696 intptr_t i; 697 698 for (i = 0; i < oprsz; i += sizeof(vec16)) { 699 *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift; 700 } 701 clear_high(d, oprsz, desc); 702 } 703 704 void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc) 705 { 706 intptr_t oprsz = simd_oprsz(desc); 707 int shift = simd_data(desc); 708 intptr_t i; 709 710 for (i = 0; i < oprsz; i += sizeof(vec32)) { 711 *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift; 712 } 713 clear_high(d, oprsz, desc); 714 } 715 716 void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc) 717 { 718 intptr_t oprsz = simd_oprsz(desc); 719 int shift = simd_data(desc); 720 intptr_t i; 721 722 for (i = 0; i < oprsz; i += sizeof(vec64)) { 723 *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift; 724 } 725 clear_high(d, oprsz, desc); 726 } 727 728 /* If vectors are enabled, the compiler fills in -1 for true. 729 Otherwise, we must take care of this by hand. */ 730 #ifdef CONFIG_VECTOR16 731 # define DO_CMP0(X) X 732 #else 733 # define DO_CMP0(X) -(X) 734 #endif 735 736 #define DO_CMP1(NAME, TYPE, OP) \ 737 void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \ 738 { \ 739 intptr_t oprsz = simd_oprsz(desc); \ 740 intptr_t i; \ 741 for (i = 0; i < oprsz; i += sizeof(TYPE)) { \ 742 *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \ 743 } \ 744 clear_high(d, oprsz, desc); \ 745 } 746 747 #define DO_CMP2(SZ) \ 748 DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \ 749 DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \ 750 DO_CMP1(gvec_lt##SZ, svec##SZ, <) \ 751 DO_CMP1(gvec_le##SZ, svec##SZ, <=) \ 752 DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \ 753 DO_CMP1(gvec_leu##SZ, vec##SZ, <=) 754 755 DO_CMP2(8) 756 DO_CMP2(16) 757 DO_CMP2(32) 758 DO_CMP2(64) 759 760 #undef DO_CMP0 761 #undef DO_CMP1 762 #undef DO_CMP2 763 764 void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc) 765 { 766 intptr_t oprsz = simd_oprsz(desc); 767 intptr_t i; 768 769 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 770 int r = *(int8_t *)(a + i) + *(int8_t *)(b + i); 771 if (r > INT8_MAX) { 772 r = INT8_MAX; 773 } else if (r < INT8_MIN) { 774 r = INT8_MIN; 775 } 776 *(int8_t *)(d + i) = r; 777 } 778 clear_high(d, oprsz, desc); 779 } 780 781 void HELPER(gvec_ssadd16)(void *d, void *a, void *b, uint32_t desc) 782 { 783 intptr_t oprsz = simd_oprsz(desc); 784 intptr_t i; 785 786 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 787 int r = *(int16_t *)(a + i) + *(int16_t *)(b + i); 788 if (r > INT16_MAX) { 789 r = INT16_MAX; 790 } else if (r < INT16_MIN) { 791 r = INT16_MIN; 792 } 793 *(int16_t *)(d + i) = r; 794 } 795 clear_high(d, oprsz, desc); 796 } 797 798 void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc) 799 { 800 intptr_t oprsz = simd_oprsz(desc); 801 intptr_t i; 802 803 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 804 int32_t ai = *(int32_t *)(a + i); 805 int32_t bi = *(int32_t *)(b + i); 806 int32_t di = ai + bi; 807 if (((di ^ ai) &~ (ai ^ bi)) < 0) { 808 /* Signed overflow. */ 809 di = (di < 0 ? INT32_MAX : INT32_MIN); 810 } 811 *(int32_t *)(d + i) = di; 812 } 813 clear_high(d, oprsz, desc); 814 } 815 816 void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc) 817 { 818 intptr_t oprsz = simd_oprsz(desc); 819 intptr_t i; 820 821 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 822 int64_t ai = *(int64_t *)(a + i); 823 int64_t bi = *(int64_t *)(b + i); 824 int64_t di = ai + bi; 825 if (((di ^ ai) &~ (ai ^ bi)) < 0) { 826 /* Signed overflow. */ 827 di = (di < 0 ? INT64_MAX : INT64_MIN); 828 } 829 *(int64_t *)(d + i) = di; 830 } 831 clear_high(d, oprsz, desc); 832 } 833 834 void HELPER(gvec_sssub8)(void *d, void *a, void *b, uint32_t desc) 835 { 836 intptr_t oprsz = simd_oprsz(desc); 837 intptr_t i; 838 839 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 840 int r = *(int8_t *)(a + i) - *(int8_t *)(b + i); 841 if (r > INT8_MAX) { 842 r = INT8_MAX; 843 } else if (r < INT8_MIN) { 844 r = INT8_MIN; 845 } 846 *(uint8_t *)(d + i) = r; 847 } 848 clear_high(d, oprsz, desc); 849 } 850 851 void HELPER(gvec_sssub16)(void *d, void *a, void *b, uint32_t desc) 852 { 853 intptr_t oprsz = simd_oprsz(desc); 854 intptr_t i; 855 856 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 857 int r = *(int16_t *)(a + i) - *(int16_t *)(b + i); 858 if (r > INT16_MAX) { 859 r = INT16_MAX; 860 } else if (r < INT16_MIN) { 861 r = INT16_MIN; 862 } 863 *(int16_t *)(d + i) = r; 864 } 865 clear_high(d, oprsz, desc); 866 } 867 868 void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc) 869 { 870 intptr_t oprsz = simd_oprsz(desc); 871 intptr_t i; 872 873 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 874 int32_t ai = *(int32_t *)(a + i); 875 int32_t bi = *(int32_t *)(b + i); 876 int32_t di = ai - bi; 877 if (((di ^ ai) & (ai ^ bi)) < 0) { 878 /* Signed overflow. */ 879 di = (di < 0 ? INT32_MAX : INT32_MIN); 880 } 881 *(int32_t *)(d + i) = di; 882 } 883 clear_high(d, oprsz, desc); 884 } 885 886 void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc) 887 { 888 intptr_t oprsz = simd_oprsz(desc); 889 intptr_t i; 890 891 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 892 int64_t ai = *(int64_t *)(a + i); 893 int64_t bi = *(int64_t *)(b + i); 894 int64_t di = ai - bi; 895 if (((di ^ ai) & (ai ^ bi)) < 0) { 896 /* Signed overflow. */ 897 di = (di < 0 ? INT64_MAX : INT64_MIN); 898 } 899 *(int64_t *)(d + i) = di; 900 } 901 clear_high(d, oprsz, desc); 902 } 903 904 void HELPER(gvec_usadd8)(void *d, void *a, void *b, uint32_t desc) 905 { 906 intptr_t oprsz = simd_oprsz(desc); 907 intptr_t i; 908 909 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 910 unsigned r = *(uint8_t *)(a + i) + *(uint8_t *)(b + i); 911 if (r > UINT8_MAX) { 912 r = UINT8_MAX; 913 } 914 *(uint8_t *)(d + i) = r; 915 } 916 clear_high(d, oprsz, desc); 917 } 918 919 void HELPER(gvec_usadd16)(void *d, void *a, void *b, uint32_t desc) 920 { 921 intptr_t oprsz = simd_oprsz(desc); 922 intptr_t i; 923 924 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 925 unsigned r = *(uint16_t *)(a + i) + *(uint16_t *)(b + i); 926 if (r > UINT16_MAX) { 927 r = UINT16_MAX; 928 } 929 *(uint16_t *)(d + i) = r; 930 } 931 clear_high(d, oprsz, desc); 932 } 933 934 void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc) 935 { 936 intptr_t oprsz = simd_oprsz(desc); 937 intptr_t i; 938 939 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 940 uint32_t ai = *(uint32_t *)(a + i); 941 uint32_t bi = *(uint32_t *)(b + i); 942 uint32_t di = ai + bi; 943 if (di < ai) { 944 di = UINT32_MAX; 945 } 946 *(uint32_t *)(d + i) = di; 947 } 948 clear_high(d, oprsz, desc); 949 } 950 951 void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc) 952 { 953 intptr_t oprsz = simd_oprsz(desc); 954 intptr_t i; 955 956 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 957 uint64_t ai = *(uint64_t *)(a + i); 958 uint64_t bi = *(uint64_t *)(b + i); 959 uint64_t di = ai + bi; 960 if (di < ai) { 961 di = UINT64_MAX; 962 } 963 *(uint64_t *)(d + i) = di; 964 } 965 clear_high(d, oprsz, desc); 966 } 967 968 void HELPER(gvec_ussub8)(void *d, void *a, void *b, uint32_t desc) 969 { 970 intptr_t oprsz = simd_oprsz(desc); 971 intptr_t i; 972 973 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 974 int r = *(uint8_t *)(a + i) - *(uint8_t *)(b + i); 975 if (r < 0) { 976 r = 0; 977 } 978 *(uint8_t *)(d + i) = r; 979 } 980 clear_high(d, oprsz, desc); 981 } 982 983 void HELPER(gvec_ussub16)(void *d, void *a, void *b, uint32_t desc) 984 { 985 intptr_t oprsz = simd_oprsz(desc); 986 intptr_t i; 987 988 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 989 int r = *(uint16_t *)(a + i) - *(uint16_t *)(b + i); 990 if (r < 0) { 991 r = 0; 992 } 993 *(uint16_t *)(d + i) = r; 994 } 995 clear_high(d, oprsz, desc); 996 } 997 998 void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc) 999 { 1000 intptr_t oprsz = simd_oprsz(desc); 1001 intptr_t i; 1002 1003 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 1004 uint32_t ai = *(uint32_t *)(a + i); 1005 uint32_t bi = *(uint32_t *)(b + i); 1006 uint32_t di = ai - bi; 1007 if (ai < bi) { 1008 di = 0; 1009 } 1010 *(uint32_t *)(d + i) = di; 1011 } 1012 clear_high(d, oprsz, desc); 1013 } 1014 1015 void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc) 1016 { 1017 intptr_t oprsz = simd_oprsz(desc); 1018 intptr_t i; 1019 1020 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 1021 uint64_t ai = *(uint64_t *)(a + i); 1022 uint64_t bi = *(uint64_t *)(b + i); 1023 uint64_t di = ai - bi; 1024 if (ai < bi) { 1025 di = 0; 1026 } 1027 *(uint64_t *)(d + i) = di; 1028 } 1029 clear_high(d, oprsz, desc); 1030 } 1031 1032 void HELPER(gvec_smin8)(void *d, void *a, void *b, uint32_t desc) 1033 { 1034 intptr_t oprsz = simd_oprsz(desc); 1035 intptr_t i; 1036 1037 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 1038 int8_t aa = *(int8_t *)(a + i); 1039 int8_t bb = *(int8_t *)(b + i); 1040 int8_t dd = aa < bb ? aa : bb; 1041 *(int8_t *)(d + i) = dd; 1042 } 1043 clear_high(d, oprsz, desc); 1044 } 1045 1046 void HELPER(gvec_smin16)(void *d, void *a, void *b, uint32_t desc) 1047 { 1048 intptr_t oprsz = simd_oprsz(desc); 1049 intptr_t i; 1050 1051 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 1052 int16_t aa = *(int16_t *)(a + i); 1053 int16_t bb = *(int16_t *)(b + i); 1054 int16_t dd = aa < bb ? aa : bb; 1055 *(int16_t *)(d + i) = dd; 1056 } 1057 clear_high(d, oprsz, desc); 1058 } 1059 1060 void HELPER(gvec_smin32)(void *d, void *a, void *b, uint32_t desc) 1061 { 1062 intptr_t oprsz = simd_oprsz(desc); 1063 intptr_t i; 1064 1065 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 1066 int32_t aa = *(int32_t *)(a + i); 1067 int32_t bb = *(int32_t *)(b + i); 1068 int32_t dd = aa < bb ? aa : bb; 1069 *(int32_t *)(d + i) = dd; 1070 } 1071 clear_high(d, oprsz, desc); 1072 } 1073 1074 void HELPER(gvec_smin64)(void *d, void *a, void *b, uint32_t desc) 1075 { 1076 intptr_t oprsz = simd_oprsz(desc); 1077 intptr_t i; 1078 1079 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 1080 int64_t aa = *(int64_t *)(a + i); 1081 int64_t bb = *(int64_t *)(b + i); 1082 int64_t dd = aa < bb ? aa : bb; 1083 *(int64_t *)(d + i) = dd; 1084 } 1085 clear_high(d, oprsz, desc); 1086 } 1087 1088 void HELPER(gvec_smax8)(void *d, void *a, void *b, uint32_t desc) 1089 { 1090 intptr_t oprsz = simd_oprsz(desc); 1091 intptr_t i; 1092 1093 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 1094 int8_t aa = *(int8_t *)(a + i); 1095 int8_t bb = *(int8_t *)(b + i); 1096 int8_t dd = aa > bb ? aa : bb; 1097 *(int8_t *)(d + i) = dd; 1098 } 1099 clear_high(d, oprsz, desc); 1100 } 1101 1102 void HELPER(gvec_smax16)(void *d, void *a, void *b, uint32_t desc) 1103 { 1104 intptr_t oprsz = simd_oprsz(desc); 1105 intptr_t i; 1106 1107 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 1108 int16_t aa = *(int16_t *)(a + i); 1109 int16_t bb = *(int16_t *)(b + i); 1110 int16_t dd = aa > bb ? aa : bb; 1111 *(int16_t *)(d + i) = dd; 1112 } 1113 clear_high(d, oprsz, desc); 1114 } 1115 1116 void HELPER(gvec_smax32)(void *d, void *a, void *b, uint32_t desc) 1117 { 1118 intptr_t oprsz = simd_oprsz(desc); 1119 intptr_t i; 1120 1121 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 1122 int32_t aa = *(int32_t *)(a + i); 1123 int32_t bb = *(int32_t *)(b + i); 1124 int32_t dd = aa > bb ? aa : bb; 1125 *(int32_t *)(d + i) = dd; 1126 } 1127 clear_high(d, oprsz, desc); 1128 } 1129 1130 void HELPER(gvec_smax64)(void *d, void *a, void *b, uint32_t desc) 1131 { 1132 intptr_t oprsz = simd_oprsz(desc); 1133 intptr_t i; 1134 1135 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 1136 int64_t aa = *(int64_t *)(a + i); 1137 int64_t bb = *(int64_t *)(b + i); 1138 int64_t dd = aa > bb ? aa : bb; 1139 *(int64_t *)(d + i) = dd; 1140 } 1141 clear_high(d, oprsz, desc); 1142 } 1143 1144 void HELPER(gvec_umin8)(void *d, void *a, void *b, uint32_t desc) 1145 { 1146 intptr_t oprsz = simd_oprsz(desc); 1147 intptr_t i; 1148 1149 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 1150 uint8_t aa = *(uint8_t *)(a + i); 1151 uint8_t bb = *(uint8_t *)(b + i); 1152 uint8_t dd = aa < bb ? aa : bb; 1153 *(uint8_t *)(d + i) = dd; 1154 } 1155 clear_high(d, oprsz, desc); 1156 } 1157 1158 void HELPER(gvec_umin16)(void *d, void *a, void *b, uint32_t desc) 1159 { 1160 intptr_t oprsz = simd_oprsz(desc); 1161 intptr_t i; 1162 1163 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 1164 uint16_t aa = *(uint16_t *)(a + i); 1165 uint16_t bb = *(uint16_t *)(b + i); 1166 uint16_t dd = aa < bb ? aa : bb; 1167 *(uint16_t *)(d + i) = dd; 1168 } 1169 clear_high(d, oprsz, desc); 1170 } 1171 1172 void HELPER(gvec_umin32)(void *d, void *a, void *b, uint32_t desc) 1173 { 1174 intptr_t oprsz = simd_oprsz(desc); 1175 intptr_t i; 1176 1177 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 1178 uint32_t aa = *(uint32_t *)(a + i); 1179 uint32_t bb = *(uint32_t *)(b + i); 1180 uint32_t dd = aa < bb ? aa : bb; 1181 *(uint32_t *)(d + i) = dd; 1182 } 1183 clear_high(d, oprsz, desc); 1184 } 1185 1186 void HELPER(gvec_umin64)(void *d, void *a, void *b, uint32_t desc) 1187 { 1188 intptr_t oprsz = simd_oprsz(desc); 1189 intptr_t i; 1190 1191 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 1192 uint64_t aa = *(uint64_t *)(a + i); 1193 uint64_t bb = *(uint64_t *)(b + i); 1194 uint64_t dd = aa < bb ? aa : bb; 1195 *(uint64_t *)(d + i) = dd; 1196 } 1197 clear_high(d, oprsz, desc); 1198 } 1199 1200 void HELPER(gvec_umax8)(void *d, void *a, void *b, uint32_t desc) 1201 { 1202 intptr_t oprsz = simd_oprsz(desc); 1203 intptr_t i; 1204 1205 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 1206 uint8_t aa = *(uint8_t *)(a + i); 1207 uint8_t bb = *(uint8_t *)(b + i); 1208 uint8_t dd = aa > bb ? aa : bb; 1209 *(uint8_t *)(d + i) = dd; 1210 } 1211 clear_high(d, oprsz, desc); 1212 } 1213 1214 void HELPER(gvec_umax16)(void *d, void *a, void *b, uint32_t desc) 1215 { 1216 intptr_t oprsz = simd_oprsz(desc); 1217 intptr_t i; 1218 1219 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 1220 uint16_t aa = *(uint16_t *)(a + i); 1221 uint16_t bb = *(uint16_t *)(b + i); 1222 uint16_t dd = aa > bb ? aa : bb; 1223 *(uint16_t *)(d + i) = dd; 1224 } 1225 clear_high(d, oprsz, desc); 1226 } 1227 1228 void HELPER(gvec_umax32)(void *d, void *a, void *b, uint32_t desc) 1229 { 1230 intptr_t oprsz = simd_oprsz(desc); 1231 intptr_t i; 1232 1233 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 1234 uint32_t aa = *(uint32_t *)(a + i); 1235 uint32_t bb = *(uint32_t *)(b + i); 1236 uint32_t dd = aa > bb ? aa : bb; 1237 *(uint32_t *)(d + i) = dd; 1238 } 1239 clear_high(d, oprsz, desc); 1240 } 1241 1242 void HELPER(gvec_umax64)(void *d, void *a, void *b, uint32_t desc) 1243 { 1244 intptr_t oprsz = simd_oprsz(desc); 1245 intptr_t i; 1246 1247 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 1248 uint64_t aa = *(uint64_t *)(a + i); 1249 uint64_t bb = *(uint64_t *)(b + i); 1250 uint64_t dd = aa > bb ? aa : bb; 1251 *(uint64_t *)(d + i) = dd; 1252 } 1253 clear_high(d, oprsz, desc); 1254 } 1255