1 /* 2 * Generic vectorized operation runtime 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu/host-utils.h" 22 #include "cpu.h" 23 #include "exec/helper-proto.h" 24 #include "tcg-gvec-desc.h" 25 26 27 /* Virtually all hosts support 16-byte vectors. Those that don't can emulate 28 * them via GCC's generic vector extension. This turns out to be simpler and 29 * more reliable than getting the compiler to autovectorize. 30 * 31 * In tcg-op-gvec.c, we asserted that both the size and alignment of the data 32 * are multiples of 16. 33 * 34 * When the compiler does not support all of the operations we require, the 35 * loops are written so that we can always fall back on the base types. 36 */ 37 #ifdef CONFIG_VECTOR16 38 typedef uint8_t vec8 __attribute__((vector_size(16))); 39 typedef uint16_t vec16 __attribute__((vector_size(16))); 40 typedef uint32_t vec32 __attribute__((vector_size(16))); 41 typedef uint64_t vec64 __attribute__((vector_size(16))); 42 43 typedef int8_t svec8 __attribute__((vector_size(16))); 44 typedef int16_t svec16 __attribute__((vector_size(16))); 45 typedef int32_t svec32 __attribute__((vector_size(16))); 46 typedef int64_t svec64 __attribute__((vector_size(16))); 47 48 #define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X } 49 #define DUP8(X) { X, X, X, X, X, X, X, X } 50 #define DUP4(X) { X, X, X, X } 51 #define DUP2(X) { X, X } 52 #else 53 typedef uint8_t vec8; 54 typedef uint16_t vec16; 55 typedef uint32_t vec32; 56 typedef uint64_t vec64; 57 58 typedef int8_t svec8; 59 typedef int16_t svec16; 60 typedef int32_t svec32; 61 typedef int64_t svec64; 62 63 #define DUP16(X) X 64 #define DUP8(X) X 65 #define DUP4(X) X 66 #define DUP2(X) X 67 #endif /* CONFIG_VECTOR16 */ 68 69 static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc) 70 { 71 intptr_t maxsz = simd_maxsz(desc); 72 intptr_t i; 73 74 if (unlikely(maxsz > oprsz)) { 75 for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) { 76 *(uint64_t *)(d + i) = 0; 77 } 78 } 79 } 80 81 void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc) 82 { 83 intptr_t oprsz = simd_oprsz(desc); 84 intptr_t i; 85 86 for (i = 0; i < oprsz; i += sizeof(vec8)) { 87 *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i); 88 } 89 clear_high(d, oprsz, desc); 90 } 91 92 void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc) 93 { 94 intptr_t oprsz = simd_oprsz(desc); 95 intptr_t i; 96 97 for (i = 0; i < oprsz; i += sizeof(vec16)) { 98 *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i); 99 } 100 clear_high(d, oprsz, desc); 101 } 102 103 void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc) 104 { 105 intptr_t oprsz = simd_oprsz(desc); 106 intptr_t i; 107 108 for (i = 0; i < oprsz; i += sizeof(vec32)) { 109 *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i); 110 } 111 clear_high(d, oprsz, desc); 112 } 113 114 void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc) 115 { 116 intptr_t oprsz = simd_oprsz(desc); 117 intptr_t i; 118 119 for (i = 0; i < oprsz; i += sizeof(vec64)) { 120 *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i); 121 } 122 clear_high(d, oprsz, desc); 123 } 124 125 void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc) 126 { 127 intptr_t oprsz = simd_oprsz(desc); 128 vec8 vecb = (vec8)DUP16(b); 129 intptr_t i; 130 131 for (i = 0; i < oprsz; i += sizeof(vec8)) { 132 *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb; 133 } 134 clear_high(d, oprsz, desc); 135 } 136 137 void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc) 138 { 139 intptr_t oprsz = simd_oprsz(desc); 140 vec16 vecb = (vec16)DUP8(b); 141 intptr_t i; 142 143 for (i = 0; i < oprsz; i += sizeof(vec16)) { 144 *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb; 145 } 146 clear_high(d, oprsz, desc); 147 } 148 149 void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc) 150 { 151 intptr_t oprsz = simd_oprsz(desc); 152 vec32 vecb = (vec32)DUP4(b); 153 intptr_t i; 154 155 for (i = 0; i < oprsz; i += sizeof(vec32)) { 156 *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb; 157 } 158 clear_high(d, oprsz, desc); 159 } 160 161 void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc) 162 { 163 intptr_t oprsz = simd_oprsz(desc); 164 vec64 vecb = (vec64)DUP2(b); 165 intptr_t i; 166 167 for (i = 0; i < oprsz; i += sizeof(vec64)) { 168 *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb; 169 } 170 clear_high(d, oprsz, desc); 171 } 172 173 void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc) 174 { 175 intptr_t oprsz = simd_oprsz(desc); 176 intptr_t i; 177 178 for (i = 0; i < oprsz; i += sizeof(vec8)) { 179 *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i); 180 } 181 clear_high(d, oprsz, desc); 182 } 183 184 void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc) 185 { 186 intptr_t oprsz = simd_oprsz(desc); 187 intptr_t i; 188 189 for (i = 0; i < oprsz; i += sizeof(vec16)) { 190 *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i); 191 } 192 clear_high(d, oprsz, desc); 193 } 194 195 void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc) 196 { 197 intptr_t oprsz = simd_oprsz(desc); 198 intptr_t i; 199 200 for (i = 0; i < oprsz; i += sizeof(vec32)) { 201 *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i); 202 } 203 clear_high(d, oprsz, desc); 204 } 205 206 void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc) 207 { 208 intptr_t oprsz = simd_oprsz(desc); 209 intptr_t i; 210 211 for (i = 0; i < oprsz; i += sizeof(vec64)) { 212 *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i); 213 } 214 clear_high(d, oprsz, desc); 215 } 216 217 void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc) 218 { 219 intptr_t oprsz = simd_oprsz(desc); 220 vec8 vecb = (vec8)DUP16(b); 221 intptr_t i; 222 223 for (i = 0; i < oprsz; i += sizeof(vec8)) { 224 *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb; 225 } 226 clear_high(d, oprsz, desc); 227 } 228 229 void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc) 230 { 231 intptr_t oprsz = simd_oprsz(desc); 232 vec16 vecb = (vec16)DUP8(b); 233 intptr_t i; 234 235 for (i = 0; i < oprsz; i += sizeof(vec16)) { 236 *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb; 237 } 238 clear_high(d, oprsz, desc); 239 } 240 241 void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc) 242 { 243 intptr_t oprsz = simd_oprsz(desc); 244 vec32 vecb = (vec32)DUP4(b); 245 intptr_t i; 246 247 for (i = 0; i < oprsz; i += sizeof(vec32)) { 248 *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb; 249 } 250 clear_high(d, oprsz, desc); 251 } 252 253 void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc) 254 { 255 intptr_t oprsz = simd_oprsz(desc); 256 vec64 vecb = (vec64)DUP2(b); 257 intptr_t i; 258 259 for (i = 0; i < oprsz; i += sizeof(vec64)) { 260 *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb; 261 } 262 clear_high(d, oprsz, desc); 263 } 264 265 void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc) 266 { 267 intptr_t oprsz = simd_oprsz(desc); 268 intptr_t i; 269 270 for (i = 0; i < oprsz; i += sizeof(vec8)) { 271 *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i); 272 } 273 clear_high(d, oprsz, desc); 274 } 275 276 void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc) 277 { 278 intptr_t oprsz = simd_oprsz(desc); 279 intptr_t i; 280 281 for (i = 0; i < oprsz; i += sizeof(vec16)) { 282 *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i); 283 } 284 clear_high(d, oprsz, desc); 285 } 286 287 void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc) 288 { 289 intptr_t oprsz = simd_oprsz(desc); 290 intptr_t i; 291 292 for (i = 0; i < oprsz; i += sizeof(vec32)) { 293 *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i); 294 } 295 clear_high(d, oprsz, desc); 296 } 297 298 void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc) 299 { 300 intptr_t oprsz = simd_oprsz(desc); 301 intptr_t i; 302 303 for (i = 0; i < oprsz; i += sizeof(vec64)) { 304 *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i); 305 } 306 clear_high(d, oprsz, desc); 307 } 308 309 void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc) 310 { 311 intptr_t oprsz = simd_oprsz(desc); 312 vec8 vecb = (vec8)DUP16(b); 313 intptr_t i; 314 315 for (i = 0; i < oprsz; i += sizeof(vec8)) { 316 *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb; 317 } 318 clear_high(d, oprsz, desc); 319 } 320 321 void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc) 322 { 323 intptr_t oprsz = simd_oprsz(desc); 324 vec16 vecb = (vec16)DUP8(b); 325 intptr_t i; 326 327 for (i = 0; i < oprsz; i += sizeof(vec16)) { 328 *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb; 329 } 330 clear_high(d, oprsz, desc); 331 } 332 333 void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc) 334 { 335 intptr_t oprsz = simd_oprsz(desc); 336 vec32 vecb = (vec32)DUP4(b); 337 intptr_t i; 338 339 for (i = 0; i < oprsz; i += sizeof(vec32)) { 340 *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb; 341 } 342 clear_high(d, oprsz, desc); 343 } 344 345 void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc) 346 { 347 intptr_t oprsz = simd_oprsz(desc); 348 vec64 vecb = (vec64)DUP2(b); 349 intptr_t i; 350 351 for (i = 0; i < oprsz; i += sizeof(vec64)) { 352 *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb; 353 } 354 clear_high(d, oprsz, desc); 355 } 356 357 void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc) 358 { 359 intptr_t oprsz = simd_oprsz(desc); 360 intptr_t i; 361 362 for (i = 0; i < oprsz; i += sizeof(vec8)) { 363 *(vec8 *)(d + i) = -*(vec8 *)(a + i); 364 } 365 clear_high(d, oprsz, desc); 366 } 367 368 void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc) 369 { 370 intptr_t oprsz = simd_oprsz(desc); 371 intptr_t i; 372 373 for (i = 0; i < oprsz; i += sizeof(vec16)) { 374 *(vec16 *)(d + i) = -*(vec16 *)(a + i); 375 } 376 clear_high(d, oprsz, desc); 377 } 378 379 void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc) 380 { 381 intptr_t oprsz = simd_oprsz(desc); 382 intptr_t i; 383 384 for (i = 0; i < oprsz; i += sizeof(vec32)) { 385 *(vec32 *)(d + i) = -*(vec32 *)(a + i); 386 } 387 clear_high(d, oprsz, desc); 388 } 389 390 void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc) 391 { 392 intptr_t oprsz = simd_oprsz(desc); 393 intptr_t i; 394 395 for (i = 0; i < oprsz; i += sizeof(vec64)) { 396 *(vec64 *)(d + i) = -*(vec64 *)(a + i); 397 } 398 clear_high(d, oprsz, desc); 399 } 400 401 void HELPER(gvec_mov)(void *d, void *a, uint32_t desc) 402 { 403 intptr_t oprsz = simd_oprsz(desc); 404 405 memcpy(d, a, oprsz); 406 clear_high(d, oprsz, desc); 407 } 408 409 void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c) 410 { 411 intptr_t oprsz = simd_oprsz(desc); 412 intptr_t i; 413 414 if (c == 0) { 415 oprsz = 0; 416 } else { 417 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 418 *(uint64_t *)(d + i) = c; 419 } 420 } 421 clear_high(d, oprsz, desc); 422 } 423 424 void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c) 425 { 426 intptr_t oprsz = simd_oprsz(desc); 427 intptr_t i; 428 429 if (c == 0) { 430 oprsz = 0; 431 } else { 432 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 433 *(uint32_t *)(d + i) = c; 434 } 435 } 436 clear_high(d, oprsz, desc); 437 } 438 439 void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c) 440 { 441 HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff)); 442 } 443 444 void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c) 445 { 446 HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff)); 447 } 448 449 void HELPER(gvec_not)(void *d, void *a, uint32_t desc) 450 { 451 intptr_t oprsz = simd_oprsz(desc); 452 intptr_t i; 453 454 for (i = 0; i < oprsz; i += sizeof(vec64)) { 455 *(vec64 *)(d + i) = ~*(vec64 *)(a + i); 456 } 457 clear_high(d, oprsz, desc); 458 } 459 460 void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc) 461 { 462 intptr_t oprsz = simd_oprsz(desc); 463 intptr_t i; 464 465 for (i = 0; i < oprsz; i += sizeof(vec64)) { 466 *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i); 467 } 468 clear_high(d, oprsz, desc); 469 } 470 471 void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc) 472 { 473 intptr_t oprsz = simd_oprsz(desc); 474 intptr_t i; 475 476 for (i = 0; i < oprsz; i += sizeof(vec64)) { 477 *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i); 478 } 479 clear_high(d, oprsz, desc); 480 } 481 482 void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc) 483 { 484 intptr_t oprsz = simd_oprsz(desc); 485 intptr_t i; 486 487 for (i = 0; i < oprsz; i += sizeof(vec64)) { 488 *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i); 489 } 490 clear_high(d, oprsz, desc); 491 } 492 493 void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc) 494 { 495 intptr_t oprsz = simd_oprsz(desc); 496 intptr_t i; 497 498 for (i = 0; i < oprsz; i += sizeof(vec64)) { 499 *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i); 500 } 501 clear_high(d, oprsz, desc); 502 } 503 504 void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc) 505 { 506 intptr_t oprsz = simd_oprsz(desc); 507 intptr_t i; 508 509 for (i = 0; i < oprsz; i += sizeof(vec64)) { 510 *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i); 511 } 512 clear_high(d, oprsz, desc); 513 } 514 515 void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc) 516 { 517 intptr_t oprsz = simd_oprsz(desc); 518 vec64 vecb = (vec64)DUP2(b); 519 intptr_t i; 520 521 for (i = 0; i < oprsz; i += sizeof(vec64)) { 522 *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb; 523 } 524 clear_high(d, oprsz, desc); 525 } 526 527 void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc) 528 { 529 intptr_t oprsz = simd_oprsz(desc); 530 vec64 vecb = (vec64)DUP2(b); 531 intptr_t i; 532 533 for (i = 0; i < oprsz; i += sizeof(vec64)) { 534 *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb; 535 } 536 clear_high(d, oprsz, desc); 537 } 538 539 void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc) 540 { 541 intptr_t oprsz = simd_oprsz(desc); 542 vec64 vecb = (vec64)DUP2(b); 543 intptr_t i; 544 545 for (i = 0; i < oprsz; i += sizeof(vec64)) { 546 *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb; 547 } 548 clear_high(d, oprsz, desc); 549 } 550 551 void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc) 552 { 553 intptr_t oprsz = simd_oprsz(desc); 554 int shift = simd_data(desc); 555 intptr_t i; 556 557 for (i = 0; i < oprsz; i += sizeof(vec8)) { 558 *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift; 559 } 560 clear_high(d, oprsz, desc); 561 } 562 563 void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc) 564 { 565 intptr_t oprsz = simd_oprsz(desc); 566 int shift = simd_data(desc); 567 intptr_t i; 568 569 for (i = 0; i < oprsz; i += sizeof(vec16)) { 570 *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift; 571 } 572 clear_high(d, oprsz, desc); 573 } 574 575 void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc) 576 { 577 intptr_t oprsz = simd_oprsz(desc); 578 int shift = simd_data(desc); 579 intptr_t i; 580 581 for (i = 0; i < oprsz; i += sizeof(vec32)) { 582 *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift; 583 } 584 clear_high(d, oprsz, desc); 585 } 586 587 void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc) 588 { 589 intptr_t oprsz = simd_oprsz(desc); 590 int shift = simd_data(desc); 591 intptr_t i; 592 593 for (i = 0; i < oprsz; i += sizeof(vec64)) { 594 *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift; 595 } 596 clear_high(d, oprsz, desc); 597 } 598 599 void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc) 600 { 601 intptr_t oprsz = simd_oprsz(desc); 602 int shift = simd_data(desc); 603 intptr_t i; 604 605 for (i = 0; i < oprsz; i += sizeof(vec8)) { 606 *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift; 607 } 608 clear_high(d, oprsz, desc); 609 } 610 611 void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc) 612 { 613 intptr_t oprsz = simd_oprsz(desc); 614 int shift = simd_data(desc); 615 intptr_t i; 616 617 for (i = 0; i < oprsz; i += sizeof(vec16)) { 618 *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift; 619 } 620 clear_high(d, oprsz, desc); 621 } 622 623 void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc) 624 { 625 intptr_t oprsz = simd_oprsz(desc); 626 int shift = simd_data(desc); 627 intptr_t i; 628 629 for (i = 0; i < oprsz; i += sizeof(vec32)) { 630 *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift; 631 } 632 clear_high(d, oprsz, desc); 633 } 634 635 void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc) 636 { 637 intptr_t oprsz = simd_oprsz(desc); 638 int shift = simd_data(desc); 639 intptr_t i; 640 641 for (i = 0; i < oprsz; i += sizeof(vec64)) { 642 *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift; 643 } 644 clear_high(d, oprsz, desc); 645 } 646 647 void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc) 648 { 649 intptr_t oprsz = simd_oprsz(desc); 650 int shift = simd_data(desc); 651 intptr_t i; 652 653 for (i = 0; i < oprsz; i += sizeof(vec8)) { 654 *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift; 655 } 656 clear_high(d, oprsz, desc); 657 } 658 659 void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc) 660 { 661 intptr_t oprsz = simd_oprsz(desc); 662 int shift = simd_data(desc); 663 intptr_t i; 664 665 for (i = 0; i < oprsz; i += sizeof(vec16)) { 666 *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift; 667 } 668 clear_high(d, oprsz, desc); 669 } 670 671 void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc) 672 { 673 intptr_t oprsz = simd_oprsz(desc); 674 int shift = simd_data(desc); 675 intptr_t i; 676 677 for (i = 0; i < oprsz; i += sizeof(vec32)) { 678 *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift; 679 } 680 clear_high(d, oprsz, desc); 681 } 682 683 void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc) 684 { 685 intptr_t oprsz = simd_oprsz(desc); 686 int shift = simd_data(desc); 687 intptr_t i; 688 689 for (i = 0; i < oprsz; i += sizeof(vec64)) { 690 *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift; 691 } 692 clear_high(d, oprsz, desc); 693 } 694 695 /* If vectors are enabled, the compiler fills in -1 for true. 696 Otherwise, we must take care of this by hand. */ 697 #ifdef CONFIG_VECTOR16 698 # define DO_CMP0(X) X 699 #else 700 # define DO_CMP0(X) -(X) 701 #endif 702 703 #define DO_CMP1(NAME, TYPE, OP) \ 704 void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \ 705 { \ 706 intptr_t oprsz = simd_oprsz(desc); \ 707 intptr_t i; \ 708 for (i = 0; i < oprsz; i += sizeof(TYPE)) { \ 709 *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \ 710 } \ 711 clear_high(d, oprsz, desc); \ 712 } 713 714 #define DO_CMP2(SZ) \ 715 DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \ 716 DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \ 717 DO_CMP1(gvec_lt##SZ, svec##SZ, <) \ 718 DO_CMP1(gvec_le##SZ, svec##SZ, <=) \ 719 DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \ 720 DO_CMP1(gvec_leu##SZ, vec##SZ, <=) 721 722 DO_CMP2(8) 723 DO_CMP2(16) 724 DO_CMP2(32) 725 DO_CMP2(64) 726 727 #undef DO_CMP0 728 #undef DO_CMP1 729 #undef DO_CMP2 730 731 void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc) 732 { 733 intptr_t oprsz = simd_oprsz(desc); 734 intptr_t i; 735 736 for (i = 0; i < oprsz; i += sizeof(int8_t)) { 737 int r = *(int8_t *)(a + i) + *(int8_t *)(b + i); 738 if (r > INT8_MAX) { 739 r = INT8_MAX; 740 } else if (r < INT8_MIN) { 741 r = INT8_MIN; 742 } 743 *(int8_t *)(d + i) = r; 744 } 745 clear_high(d, oprsz, desc); 746 } 747 748 void HELPER(gvec_ssadd16)(void *d, void *a, void *b, uint32_t desc) 749 { 750 intptr_t oprsz = simd_oprsz(desc); 751 intptr_t i; 752 753 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 754 int r = *(int16_t *)(a + i) + *(int16_t *)(b + i); 755 if (r > INT16_MAX) { 756 r = INT16_MAX; 757 } else if (r < INT16_MIN) { 758 r = INT16_MIN; 759 } 760 *(int16_t *)(d + i) = r; 761 } 762 clear_high(d, oprsz, desc); 763 } 764 765 void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc) 766 { 767 intptr_t oprsz = simd_oprsz(desc); 768 intptr_t i; 769 770 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 771 int32_t ai = *(int32_t *)(a + i); 772 int32_t bi = *(int32_t *)(b + i); 773 int32_t di = ai + bi; 774 if (((di ^ ai) &~ (ai ^ bi)) < 0) { 775 /* Signed overflow. */ 776 di = (di < 0 ? INT32_MAX : INT32_MIN); 777 } 778 *(int32_t *)(d + i) = di; 779 } 780 clear_high(d, oprsz, desc); 781 } 782 783 void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc) 784 { 785 intptr_t oprsz = simd_oprsz(desc); 786 intptr_t i; 787 788 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 789 int64_t ai = *(int64_t *)(a + i); 790 int64_t bi = *(int64_t *)(b + i); 791 int64_t di = ai + bi; 792 if (((di ^ ai) &~ (ai ^ bi)) < 0) { 793 /* Signed overflow. */ 794 di = (di < 0 ? INT64_MAX : INT64_MIN); 795 } 796 *(int64_t *)(d + i) = di; 797 } 798 clear_high(d, oprsz, desc); 799 } 800 801 void HELPER(gvec_sssub8)(void *d, void *a, void *b, uint32_t desc) 802 { 803 intptr_t oprsz = simd_oprsz(desc); 804 intptr_t i; 805 806 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 807 int r = *(int8_t *)(a + i) - *(int8_t *)(b + i); 808 if (r > INT8_MAX) { 809 r = INT8_MAX; 810 } else if (r < INT8_MIN) { 811 r = INT8_MIN; 812 } 813 *(uint8_t *)(d + i) = r; 814 } 815 clear_high(d, oprsz, desc); 816 } 817 818 void HELPER(gvec_sssub16)(void *d, void *a, void *b, uint32_t desc) 819 { 820 intptr_t oprsz = simd_oprsz(desc); 821 intptr_t i; 822 823 for (i = 0; i < oprsz; i += sizeof(int16_t)) { 824 int r = *(int16_t *)(a + i) - *(int16_t *)(b + i); 825 if (r > INT16_MAX) { 826 r = INT16_MAX; 827 } else if (r < INT16_MIN) { 828 r = INT16_MIN; 829 } 830 *(int16_t *)(d + i) = r; 831 } 832 clear_high(d, oprsz, desc); 833 } 834 835 void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc) 836 { 837 intptr_t oprsz = simd_oprsz(desc); 838 intptr_t i; 839 840 for (i = 0; i < oprsz; i += sizeof(int32_t)) { 841 int32_t ai = *(int32_t *)(a + i); 842 int32_t bi = *(int32_t *)(b + i); 843 int32_t di = ai - bi; 844 if (((di ^ ai) & (ai ^ bi)) < 0) { 845 /* Signed overflow. */ 846 di = (di < 0 ? INT32_MAX : INT32_MIN); 847 } 848 *(int32_t *)(d + i) = di; 849 } 850 clear_high(d, oprsz, desc); 851 } 852 853 void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc) 854 { 855 intptr_t oprsz = simd_oprsz(desc); 856 intptr_t i; 857 858 for (i = 0; i < oprsz; i += sizeof(int64_t)) { 859 int64_t ai = *(int64_t *)(a + i); 860 int64_t bi = *(int64_t *)(b + i); 861 int64_t di = ai - bi; 862 if (((di ^ ai) & (ai ^ bi)) < 0) { 863 /* Signed overflow. */ 864 di = (di < 0 ? INT64_MAX : INT64_MIN); 865 } 866 *(int64_t *)(d + i) = di; 867 } 868 clear_high(d, oprsz, desc); 869 } 870 871 void HELPER(gvec_usadd8)(void *d, void *a, void *b, uint32_t desc) 872 { 873 intptr_t oprsz = simd_oprsz(desc); 874 intptr_t i; 875 876 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 877 unsigned r = *(uint8_t *)(a + i) + *(uint8_t *)(b + i); 878 if (r > UINT8_MAX) { 879 r = UINT8_MAX; 880 } 881 *(uint8_t *)(d + i) = r; 882 } 883 clear_high(d, oprsz, desc); 884 } 885 886 void HELPER(gvec_usadd16)(void *d, void *a, void *b, uint32_t desc) 887 { 888 intptr_t oprsz = simd_oprsz(desc); 889 intptr_t i; 890 891 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 892 unsigned r = *(uint16_t *)(a + i) + *(uint16_t *)(b + i); 893 if (r > UINT16_MAX) { 894 r = UINT16_MAX; 895 } 896 *(uint16_t *)(d + i) = r; 897 } 898 clear_high(d, oprsz, desc); 899 } 900 901 void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc) 902 { 903 intptr_t oprsz = simd_oprsz(desc); 904 intptr_t i; 905 906 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 907 uint32_t ai = *(uint32_t *)(a + i); 908 uint32_t bi = *(uint32_t *)(b + i); 909 uint32_t di = ai + bi; 910 if (di < ai) { 911 di = UINT32_MAX; 912 } 913 *(uint32_t *)(d + i) = di; 914 } 915 clear_high(d, oprsz, desc); 916 } 917 918 void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc) 919 { 920 intptr_t oprsz = simd_oprsz(desc); 921 intptr_t i; 922 923 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 924 uint64_t ai = *(uint64_t *)(a + i); 925 uint64_t bi = *(uint64_t *)(b + i); 926 uint64_t di = ai + bi; 927 if (di < ai) { 928 di = UINT64_MAX; 929 } 930 *(uint64_t *)(d + i) = di; 931 } 932 clear_high(d, oprsz, desc); 933 } 934 935 void HELPER(gvec_ussub8)(void *d, void *a, void *b, uint32_t desc) 936 { 937 intptr_t oprsz = simd_oprsz(desc); 938 intptr_t i; 939 940 for (i = 0; i < oprsz; i += sizeof(uint8_t)) { 941 int r = *(uint8_t *)(a + i) - *(uint8_t *)(b + i); 942 if (r < 0) { 943 r = 0; 944 } 945 *(uint8_t *)(d + i) = r; 946 } 947 clear_high(d, oprsz, desc); 948 } 949 950 void HELPER(gvec_ussub16)(void *d, void *a, void *b, uint32_t desc) 951 { 952 intptr_t oprsz = simd_oprsz(desc); 953 intptr_t i; 954 955 for (i = 0; i < oprsz; i += sizeof(uint16_t)) { 956 int r = *(uint16_t *)(a + i) - *(uint16_t *)(b + i); 957 if (r < 0) { 958 r = 0; 959 } 960 *(uint16_t *)(d + i) = r; 961 } 962 clear_high(d, oprsz, desc); 963 } 964 965 void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc) 966 { 967 intptr_t oprsz = simd_oprsz(desc); 968 intptr_t i; 969 970 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 971 uint32_t ai = *(uint32_t *)(a + i); 972 uint32_t bi = *(uint32_t *)(b + i); 973 uint32_t di = ai - bi; 974 if (ai < bi) { 975 di = 0; 976 } 977 *(uint32_t *)(d + i) = di; 978 } 979 clear_high(d, oprsz, desc); 980 } 981 982 void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc) 983 { 984 intptr_t oprsz = simd_oprsz(desc); 985 intptr_t i; 986 987 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 988 uint64_t ai = *(uint64_t *)(a + i); 989 uint64_t bi = *(uint64_t *)(b + i); 990 uint64_t di = ai - bi; 991 if (ai < bi) { 992 di = 0; 993 } 994 *(uint64_t *)(d + i) = di; 995 } 996 clear_high(d, oprsz, desc); 997 } 998