1 /* 2 * Generic vectorized operation runtime 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu/host-utils.h" 22 #include "cpu.h" 23 #include "exec/helper-proto.h" 24 #include "tcg-gvec-desc.h" 25 26 27 /* Virtually all hosts support 16-byte vectors. Those that don't can emulate 28 * them via GCC's generic vector extension. This turns out to be simpler and 29 * more reliable than getting the compiler to autovectorize. 30 * 31 * In tcg-op-gvec.c, we asserted that both the size and alignment of the data 32 * are multiples of 16. 33 * 34 * When the compiler does not support all of the operations we require, the 35 * loops are written so that we can always fall back on the base types. 36 */ 37 #ifdef CONFIG_VECTOR16 38 typedef uint8_t vec8 __attribute__((vector_size(16))); 39 typedef uint16_t vec16 __attribute__((vector_size(16))); 40 typedef uint32_t vec32 __attribute__((vector_size(16))); 41 typedef uint64_t vec64 __attribute__((vector_size(16))); 42 43 typedef int8_t svec8 __attribute__((vector_size(16))); 44 typedef int16_t svec16 __attribute__((vector_size(16))); 45 typedef int32_t svec32 __attribute__((vector_size(16))); 46 typedef int64_t svec64 __attribute__((vector_size(16))); 47 48 #define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X } 49 #define DUP8(X) { X, X, X, X, X, X, X, X } 50 #define DUP4(X) { X, X, X, X } 51 #define DUP2(X) { X, X } 52 #else 53 typedef uint8_t vec8; 54 typedef uint16_t vec16; 55 typedef uint32_t vec32; 56 typedef uint64_t vec64; 57 58 typedef int8_t svec8; 59 typedef int16_t svec16; 60 typedef int32_t svec32; 61 typedef int64_t svec64; 62 63 #define DUP16(X) X 64 #define DUP8(X) X 65 #define DUP4(X) X 66 #define DUP2(X) X 67 #endif /* CONFIG_VECTOR16 */ 68 69 static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc) 70 { 71 intptr_t maxsz = simd_maxsz(desc); 72 intptr_t i; 73 74 if (unlikely(maxsz > oprsz)) { 75 for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) { 76 *(uint64_t *)(d + i) = 0; 77 } 78 } 79 } 80 81 void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc) 82 { 83 intptr_t oprsz = simd_oprsz(desc); 84 intptr_t i; 85 86 for (i = 0; i < oprsz; i += sizeof(vec8)) { 87 *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i); 88 } 89 clear_high(d, oprsz, desc); 90 } 91 92 void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc) 93 { 94 intptr_t oprsz = simd_oprsz(desc); 95 intptr_t i; 96 97 for (i = 0; i < oprsz; i += sizeof(vec16)) { 98 *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i); 99 } 100 clear_high(d, oprsz, desc); 101 } 102 103 void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc) 104 { 105 intptr_t oprsz = simd_oprsz(desc); 106 intptr_t i; 107 108 for (i = 0; i < oprsz; i += sizeof(vec32)) { 109 *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i); 110 } 111 clear_high(d, oprsz, desc); 112 } 113 114 void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc) 115 { 116 intptr_t oprsz = simd_oprsz(desc); 117 intptr_t i; 118 119 for (i = 0; i < oprsz; i += sizeof(vec64)) { 120 *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i); 121 } 122 clear_high(d, oprsz, desc); 123 } 124 125 void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc) 126 { 127 intptr_t oprsz = simd_oprsz(desc); 128 intptr_t i; 129 130 for (i = 0; i < oprsz; i += sizeof(vec8)) { 131 *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i); 132 } 133 clear_high(d, oprsz, desc); 134 } 135 136 void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc) 137 { 138 intptr_t oprsz = simd_oprsz(desc); 139 intptr_t i; 140 141 for (i = 0; i < oprsz; i += sizeof(vec16)) { 142 *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i); 143 } 144 clear_high(d, oprsz, desc); 145 } 146 147 void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc) 148 { 149 intptr_t oprsz = simd_oprsz(desc); 150 intptr_t i; 151 152 for (i = 0; i < oprsz; i += sizeof(vec32)) { 153 *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i); 154 } 155 clear_high(d, oprsz, desc); 156 } 157 158 void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc) 159 { 160 intptr_t oprsz = simd_oprsz(desc); 161 intptr_t i; 162 163 for (i = 0; i < oprsz; i += sizeof(vec64)) { 164 *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i); 165 } 166 clear_high(d, oprsz, desc); 167 } 168 169 void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc) 170 { 171 intptr_t oprsz = simd_oprsz(desc); 172 intptr_t i; 173 174 for (i = 0; i < oprsz; i += sizeof(vec8)) { 175 *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i); 176 } 177 clear_high(d, oprsz, desc); 178 } 179 180 void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc) 181 { 182 intptr_t oprsz = simd_oprsz(desc); 183 intptr_t i; 184 185 for (i = 0; i < oprsz; i += sizeof(vec16)) { 186 *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i); 187 } 188 clear_high(d, oprsz, desc); 189 } 190 191 void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc) 192 { 193 intptr_t oprsz = simd_oprsz(desc); 194 intptr_t i; 195 196 for (i = 0; i < oprsz; i += sizeof(vec32)) { 197 *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i); 198 } 199 clear_high(d, oprsz, desc); 200 } 201 202 void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc) 203 { 204 intptr_t oprsz = simd_oprsz(desc); 205 intptr_t i; 206 207 for (i = 0; i < oprsz; i += sizeof(vec64)) { 208 *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i); 209 } 210 clear_high(d, oprsz, desc); 211 } 212 213 void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc) 214 { 215 intptr_t oprsz = simd_oprsz(desc); 216 intptr_t i; 217 218 for (i = 0; i < oprsz; i += sizeof(vec8)) { 219 *(vec8 *)(d + i) = -*(vec8 *)(a + i); 220 } 221 clear_high(d, oprsz, desc); 222 } 223 224 void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc) 225 { 226 intptr_t oprsz = simd_oprsz(desc); 227 intptr_t i; 228 229 for (i = 0; i < oprsz; i += sizeof(vec16)) { 230 *(vec16 *)(d + i) = -*(vec16 *)(a + i); 231 } 232 clear_high(d, oprsz, desc); 233 } 234 235 void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc) 236 { 237 intptr_t oprsz = simd_oprsz(desc); 238 intptr_t i; 239 240 for (i = 0; i < oprsz; i += sizeof(vec32)) { 241 *(vec32 *)(d + i) = -*(vec32 *)(a + i); 242 } 243 clear_high(d, oprsz, desc); 244 } 245 246 void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc) 247 { 248 intptr_t oprsz = simd_oprsz(desc); 249 intptr_t i; 250 251 for (i = 0; i < oprsz; i += sizeof(vec64)) { 252 *(vec64 *)(d + i) = -*(vec64 *)(a + i); 253 } 254 clear_high(d, oprsz, desc); 255 } 256 257 void HELPER(gvec_mov)(void *d, void *a, uint32_t desc) 258 { 259 intptr_t oprsz = simd_oprsz(desc); 260 261 memcpy(d, a, oprsz); 262 clear_high(d, oprsz, desc); 263 } 264 265 void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c) 266 { 267 intptr_t oprsz = simd_oprsz(desc); 268 intptr_t i; 269 270 if (c == 0) { 271 oprsz = 0; 272 } else { 273 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 274 *(uint64_t *)(d + i) = c; 275 } 276 } 277 clear_high(d, oprsz, desc); 278 } 279 280 void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c) 281 { 282 intptr_t oprsz = simd_oprsz(desc); 283 intptr_t i; 284 285 if (c == 0) { 286 oprsz = 0; 287 } else { 288 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 289 *(uint32_t *)(d + i) = c; 290 } 291 } 292 clear_high(d, oprsz, desc); 293 } 294 295 void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c) 296 { 297 HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff)); 298 } 299 300 void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c) 301 { 302 HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff)); 303 } 304 305 void HELPER(gvec_not)(void *d, void *a, uint32_t desc) 306 { 307 intptr_t oprsz = simd_oprsz(desc); 308 intptr_t i; 309 310 for (i = 0; i < oprsz; i += sizeof(vec64)) { 311 *(vec64 *)(d + i) = ~*(vec64 *)(a + i); 312 } 313 clear_high(d, oprsz, desc); 314 } 315 316 void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc) 317 { 318 intptr_t oprsz = simd_oprsz(desc); 319 intptr_t i; 320 321 for (i = 0; i < oprsz; i += sizeof(vec64)) { 322 *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i); 323 } 324 clear_high(d, oprsz, desc); 325 } 326 327 void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc) 328 { 329 intptr_t oprsz = simd_oprsz(desc); 330 intptr_t i; 331 332 for (i = 0; i < oprsz; i += sizeof(vec64)) { 333 *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i); 334 } 335 clear_high(d, oprsz, desc); 336 } 337 338 void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc) 339 { 340 intptr_t oprsz = simd_oprsz(desc); 341 intptr_t i; 342 343 for (i = 0; i < oprsz; i += sizeof(vec64)) { 344 *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i); 345 } 346 clear_high(d, oprsz, desc); 347 } 348 349 void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc) 350 { 351 intptr_t oprsz = simd_oprsz(desc); 352 intptr_t i; 353 354 for (i = 0; i < oprsz; i += sizeof(vec64)) { 355 *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i); 356 } 357 clear_high(d, oprsz, desc); 358 } 359 360 void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc) 361 { 362 intptr_t oprsz = simd_oprsz(desc); 363 intptr_t i; 364 365 for (i = 0; i < oprsz; i += sizeof(vec64)) { 366 *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i); 367 } 368 clear_high(d, oprsz, desc); 369 } 370 371 void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc) 372 { 373 intptr_t oprsz = simd_oprsz(desc); 374 int shift = simd_data(desc); 375 intptr_t i; 376 377 for (i = 0; i < oprsz; i += sizeof(vec8)) { 378 *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift; 379 } 380 clear_high(d, oprsz, desc); 381 } 382 383 void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc) 384 { 385 intptr_t oprsz = simd_oprsz(desc); 386 int shift = simd_data(desc); 387 intptr_t i; 388 389 for (i = 0; i < oprsz; i += sizeof(vec16)) { 390 *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift; 391 } 392 clear_high(d, oprsz, desc); 393 } 394 395 void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc) 396 { 397 intptr_t oprsz = simd_oprsz(desc); 398 int shift = simd_data(desc); 399 intptr_t i; 400 401 for (i = 0; i < oprsz; i += sizeof(vec32)) { 402 *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift; 403 } 404 clear_high(d, oprsz, desc); 405 } 406 407 void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc) 408 { 409 intptr_t oprsz = simd_oprsz(desc); 410 int shift = simd_data(desc); 411 intptr_t i; 412 413 for (i = 0; i < oprsz; i += sizeof(vec64)) { 414 *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift; 415 } 416 clear_high(d, oprsz, desc); 417 } 418 419 void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc) 420 { 421 intptr_t oprsz = simd_oprsz(desc); 422 int shift = simd_data(desc); 423 intptr_t i; 424 425 for (i = 0; i < oprsz; i += sizeof(vec8)) { 426 *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift; 427 } 428 clear_high(d, oprsz, desc); 429 } 430 431 void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc) 432 { 433 intptr_t oprsz = simd_oprsz(desc); 434 int shift = simd_data(desc); 435 intptr_t i; 436 437 for (i = 0; i < oprsz; i += sizeof(vec16)) { 438 *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift; 439 } 440 clear_high(d, oprsz, desc); 441 } 442 443 void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc) 444 { 445 intptr_t oprsz = simd_oprsz(desc); 446 int shift = simd_data(desc); 447 intptr_t i; 448 449 for (i = 0; i < oprsz; i += sizeof(vec32)) { 450 *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift; 451 } 452 clear_high(d, oprsz, desc); 453 } 454 455 void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc) 456 { 457 intptr_t oprsz = simd_oprsz(desc); 458 int shift = simd_data(desc); 459 intptr_t i; 460 461 for (i = 0; i < oprsz; i += sizeof(vec64)) { 462 *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift; 463 } 464 clear_high(d, oprsz, desc); 465 } 466 467 void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc) 468 { 469 intptr_t oprsz = simd_oprsz(desc); 470 int shift = simd_data(desc); 471 intptr_t i; 472 473 for (i = 0; i < oprsz; i += sizeof(vec8)) { 474 *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift; 475 } 476 clear_high(d, oprsz, desc); 477 } 478 479 void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc) 480 { 481 intptr_t oprsz = simd_oprsz(desc); 482 int shift = simd_data(desc); 483 intptr_t i; 484 485 for (i = 0; i < oprsz; i += sizeof(vec16)) { 486 *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift; 487 } 488 clear_high(d, oprsz, desc); 489 } 490 491 void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc) 492 { 493 intptr_t oprsz = simd_oprsz(desc); 494 int shift = simd_data(desc); 495 intptr_t i; 496 497 for (i = 0; i < oprsz; i += sizeof(vec32)) { 498 *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift; 499 } 500 clear_high(d, oprsz, desc); 501 } 502 503 void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc) 504 { 505 intptr_t oprsz = simd_oprsz(desc); 506 int shift = simd_data(desc); 507 intptr_t i; 508 509 for (i = 0; i < oprsz; i += sizeof(vec64)) { 510 *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift; 511 } 512 clear_high(d, oprsz, desc); 513 } 514 515 /* If vectors are enabled, the compiler fills in -1 for true. 516 Otherwise, we must take care of this by hand. */ 517 #ifdef CONFIG_VECTOR16 518 # define DO_CMP0(X) X 519 #else 520 # define DO_CMP0(X) -(X) 521 #endif 522 523 #define DO_CMP1(NAME, TYPE, OP) \ 524 void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \ 525 { \ 526 intptr_t oprsz = simd_oprsz(desc); \ 527 intptr_t i; \ 528 for (i = 0; i < oprsz; i += sizeof(vec64)) { \ 529 *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \ 530 } \ 531 clear_high(d, oprsz, desc); \ 532 } 533 534 #define DO_CMP2(SZ) \ 535 DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \ 536 DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \ 537 DO_CMP1(gvec_lt##SZ, svec##SZ, <) \ 538 DO_CMP1(gvec_le##SZ, svec##SZ, <=) \ 539 DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \ 540 DO_CMP1(gvec_leu##SZ, vec##SZ, <=) 541 542 DO_CMP2(8) 543 DO_CMP2(16) 544 DO_CMP2(32) 545 DO_CMP2(64) 546 547 #undef DO_CMP0 548 #undef DO_CMP1 549 #undef DO_CMP2 550