1 /* 2 * Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 /* 19 * This example tests the HVX scatter/gather instructions 20 * 21 * See section 5.13 of the V68 HVX Programmer's Reference 22 * 23 * There are 3 main classes operations 24 * _16 16-bit elements and 16-bit offsets 25 * _32 32-bit elements and 32-bit offsets 26 * _16_32 16-bit elements and 32-bit offsets 27 * 28 * There are also masked and accumulate versions 29 */ 30 31 #include <stdio.h> 32 #include <string.h> 33 #include <stdlib.h> 34 #include <inttypes.h> 35 36 typedef long HVX_Vector __attribute__((__vector_size__(128))) 37 __attribute__((aligned(128))); 38 typedef long HVX_VectorPair __attribute__((__vector_size__(256))) 39 __attribute__((aligned(128))); 40 typedef long HVX_VectorPred __attribute__((__vector_size__(128))) 41 __attribute__((aligned(128))); 42 43 #define VSCATTER_16(BASE, RGN, OFF, VALS) \ 44 __builtin_HEXAGON_V6_vscattermh_128B((int)BASE, RGN, OFF, VALS) 45 #define VSCATTER_16_MASKED(MASK, BASE, RGN, OFF, VALS) \ 46 __builtin_HEXAGON_V6_vscattermhq_128B(MASK, (int)BASE, RGN, OFF, VALS) 47 #define VSCATTER_32(BASE, RGN, OFF, VALS) \ 48 __builtin_HEXAGON_V6_vscattermw_128B((int)BASE, RGN, OFF, VALS) 49 #define VSCATTER_32_MASKED(MASK, BASE, RGN, OFF, VALS) \ 50 __builtin_HEXAGON_V6_vscattermwq_128B(MASK, (int)BASE, RGN, OFF, VALS) 51 #define VSCATTER_16_32(BASE, RGN, OFF, VALS) \ 52 __builtin_HEXAGON_V6_vscattermhw_128B((int)BASE, RGN, OFF, VALS) 53 #define VSCATTER_16_32_MASKED(MASK, BASE, RGN, OFF, VALS) \ 54 __builtin_HEXAGON_V6_vscattermhwq_128B(MASK, (int)BASE, RGN, OFF, VALS) 55 #define VSCATTER_16_ACC(BASE, RGN, OFF, VALS) \ 56 __builtin_HEXAGON_V6_vscattermh_add_128B((int)BASE, RGN, OFF, VALS) 57 #define VSCATTER_32_ACC(BASE, RGN, OFF, VALS) \ 58 __builtin_HEXAGON_V6_vscattermw_add_128B((int)BASE, RGN, OFF, VALS) 59 #define VSCATTER_16_32_ACC(BASE, RGN, OFF, VALS) \ 60 __builtin_HEXAGON_V6_vscattermhw_add_128B((int)BASE, RGN, OFF, VALS) 61 62 #define VGATHER_16(DSTADDR, BASE, RGN, OFF) \ 63 __builtin_HEXAGON_V6_vgathermh_128B(DSTADDR, (int)BASE, RGN, OFF) 64 #define VGATHER_16_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \ 65 __builtin_HEXAGON_V6_vgathermhq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF) 66 #define VGATHER_32(DSTADDR, BASE, RGN, OFF) \ 67 __builtin_HEXAGON_V6_vgathermw_128B(DSTADDR, (int)BASE, RGN, OFF) 68 #define VGATHER_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \ 69 __builtin_HEXAGON_V6_vgathermwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF) 70 #define VGATHER_16_32(DSTADDR, BASE, RGN, OFF) \ 71 __builtin_HEXAGON_V6_vgathermhw_128B(DSTADDR, (int)BASE, RGN, OFF) 72 #define VGATHER_16_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \ 73 __builtin_HEXAGON_V6_vgathermhwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF) 74 75 #define VSHUFF_H(V) \ 76 __builtin_HEXAGON_V6_vshuffh_128B(V) 77 #define VSPLAT_H(X) \ 78 __builtin_HEXAGON_V6_lvsplath_128B(X) 79 #define VAND_VAL(PRED, VAL) \ 80 __builtin_HEXAGON_V6_vandvrt_128B(PRED, VAL) 81 #define VDEAL_H(V) \ 82 __builtin_HEXAGON_V6_vdealh_128B(V) 83 84 int err; 85 86 /* define the number of rows/cols in a square matrix */ 87 #define MATRIX_SIZE 64 88 89 /* define the size of the scatter buffer */ 90 #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE) 91 92 /* fake vtcm - put buffers together and force alignment */ 93 static struct { 94 unsigned short vscatter16[SCATTER_BUFFER_SIZE]; 95 unsigned short vgather16[MATRIX_SIZE]; 96 unsigned int vscatter32[SCATTER_BUFFER_SIZE]; 97 unsigned int vgather32[MATRIX_SIZE]; 98 unsigned short vscatter16_32[SCATTER_BUFFER_SIZE]; 99 unsigned short vgather16_32[MATRIX_SIZE]; 100 } vtcm __attribute__((aligned(0x10000))); 101 102 /* declare the arrays of reference values */ 103 unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE]; 104 unsigned short vgather16_ref[MATRIX_SIZE]; 105 unsigned int vscatter32_ref[SCATTER_BUFFER_SIZE]; 106 unsigned int vgather32_ref[MATRIX_SIZE]; 107 unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE]; 108 unsigned short vgather16_32_ref[MATRIX_SIZE]; 109 110 /* declare the arrays of offsets */ 111 unsigned short half_offsets[MATRIX_SIZE]; 112 unsigned int word_offsets[MATRIX_SIZE]; 113 114 /* declare the arrays of values */ 115 unsigned short half_values[MATRIX_SIZE]; 116 unsigned short half_values_acc[MATRIX_SIZE]; 117 unsigned short half_values_masked[MATRIX_SIZE]; 118 unsigned int word_values[MATRIX_SIZE]; 119 unsigned int word_values_acc[MATRIX_SIZE]; 120 unsigned int word_values_masked[MATRIX_SIZE]; 121 122 /* declare the arrays of predicates */ 123 unsigned short half_predicates[MATRIX_SIZE]; 124 unsigned int word_predicates[MATRIX_SIZE]; 125 126 /* make this big enough for all the intrinsics */ 127 const size_t region_len = sizeof(vtcm); 128 129 /* optionally add sync instructions */ 130 #define SYNC_VECTOR 1 131 132 static void sync_scatter(void *addr) 133 { 134 #if SYNC_VECTOR 135 /* 136 * Do the scatter release followed by a dummy load to complete the 137 * synchronization. Normally the dummy load would be deferred as 138 * long as possible to minimize stalls. 139 */ 140 asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr)); 141 /* use volatile to force the load */ 142 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy; 143 #endif 144 } 145 146 static void sync_gather(void *addr) 147 { 148 #if SYNC_VECTOR 149 /* use volatile to force the load */ 150 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy; 151 #endif 152 } 153 154 /* optionally print the results */ 155 #define PRINT_DATA 0 156 157 #define FILL_CHAR '.' 158 159 /* fill vtcm scratch with ee */ 160 void prefill_vtcm_scratch(void) 161 { 162 memset(&vtcm, FILL_CHAR, sizeof(vtcm)); 163 } 164 165 /* create byte offsets to be a diagonal of the matrix with 16 bit elements */ 166 void create_offsets_values_preds_16(void) 167 { 168 unsigned short half_element = 0; 169 unsigned short half_element_masked = 0; 170 char letter = 'A'; 171 char letter_masked = '@'; 172 173 for (int i = 0; i < MATRIX_SIZE; i++) { 174 half_offsets[i] = i * (2 * MATRIX_SIZE + 2); 175 176 half_element = 0; 177 half_element_masked = 0; 178 for (int j = 0; j < 2; j++) { 179 half_element |= letter << j * 8; 180 half_element_masked |= letter_masked << j * 8; 181 } 182 183 half_values[i] = half_element; 184 half_values_acc[i] = ((i % 10) << 8) + (i % 10); 185 half_values_masked[i] = half_element_masked; 186 187 letter++; 188 /* reset to 'A' */ 189 if (letter == 'M') { 190 letter = 'A'; 191 } 192 193 half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0; 194 } 195 } 196 197 /* create byte offsets to be a diagonal of the matrix with 32 bit elements */ 198 void create_offsets_values_preds_32(void) 199 { 200 unsigned int word_element = 0; 201 unsigned int word_element_masked = 0; 202 char letter = 'A'; 203 char letter_masked = '&'; 204 205 for (int i = 0; i < MATRIX_SIZE; i++) { 206 word_offsets[i] = i * (4 * MATRIX_SIZE + 4); 207 208 word_element = 0; 209 word_element_masked = 0; 210 for (int j = 0; j < 4; j++) { 211 word_element |= letter << j * 8; 212 word_element_masked |= letter_masked << j * 8; 213 } 214 215 word_values[i] = word_element; 216 word_values_acc[i] = ((i % 10) << 8) + (i % 10); 217 word_values_masked[i] = word_element_masked; 218 219 letter++; 220 /* reset to 'A' */ 221 if (letter == 'M') { 222 letter = 'A'; 223 } 224 225 word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0; 226 } 227 } 228 229 /* 230 * create byte offsets to be a diagonal of the matrix with 16 bit elements 231 * and 32 bit offsets 232 */ 233 void create_offsets_values_preds_16_32(void) 234 { 235 unsigned short half_element = 0; 236 unsigned short half_element_masked = 0; 237 char letter = 'D'; 238 char letter_masked = '$'; 239 240 for (int i = 0; i < MATRIX_SIZE; i++) { 241 word_offsets[i] = i * (2 * MATRIX_SIZE + 2); 242 243 half_element = 0; 244 half_element_masked = 0; 245 for (int j = 0; j < 2; j++) { 246 half_element |= letter << j * 8; 247 half_element_masked |= letter_masked << j * 8; 248 } 249 250 half_values[i] = half_element; 251 half_values_acc[i] = ((i % 10) << 8) + (i % 10); 252 half_values_masked[i] = half_element_masked; 253 254 letter++; 255 /* reset to 'A' */ 256 if (letter == 'P') { 257 letter = 'D'; 258 } 259 260 half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0; 261 } 262 } 263 264 /* scatter the 16 bit elements using intrinsics */ 265 void vector_scatter_16(void) 266 { 267 /* copy the offsets and values to vectors */ 268 HVX_Vector offsets = *(HVX_Vector *)half_offsets; 269 HVX_Vector values = *(HVX_Vector *)half_values; 270 271 VSCATTER_16(&vtcm.vscatter16, region_len, offsets, values); 272 273 sync_scatter(vtcm.vscatter16); 274 } 275 276 /* scatter-accumulate the 16 bit elements using intrinsics */ 277 void vector_scatter_16_acc(void) 278 { 279 /* copy the offsets and values to vectors */ 280 HVX_Vector offsets = *(HVX_Vector *)half_offsets; 281 HVX_Vector values = *(HVX_Vector *)half_values_acc; 282 283 VSCATTER_16_ACC(&vtcm.vscatter16, region_len, offsets, values); 284 285 sync_scatter(vtcm.vscatter16); 286 } 287 288 /* scatter the 16 bit elements using intrinsics */ 289 void vector_scatter_16_masked(void) 290 { 291 /* copy the offsets and values to vectors */ 292 HVX_Vector offsets = *(HVX_Vector *)half_offsets; 293 HVX_Vector values = *(HVX_Vector *)half_values_masked; 294 HVX_Vector pred_reg = *(HVX_Vector *)half_predicates; 295 HVX_VectorPred preds = VAND_VAL(pred_reg, ~0); 296 297 VSCATTER_16_MASKED(preds, &vtcm.vscatter16, region_len, offsets, values); 298 299 sync_scatter(vtcm.vscatter16); 300 } 301 302 /* scatter the 32 bit elements using intrinsics */ 303 void vector_scatter_32(void) 304 { 305 /* copy the offsets and values to vectors */ 306 HVX_Vector offsetslo = *(HVX_Vector *)word_offsets; 307 HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; 308 HVX_Vector valueslo = *(HVX_Vector *)word_values; 309 HVX_Vector valueshi = *(HVX_Vector *)&word_values[MATRIX_SIZE / 2]; 310 311 VSCATTER_32(&vtcm.vscatter32, region_len, offsetslo, valueslo); 312 VSCATTER_32(&vtcm.vscatter32, region_len, offsetshi, valueshi); 313 314 sync_scatter(vtcm.vscatter32); 315 } 316 317 /* scatter-acc the 32 bit elements using intrinsics */ 318 void vector_scatter_32_acc(void) 319 { 320 /* copy the offsets and values to vectors */ 321 HVX_Vector offsetslo = *(HVX_Vector *)word_offsets; 322 HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; 323 HVX_Vector valueslo = *(HVX_Vector *)word_values_acc; 324 HVX_Vector valueshi = *(HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2]; 325 326 VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetslo, valueslo); 327 VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetshi, valueshi); 328 329 sync_scatter(vtcm.vscatter32); 330 } 331 332 /* scatter the 32 bit elements using intrinsics */ 333 void vector_scatter_32_masked(void) 334 { 335 /* copy the offsets and values to vectors */ 336 HVX_Vector offsetslo = *(HVX_Vector *)word_offsets; 337 HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; 338 HVX_Vector valueslo = *(HVX_Vector *)word_values_masked; 339 HVX_Vector valueshi = *(HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2]; 340 HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates; 341 HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2]; 342 HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0); 343 HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0); 344 345 VSCATTER_32_MASKED(predslo, &vtcm.vscatter32, region_len, offsetslo, 346 valueslo); 347 VSCATTER_32_MASKED(predshi, &vtcm.vscatter32, region_len, offsetshi, 348 valueshi); 349 350 sync_scatter(vtcm.vscatter16); 351 } 352 353 /* scatter the 16 bit elements with 32 bit offsets using intrinsics */ 354 void vector_scatter_16_32(void) 355 { 356 HVX_VectorPair offsets; 357 HVX_Vector values; 358 359 /* get the word offsets in a vector pair */ 360 offsets = *(HVX_VectorPair *)word_offsets; 361 362 /* these values need to be shuffled for the scatter */ 363 values = *(HVX_Vector *)half_values; 364 values = VSHUFF_H(values); 365 366 VSCATTER_16_32(&vtcm.vscatter16_32, region_len, offsets, values); 367 368 sync_scatter(vtcm.vscatter16_32); 369 } 370 371 /* scatter-acc the 16 bit elements with 32 bit offsets using intrinsics */ 372 void vector_scatter_16_32_acc(void) 373 { 374 HVX_VectorPair offsets; 375 HVX_Vector values; 376 377 /* get the word offsets in a vector pair */ 378 offsets = *(HVX_VectorPair *)word_offsets; 379 380 /* these values need to be shuffled for the scatter */ 381 values = *(HVX_Vector *)half_values_acc; 382 values = VSHUFF_H(values); 383 384 VSCATTER_16_32_ACC(&vtcm.vscatter16_32, region_len, offsets, values); 385 386 sync_scatter(vtcm.vscatter16_32); 387 } 388 389 /* masked scatter the 16 bit elements with 32 bit offsets using intrinsics */ 390 void vector_scatter_16_32_masked(void) 391 { 392 HVX_VectorPair offsets; 393 HVX_Vector values; 394 HVX_Vector pred_reg; 395 396 /* get the word offsets in a vector pair */ 397 offsets = *(HVX_VectorPair *)word_offsets; 398 399 /* these values need to be shuffled for the scatter */ 400 values = *(HVX_Vector *)half_values_masked; 401 values = VSHUFF_H(values); 402 403 pred_reg = *(HVX_Vector *)half_predicates; 404 pred_reg = VSHUFF_H(pred_reg); 405 HVX_VectorPred preds = VAND_VAL(pred_reg, ~0); 406 407 VSCATTER_16_32_MASKED(preds, &vtcm.vscatter16_32, region_len, offsets, 408 values); 409 410 sync_scatter(vtcm.vscatter16_32); 411 } 412 413 /* gather the elements from the scatter16 buffer */ 414 void vector_gather_16(void) 415 { 416 HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16; 417 HVX_Vector offsets = *(HVX_Vector *)half_offsets; 418 419 VGATHER_16(vgather, &vtcm.vscatter16, region_len, offsets); 420 421 sync_gather(vgather); 422 } 423 424 static unsigned short gather_16_masked_init(void) 425 { 426 char letter = '?'; 427 return letter | (letter << 8); 428 } 429 430 void vector_gather_16_masked(void) 431 { 432 HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16; 433 HVX_Vector offsets = *(HVX_Vector *)half_offsets; 434 HVX_Vector pred_reg = *(HVX_Vector *)half_predicates; 435 HVX_VectorPred preds = VAND_VAL(pred_reg, ~0); 436 437 *vgather = VSPLAT_H(gather_16_masked_init()); 438 VGATHER_16_MASKED(vgather, preds, &vtcm.vscatter16, region_len, offsets); 439 440 sync_gather(vgather); 441 } 442 443 /* gather the elements from the scatter32 buffer */ 444 void vector_gather_32(void) 445 { 446 HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32; 447 HVX_Vector *vgatherhi = 448 (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2)); 449 HVX_Vector offsetslo = *(HVX_Vector *)word_offsets; 450 HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; 451 452 VGATHER_32(vgatherlo, &vtcm.vscatter32, region_len, offsetslo); 453 VGATHER_32(vgatherhi, &vtcm.vscatter32, region_len, offsetshi); 454 455 sync_gather(vgatherhi); 456 } 457 458 static unsigned int gather_32_masked_init(void) 459 { 460 char letter = '?'; 461 return letter | (letter << 8) | (letter << 16) | (letter << 24); 462 } 463 464 void vector_gather_32_masked(void) 465 { 466 HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32; 467 HVX_Vector *vgatherhi = 468 (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2)); 469 HVX_Vector offsetslo = *(HVX_Vector *)word_offsets; 470 HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; 471 HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates; 472 HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0); 473 HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2]; 474 HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0); 475 476 *vgatherlo = VSPLAT_H(gather_32_masked_init()); 477 *vgatherhi = VSPLAT_H(gather_32_masked_init()); 478 VGATHER_32_MASKED(vgatherlo, predslo, &vtcm.vscatter32, region_len, 479 offsetslo); 480 VGATHER_32_MASKED(vgatherhi, predshi, &vtcm.vscatter32, region_len, 481 offsetshi); 482 483 sync_gather(vgatherlo); 484 sync_gather(vgatherhi); 485 } 486 487 /* gather the elements from the scatter16_32 buffer */ 488 void vector_gather_16_32(void) 489 { 490 HVX_Vector *vgather; 491 HVX_VectorPair offsets; 492 HVX_Vector values; 493 494 /* get the vtcm address to gather from */ 495 vgather = (HVX_Vector *)&vtcm.vgather16_32; 496 497 /* get the word offsets in a vector pair */ 498 offsets = *(HVX_VectorPair *)word_offsets; 499 500 VGATHER_16_32(vgather, &vtcm.vscatter16_32, region_len, offsets); 501 502 /* deal the elements to get the order back */ 503 values = *(HVX_Vector *)vgather; 504 values = VDEAL_H(values); 505 506 /* write it back to vtcm address */ 507 *(HVX_Vector *)vgather = values; 508 } 509 510 void vector_gather_16_32_masked(void) 511 { 512 HVX_Vector *vgather; 513 HVX_VectorPair offsets; 514 HVX_Vector pred_reg; 515 HVX_VectorPred preds; 516 HVX_Vector values; 517 518 /* get the vtcm address to gather from */ 519 vgather = (HVX_Vector *)&vtcm.vgather16_32; 520 521 /* get the word offsets in a vector pair */ 522 offsets = *(HVX_VectorPair *)word_offsets; 523 pred_reg = *(HVX_Vector *)half_predicates; 524 pred_reg = VSHUFF_H(pred_reg); 525 preds = VAND_VAL(pred_reg, ~0); 526 527 *vgather = VSPLAT_H(gather_16_masked_init()); 528 VGATHER_16_32_MASKED(vgather, preds, &vtcm.vscatter16_32, region_len, 529 offsets); 530 531 /* deal the elements to get the order back */ 532 values = *(HVX_Vector *)vgather; 533 values = VDEAL_H(values); 534 535 /* write it back to vtcm address */ 536 *(HVX_Vector *)vgather = values; 537 } 538 539 static void check_buffer(const char *name, void *c, void *r, size_t size) 540 { 541 char *check = (char *)c; 542 char *ref = (char *)r; 543 for (int i = 0; i < size; i++) { 544 if (check[i] != ref[i]) { 545 printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i, 546 check[i], check[i], ref[i], ref[i]); 547 err++; 548 } 549 } 550 } 551 552 /* 553 * These scalar functions are the C equivalents of the vector functions that 554 * use HVX 555 */ 556 557 /* scatter the 16 bit elements using C */ 558 void scalar_scatter_16(unsigned short *vscatter16) 559 { 560 for (int i = 0; i < MATRIX_SIZE; ++i) { 561 vscatter16[half_offsets[i] / 2] = half_values[i]; 562 } 563 } 564 565 void check_scatter_16() 566 { 567 memset(vscatter16_ref, FILL_CHAR, 568 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 569 scalar_scatter_16(vscatter16_ref); 570 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref, 571 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 572 } 573 574 /* scatter the 16 bit elements using C */ 575 void scalar_scatter_16_acc(unsigned short *vscatter16) 576 { 577 for (int i = 0; i < MATRIX_SIZE; ++i) { 578 vscatter16[half_offsets[i] / 2] += half_values_acc[i]; 579 } 580 } 581 582 void check_scatter_16_acc() 583 { 584 memset(vscatter16_ref, FILL_CHAR, 585 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 586 scalar_scatter_16(vscatter16_ref); 587 scalar_scatter_16_acc(vscatter16_ref); 588 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref, 589 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 590 } 591 592 /* scatter the 16 bit elements using C */ 593 void scalar_scatter_16_masked(unsigned short *vscatter16) 594 { 595 for (int i = 0; i < MATRIX_SIZE; i++) { 596 if (half_predicates[i]) { 597 vscatter16[half_offsets[i] / 2] = half_values_masked[i]; 598 } 599 } 600 601 } 602 603 void check_scatter_16_masked() 604 { 605 memset(vscatter16_ref, FILL_CHAR, 606 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 607 scalar_scatter_16(vscatter16_ref); 608 scalar_scatter_16_acc(vscatter16_ref); 609 scalar_scatter_16_masked(vscatter16_ref); 610 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref, 611 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 612 } 613 614 /* scatter the 32 bit elements using C */ 615 void scalar_scatter_32(unsigned int *vscatter32) 616 { 617 for (int i = 0; i < MATRIX_SIZE; ++i) { 618 vscatter32[word_offsets[i] / 4] = word_values[i]; 619 } 620 } 621 622 void check_scatter_32() 623 { 624 memset(vscatter32_ref, FILL_CHAR, 625 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 626 scalar_scatter_32(vscatter32_ref); 627 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref, 628 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 629 } 630 631 /* scatter the 32 bit elements using C */ 632 void scalar_scatter_32_acc(unsigned int *vscatter32) 633 { 634 for (int i = 0; i < MATRIX_SIZE; ++i) { 635 vscatter32[word_offsets[i] / 4] += word_values_acc[i]; 636 } 637 } 638 639 void check_scatter_32_acc() 640 { 641 memset(vscatter32_ref, FILL_CHAR, 642 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 643 scalar_scatter_32(vscatter32_ref); 644 scalar_scatter_32_acc(vscatter32_ref); 645 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref, 646 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 647 } 648 649 /* scatter the 32 bit elements using C */ 650 void scalar_scatter_32_masked(unsigned int *vscatter32) 651 { 652 for (int i = 0; i < MATRIX_SIZE; i++) { 653 if (word_predicates[i]) { 654 vscatter32[word_offsets[i] / 4] = word_values_masked[i]; 655 } 656 } 657 } 658 659 void check_scatter_32_masked() 660 { 661 memset(vscatter32_ref, FILL_CHAR, 662 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 663 scalar_scatter_32(vscatter32_ref); 664 scalar_scatter_32_acc(vscatter32_ref); 665 scalar_scatter_32_masked(vscatter32_ref); 666 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref, 667 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 668 } 669 670 /* scatter the 32 bit elements using C */ 671 void scalar_scatter_16_32(unsigned short *vscatter16_32) 672 { 673 for (int i = 0; i < MATRIX_SIZE; ++i) { 674 vscatter16_32[word_offsets[i] / 2] = half_values[i]; 675 } 676 } 677 678 void check_scatter_16_32() 679 { 680 memset(vscatter16_32_ref, FILL_CHAR, 681 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 682 scalar_scatter_16_32(vscatter16_32_ref); 683 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref, 684 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 685 } 686 687 /* scatter the 32 bit elements using C */ 688 void scalar_scatter_16_32_acc(unsigned short *vscatter16_32) 689 { 690 for (int i = 0; i < MATRIX_SIZE; ++i) { 691 vscatter16_32[word_offsets[i] / 2] += half_values_acc[i]; 692 } 693 } 694 695 void check_scatter_16_32_acc() 696 { 697 memset(vscatter16_32_ref, FILL_CHAR, 698 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 699 scalar_scatter_16_32(vscatter16_32_ref); 700 scalar_scatter_16_32_acc(vscatter16_32_ref); 701 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref, 702 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 703 } 704 705 void scalar_scatter_16_32_masked(unsigned short *vscatter16_32) 706 { 707 for (int i = 0; i < MATRIX_SIZE; i++) { 708 if (half_predicates[i]) { 709 vscatter16_32[word_offsets[i] / 2] = half_values_masked[i]; 710 } 711 } 712 } 713 714 void check_scatter_16_32_masked() 715 { 716 memset(vscatter16_32_ref, FILL_CHAR, 717 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 718 scalar_scatter_16_32(vscatter16_32_ref); 719 scalar_scatter_16_32_acc(vscatter16_32_ref); 720 scalar_scatter_16_32_masked(vscatter16_32_ref); 721 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref, 722 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 723 } 724 725 /* gather the elements from the scatter buffer using C */ 726 void scalar_gather_16(unsigned short *vgather16) 727 { 728 for (int i = 0; i < MATRIX_SIZE; ++i) { 729 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2]; 730 } 731 } 732 733 void check_gather_16() 734 { 735 memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short)); 736 scalar_gather_16(vgather16_ref); 737 check_buffer(__func__, vtcm.vgather16, vgather16_ref, 738 MATRIX_SIZE * sizeof(unsigned short)); 739 } 740 741 void scalar_gather_16_masked(unsigned short *vgather16) 742 { 743 for (int i = 0; i < MATRIX_SIZE; ++i) { 744 if (half_predicates[i]) { 745 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2]; 746 } 747 } 748 } 749 750 void check_gather_16_masked() 751 { 752 memset(vgather16_ref, gather_16_masked_init(), 753 MATRIX_SIZE * sizeof(unsigned short)); 754 scalar_gather_16_masked(vgather16_ref); 755 check_buffer(__func__, vtcm.vgather16, vgather16_ref, 756 MATRIX_SIZE * sizeof(unsigned short)); 757 } 758 759 /* gather the elements from the scatter buffer using C */ 760 void scalar_gather_32(unsigned int *vgather32) 761 { 762 for (int i = 0; i < MATRIX_SIZE; ++i) { 763 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4]; 764 } 765 } 766 767 void check_gather_32(void) 768 { 769 memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int)); 770 scalar_gather_32(vgather32_ref); 771 check_buffer(__func__, vtcm.vgather32, vgather32_ref, 772 MATRIX_SIZE * sizeof(unsigned int)); 773 } 774 775 void scalar_gather_32_masked(unsigned int *vgather32) 776 { 777 for (int i = 0; i < MATRIX_SIZE; ++i) { 778 if (word_predicates[i]) { 779 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4]; 780 } 781 } 782 } 783 784 785 void check_gather_32_masked(void) 786 { 787 memset(vgather32_ref, gather_32_masked_init(), 788 MATRIX_SIZE * sizeof(unsigned int)); 789 scalar_gather_32_masked(vgather32_ref); 790 check_buffer(__func__, vtcm.vgather32, 791 vgather32_ref, MATRIX_SIZE * sizeof(unsigned int)); 792 } 793 794 /* gather the elements from the scatter buffer using C */ 795 void scalar_gather_16_32(unsigned short *vgather16_32) 796 { 797 for (int i = 0; i < MATRIX_SIZE; ++i) { 798 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2]; 799 } 800 } 801 802 void check_gather_16_32(void) 803 { 804 memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short)); 805 scalar_gather_16_32(vgather16_32_ref); 806 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref, 807 MATRIX_SIZE * sizeof(unsigned short)); 808 } 809 810 void scalar_gather_16_32_masked(unsigned short *vgather16_32) 811 { 812 for (int i = 0; i < MATRIX_SIZE; ++i) { 813 if (half_predicates[i]) { 814 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2]; 815 } 816 } 817 818 } 819 820 void check_gather_16_32_masked(void) 821 { 822 memset(vgather16_32_ref, gather_16_masked_init(), 823 MATRIX_SIZE * sizeof(unsigned short)); 824 scalar_gather_16_32_masked(vgather16_32_ref); 825 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref, 826 MATRIX_SIZE * sizeof(unsigned short)); 827 } 828 829 /* print scatter16 buffer */ 830 void print_scatter16_buffer(void) 831 { 832 if (PRINT_DATA) { 833 printf("\n\nPrinting the 16 bit scatter buffer"); 834 835 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) { 836 if ((i % MATRIX_SIZE) == 0) { 837 printf("\n"); 838 } 839 for (int j = 0; j < 2; j++) { 840 printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff)); 841 } 842 printf(" "); 843 } 844 printf("\n"); 845 } 846 } 847 848 /* print the gather 16 buffer */ 849 void print_gather_result_16(void) 850 { 851 if (PRINT_DATA) { 852 printf("\n\nPrinting the 16 bit gather result\n"); 853 854 for (int i = 0; i < MATRIX_SIZE; i++) { 855 for (int j = 0; j < 2; j++) { 856 printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff)); 857 } 858 printf(" "); 859 } 860 printf("\n"); 861 } 862 } 863 864 /* print the scatter32 buffer */ 865 void print_scatter32_buffer(void) 866 { 867 if (PRINT_DATA) { 868 printf("\n\nPrinting the 32 bit scatter buffer"); 869 870 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) { 871 if ((i % MATRIX_SIZE) == 0) { 872 printf("\n"); 873 } 874 for (int j = 0; j < 4; j++) { 875 printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff)); 876 } 877 printf(" "); 878 } 879 printf("\n"); 880 } 881 } 882 883 /* print the gather 32 buffer */ 884 void print_gather_result_32(void) 885 { 886 if (PRINT_DATA) { 887 printf("\n\nPrinting the 32 bit gather result\n"); 888 889 for (int i = 0; i < MATRIX_SIZE; i++) { 890 for (int j = 0; j < 4; j++) { 891 printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff)); 892 } 893 printf(" "); 894 } 895 printf("\n"); 896 } 897 } 898 899 /* print the scatter16_32 buffer */ 900 void print_scatter16_32_buffer(void) 901 { 902 if (PRINT_DATA) { 903 printf("\n\nPrinting the 16_32 bit scatter buffer"); 904 905 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) { 906 if ((i % MATRIX_SIZE) == 0) { 907 printf("\n"); 908 } 909 for (int j = 0; j < 2; j++) { 910 printf("%c", 911 (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff)); 912 } 913 printf(" "); 914 } 915 printf("\n"); 916 } 917 } 918 919 /* print the gather 16_32 buffer */ 920 void print_gather_result_16_32(void) 921 { 922 if (PRINT_DATA) { 923 printf("\n\nPrinting the 16_32 bit gather result\n"); 924 925 for (int i = 0; i < MATRIX_SIZE; i++) { 926 for (int j = 0; j < 2; j++) { 927 printf("%c", 928 (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff)); 929 } 930 printf(" "); 931 } 932 printf("\n"); 933 } 934 } 935 936 int main() 937 { 938 prefill_vtcm_scratch(); 939 940 /* 16 bit elements with 16 bit offsets */ 941 create_offsets_values_preds_16(); 942 943 vector_scatter_16(); 944 print_scatter16_buffer(); 945 check_scatter_16(); 946 947 vector_gather_16(); 948 print_gather_result_16(); 949 check_gather_16(); 950 951 vector_gather_16_masked(); 952 print_gather_result_16(); 953 check_gather_16_masked(); 954 955 vector_scatter_16_acc(); 956 print_scatter16_buffer(); 957 check_scatter_16_acc(); 958 959 vector_scatter_16_masked(); 960 print_scatter16_buffer(); 961 check_scatter_16_masked(); 962 963 /* 32 bit elements with 32 bit offsets */ 964 create_offsets_values_preds_32(); 965 966 vector_scatter_32(); 967 print_scatter32_buffer(); 968 check_scatter_32(); 969 970 vector_gather_32(); 971 print_gather_result_32(); 972 check_gather_32(); 973 974 vector_gather_32_masked(); 975 print_gather_result_32(); 976 check_gather_32_masked(); 977 978 vector_scatter_32_acc(); 979 print_scatter32_buffer(); 980 check_scatter_32_acc(); 981 982 vector_scatter_32_masked(); 983 print_scatter32_buffer(); 984 check_scatter_32_masked(); 985 986 /* 16 bit elements with 32 bit offsets */ 987 create_offsets_values_preds_16_32(); 988 989 vector_scatter_16_32(); 990 print_scatter16_32_buffer(); 991 check_scatter_16_32(); 992 993 vector_gather_16_32(); 994 print_gather_result_16_32(); 995 check_gather_16_32(); 996 997 vector_gather_16_32_masked(); 998 print_gather_result_16_32(); 999 check_gather_16_32_masked(); 1000 1001 vector_scatter_16_32_acc(); 1002 print_scatter16_32_buffer(); 1003 check_scatter_16_32_acc(); 1004 1005 vector_scatter_16_32_masked(); 1006 print_scatter16_32_buffer(); 1007 check_scatter_16_32_masked(); 1008 1009 puts(err ? "FAIL" : "PASS"); 1010 return err; 1011 } 1012