1 /* 2 * Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 /* 19 * This example tests the HVX scatter/gather instructions 20 * 21 * See section 5.13 of the V68 HVX Programmer's Reference 22 * 23 * There are 3 main classes operations 24 * _16 16-bit elements and 16-bit offsets 25 * _32 32-bit elements and 32-bit offsets 26 * _16_32 16-bit elements and 32-bit offsets 27 * 28 * There are also masked and accumulate versions 29 */ 30 31 #include <stdio.h> 32 #include <string.h> 33 #include <stdlib.h> 34 #include <inttypes.h> 35 36 typedef long HVX_Vector __attribute__((__vector_size__(128))) 37 __attribute__((aligned(128))); 38 typedef long HVX_VectorPair __attribute__((__vector_size__(256))) 39 __attribute__((aligned(128))); 40 typedef long HVX_VectorPred __attribute__((__vector_size__(128))) 41 __attribute__((aligned(128))); 42 43 int err; 44 45 /* define the number of rows/cols in a square matrix */ 46 #define MATRIX_SIZE 64 47 48 /* define the size of the scatter buffer */ 49 #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE) 50 51 /* fake vtcm - put buffers together and force alignment */ 52 static struct { 53 unsigned short vscatter16[SCATTER_BUFFER_SIZE]; 54 unsigned short vgather16[MATRIX_SIZE]; 55 unsigned int vscatter32[SCATTER_BUFFER_SIZE]; 56 unsigned int vgather32[MATRIX_SIZE]; 57 unsigned short vscatter16_32[SCATTER_BUFFER_SIZE]; 58 unsigned short vgather16_32[MATRIX_SIZE]; 59 } vtcm __attribute__((aligned(0x10000))); 60 61 /* declare the arrays of reference values */ 62 unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE]; 63 unsigned short vgather16_ref[MATRIX_SIZE]; 64 unsigned int vscatter32_ref[SCATTER_BUFFER_SIZE]; 65 unsigned int vgather32_ref[MATRIX_SIZE]; 66 unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE]; 67 unsigned short vgather16_32_ref[MATRIX_SIZE]; 68 69 /* declare the arrays of offsets */ 70 unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128))); 71 unsigned int word_offsets[MATRIX_SIZE] __attribute__((aligned(128))); 72 73 /* declare the arrays of values */ 74 unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128))); 75 unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128))); 76 unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128))); 77 unsigned int word_values[MATRIX_SIZE] __attribute__((aligned(128))); 78 unsigned int word_values_acc[MATRIX_SIZE] __attribute__((aligned(128))); 79 unsigned int word_values_masked[MATRIX_SIZE] __attribute__((aligned(128))); 80 81 /* declare the arrays of predicates */ 82 unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128))); 83 unsigned int word_predicates[MATRIX_SIZE] __attribute__((aligned(128))); 84 85 /* make this big enough for all the operations */ 86 const size_t region_len = sizeof(vtcm); 87 88 /* optionally add sync instructions */ 89 #define SYNC_VECTOR 1 90 91 static void sync_scatter(void *addr) 92 { 93 #if SYNC_VECTOR 94 /* 95 * Do the scatter release followed by a dummy load to complete the 96 * synchronization. Normally the dummy load would be deferred as 97 * long as possible to minimize stalls. 98 */ 99 asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr)); 100 /* use volatile to force the load */ 101 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy; 102 #endif 103 } 104 105 static void sync_gather(void *addr) 106 { 107 #if SYNC_VECTOR 108 /* use volatile to force the load */ 109 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy; 110 #endif 111 } 112 113 /* optionally print the results */ 114 #define PRINT_DATA 0 115 116 #define FILL_CHAR '.' 117 118 /* fill vtcm scratch with ee */ 119 void prefill_vtcm_scratch(void) 120 { 121 memset(&vtcm, FILL_CHAR, sizeof(vtcm)); 122 } 123 124 /* create byte offsets to be a diagonal of the matrix with 16 bit elements */ 125 void create_offsets_values_preds_16(void) 126 { 127 unsigned short half_element = 0; 128 unsigned short half_element_masked = 0; 129 char letter = 'A'; 130 char letter_masked = '@'; 131 132 for (int i = 0; i < MATRIX_SIZE; i++) { 133 half_offsets[i] = i * (2 * MATRIX_SIZE + 2); 134 135 half_element = 0; 136 half_element_masked = 0; 137 for (int j = 0; j < 2; j++) { 138 half_element |= letter << j * 8; 139 half_element_masked |= letter_masked << j * 8; 140 } 141 142 half_values[i] = half_element; 143 half_values_acc[i] = ((i % 10) << 8) + (i % 10); 144 half_values_masked[i] = half_element_masked; 145 146 letter++; 147 /* reset to 'A' */ 148 if (letter == 'M') { 149 letter = 'A'; 150 } 151 152 half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0; 153 } 154 } 155 156 /* create byte offsets to be a diagonal of the matrix with 32 bit elements */ 157 void create_offsets_values_preds_32(void) 158 { 159 unsigned int word_element = 0; 160 unsigned int word_element_masked = 0; 161 char letter = 'A'; 162 char letter_masked = '&'; 163 164 for (int i = 0; i < MATRIX_SIZE; i++) { 165 word_offsets[i] = i * (4 * MATRIX_SIZE + 4); 166 167 word_element = 0; 168 word_element_masked = 0; 169 for (int j = 0; j < 4; j++) { 170 word_element |= letter << j * 8; 171 word_element_masked |= letter_masked << j * 8; 172 } 173 174 word_values[i] = word_element; 175 word_values_acc[i] = ((i % 10) << 8) + (i % 10); 176 word_values_masked[i] = word_element_masked; 177 178 letter++; 179 /* reset to 'A' */ 180 if (letter == 'M') { 181 letter = 'A'; 182 } 183 184 word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0; 185 } 186 } 187 188 /* 189 * create byte offsets to be a diagonal of the matrix with 16 bit elements 190 * and 32 bit offsets 191 */ 192 void create_offsets_values_preds_16_32(void) 193 { 194 unsigned short half_element = 0; 195 unsigned short half_element_masked = 0; 196 char letter = 'D'; 197 char letter_masked = '$'; 198 199 for (int i = 0; i < MATRIX_SIZE; i++) { 200 word_offsets[i] = i * (2 * MATRIX_SIZE + 2); 201 202 half_element = 0; 203 half_element_masked = 0; 204 for (int j = 0; j < 2; j++) { 205 half_element |= letter << j * 8; 206 half_element_masked |= letter_masked << j * 8; 207 } 208 209 half_values[i] = half_element; 210 half_values_acc[i] = ((i % 10) << 8) + (i % 10); 211 half_values_masked[i] = half_element_masked; 212 213 letter++; 214 /* reset to 'A' */ 215 if (letter == 'P') { 216 letter = 'D'; 217 } 218 219 half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0; 220 } 221 } 222 223 /* scatter the 16 bit elements using HVX */ 224 void vector_scatter_16(void) 225 { 226 asm ("m0 = %1\n\t" 227 "v0 = vmem(%2 + #0)\n\t" 228 "v1 = vmem(%3 + #0)\n\t" 229 "vscatter(%0, m0, v0.h).h = v1\n\t" 230 : : "r"(vtcm.vscatter16), "r"(region_len), 231 "r"(half_offsets), "r"(half_values) 232 : "m0", "v0", "v1", "memory"); 233 234 sync_scatter(vtcm.vscatter16); 235 } 236 237 /* scatter-accumulate the 16 bit elements using HVX */ 238 void vector_scatter_16_acc(void) 239 { 240 asm ("m0 = %1\n\t" 241 "v0 = vmem(%2 + #0)\n\t" 242 "v1 = vmem(%3 + #0)\n\t" 243 "vscatter(%0, m0, v0.h).h += v1\n\t" 244 : : "r"(vtcm.vscatter16), "r"(region_len), 245 "r"(half_offsets), "r"(half_values_acc) 246 : "m0", "v0", "v1", "memory"); 247 248 sync_scatter(vtcm.vscatter16); 249 } 250 251 /* masked scatter the 16 bit elements using HVX */ 252 void vector_scatter_16_masked(void) 253 { 254 asm ("r1 = #-1\n\t" 255 "v0 = vmem(%0 + #0)\n\t" 256 "q0 = vand(v0, r1)\n\t" 257 "m0 = %2\n\t" 258 "v0 = vmem(%3 + #0)\n\t" 259 "v1 = vmem(%4 + #0)\n\t" 260 "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t" 261 : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len), 262 "r"(half_offsets), "r"(half_values_masked) 263 : "r1", "q0", "m0", "q0", "v0", "v1", "memory"); 264 265 sync_scatter(vtcm.vscatter16); 266 } 267 268 /* scatter the 32 bit elements using HVX */ 269 void vector_scatter_32(void) 270 { 271 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets; 272 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; 273 HVX_Vector *valueslo = (HVX_Vector *)word_values; 274 HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2]; 275 276 asm ("m0 = %1\n\t" 277 "v0 = vmem(%2 + #0)\n\t" 278 "v1 = vmem(%3 + #0)\n\t" 279 "vscatter(%0, m0, v0.w).w = v1\n\t" 280 : : "r"(vtcm.vscatter32), "r"(region_len), 281 "r"(offsetslo), "r"(valueslo) 282 : "m0", "v0", "v1", "memory"); 283 asm ("m0 = %1\n\t" 284 "v0 = vmem(%2 + #0)\n\t" 285 "v1 = vmem(%3 + #0)\n\t" 286 "vscatter(%0, m0, v0.w).w = v1\n\t" 287 : : "r"(vtcm.vscatter32), "r"(region_len), 288 "r"(offsetshi), "r"(valueshi) 289 : "m0", "v0", "v1", "memory"); 290 291 sync_scatter(vtcm.vscatter32); 292 } 293 294 /* scatter-accumulate the 32 bit elements using HVX */ 295 void vector_scatter_32_acc(void) 296 { 297 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets; 298 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; 299 HVX_Vector *valueslo = (HVX_Vector *)word_values_acc; 300 HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2]; 301 302 asm ("m0 = %1\n\t" 303 "v0 = vmem(%2 + #0)\n\t" 304 "v1 = vmem(%3 + #0)\n\t" 305 "vscatter(%0, m0, v0.w).w += v1\n\t" 306 : : "r"(vtcm.vscatter32), "r"(region_len), 307 "r"(offsetslo), "r"(valueslo) 308 : "m0", "v0", "v1", "memory"); 309 asm ("m0 = %1\n\t" 310 "v0 = vmem(%2 + #0)\n\t" 311 "v1 = vmem(%3 + #0)\n\t" 312 "vscatter(%0, m0, v0.w).w += v1\n\t" 313 : : "r"(vtcm.vscatter32), "r"(region_len), 314 "r"(offsetshi), "r"(valueshi) 315 : "m0", "v0", "v1", "memory"); 316 317 sync_scatter(vtcm.vscatter32); 318 } 319 320 /* masked scatter the 32 bit elements using HVX */ 321 void vector_scatter_32_masked(void) 322 { 323 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets; 324 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; 325 HVX_Vector *valueslo = (HVX_Vector *)word_values_masked; 326 HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2]; 327 HVX_Vector *predslo = (HVX_Vector *)word_predicates; 328 HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2]; 329 330 asm ("r1 = #-1\n\t" 331 "v0 = vmem(%0 + #0)\n\t" 332 "q0 = vand(v0, r1)\n\t" 333 "m0 = %2\n\t" 334 "v0 = vmem(%3 + #0)\n\t" 335 "v1 = vmem(%4 + #0)\n\t" 336 "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t" 337 : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len), 338 "r"(offsetslo), "r"(valueslo) 339 : "r1", "q0", "m0", "q0", "v0", "v1", "memory"); 340 asm ("r1 = #-1\n\t" 341 "v0 = vmem(%0 + #0)\n\t" 342 "q0 = vand(v0, r1)\n\t" 343 "m0 = %2\n\t" 344 "v0 = vmem(%3 + #0)\n\t" 345 "v1 = vmem(%4 + #0)\n\t" 346 "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t" 347 : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len), 348 "r"(offsetshi), "r"(valueshi) 349 : "r1", "q0", "m0", "q0", "v0", "v1", "memory"); 350 351 sync_scatter(vtcm.vscatter32); 352 } 353 354 /* scatter the 16 bit elements with 32 bit offsets using HVX */ 355 void vector_scatter_16_32(void) 356 { 357 asm ("m0 = %1\n\t" 358 "v0 = vmem(%2 + #0)\n\t" 359 "v1 = vmem(%2 + #1)\n\t" 360 "v2 = vmem(%3 + #0)\n\t" 361 "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */ 362 "vscatter(%0, m0, v1:0.w).h = v2\n\t" 363 : : "r"(vtcm.vscatter16_32), "r"(region_len), 364 "r"(word_offsets), "r"(half_values) 365 : "m0", "v0", "v1", "v2", "memory"); 366 367 sync_scatter(vtcm.vscatter16_32); 368 } 369 370 /* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */ 371 void vector_scatter_16_32_acc(void) 372 { 373 asm ("m0 = %1\n\t" 374 "v0 = vmem(%2 + #0)\n\t" 375 "v1 = vmem(%2 + #1)\n\t" 376 "v2 = vmem(%3 + #0)\n\t" \ 377 "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */ 378 "vscatter(%0, m0, v1:0.w).h += v2\n\t" 379 : : "r"(vtcm.vscatter16_32), "r"(region_len), 380 "r"(word_offsets), "r"(half_values_acc) 381 : "m0", "v0", "v1", "v2", "memory"); 382 383 sync_scatter(vtcm.vscatter16_32); 384 } 385 386 /* masked scatter the 16 bit elements with 32 bit offsets using HVX */ 387 void vector_scatter_16_32_masked(void) 388 { 389 asm ("r1 = #-1\n\t" 390 "v0 = vmem(%0 + #0)\n\t" 391 "v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */ 392 "q0 = vand(v0, r1)\n\t" 393 "m0 = %2\n\t" 394 "v0 = vmem(%3 + #0)\n\t" 395 "v1 = vmem(%3 + #1)\n\t" 396 "v2 = vmem(%4 + #0)\n\t" \ 397 "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */ 398 "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t" 399 : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len), 400 "r"(word_offsets), "r"(half_values_masked) 401 : "r1", "q0", "m0", "v0", "v1", "v2", "memory"); 402 403 sync_scatter(vtcm.vscatter16_32); 404 } 405 406 /* gather the elements from the scatter16 buffer using HVX */ 407 void vector_gather_16(void) 408 { 409 asm ("m0 = %1\n\t" 410 "v0 = vmem(%2 + #0)\n\t" 411 "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t" 412 " vmem(%3 + #0) = vtmp.new }\n\t" 413 : : "r"(vtcm.vscatter16), "r"(region_len), 414 "r"(half_offsets), "r"(vtcm.vgather16) 415 : "m0", "v0", "memory"); 416 417 sync_gather(vtcm.vgather16); 418 } 419 420 static unsigned short gather_16_masked_init(void) 421 { 422 char letter = '?'; 423 return letter | (letter << 8); 424 } 425 426 /* masked gather the elements from the scatter16 buffer using HVX */ 427 void vector_gather_16_masked(void) 428 { 429 unsigned short init = gather_16_masked_init(); 430 431 asm ("v0.h = vsplat(%5)\n\t" 432 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ 433 "r1 = #-1\n\t" 434 "v0 = vmem(%0 + #0)\n\t" 435 "q0 = vand(v0, r1)\n\t" 436 "m0 = %2\n\t" 437 "v0 = vmem(%3 + #0)\n\t" 438 "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t" 439 " vmem(%4 + #0) = vtmp.new }\n\t" 440 : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len), 441 "r"(half_offsets), "r"(vtcm.vgather16), "r"(init) 442 : "r1", "q0", "m0", "v0", "memory"); 443 444 sync_gather(vtcm.vgather16); 445 } 446 447 /* gather the elements from the scatter32 buffer using HVX */ 448 void vector_gather_32(void) 449 { 450 HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32; 451 HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2]; 452 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets; 453 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; 454 455 asm ("m0 = %1\n\t" 456 "v0 = vmem(%2 + #0)\n\t" 457 "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t" 458 " vmem(%3 + #0) = vtmp.new }\n\t" 459 : : "r"(vtcm.vscatter32), "r"(region_len), 460 "r"(offsetslo), "r"(vgatherlo) 461 : "m0", "v0", "memory"); 462 asm ("m0 = %1\n\t" 463 "v0 = vmem(%2 + #0)\n\t" 464 "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t" 465 " vmem(%3 + #0) = vtmp.new }\n\t" 466 : : "r"(vtcm.vscatter32), "r"(region_len), 467 "r"(offsetshi), "r"(vgatherhi) 468 : "m0", "v0", "memory"); 469 470 sync_gather(vgatherlo); 471 sync_gather(vgatherhi); 472 } 473 474 static unsigned int gather_32_masked_init(void) 475 { 476 char letter = '?'; 477 return letter | (letter << 8) | (letter << 16) | (letter << 24); 478 } 479 480 /* masked gather the elements from the scatter32 buffer using HVX */ 481 void vector_gather_32_masked(void) 482 { 483 unsigned int init = gather_32_masked_init(); 484 HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32; 485 HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2]; 486 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets; 487 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; 488 HVX_Vector *predslo = (HVX_Vector *)word_predicates; 489 HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2]; 490 491 asm ("v0.h = vsplat(%5)\n\t" 492 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ 493 "r1 = #-1\n\t" 494 "v0 = vmem(%0 + #0)\n\t" 495 "q0 = vand(v0, r1)\n\t" 496 "m0 = %2\n\t" 497 "v0 = vmem(%3 + #0)\n\t" 498 "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t" 499 " vmem(%4 + #0) = vtmp.new }\n\t" 500 : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len), 501 "r"(offsetslo), "r"(vgatherlo), "r"(init) 502 : "r1", "q0", "m0", "v0", "memory"); 503 asm ("v0.h = vsplat(%5)\n\t" 504 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ 505 "r1 = #-1\n\t" 506 "v0 = vmem(%0 + #0)\n\t" 507 "q0 = vand(v0, r1)\n\t" 508 "m0 = %2\n\t" 509 "v0 = vmem(%3 + #0)\n\t" 510 "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t" 511 " vmem(%4 + #0) = vtmp.new }\n\t" 512 : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len), 513 "r"(offsetshi), "r"(vgatherhi), "r"(init) 514 : "r1", "q0", "m0", "v0", "memory"); 515 516 sync_gather(vgatherlo); 517 sync_gather(vgatherhi); 518 } 519 520 /* gather the elements from the scatter16_32 buffer using HVX */ 521 void vector_gather_16_32(void) 522 { 523 asm ("m0 = %1\n\t" 524 "v0 = vmem(%2 + #0)\n\t" 525 "v1 = vmem(%2 + #1)\n\t" 526 "{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t" 527 " vmem(%3 + #0) = vtmp.new }\n\t" 528 "v0 = vmem(%3 + #0)\n\t" 529 "v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */ 530 "vmem(%3 + #0) = v0\n\t" 531 : : "r"(vtcm.vscatter16_32), "r"(region_len), 532 "r"(word_offsets), "r"(vtcm.vgather16_32) 533 : "m0", "v0", "v1", "memory"); 534 535 sync_gather(vtcm.vgather16_32); 536 } 537 538 /* masked gather the elements from the scatter16_32 buffer using HVX */ 539 void vector_gather_16_32_masked(void) 540 { 541 unsigned short init = gather_16_masked_init(); 542 543 asm ("v0.h = vsplat(%5)\n\t" 544 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ 545 "r1 = #-1\n\t" 546 "v0 = vmem(%0 + #0)\n\t" 547 "v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */ 548 "q0 = vand(v0, r1)\n\t" 549 "m0 = %2\n\t" 550 "v0 = vmem(%3 + #0)\n\t" 551 "v1 = vmem(%3 + #1)\n\t" 552 "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t" 553 " vmem(%4 + #0) = vtmp.new }\n\t" 554 "v0 = vmem(%4 + #0)\n\t" 555 "v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */ 556 "vmem(%4 + #0) = v0\n\t" 557 : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len), 558 "r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init) 559 : "r1", "q0", "m0", "v0", "v1", "memory"); 560 561 sync_gather(vtcm.vgather16_32); 562 } 563 564 static void check_buffer(const char *name, void *c, void *r, size_t size) 565 { 566 char *check = (char *)c; 567 char *ref = (char *)r; 568 for (int i = 0; i < size; i++) { 569 if (check[i] != ref[i]) { 570 printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i, 571 check[i], check[i], ref[i], ref[i]); 572 err++; 573 } 574 } 575 } 576 577 /* 578 * These scalar functions are the C equivalents of the vector functions that 579 * use HVX 580 */ 581 582 /* scatter the 16 bit elements using C */ 583 void scalar_scatter_16(unsigned short *vscatter16) 584 { 585 for (int i = 0; i < MATRIX_SIZE; ++i) { 586 vscatter16[half_offsets[i] / 2] = half_values[i]; 587 } 588 } 589 590 void check_scatter_16() 591 { 592 memset(vscatter16_ref, FILL_CHAR, 593 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 594 scalar_scatter_16(vscatter16_ref); 595 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref, 596 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 597 } 598 599 /* scatter the 16 bit elements using C */ 600 void scalar_scatter_16_acc(unsigned short *vscatter16) 601 { 602 for (int i = 0; i < MATRIX_SIZE; ++i) { 603 vscatter16[half_offsets[i] / 2] += half_values_acc[i]; 604 } 605 } 606 607 /* scatter-accumulate the 16 bit elements using C */ 608 void check_scatter_16_acc() 609 { 610 memset(vscatter16_ref, FILL_CHAR, 611 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 612 scalar_scatter_16(vscatter16_ref); 613 scalar_scatter_16_acc(vscatter16_ref); 614 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref, 615 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 616 } 617 618 /* masked scatter the 16 bit elements using C */ 619 void scalar_scatter_16_masked(unsigned short *vscatter16) 620 { 621 for (int i = 0; i < MATRIX_SIZE; i++) { 622 if (half_predicates[i]) { 623 vscatter16[half_offsets[i] / 2] = half_values_masked[i]; 624 } 625 } 626 627 } 628 629 void check_scatter_16_masked() 630 { 631 memset(vscatter16_ref, FILL_CHAR, 632 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 633 scalar_scatter_16(vscatter16_ref); 634 scalar_scatter_16_acc(vscatter16_ref); 635 scalar_scatter_16_masked(vscatter16_ref); 636 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref, 637 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 638 } 639 640 /* scatter the 32 bit elements using C */ 641 void scalar_scatter_32(unsigned int *vscatter32) 642 { 643 for (int i = 0; i < MATRIX_SIZE; ++i) { 644 vscatter32[word_offsets[i] / 4] = word_values[i]; 645 } 646 } 647 648 void check_scatter_32() 649 { 650 memset(vscatter32_ref, FILL_CHAR, 651 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 652 scalar_scatter_32(vscatter32_ref); 653 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref, 654 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 655 } 656 657 /* scatter-accumulate the 32 bit elements using C */ 658 void scalar_scatter_32_acc(unsigned int *vscatter32) 659 { 660 for (int i = 0; i < MATRIX_SIZE; ++i) { 661 vscatter32[word_offsets[i] / 4] += word_values_acc[i]; 662 } 663 } 664 665 void check_scatter_32_acc() 666 { 667 memset(vscatter32_ref, FILL_CHAR, 668 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 669 scalar_scatter_32(vscatter32_ref); 670 scalar_scatter_32_acc(vscatter32_ref); 671 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref, 672 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 673 } 674 675 /* masked scatter the 32 bit elements using C */ 676 void scalar_scatter_32_masked(unsigned int *vscatter32) 677 { 678 for (int i = 0; i < MATRIX_SIZE; i++) { 679 if (word_predicates[i]) { 680 vscatter32[word_offsets[i] / 4] = word_values_masked[i]; 681 } 682 } 683 } 684 685 void check_scatter_32_masked() 686 { 687 memset(vscatter32_ref, FILL_CHAR, 688 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 689 scalar_scatter_32(vscatter32_ref); 690 scalar_scatter_32_acc(vscatter32_ref); 691 scalar_scatter_32_masked(vscatter32_ref); 692 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref, 693 SCATTER_BUFFER_SIZE * sizeof(unsigned int)); 694 } 695 696 /* scatter the 16 bit elements with 32 bit offsets using C */ 697 void scalar_scatter_16_32(unsigned short *vscatter16_32) 698 { 699 for (int i = 0; i < MATRIX_SIZE; ++i) { 700 vscatter16_32[word_offsets[i] / 2] = half_values[i]; 701 } 702 } 703 704 void check_scatter_16_32() 705 { 706 memset(vscatter16_32_ref, FILL_CHAR, 707 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 708 scalar_scatter_16_32(vscatter16_32_ref); 709 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref, 710 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 711 } 712 713 /* scatter-accumulate the 16 bit elements with 32 bit offsets using C */ 714 void scalar_scatter_16_32_acc(unsigned short *vscatter16_32) 715 { 716 for (int i = 0; i < MATRIX_SIZE; ++i) { 717 vscatter16_32[word_offsets[i] / 2] += half_values_acc[i]; 718 } 719 } 720 721 void check_scatter_16_32_acc() 722 { 723 memset(vscatter16_32_ref, FILL_CHAR, 724 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 725 scalar_scatter_16_32(vscatter16_32_ref); 726 scalar_scatter_16_32_acc(vscatter16_32_ref); 727 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref, 728 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 729 } 730 731 /* masked scatter the 16 bit elements with 32 bit offsets using C */ 732 void scalar_scatter_16_32_masked(unsigned short *vscatter16_32) 733 { 734 for (int i = 0; i < MATRIX_SIZE; i++) { 735 if (half_predicates[i]) { 736 vscatter16_32[word_offsets[i] / 2] = half_values_masked[i]; 737 } 738 } 739 } 740 741 void check_scatter_16_32_masked() 742 { 743 memset(vscatter16_32_ref, FILL_CHAR, 744 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 745 scalar_scatter_16_32(vscatter16_32_ref); 746 scalar_scatter_16_32_acc(vscatter16_32_ref); 747 scalar_scatter_16_32_masked(vscatter16_32_ref); 748 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref, 749 SCATTER_BUFFER_SIZE * sizeof(unsigned short)); 750 } 751 752 /* gather the elements from the scatter buffer using C */ 753 void scalar_gather_16(unsigned short *vgather16) 754 { 755 for (int i = 0; i < MATRIX_SIZE; ++i) { 756 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2]; 757 } 758 } 759 760 void check_gather_16() 761 { 762 memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short)); 763 scalar_gather_16(vgather16_ref); 764 check_buffer(__func__, vtcm.vgather16, vgather16_ref, 765 MATRIX_SIZE * sizeof(unsigned short)); 766 } 767 768 /* masked gather the elements from the scatter buffer using C */ 769 void scalar_gather_16_masked(unsigned short *vgather16) 770 { 771 for (int i = 0; i < MATRIX_SIZE; ++i) { 772 if (half_predicates[i]) { 773 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2]; 774 } 775 } 776 } 777 778 void check_gather_16_masked() 779 { 780 memset(vgather16_ref, gather_16_masked_init(), 781 MATRIX_SIZE * sizeof(unsigned short)); 782 scalar_gather_16_masked(vgather16_ref); 783 check_buffer(__func__, vtcm.vgather16, vgather16_ref, 784 MATRIX_SIZE * sizeof(unsigned short)); 785 } 786 787 /* gather the elements from the scatter32 buffer using C */ 788 void scalar_gather_32(unsigned int *vgather32) 789 { 790 for (int i = 0; i < MATRIX_SIZE; ++i) { 791 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4]; 792 } 793 } 794 795 void check_gather_32(void) 796 { 797 memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int)); 798 scalar_gather_32(vgather32_ref); 799 check_buffer(__func__, vtcm.vgather32, vgather32_ref, 800 MATRIX_SIZE * sizeof(unsigned int)); 801 } 802 803 /* masked gather the elements from the scatter32 buffer using C */ 804 void scalar_gather_32_masked(unsigned int *vgather32) 805 { 806 for (int i = 0; i < MATRIX_SIZE; ++i) { 807 if (word_predicates[i]) { 808 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4]; 809 } 810 } 811 } 812 813 void check_gather_32_masked(void) 814 { 815 memset(vgather32_ref, gather_32_masked_init(), 816 MATRIX_SIZE * sizeof(unsigned int)); 817 scalar_gather_32_masked(vgather32_ref); 818 check_buffer(__func__, vtcm.vgather32, 819 vgather32_ref, MATRIX_SIZE * sizeof(unsigned int)); 820 } 821 822 /* gather the elements from the scatter16_32 buffer using C */ 823 void scalar_gather_16_32(unsigned short *vgather16_32) 824 { 825 for (int i = 0; i < MATRIX_SIZE; ++i) { 826 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2]; 827 } 828 } 829 830 void check_gather_16_32(void) 831 { 832 memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short)); 833 scalar_gather_16_32(vgather16_32_ref); 834 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref, 835 MATRIX_SIZE * sizeof(unsigned short)); 836 } 837 838 /* masked gather the elements from the scatter16_32 buffer using C */ 839 void scalar_gather_16_32_masked(unsigned short *vgather16_32) 840 { 841 for (int i = 0; i < MATRIX_SIZE; ++i) { 842 if (half_predicates[i]) { 843 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2]; 844 } 845 } 846 847 } 848 849 void check_gather_16_32_masked(void) 850 { 851 memset(vgather16_32_ref, gather_16_masked_init(), 852 MATRIX_SIZE * sizeof(unsigned short)); 853 scalar_gather_16_32_masked(vgather16_32_ref); 854 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref, 855 MATRIX_SIZE * sizeof(unsigned short)); 856 } 857 858 /* print scatter16 buffer */ 859 void print_scatter16_buffer(void) 860 { 861 if (PRINT_DATA) { 862 printf("\n\nPrinting the 16 bit scatter buffer"); 863 864 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) { 865 if ((i % MATRIX_SIZE) == 0) { 866 printf("\n"); 867 } 868 for (int j = 0; j < 2; j++) { 869 printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff)); 870 } 871 printf(" "); 872 } 873 printf("\n"); 874 } 875 } 876 877 /* print the gather 16 buffer */ 878 void print_gather_result_16(void) 879 { 880 if (PRINT_DATA) { 881 printf("\n\nPrinting the 16 bit gather result\n"); 882 883 for (int i = 0; i < MATRIX_SIZE; i++) { 884 for (int j = 0; j < 2; j++) { 885 printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff)); 886 } 887 printf(" "); 888 } 889 printf("\n"); 890 } 891 } 892 893 /* print the scatter32 buffer */ 894 void print_scatter32_buffer(void) 895 { 896 if (PRINT_DATA) { 897 printf("\n\nPrinting the 32 bit scatter buffer"); 898 899 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) { 900 if ((i % MATRIX_SIZE) == 0) { 901 printf("\n"); 902 } 903 for (int j = 0; j < 4; j++) { 904 printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff)); 905 } 906 printf(" "); 907 } 908 printf("\n"); 909 } 910 } 911 912 /* print the gather 32 buffer */ 913 void print_gather_result_32(void) 914 { 915 if (PRINT_DATA) { 916 printf("\n\nPrinting the 32 bit gather result\n"); 917 918 for (int i = 0; i < MATRIX_SIZE; i++) { 919 for (int j = 0; j < 4; j++) { 920 printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff)); 921 } 922 printf(" "); 923 } 924 printf("\n"); 925 } 926 } 927 928 /* print the scatter16_32 buffer */ 929 void print_scatter16_32_buffer(void) 930 { 931 if (PRINT_DATA) { 932 printf("\n\nPrinting the 16_32 bit scatter buffer"); 933 934 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) { 935 if ((i % MATRIX_SIZE) == 0) { 936 printf("\n"); 937 } 938 for (int j = 0; j < 2; j++) { 939 printf("%c", 940 (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff)); 941 } 942 printf(" "); 943 } 944 printf("\n"); 945 } 946 } 947 948 /* print the gather 16_32 buffer */ 949 void print_gather_result_16_32(void) 950 { 951 if (PRINT_DATA) { 952 printf("\n\nPrinting the 16_32 bit gather result\n"); 953 954 for (int i = 0; i < MATRIX_SIZE; i++) { 955 for (int j = 0; j < 2; j++) { 956 printf("%c", 957 (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff)); 958 } 959 printf(" "); 960 } 961 printf("\n"); 962 } 963 } 964 965 int main() 966 { 967 prefill_vtcm_scratch(); 968 969 /* 16 bit elements with 16 bit offsets */ 970 create_offsets_values_preds_16(); 971 972 vector_scatter_16(); 973 print_scatter16_buffer(); 974 check_scatter_16(); 975 976 vector_gather_16(); 977 print_gather_result_16(); 978 check_gather_16(); 979 980 vector_gather_16_masked(); 981 print_gather_result_16(); 982 check_gather_16_masked(); 983 984 vector_scatter_16_acc(); 985 print_scatter16_buffer(); 986 check_scatter_16_acc(); 987 988 vector_scatter_16_masked(); 989 print_scatter16_buffer(); 990 check_scatter_16_masked(); 991 992 /* 32 bit elements with 32 bit offsets */ 993 create_offsets_values_preds_32(); 994 995 vector_scatter_32(); 996 print_scatter32_buffer(); 997 check_scatter_32(); 998 999 vector_gather_32(); 1000 print_gather_result_32(); 1001 check_gather_32(); 1002 1003 vector_gather_32_masked(); 1004 print_gather_result_32(); 1005 check_gather_32_masked(); 1006 1007 vector_scatter_32_acc(); 1008 print_scatter32_buffer(); 1009 check_scatter_32_acc(); 1010 1011 vector_scatter_32_masked(); 1012 print_scatter32_buffer(); 1013 check_scatter_32_masked(); 1014 1015 /* 16 bit elements with 32 bit offsets */ 1016 create_offsets_values_preds_16_32(); 1017 1018 vector_scatter_16_32(); 1019 print_scatter16_32_buffer(); 1020 check_scatter_16_32(); 1021 1022 vector_gather_16_32(); 1023 print_gather_result_16_32(); 1024 check_gather_16_32(); 1025 1026 vector_gather_16_32_masked(); 1027 print_gather_result_16_32(); 1028 check_gather_16_32_masked(); 1029 1030 vector_scatter_16_32_acc(); 1031 print_scatter16_32_buffer(); 1032 check_scatter_16_32_acc(); 1033 1034 vector_scatter_16_32_masked(); 1035 print_scatter16_32_buffer(); 1036 check_scatter_16_32_masked(); 1037 1038 puts(err ? "FAIL" : "PASS"); 1039 return err; 1040 } 1041