1 /*
2 * Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18 /*
19 * This example tests the HVX scatter/gather instructions
20 *
21 * See section 5.13 of the V68 HVX Programmer's Reference
22 *
23 * There are 3 main classes operations
24 * _16 16-bit elements and 16-bit offsets
25 * _32 32-bit elements and 32-bit offsets
26 * _16_32 16-bit elements and 32-bit offsets
27 *
28 * There are also masked and accumulate versions
29 */
30
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include <inttypes.h>
35
36 typedef long HVX_Vector __attribute__((__vector_size__(128)))
37 __attribute__((aligned(128)));
38 typedef long HVX_VectorPair __attribute__((__vector_size__(256)))
39 __attribute__((aligned(128)));
40 typedef long HVX_VectorPred __attribute__((__vector_size__(128)))
41 __attribute__((aligned(128)));
42
43 int err;
44
45 /* define the number of rows/cols in a square matrix */
46 #define MATRIX_SIZE 64
47
48 /* define the size of the scatter buffer */
49 #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
50
51 /* fake vtcm - put buffers together and force alignment */
52 static struct {
53 unsigned short vscatter16[SCATTER_BUFFER_SIZE];
54 unsigned short vgather16[MATRIX_SIZE];
55 unsigned int vscatter32[SCATTER_BUFFER_SIZE];
56 unsigned int vgather32[MATRIX_SIZE];
57 unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
58 unsigned short vgather16_32[MATRIX_SIZE];
59 } vtcm __attribute__((aligned(0x10000)));
60
61 /* declare the arrays of reference values */
62 unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
63 unsigned short vgather16_ref[MATRIX_SIZE];
64 unsigned int vscatter32_ref[SCATTER_BUFFER_SIZE];
65 unsigned int vgather32_ref[MATRIX_SIZE];
66 unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
67 unsigned short vgather16_32_ref[MATRIX_SIZE];
68
69 /* declare the arrays of offsets */
70 unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
71 unsigned int word_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
72
73 /* declare the arrays of values */
74 unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128)));
75 unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
76 unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
77 unsigned int word_values[MATRIX_SIZE] __attribute__((aligned(128)));
78 unsigned int word_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
79 unsigned int word_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
80
81 /* declare the arrays of predicates */
82 unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
83 unsigned int word_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
84
85 /* make this big enough for all the operations */
86 const size_t region_len = sizeof(vtcm);
87
88 /* optionally add sync instructions */
89 #define SYNC_VECTOR 1
90
sync_scatter(void * addr)91 static void sync_scatter(void *addr)
92 {
93 #if SYNC_VECTOR
94 /*
95 * Do the scatter release followed by a dummy load to complete the
96 * synchronization. Normally the dummy load would be deferred as
97 * long as possible to minimize stalls.
98 */
99 asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
100 /* use volatile to force the load */
101 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
102 #endif
103 }
104
sync_gather(void * addr)105 static void sync_gather(void *addr)
106 {
107 #if SYNC_VECTOR
108 /* use volatile to force the load */
109 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
110 #endif
111 }
112
113 /* optionally print the results */
114 #define PRINT_DATA 0
115
116 #define FILL_CHAR '.'
117
118 /* fill vtcm scratch with ee */
prefill_vtcm_scratch(void)119 void prefill_vtcm_scratch(void)
120 {
121 memset(&vtcm, FILL_CHAR, sizeof(vtcm));
122 }
123
124 /* create byte offsets to be a diagonal of the matrix with 16 bit elements */
create_offsets_values_preds_16(void)125 void create_offsets_values_preds_16(void)
126 {
127 unsigned short half_element = 0;
128 unsigned short half_element_masked = 0;
129 char letter = 'A';
130 char letter_masked = '@';
131
132 for (int i = 0; i < MATRIX_SIZE; i++) {
133 half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
134
135 half_element = 0;
136 half_element_masked = 0;
137 for (int j = 0; j < 2; j++) {
138 half_element |= letter << j * 8;
139 half_element_masked |= letter_masked << j * 8;
140 }
141
142 half_values[i] = half_element;
143 half_values_acc[i] = ((i % 10) << 8) + (i % 10);
144 half_values_masked[i] = half_element_masked;
145
146 letter++;
147 /* reset to 'A' */
148 if (letter == 'M') {
149 letter = 'A';
150 }
151
152 half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
153 }
154 }
155
156 /* create byte offsets to be a diagonal of the matrix with 32 bit elements */
create_offsets_values_preds_32(void)157 void create_offsets_values_preds_32(void)
158 {
159 unsigned int word_element = 0;
160 unsigned int word_element_masked = 0;
161 char letter = 'A';
162 char letter_masked = '&';
163
164 for (int i = 0; i < MATRIX_SIZE; i++) {
165 word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
166
167 word_element = 0;
168 word_element_masked = 0;
169 for (int j = 0; j < 4; j++) {
170 word_element |= letter << j * 8;
171 word_element_masked |= letter_masked << j * 8;
172 }
173
174 word_values[i] = word_element;
175 word_values_acc[i] = ((i % 10) << 8) + (i % 10);
176 word_values_masked[i] = word_element_masked;
177
178 letter++;
179 /* reset to 'A' */
180 if (letter == 'M') {
181 letter = 'A';
182 }
183
184 word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
185 }
186 }
187
188 /*
189 * create byte offsets to be a diagonal of the matrix with 16 bit elements
190 * and 32 bit offsets
191 */
create_offsets_values_preds_16_32(void)192 void create_offsets_values_preds_16_32(void)
193 {
194 unsigned short half_element = 0;
195 unsigned short half_element_masked = 0;
196 char letter = 'D';
197 char letter_masked = '$';
198
199 for (int i = 0; i < MATRIX_SIZE; i++) {
200 word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
201
202 half_element = 0;
203 half_element_masked = 0;
204 for (int j = 0; j < 2; j++) {
205 half_element |= letter << j * 8;
206 half_element_masked |= letter_masked << j * 8;
207 }
208
209 half_values[i] = half_element;
210 half_values_acc[i] = ((i % 10) << 8) + (i % 10);
211 half_values_masked[i] = half_element_masked;
212
213 letter++;
214 /* reset to 'A' */
215 if (letter == 'P') {
216 letter = 'D';
217 }
218
219 half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
220 }
221 }
222
223 /* scatter the 16 bit elements using HVX */
vector_scatter_16(void)224 void vector_scatter_16(void)
225 {
226 asm ("m0 = %1\n\t"
227 "v0 = vmem(%2 + #0)\n\t"
228 "v1 = vmem(%3 + #0)\n\t"
229 "vscatter(%0, m0, v0.h).h = v1\n\t"
230 : : "r"(vtcm.vscatter16), "r"(region_len),
231 "r"(half_offsets), "r"(half_values)
232 : "m0", "v0", "v1", "memory");
233
234 sync_scatter(vtcm.vscatter16);
235 }
236
237 /* scatter-accumulate the 16 bit elements using HVX */
vector_scatter_16_acc(void)238 void vector_scatter_16_acc(void)
239 {
240 asm ("m0 = %1\n\t"
241 "v0 = vmem(%2 + #0)\n\t"
242 "v1 = vmem(%3 + #0)\n\t"
243 "vscatter(%0, m0, v0.h).h += v1\n\t"
244 : : "r"(vtcm.vscatter16), "r"(region_len),
245 "r"(half_offsets), "r"(half_values_acc)
246 : "m0", "v0", "v1", "memory");
247
248 sync_scatter(vtcm.vscatter16);
249 }
250
251 /* masked scatter the 16 bit elements using HVX */
vector_scatter_16_masked(void)252 void vector_scatter_16_masked(void)
253 {
254 asm ("r1 = #-1\n\t"
255 "v0 = vmem(%0 + #0)\n\t"
256 "q0 = vand(v0, r1)\n\t"
257 "m0 = %2\n\t"
258 "v0 = vmem(%3 + #0)\n\t"
259 "v1 = vmem(%4 + #0)\n\t"
260 "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t"
261 : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
262 "r"(half_offsets), "r"(half_values_masked)
263 : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
264
265 sync_scatter(vtcm.vscatter16);
266 }
267
268 /* scatter the 32 bit elements using HVX */
vector_scatter_32(void)269 void vector_scatter_32(void)
270 {
271 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
272 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
273 HVX_Vector *valueslo = (HVX_Vector *)word_values;
274 HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2];
275
276 asm ("m0 = %1\n\t"
277 "v0 = vmem(%2 + #0)\n\t"
278 "v1 = vmem(%3 + #0)\n\t"
279 "vscatter(%0, m0, v0.w).w = v1\n\t"
280 : : "r"(vtcm.vscatter32), "r"(region_len),
281 "r"(offsetslo), "r"(valueslo)
282 : "m0", "v0", "v1", "memory");
283 asm ("m0 = %1\n\t"
284 "v0 = vmem(%2 + #0)\n\t"
285 "v1 = vmem(%3 + #0)\n\t"
286 "vscatter(%0, m0, v0.w).w = v1\n\t"
287 : : "r"(vtcm.vscatter32), "r"(region_len),
288 "r"(offsetshi), "r"(valueshi)
289 : "m0", "v0", "v1", "memory");
290
291 sync_scatter(vtcm.vscatter32);
292 }
293
294 /* scatter-accumulate the 32 bit elements using HVX */
vector_scatter_32_acc(void)295 void vector_scatter_32_acc(void)
296 {
297 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
298 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
299 HVX_Vector *valueslo = (HVX_Vector *)word_values_acc;
300 HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
301
302 asm ("m0 = %1\n\t"
303 "v0 = vmem(%2 + #0)\n\t"
304 "v1 = vmem(%3 + #0)\n\t"
305 "vscatter(%0, m0, v0.w).w += v1\n\t"
306 : : "r"(vtcm.vscatter32), "r"(region_len),
307 "r"(offsetslo), "r"(valueslo)
308 : "m0", "v0", "v1", "memory");
309 asm ("m0 = %1\n\t"
310 "v0 = vmem(%2 + #0)\n\t"
311 "v1 = vmem(%3 + #0)\n\t"
312 "vscatter(%0, m0, v0.w).w += v1\n\t"
313 : : "r"(vtcm.vscatter32), "r"(region_len),
314 "r"(offsetshi), "r"(valueshi)
315 : "m0", "v0", "v1", "memory");
316
317 sync_scatter(vtcm.vscatter32);
318 }
319
320 /* masked scatter the 32 bit elements using HVX */
vector_scatter_32_masked(void)321 void vector_scatter_32_masked(void)
322 {
323 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
324 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
325 HVX_Vector *valueslo = (HVX_Vector *)word_values_masked;
326 HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
327 HVX_Vector *predslo = (HVX_Vector *)word_predicates;
328 HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
329
330 asm ("r1 = #-1\n\t"
331 "v0 = vmem(%0 + #0)\n\t"
332 "q0 = vand(v0, r1)\n\t"
333 "m0 = %2\n\t"
334 "v0 = vmem(%3 + #0)\n\t"
335 "v1 = vmem(%4 + #0)\n\t"
336 "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
337 : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
338 "r"(offsetslo), "r"(valueslo)
339 : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
340 asm ("r1 = #-1\n\t"
341 "v0 = vmem(%0 + #0)\n\t"
342 "q0 = vand(v0, r1)\n\t"
343 "m0 = %2\n\t"
344 "v0 = vmem(%3 + #0)\n\t"
345 "v1 = vmem(%4 + #0)\n\t"
346 "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
347 : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
348 "r"(offsetshi), "r"(valueshi)
349 : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
350
351 sync_scatter(vtcm.vscatter32);
352 }
353
354 /* scatter the 16 bit elements with 32 bit offsets using HVX */
vector_scatter_16_32(void)355 void vector_scatter_16_32(void)
356 {
357 asm ("m0 = %1\n\t"
358 "v0 = vmem(%2 + #0)\n\t"
359 "v1 = vmem(%2 + #1)\n\t"
360 "v2 = vmem(%3 + #0)\n\t"
361 "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */
362 "vscatter(%0, m0, v1:0.w).h = v2\n\t"
363 : : "r"(vtcm.vscatter16_32), "r"(region_len),
364 "r"(word_offsets), "r"(half_values)
365 : "m0", "v0", "v1", "v2", "memory");
366
367 sync_scatter(vtcm.vscatter16_32);
368 }
369
370 /* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */
vector_scatter_16_32_acc(void)371 void vector_scatter_16_32_acc(void)
372 {
373 asm ("m0 = %1\n\t"
374 "v0 = vmem(%2 + #0)\n\t"
375 "v1 = vmem(%2 + #1)\n\t"
376 "v2 = vmem(%3 + #0)\n\t" \
377 "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */
378 "vscatter(%0, m0, v1:0.w).h += v2\n\t"
379 : : "r"(vtcm.vscatter16_32), "r"(region_len),
380 "r"(word_offsets), "r"(half_values_acc)
381 : "m0", "v0", "v1", "v2", "memory");
382
383 sync_scatter(vtcm.vscatter16_32);
384 }
385
386 /* masked scatter the 16 bit elements with 32 bit offsets using HVX */
vector_scatter_16_32_masked(void)387 void vector_scatter_16_32_masked(void)
388 {
389 asm ("r1 = #-1\n\t"
390 "v0 = vmem(%0 + #0)\n\t"
391 "v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */
392 "q0 = vand(v0, r1)\n\t"
393 "m0 = %2\n\t"
394 "v0 = vmem(%3 + #0)\n\t"
395 "v1 = vmem(%3 + #1)\n\t"
396 "v2 = vmem(%4 + #0)\n\t" \
397 "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */
398 "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t"
399 : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
400 "r"(word_offsets), "r"(half_values_masked)
401 : "r1", "q0", "m0", "v0", "v1", "v2", "memory");
402
403 sync_scatter(vtcm.vscatter16_32);
404 }
405
406 /* gather the elements from the scatter16 buffer using HVX */
vector_gather_16(void)407 void vector_gather_16(void)
408 {
409 asm ("m0 = %1\n\t"
410 "v0 = vmem(%2 + #0)\n\t"
411 "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t"
412 " vmem(%3 + #0) = vtmp.new }\n\t"
413 : : "r"(vtcm.vscatter16), "r"(region_len),
414 "r"(half_offsets), "r"(vtcm.vgather16)
415 : "m0", "v0", "memory");
416
417 sync_gather(vtcm.vgather16);
418 }
419
gather_16_masked_init(void)420 static unsigned short gather_16_masked_init(void)
421 {
422 char letter = '?';
423 return letter | (letter << 8);
424 }
425
426 /* masked gather the elements from the scatter16 buffer using HVX */
vector_gather_16_masked(void)427 void vector_gather_16_masked(void)
428 {
429 unsigned short init = gather_16_masked_init();
430
431 asm ("v0.h = vsplat(%5)\n\t"
432 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */
433 "r1 = #-1\n\t"
434 "v0 = vmem(%0 + #0)\n\t"
435 "q0 = vand(v0, r1)\n\t"
436 "m0 = %2\n\t"
437 "v0 = vmem(%3 + #0)\n\t"
438 "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t"
439 " vmem(%4 + #0) = vtmp.new }\n\t"
440 : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
441 "r"(half_offsets), "r"(vtcm.vgather16), "r"(init)
442 : "r1", "q0", "m0", "v0", "memory");
443
444 sync_gather(vtcm.vgather16);
445 }
446
447 /* gather the elements from the scatter32 buffer using HVX */
vector_gather_32(void)448 void vector_gather_32(void)
449 {
450 HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
451 HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
452 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
453 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
454
455 asm ("m0 = %1\n\t"
456 "v0 = vmem(%2 + #0)\n\t"
457 "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
458 " vmem(%3 + #0) = vtmp.new }\n\t"
459 : : "r"(vtcm.vscatter32), "r"(region_len),
460 "r"(offsetslo), "r"(vgatherlo)
461 : "m0", "v0", "memory");
462 asm ("m0 = %1\n\t"
463 "v0 = vmem(%2 + #0)\n\t"
464 "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
465 " vmem(%3 + #0) = vtmp.new }\n\t"
466 : : "r"(vtcm.vscatter32), "r"(region_len),
467 "r"(offsetshi), "r"(vgatherhi)
468 : "m0", "v0", "memory");
469
470 sync_gather(vgatherlo);
471 sync_gather(vgatherhi);
472 }
473
gather_32_masked_init(void)474 static unsigned int gather_32_masked_init(void)
475 {
476 char letter = '?';
477 return letter | (letter << 8) | (letter << 16) | (letter << 24);
478 }
479
480 /* masked gather the elements from the scatter32 buffer using HVX */
vector_gather_32_masked(void)481 void vector_gather_32_masked(void)
482 {
483 unsigned int init = gather_32_masked_init();
484 HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
485 HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
486 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
487 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
488 HVX_Vector *predslo = (HVX_Vector *)word_predicates;
489 HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
490
491 asm ("v0.h = vsplat(%5)\n\t"
492 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */
493 "r1 = #-1\n\t"
494 "v0 = vmem(%0 + #0)\n\t"
495 "q0 = vand(v0, r1)\n\t"
496 "m0 = %2\n\t"
497 "v0 = vmem(%3 + #0)\n\t"
498 "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
499 " vmem(%4 + #0) = vtmp.new }\n\t"
500 : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
501 "r"(offsetslo), "r"(vgatherlo), "r"(init)
502 : "r1", "q0", "m0", "v0", "memory");
503 asm ("v0.h = vsplat(%5)\n\t"
504 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */
505 "r1 = #-1\n\t"
506 "v0 = vmem(%0 + #0)\n\t"
507 "q0 = vand(v0, r1)\n\t"
508 "m0 = %2\n\t"
509 "v0 = vmem(%3 + #0)\n\t"
510 "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
511 " vmem(%4 + #0) = vtmp.new }\n\t"
512 : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
513 "r"(offsetshi), "r"(vgatherhi), "r"(init)
514 : "r1", "q0", "m0", "v0", "memory");
515
516 sync_gather(vgatherlo);
517 sync_gather(vgatherhi);
518 }
519
520 /* gather the elements from the scatter16_32 buffer using HVX */
vector_gather_16_32(void)521 void vector_gather_16_32(void)
522 {
523 asm ("m0 = %1\n\t"
524 "v0 = vmem(%2 + #0)\n\t"
525 "v1 = vmem(%2 + #1)\n\t"
526 "{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t"
527 " vmem(%3 + #0) = vtmp.new }\n\t"
528 "v0 = vmem(%3 + #0)\n\t"
529 "v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */
530 "vmem(%3 + #0) = v0\n\t"
531 : : "r"(vtcm.vscatter16_32), "r"(region_len),
532 "r"(word_offsets), "r"(vtcm.vgather16_32)
533 : "m0", "v0", "v1", "memory");
534
535 sync_gather(vtcm.vgather16_32);
536 }
537
538 /* masked gather the elements from the scatter16_32 buffer using HVX */
vector_gather_16_32_masked(void)539 void vector_gather_16_32_masked(void)
540 {
541 unsigned short init = gather_16_masked_init();
542
543 asm ("v0.h = vsplat(%5)\n\t"
544 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */
545 "r1 = #-1\n\t"
546 "v0 = vmem(%0 + #0)\n\t"
547 "v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */
548 "q0 = vand(v0, r1)\n\t"
549 "m0 = %2\n\t"
550 "v0 = vmem(%3 + #0)\n\t"
551 "v1 = vmem(%3 + #1)\n\t"
552 "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t"
553 " vmem(%4 + #0) = vtmp.new }\n\t"
554 "v0 = vmem(%4 + #0)\n\t"
555 "v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */
556 "vmem(%4 + #0) = v0\n\t"
557 : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
558 "r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init)
559 : "r1", "q0", "m0", "v0", "v1", "memory");
560
561 sync_gather(vtcm.vgather16_32);
562 }
563
check_buffer(const char * name,void * c,void * r,size_t size)564 static void check_buffer(const char *name, void *c, void *r, size_t size)
565 {
566 char *check = (char *)c;
567 char *ref = (char *)r;
568 for (int i = 0; i < size; i++) {
569 if (check[i] != ref[i]) {
570 printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
571 check[i], check[i], ref[i], ref[i]);
572 err++;
573 }
574 }
575 }
576
577 /*
578 * These scalar functions are the C equivalents of the vector functions that
579 * use HVX
580 */
581
582 /* scatter the 16 bit elements using C */
scalar_scatter_16(unsigned short * vscatter16)583 void scalar_scatter_16(unsigned short *vscatter16)
584 {
585 for (int i = 0; i < MATRIX_SIZE; ++i) {
586 vscatter16[half_offsets[i] / 2] = half_values[i];
587 }
588 }
589
check_scatter_16()590 void check_scatter_16()
591 {
592 memset(vscatter16_ref, FILL_CHAR,
593 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
594 scalar_scatter_16(vscatter16_ref);
595 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
596 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
597 }
598
599 /* scatter the 16 bit elements using C */
scalar_scatter_16_acc(unsigned short * vscatter16)600 void scalar_scatter_16_acc(unsigned short *vscatter16)
601 {
602 for (int i = 0; i < MATRIX_SIZE; ++i) {
603 vscatter16[half_offsets[i] / 2] += half_values_acc[i];
604 }
605 }
606
607 /* scatter-accumulate the 16 bit elements using C */
check_scatter_16_acc()608 void check_scatter_16_acc()
609 {
610 memset(vscatter16_ref, FILL_CHAR,
611 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
612 scalar_scatter_16(vscatter16_ref);
613 scalar_scatter_16_acc(vscatter16_ref);
614 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
615 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
616 }
617
618 /* masked scatter the 16 bit elements using C */
scalar_scatter_16_masked(unsigned short * vscatter16)619 void scalar_scatter_16_masked(unsigned short *vscatter16)
620 {
621 for (int i = 0; i < MATRIX_SIZE; i++) {
622 if (half_predicates[i]) {
623 vscatter16[half_offsets[i] / 2] = half_values_masked[i];
624 }
625 }
626
627 }
628
check_scatter_16_masked()629 void check_scatter_16_masked()
630 {
631 memset(vscatter16_ref, FILL_CHAR,
632 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
633 scalar_scatter_16(vscatter16_ref);
634 scalar_scatter_16_acc(vscatter16_ref);
635 scalar_scatter_16_masked(vscatter16_ref);
636 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
637 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
638 }
639
640 /* scatter the 32 bit elements using C */
scalar_scatter_32(unsigned int * vscatter32)641 void scalar_scatter_32(unsigned int *vscatter32)
642 {
643 for (int i = 0; i < MATRIX_SIZE; ++i) {
644 vscatter32[word_offsets[i] / 4] = word_values[i];
645 }
646 }
647
check_scatter_32()648 void check_scatter_32()
649 {
650 memset(vscatter32_ref, FILL_CHAR,
651 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
652 scalar_scatter_32(vscatter32_ref);
653 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
654 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
655 }
656
657 /* scatter-accumulate the 32 bit elements using C */
scalar_scatter_32_acc(unsigned int * vscatter32)658 void scalar_scatter_32_acc(unsigned int *vscatter32)
659 {
660 for (int i = 0; i < MATRIX_SIZE; ++i) {
661 vscatter32[word_offsets[i] / 4] += word_values_acc[i];
662 }
663 }
664
check_scatter_32_acc()665 void check_scatter_32_acc()
666 {
667 memset(vscatter32_ref, FILL_CHAR,
668 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
669 scalar_scatter_32(vscatter32_ref);
670 scalar_scatter_32_acc(vscatter32_ref);
671 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
672 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
673 }
674
675 /* masked scatter the 32 bit elements using C */
scalar_scatter_32_masked(unsigned int * vscatter32)676 void scalar_scatter_32_masked(unsigned int *vscatter32)
677 {
678 for (int i = 0; i < MATRIX_SIZE; i++) {
679 if (word_predicates[i]) {
680 vscatter32[word_offsets[i] / 4] = word_values_masked[i];
681 }
682 }
683 }
684
check_scatter_32_masked()685 void check_scatter_32_masked()
686 {
687 memset(vscatter32_ref, FILL_CHAR,
688 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
689 scalar_scatter_32(vscatter32_ref);
690 scalar_scatter_32_acc(vscatter32_ref);
691 scalar_scatter_32_masked(vscatter32_ref);
692 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
693 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
694 }
695
696 /* scatter the 16 bit elements with 32 bit offsets using C */
scalar_scatter_16_32(unsigned short * vscatter16_32)697 void scalar_scatter_16_32(unsigned short *vscatter16_32)
698 {
699 for (int i = 0; i < MATRIX_SIZE; ++i) {
700 vscatter16_32[word_offsets[i] / 2] = half_values[i];
701 }
702 }
703
check_scatter_16_32()704 void check_scatter_16_32()
705 {
706 memset(vscatter16_32_ref, FILL_CHAR,
707 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
708 scalar_scatter_16_32(vscatter16_32_ref);
709 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
710 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
711 }
712
713 /* scatter-accumulate the 16 bit elements with 32 bit offsets using C */
scalar_scatter_16_32_acc(unsigned short * vscatter16_32)714 void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
715 {
716 for (int i = 0; i < MATRIX_SIZE; ++i) {
717 vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
718 }
719 }
720
check_scatter_16_32_acc()721 void check_scatter_16_32_acc()
722 {
723 memset(vscatter16_32_ref, FILL_CHAR,
724 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
725 scalar_scatter_16_32(vscatter16_32_ref);
726 scalar_scatter_16_32_acc(vscatter16_32_ref);
727 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
728 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
729 }
730
731 /* masked scatter the 16 bit elements with 32 bit offsets using C */
scalar_scatter_16_32_masked(unsigned short * vscatter16_32)732 void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
733 {
734 for (int i = 0; i < MATRIX_SIZE; i++) {
735 if (half_predicates[i]) {
736 vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
737 }
738 }
739 }
740
check_scatter_16_32_masked()741 void check_scatter_16_32_masked()
742 {
743 memset(vscatter16_32_ref, FILL_CHAR,
744 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
745 scalar_scatter_16_32(vscatter16_32_ref);
746 scalar_scatter_16_32_acc(vscatter16_32_ref);
747 scalar_scatter_16_32_masked(vscatter16_32_ref);
748 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
749 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
750 }
751
752 /* gather the elements from the scatter buffer using C */
scalar_gather_16(unsigned short * vgather16)753 void scalar_gather_16(unsigned short *vgather16)
754 {
755 for (int i = 0; i < MATRIX_SIZE; ++i) {
756 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
757 }
758 }
759
check_gather_16()760 void check_gather_16()
761 {
762 memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
763 scalar_gather_16(vgather16_ref);
764 check_buffer(__func__, vtcm.vgather16, vgather16_ref,
765 MATRIX_SIZE * sizeof(unsigned short));
766 }
767
768 /* masked gather the elements from the scatter buffer using C */
scalar_gather_16_masked(unsigned short * vgather16)769 void scalar_gather_16_masked(unsigned short *vgather16)
770 {
771 for (int i = 0; i < MATRIX_SIZE; ++i) {
772 if (half_predicates[i]) {
773 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
774 }
775 }
776 }
777
check_gather_16_masked()778 void check_gather_16_masked()
779 {
780 memset(vgather16_ref, gather_16_masked_init(),
781 MATRIX_SIZE * sizeof(unsigned short));
782 scalar_gather_16_masked(vgather16_ref);
783 check_buffer(__func__, vtcm.vgather16, vgather16_ref,
784 MATRIX_SIZE * sizeof(unsigned short));
785 }
786
787 /* gather the elements from the scatter32 buffer using C */
scalar_gather_32(unsigned int * vgather32)788 void scalar_gather_32(unsigned int *vgather32)
789 {
790 for (int i = 0; i < MATRIX_SIZE; ++i) {
791 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
792 }
793 }
794
check_gather_32(void)795 void check_gather_32(void)
796 {
797 memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
798 scalar_gather_32(vgather32_ref);
799 check_buffer(__func__, vtcm.vgather32, vgather32_ref,
800 MATRIX_SIZE * sizeof(unsigned int));
801 }
802
803 /* masked gather the elements from the scatter32 buffer using C */
scalar_gather_32_masked(unsigned int * vgather32)804 void scalar_gather_32_masked(unsigned int *vgather32)
805 {
806 for (int i = 0; i < MATRIX_SIZE; ++i) {
807 if (word_predicates[i]) {
808 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
809 }
810 }
811 }
812
check_gather_32_masked(void)813 void check_gather_32_masked(void)
814 {
815 memset(vgather32_ref, gather_32_masked_init(),
816 MATRIX_SIZE * sizeof(unsigned int));
817 scalar_gather_32_masked(vgather32_ref);
818 check_buffer(__func__, vtcm.vgather32,
819 vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
820 }
821
822 /* gather the elements from the scatter16_32 buffer using C */
scalar_gather_16_32(unsigned short * vgather16_32)823 void scalar_gather_16_32(unsigned short *vgather16_32)
824 {
825 for (int i = 0; i < MATRIX_SIZE; ++i) {
826 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
827 }
828 }
829
check_gather_16_32(void)830 void check_gather_16_32(void)
831 {
832 memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
833 scalar_gather_16_32(vgather16_32_ref);
834 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
835 MATRIX_SIZE * sizeof(unsigned short));
836 }
837
838 /* masked gather the elements from the scatter16_32 buffer using C */
scalar_gather_16_32_masked(unsigned short * vgather16_32)839 void scalar_gather_16_32_masked(unsigned short *vgather16_32)
840 {
841 for (int i = 0; i < MATRIX_SIZE; ++i) {
842 if (half_predicates[i]) {
843 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
844 }
845 }
846
847 }
848
check_gather_16_32_masked(void)849 void check_gather_16_32_masked(void)
850 {
851 memset(vgather16_32_ref, gather_16_masked_init(),
852 MATRIX_SIZE * sizeof(unsigned short));
853 scalar_gather_16_32_masked(vgather16_32_ref);
854 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
855 MATRIX_SIZE * sizeof(unsigned short));
856 }
857
858 /* print scatter16 buffer */
print_scatter16_buffer(void)859 void print_scatter16_buffer(void)
860 {
861 if (PRINT_DATA) {
862 printf("\n\nPrinting the 16 bit scatter buffer");
863
864 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
865 if ((i % MATRIX_SIZE) == 0) {
866 printf("\n");
867 }
868 for (int j = 0; j < 2; j++) {
869 printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
870 }
871 printf(" ");
872 }
873 printf("\n");
874 }
875 }
876
877 /* print the gather 16 buffer */
print_gather_result_16(void)878 void print_gather_result_16(void)
879 {
880 if (PRINT_DATA) {
881 printf("\n\nPrinting the 16 bit gather result\n");
882
883 for (int i = 0; i < MATRIX_SIZE; i++) {
884 for (int j = 0; j < 2; j++) {
885 printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
886 }
887 printf(" ");
888 }
889 printf("\n");
890 }
891 }
892
893 /* print the scatter32 buffer */
print_scatter32_buffer(void)894 void print_scatter32_buffer(void)
895 {
896 if (PRINT_DATA) {
897 printf("\n\nPrinting the 32 bit scatter buffer");
898
899 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
900 if ((i % MATRIX_SIZE) == 0) {
901 printf("\n");
902 }
903 for (int j = 0; j < 4; j++) {
904 printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
905 }
906 printf(" ");
907 }
908 printf("\n");
909 }
910 }
911
912 /* print the gather 32 buffer */
print_gather_result_32(void)913 void print_gather_result_32(void)
914 {
915 if (PRINT_DATA) {
916 printf("\n\nPrinting the 32 bit gather result\n");
917
918 for (int i = 0; i < MATRIX_SIZE; i++) {
919 for (int j = 0; j < 4; j++) {
920 printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
921 }
922 printf(" ");
923 }
924 printf("\n");
925 }
926 }
927
928 /* print the scatter16_32 buffer */
print_scatter16_32_buffer(void)929 void print_scatter16_32_buffer(void)
930 {
931 if (PRINT_DATA) {
932 printf("\n\nPrinting the 16_32 bit scatter buffer");
933
934 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
935 if ((i % MATRIX_SIZE) == 0) {
936 printf("\n");
937 }
938 for (int j = 0; j < 2; j++) {
939 printf("%c",
940 (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
941 }
942 printf(" ");
943 }
944 printf("\n");
945 }
946 }
947
948 /* print the gather 16_32 buffer */
print_gather_result_16_32(void)949 void print_gather_result_16_32(void)
950 {
951 if (PRINT_DATA) {
952 printf("\n\nPrinting the 16_32 bit gather result\n");
953
954 for (int i = 0; i < MATRIX_SIZE; i++) {
955 for (int j = 0; j < 2; j++) {
956 printf("%c",
957 (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
958 }
959 printf(" ");
960 }
961 printf("\n");
962 }
963 }
964
main()965 int main()
966 {
967 prefill_vtcm_scratch();
968
969 /* 16 bit elements with 16 bit offsets */
970 create_offsets_values_preds_16();
971
972 vector_scatter_16();
973 print_scatter16_buffer();
974 check_scatter_16();
975
976 vector_gather_16();
977 print_gather_result_16();
978 check_gather_16();
979
980 vector_gather_16_masked();
981 print_gather_result_16();
982 check_gather_16_masked();
983
984 vector_scatter_16_acc();
985 print_scatter16_buffer();
986 check_scatter_16_acc();
987
988 vector_scatter_16_masked();
989 print_scatter16_buffer();
990 check_scatter_16_masked();
991
992 /* 32 bit elements with 32 bit offsets */
993 create_offsets_values_preds_32();
994
995 vector_scatter_32();
996 print_scatter32_buffer();
997 check_scatter_32();
998
999 vector_gather_32();
1000 print_gather_result_32();
1001 check_gather_32();
1002
1003 vector_gather_32_masked();
1004 print_gather_result_32();
1005 check_gather_32_masked();
1006
1007 vector_scatter_32_acc();
1008 print_scatter32_buffer();
1009 check_scatter_32_acc();
1010
1011 vector_scatter_32_masked();
1012 print_scatter32_buffer();
1013 check_scatter_32_masked();
1014
1015 /* 16 bit elements with 32 bit offsets */
1016 create_offsets_values_preds_16_32();
1017
1018 vector_scatter_16_32();
1019 print_scatter16_32_buffer();
1020 check_scatter_16_32();
1021
1022 vector_gather_16_32();
1023 print_gather_result_16_32();
1024 check_gather_16_32();
1025
1026 vector_gather_16_32_masked();
1027 print_gather_result_16_32();
1028 check_gather_16_32_masked();
1029
1030 vector_scatter_16_32_acc();
1031 print_scatter16_32_buffer();
1032 check_scatter_16_32_acc();
1033
1034 vector_scatter_16_32_masked();
1035 print_scatter16_32_buffer();
1036 check_scatter_16_32_masked();
1037
1038 puts(err ? "FAIL" : "PASS");
1039 return err;
1040 }
1041