1 /*
2  *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 /*
19  * This example tests the HVX scatter/gather instructions
20  *
21  * See section 5.13 of the V68 HVX Programmer's Reference
22  *
23  * There are 3 main classes operations
24  *     _16                 16-bit elements and 16-bit offsets
25  *     _32                 32-bit elements and 32-bit offsets
26  *     _16_32              16-bit elements and 32-bit offsets
27  *
28  * There are also masked and accumulate versions
29  */
30 
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include <inttypes.h>
35 
36 typedef long HVX_Vector       __attribute__((__vector_size__(128)))
37                               __attribute__((aligned(128)));
38 typedef long HVX_VectorPair   __attribute__((__vector_size__(256)))
39                               __attribute__((aligned(128)));
40 typedef long HVX_VectorPred   __attribute__((__vector_size__(128)))
41                               __attribute__((aligned(128)));
42 
43 #define VSCATTER_16(BASE, RGN, OFF, VALS) \
44     __builtin_HEXAGON_V6_vscattermh_128B((int)BASE, RGN, OFF, VALS)
45 #define VSCATTER_16_MASKED(MASK, BASE, RGN, OFF, VALS) \
46     __builtin_HEXAGON_V6_vscattermhq_128B(MASK, (int)BASE, RGN, OFF, VALS)
47 #define VSCATTER_32(BASE, RGN, OFF, VALS) \
48     __builtin_HEXAGON_V6_vscattermw_128B((int)BASE, RGN, OFF, VALS)
49 #define VSCATTER_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
50     __builtin_HEXAGON_V6_vscattermwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
51 #define VSCATTER_16_32(BASE, RGN, OFF, VALS) \
52     __builtin_HEXAGON_V6_vscattermhw_128B((int)BASE, RGN, OFF, VALS)
53 #define VSCATTER_16_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
54     __builtin_HEXAGON_V6_vscattermhwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
55 #define VSCATTER_16_ACC(BASE, RGN, OFF, VALS) \
56     __builtin_HEXAGON_V6_vscattermh_add_128B((int)BASE, RGN, OFF, VALS)
57 #define VSCATTER_32_ACC(BASE, RGN, OFF, VALS) \
58     __builtin_HEXAGON_V6_vscattermw_add_128B((int)BASE, RGN, OFF, VALS)
59 #define VSCATTER_16_32_ACC(BASE, RGN, OFF, VALS) \
60     __builtin_HEXAGON_V6_vscattermhw_add_128B((int)BASE, RGN, OFF, VALS)
61 
62 #define VGATHER_16(DSTADDR, BASE, RGN, OFF) \
63     __builtin_HEXAGON_V6_vgathermh_128B(DSTADDR, (int)BASE, RGN, OFF)
64 #define VGATHER_16_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
65     __builtin_HEXAGON_V6_vgathermhq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
66 #define VGATHER_32(DSTADDR, BASE, RGN, OFF) \
67     __builtin_HEXAGON_V6_vgathermw_128B(DSTADDR, (int)BASE, RGN, OFF)
68 #define VGATHER_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
69     __builtin_HEXAGON_V6_vgathermwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
70 #define VGATHER_16_32(DSTADDR, BASE, RGN, OFF) \
71     __builtin_HEXAGON_V6_vgathermhw_128B(DSTADDR, (int)BASE, RGN, OFF)
72 #define VGATHER_16_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
73     __builtin_HEXAGON_V6_vgathermhwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
74 
75 #define VSHUFF_H(V) \
76     __builtin_HEXAGON_V6_vshuffh_128B(V)
77 #define VSPLAT_H(X) \
78     __builtin_HEXAGON_V6_lvsplath_128B(X)
79 #define VAND_VAL(PRED, VAL) \
80     __builtin_HEXAGON_V6_vandvrt_128B(PRED, VAL)
81 #define VDEAL_H(V) \
82     __builtin_HEXAGON_V6_vdealh_128B(V)
83 
84 int err;
85 
86 /* define the number of rows/cols in a square matrix */
87 #define MATRIX_SIZE 64
88 
89 /* define the size of the scatter buffer */
90 #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
91 
92 /* fake vtcm - put buffers together and force alignment */
93 static struct {
94     unsigned short vscatter16[SCATTER_BUFFER_SIZE];
95     unsigned short vgather16[MATRIX_SIZE];
96     unsigned int   vscatter32[SCATTER_BUFFER_SIZE];
97     unsigned int   vgather32[MATRIX_SIZE];
98     unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
99     unsigned short vgather16_32[MATRIX_SIZE];
100 } vtcm __attribute__((aligned(0x10000)));
101 
102 /* declare the arrays of reference values */
103 unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
104 unsigned short vgather16_ref[MATRIX_SIZE];
105 unsigned int   vscatter32_ref[SCATTER_BUFFER_SIZE];
106 unsigned int   vgather32_ref[MATRIX_SIZE];
107 unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
108 unsigned short vgather16_32_ref[MATRIX_SIZE];
109 
110 /* declare the arrays of offsets */
111 unsigned short half_offsets[MATRIX_SIZE];
112 unsigned int   word_offsets[MATRIX_SIZE];
113 
114 /* declare the arrays of values */
115 unsigned short half_values[MATRIX_SIZE];
116 unsigned short half_values_acc[MATRIX_SIZE];
117 unsigned short half_values_masked[MATRIX_SIZE];
118 unsigned int   word_values[MATRIX_SIZE];
119 unsigned int   word_values_acc[MATRIX_SIZE];
120 unsigned int   word_values_masked[MATRIX_SIZE];
121 
122 /* declare the arrays of predicates */
123 unsigned short half_predicates[MATRIX_SIZE];
124 unsigned int   word_predicates[MATRIX_SIZE];
125 
126 /* make this big enough for all the intrinsics */
127 const size_t region_len = sizeof(vtcm);
128 
129 /* optionally add sync instructions */
130 #define SYNC_VECTOR 1
131 
132 static void sync_scatter(void *addr)
133 {
134 #if SYNC_VECTOR
135     /*
136      * Do the scatter release followed by a dummy load to complete the
137      * synchronization.  Normally the dummy load would be deferred as
138      * long as possible to minimize stalls.
139      */
140     asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
141     /* use volatile to force the load */
142     volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
143 #endif
144 }
145 
146 static void sync_gather(void *addr)
147 {
148 #if SYNC_VECTOR
149     /* use volatile to force the load */
150     volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
151 #endif
152 }
153 
154 /* optionally print the results */
155 #define PRINT_DATA 0
156 
157 #define FILL_CHAR       '.'
158 
159 /* fill vtcm scratch with ee */
160 void prefill_vtcm_scratch(void)
161 {
162     memset(&vtcm, FILL_CHAR, sizeof(vtcm));
163 }
164 
165 /* create byte offsets to be a diagonal of the matrix with 16 bit elements */
166 void create_offsets_values_preds_16(void)
167 {
168     unsigned short half_element = 0;
169     unsigned short half_element_masked = 0;
170     char letter = 'A';
171     char letter_masked = '@';
172 
173     for (int i = 0; i < MATRIX_SIZE; i++) {
174         half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
175 
176         half_element = 0;
177         half_element_masked = 0;
178         for (int j = 0; j < 2; j++) {
179             half_element |= letter << j * 8;
180             half_element_masked |= letter_masked << j * 8;
181         }
182 
183         half_values[i] = half_element;
184         half_values_acc[i] = ((i % 10) << 8) + (i % 10);
185         half_values_masked[i] = half_element_masked;
186 
187         letter++;
188         /* reset to 'A' */
189         if (letter == 'M') {
190             letter = 'A';
191         }
192 
193         half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
194     }
195 }
196 
197 /* create byte offsets to be a diagonal of the matrix with 32 bit elements */
198 void create_offsets_values_preds_32(void)
199 {
200     unsigned int word_element = 0;
201     unsigned int word_element_masked = 0;
202     char letter = 'A';
203     char letter_masked = '&';
204 
205     for (int i = 0; i < MATRIX_SIZE; i++) {
206         word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
207 
208         word_element = 0;
209         word_element_masked = 0;
210         for (int j = 0; j < 4; j++) {
211             word_element |= letter << j * 8;
212             word_element_masked |= letter_masked << j * 8;
213         }
214 
215         word_values[i] = word_element;
216         word_values_acc[i] = ((i % 10) << 8) + (i % 10);
217         word_values_masked[i] = word_element_masked;
218 
219         letter++;
220         /* reset to 'A' */
221         if (letter == 'M') {
222             letter = 'A';
223         }
224 
225         word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
226     }
227 }
228 
229 /*
230  * create byte offsets to be a diagonal of the matrix with 16 bit elements
231  * and 32 bit offsets
232  */
233 void create_offsets_values_preds_16_32(void)
234 {
235     unsigned short half_element = 0;
236     unsigned short half_element_masked = 0;
237     char letter = 'D';
238     char letter_masked = '$';
239 
240     for (int i = 0; i < MATRIX_SIZE; i++) {
241         word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
242 
243         half_element = 0;
244         half_element_masked = 0;
245         for (int j = 0; j < 2; j++) {
246             half_element |= letter << j * 8;
247             half_element_masked |= letter_masked << j * 8;
248         }
249 
250         half_values[i] = half_element;
251         half_values_acc[i] = ((i % 10) << 8) + (i % 10);
252         half_values_masked[i] = half_element_masked;
253 
254         letter++;
255         /* reset to 'A' */
256         if (letter == 'P') {
257             letter = 'D';
258         }
259 
260         half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
261     }
262 }
263 
264 /* scatter the 16 bit elements using intrinsics */
265 void vector_scatter_16(void)
266 {
267     /* copy the offsets and values to vectors */
268     HVX_Vector offsets = *(HVX_Vector *)half_offsets;
269     HVX_Vector values = *(HVX_Vector *)half_values;
270 
271     VSCATTER_16(&vtcm.vscatter16, region_len, offsets, values);
272 
273     sync_scatter(vtcm.vscatter16);
274 }
275 
276 /* scatter-accumulate the 16 bit elements using intrinsics */
277 void vector_scatter_16_acc(void)
278 {
279     /* copy the offsets and values to vectors */
280     HVX_Vector offsets = *(HVX_Vector *)half_offsets;
281     HVX_Vector values = *(HVX_Vector *)half_values_acc;
282 
283     VSCATTER_16_ACC(&vtcm.vscatter16, region_len, offsets, values);
284 
285     sync_scatter(vtcm.vscatter16);
286 }
287 
288 /* scatter the 16 bit elements using intrinsics */
289 void vector_scatter_16_masked(void)
290 {
291     /* copy the offsets and values to vectors */
292     HVX_Vector offsets = *(HVX_Vector *)half_offsets;
293     HVX_Vector values = *(HVX_Vector *)half_values_masked;
294     HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
295     HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
296 
297     VSCATTER_16_MASKED(preds, &vtcm.vscatter16, region_len, offsets, values);
298 
299     sync_scatter(vtcm.vscatter16);
300 }
301 
302 /* scatter the 32 bit elements using intrinsics */
303 void vector_scatter_32(void)
304 {
305     /* copy the offsets and values to vectors */
306     HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
307     HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
308     HVX_Vector valueslo = *(HVX_Vector *)word_values;
309     HVX_Vector valueshi = *(HVX_Vector *)&word_values[MATRIX_SIZE / 2];
310 
311     VSCATTER_32(&vtcm.vscatter32, region_len, offsetslo, valueslo);
312     VSCATTER_32(&vtcm.vscatter32, region_len, offsetshi, valueshi);
313 
314     sync_scatter(vtcm.vscatter32);
315 }
316 
317 /* scatter-acc the 32 bit elements using intrinsics */
318 void vector_scatter_32_acc(void)
319 {
320     /* copy the offsets and values to vectors */
321     HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
322     HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
323     HVX_Vector valueslo = *(HVX_Vector *)word_values_acc;
324     HVX_Vector valueshi = *(HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
325 
326     VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetslo, valueslo);
327     VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetshi, valueshi);
328 
329     sync_scatter(vtcm.vscatter32);
330 }
331 
332 /* scatter the 32 bit elements using intrinsics */
333 void vector_scatter_32_masked(void)
334 {
335     /* copy the offsets and values to vectors */
336     HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
337     HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
338     HVX_Vector valueslo = *(HVX_Vector *)word_values_masked;
339     HVX_Vector valueshi = *(HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
340     HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
341     HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
342     HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
343     HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
344 
345     VSCATTER_32_MASKED(predslo, &vtcm.vscatter32, region_len, offsetslo,
346                        valueslo);
347     VSCATTER_32_MASKED(predshi, &vtcm.vscatter32, region_len, offsetshi,
348                        valueshi);
349 
350     sync_scatter(vtcm.vscatter16);
351 }
352 
353 /* scatter the 16 bit elements with 32 bit offsets using intrinsics */
354 void vector_scatter_16_32(void)
355 {
356     HVX_VectorPair offsets;
357     HVX_Vector values;
358 
359     /* get the word offsets in a vector pair */
360     offsets = *(HVX_VectorPair *)word_offsets;
361 
362     /* these values need to be shuffled for the scatter */
363     values = *(HVX_Vector *)half_values;
364     values = VSHUFF_H(values);
365 
366     VSCATTER_16_32(&vtcm.vscatter16_32, region_len, offsets, values);
367 
368     sync_scatter(vtcm.vscatter16_32);
369 }
370 
371 /* scatter-acc the 16 bit elements with 32 bit offsets using intrinsics */
372 void vector_scatter_16_32_acc(void)
373 {
374     HVX_VectorPair offsets;
375     HVX_Vector values;
376 
377     /* get the word offsets in a vector pair */
378     offsets = *(HVX_VectorPair *)word_offsets;
379 
380     /* these values need to be shuffled for the scatter */
381     values = *(HVX_Vector *)half_values_acc;
382     values = VSHUFF_H(values);
383 
384     VSCATTER_16_32_ACC(&vtcm.vscatter16_32, region_len, offsets, values);
385 
386     sync_scatter(vtcm.vscatter16_32);
387 }
388 
389 /* masked scatter the 16 bit elements with 32 bit offsets using intrinsics */
390 void vector_scatter_16_32_masked(void)
391 {
392     HVX_VectorPair offsets;
393     HVX_Vector values;
394     HVX_Vector pred_reg;
395 
396     /* get the word offsets in a vector pair */
397     offsets = *(HVX_VectorPair *)word_offsets;
398 
399     /* these values need to be shuffled for the scatter */
400     values = *(HVX_Vector *)half_values_masked;
401     values = VSHUFF_H(values);
402 
403     pred_reg = *(HVX_Vector *)half_predicates;
404     pred_reg = VSHUFF_H(pred_reg);
405     HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
406 
407     VSCATTER_16_32_MASKED(preds, &vtcm.vscatter16_32, region_len, offsets,
408                           values);
409 
410     sync_scatter(vtcm.vscatter16_32);
411 }
412 
413 /* gather the elements from the scatter16 buffer */
414 void vector_gather_16(void)
415 {
416     HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
417     HVX_Vector offsets = *(HVX_Vector *)half_offsets;
418 
419     VGATHER_16(vgather, &vtcm.vscatter16, region_len, offsets);
420 
421     sync_gather(vgather);
422 }
423 
424 static unsigned short gather_16_masked_init(void)
425 {
426     char letter = '?';
427     return letter | (letter << 8);
428 }
429 
430 void vector_gather_16_masked(void)
431 {
432     HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
433     HVX_Vector offsets = *(HVX_Vector *)half_offsets;
434     HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
435     HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
436 
437     *vgather = VSPLAT_H(gather_16_masked_init());
438     VGATHER_16_MASKED(vgather, preds, &vtcm.vscatter16, region_len, offsets);
439 
440     sync_gather(vgather);
441 }
442 
443 /* gather the elements from the scatter32 buffer */
444 void vector_gather_32(void)
445 {
446     HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
447     HVX_Vector *vgatherhi =
448         (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
449     HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
450     HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
451 
452     VGATHER_32(vgatherlo, &vtcm.vscatter32, region_len, offsetslo);
453     VGATHER_32(vgatherhi, &vtcm.vscatter32, region_len, offsetshi);
454 
455     sync_gather(vgatherhi);
456 }
457 
458 static unsigned int gather_32_masked_init(void)
459 {
460     char letter = '?';
461     return letter | (letter << 8) | (letter << 16) | (letter << 24);
462 }
463 
464 void vector_gather_32_masked(void)
465 {
466     HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
467     HVX_Vector *vgatherhi =
468         (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
469     HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
470     HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
471     HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
472     HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
473     HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
474     HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
475 
476     *vgatherlo = VSPLAT_H(gather_32_masked_init());
477     *vgatherhi = VSPLAT_H(gather_32_masked_init());
478     VGATHER_32_MASKED(vgatherlo, predslo, &vtcm.vscatter32, region_len,
479                       offsetslo);
480     VGATHER_32_MASKED(vgatherhi, predshi, &vtcm.vscatter32, region_len,
481                       offsetshi);
482 
483     sync_gather(vgatherlo);
484     sync_gather(vgatherhi);
485 }
486 
487 /* gather the elements from the scatter16_32 buffer */
488 void vector_gather_16_32(void)
489 {
490     HVX_Vector *vgather;
491     HVX_VectorPair offsets;
492     HVX_Vector values;
493 
494     /* get the vtcm address to gather from */
495     vgather = (HVX_Vector *)&vtcm.vgather16_32;
496 
497     /* get the word offsets in a vector pair */
498     offsets = *(HVX_VectorPair *)word_offsets;
499 
500     VGATHER_16_32(vgather, &vtcm.vscatter16_32, region_len, offsets);
501 
502     /* deal the elements to get the order back */
503     values = *(HVX_Vector *)vgather;
504     values = VDEAL_H(values);
505 
506     /* write it back to vtcm address */
507     *(HVX_Vector *)vgather = values;
508 }
509 
510 void vector_gather_16_32_masked(void)
511 {
512     HVX_Vector *vgather;
513     HVX_VectorPair offsets;
514     HVX_Vector pred_reg;
515     HVX_VectorPred preds;
516     HVX_Vector values;
517 
518     /* get the vtcm address to gather from */
519     vgather = (HVX_Vector *)&vtcm.vgather16_32;
520 
521     /* get the word offsets in a vector pair */
522     offsets = *(HVX_VectorPair *)word_offsets;
523     pred_reg = *(HVX_Vector *)half_predicates;
524     pred_reg = VSHUFF_H(pred_reg);
525     preds = VAND_VAL(pred_reg, ~0);
526 
527    *vgather = VSPLAT_H(gather_16_masked_init());
528    VGATHER_16_32_MASKED(vgather, preds, &vtcm.vscatter16_32, region_len,
529                         offsets);
530 
531     /* deal the elements to get the order back */
532     values = *(HVX_Vector *)vgather;
533     values = VDEAL_H(values);
534 
535     /* write it back to vtcm address */
536     *(HVX_Vector *)vgather = values;
537 }
538 
539 static void check_buffer(const char *name, void *c, void *r, size_t size)
540 {
541     char *check = (char *)c;
542     char *ref = (char *)r;
543     for (int i = 0; i < size; i++) {
544         if (check[i] != ref[i]) {
545             printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
546                    check[i], check[i], ref[i], ref[i]);
547             err++;
548         }
549     }
550 }
551 
552 /*
553  * These scalar functions are the C equivalents of the vector functions that
554  * use HVX
555  */
556 
557 /* scatter the 16 bit elements using C */
558 void scalar_scatter_16(unsigned short *vscatter16)
559 {
560     for (int i = 0; i < MATRIX_SIZE; ++i) {
561         vscatter16[half_offsets[i] / 2] = half_values[i];
562     }
563 }
564 
565 void check_scatter_16()
566 {
567     memset(vscatter16_ref, FILL_CHAR,
568            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
569     scalar_scatter_16(vscatter16_ref);
570     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
571                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
572 }
573 
574 /* scatter the 16 bit elements using C */
575 void scalar_scatter_16_acc(unsigned short *vscatter16)
576 {
577     for (int i = 0; i < MATRIX_SIZE; ++i) {
578         vscatter16[half_offsets[i] / 2] += half_values_acc[i];
579     }
580 }
581 
582 void check_scatter_16_acc()
583 {
584     memset(vscatter16_ref, FILL_CHAR,
585            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
586     scalar_scatter_16(vscatter16_ref);
587     scalar_scatter_16_acc(vscatter16_ref);
588     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
589                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
590 }
591 
592 /* scatter the 16 bit elements using C */
593 void scalar_scatter_16_masked(unsigned short *vscatter16)
594 {
595     for (int i = 0; i < MATRIX_SIZE; i++) {
596         if (half_predicates[i]) {
597             vscatter16[half_offsets[i] / 2] = half_values_masked[i];
598         }
599     }
600 
601 }
602 
603 void check_scatter_16_masked()
604 {
605     memset(vscatter16_ref, FILL_CHAR,
606            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
607     scalar_scatter_16(vscatter16_ref);
608     scalar_scatter_16_acc(vscatter16_ref);
609     scalar_scatter_16_masked(vscatter16_ref);
610     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
611                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
612 }
613 
614 /* scatter the 32 bit elements using C */
615 void scalar_scatter_32(unsigned int *vscatter32)
616 {
617     for (int i = 0; i < MATRIX_SIZE; ++i) {
618         vscatter32[word_offsets[i] / 4] = word_values[i];
619     }
620 }
621 
622 void check_scatter_32()
623 {
624     memset(vscatter32_ref, FILL_CHAR,
625            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
626     scalar_scatter_32(vscatter32_ref);
627     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
628                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
629 }
630 
631 /* scatter the 32 bit elements using C */
632 void scalar_scatter_32_acc(unsigned int *vscatter32)
633 {
634     for (int i = 0; i < MATRIX_SIZE; ++i) {
635         vscatter32[word_offsets[i] / 4] += word_values_acc[i];
636     }
637 }
638 
639 void check_scatter_32_acc()
640 {
641     memset(vscatter32_ref, FILL_CHAR,
642            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
643     scalar_scatter_32(vscatter32_ref);
644     scalar_scatter_32_acc(vscatter32_ref);
645     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
646                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
647 }
648 
649 /* scatter the 32 bit elements using C */
650 void scalar_scatter_32_masked(unsigned int *vscatter32)
651 {
652     for (int i = 0; i < MATRIX_SIZE; i++) {
653         if (word_predicates[i]) {
654             vscatter32[word_offsets[i] / 4] = word_values_masked[i];
655         }
656     }
657 }
658 
659 void check_scatter_32_masked()
660 {
661     memset(vscatter32_ref, FILL_CHAR,
662            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
663     scalar_scatter_32(vscatter32_ref);
664     scalar_scatter_32_acc(vscatter32_ref);
665     scalar_scatter_32_masked(vscatter32_ref);
666     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
667                   SCATTER_BUFFER_SIZE * sizeof(unsigned int));
668 }
669 
670 /* scatter the 32 bit elements using C */
671 void scalar_scatter_16_32(unsigned short *vscatter16_32)
672 {
673     for (int i = 0; i < MATRIX_SIZE; ++i) {
674         vscatter16_32[word_offsets[i] / 2] = half_values[i];
675     }
676 }
677 
678 void check_scatter_16_32()
679 {
680     memset(vscatter16_32_ref, FILL_CHAR,
681            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
682     scalar_scatter_16_32(vscatter16_32_ref);
683     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
684                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
685 }
686 
687 /* scatter the 32 bit elements using C */
688 void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
689 {
690     for (int i = 0; i < MATRIX_SIZE; ++i) {
691         vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
692     }
693 }
694 
695 void check_scatter_16_32_acc()
696 {
697     memset(vscatter16_32_ref, FILL_CHAR,
698            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
699     scalar_scatter_16_32(vscatter16_32_ref);
700     scalar_scatter_16_32_acc(vscatter16_32_ref);
701     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
702                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
703 }
704 
705 void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
706 {
707     for (int i = 0; i < MATRIX_SIZE; i++) {
708         if (half_predicates[i]) {
709             vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
710         }
711     }
712 }
713 
714 void check_scatter_16_32_masked()
715 {
716     memset(vscatter16_32_ref, FILL_CHAR,
717            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
718     scalar_scatter_16_32(vscatter16_32_ref);
719     scalar_scatter_16_32_acc(vscatter16_32_ref);
720     scalar_scatter_16_32_masked(vscatter16_32_ref);
721     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
722                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
723 }
724 
725 /* gather the elements from the scatter buffer using C */
726 void scalar_gather_16(unsigned short *vgather16)
727 {
728     for (int i = 0; i < MATRIX_SIZE; ++i) {
729         vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
730     }
731 }
732 
733 void check_gather_16()
734 {
735       memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
736       scalar_gather_16(vgather16_ref);
737       check_buffer(__func__, vtcm.vgather16, vgather16_ref,
738                    MATRIX_SIZE * sizeof(unsigned short));
739 }
740 
741 void scalar_gather_16_masked(unsigned short *vgather16)
742 {
743     for (int i = 0; i < MATRIX_SIZE; ++i) {
744         if (half_predicates[i]) {
745             vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
746         }
747     }
748 }
749 
750 void check_gather_16_masked()
751 {
752     memset(vgather16_ref, gather_16_masked_init(),
753            MATRIX_SIZE * sizeof(unsigned short));
754     scalar_gather_16_masked(vgather16_ref);
755     check_buffer(__func__, vtcm.vgather16, vgather16_ref,
756                  MATRIX_SIZE * sizeof(unsigned short));
757 }
758 
759 /* gather the elements from the scatter buffer using C */
760 void scalar_gather_32(unsigned int *vgather32)
761 {
762     for (int i = 0; i < MATRIX_SIZE; ++i) {
763         vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
764     }
765 }
766 
767 void check_gather_32(void)
768 {
769     memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
770     scalar_gather_32(vgather32_ref);
771     check_buffer(__func__, vtcm.vgather32, vgather32_ref,
772                  MATRIX_SIZE * sizeof(unsigned int));
773 }
774 
775 void scalar_gather_32_masked(unsigned int *vgather32)
776 {
777     for (int i = 0; i < MATRIX_SIZE; ++i) {
778         if (word_predicates[i]) {
779             vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
780         }
781     }
782 }
783 
784 
785 void check_gather_32_masked(void)
786 {
787     memset(vgather32_ref, gather_32_masked_init(),
788            MATRIX_SIZE * sizeof(unsigned int));
789     scalar_gather_32_masked(vgather32_ref);
790     check_buffer(__func__, vtcm.vgather32,
791                  vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
792 }
793 
794 /* gather the elements from the scatter buffer using C */
795 void scalar_gather_16_32(unsigned short *vgather16_32)
796 {
797     for (int i = 0; i < MATRIX_SIZE; ++i) {
798         vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
799     }
800 }
801 
802 void check_gather_16_32(void)
803 {
804     memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
805     scalar_gather_16_32(vgather16_32_ref);
806     check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
807                  MATRIX_SIZE * sizeof(unsigned short));
808 }
809 
810 void scalar_gather_16_32_masked(unsigned short *vgather16_32)
811 {
812     for (int i = 0; i < MATRIX_SIZE; ++i) {
813         if (half_predicates[i]) {
814             vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
815         }
816     }
817 
818 }
819 
820 void check_gather_16_32_masked(void)
821 {
822     memset(vgather16_32_ref, gather_16_masked_init(),
823            MATRIX_SIZE * sizeof(unsigned short));
824     scalar_gather_16_32_masked(vgather16_32_ref);
825     check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
826                  MATRIX_SIZE * sizeof(unsigned short));
827 }
828 
829 /* print scatter16 buffer */
830 void print_scatter16_buffer(void)
831 {
832     if (PRINT_DATA) {
833         printf("\n\nPrinting the 16 bit scatter buffer");
834 
835         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
836             if ((i % MATRIX_SIZE) == 0) {
837                 printf("\n");
838             }
839             for (int j = 0; j < 2; j++) {
840                 printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
841             }
842             printf(" ");
843         }
844         printf("\n");
845     }
846 }
847 
848 /* print the gather 16 buffer */
849 void print_gather_result_16(void)
850 {
851     if (PRINT_DATA) {
852         printf("\n\nPrinting the 16 bit gather result\n");
853 
854         for (int i = 0; i < MATRIX_SIZE; i++) {
855             for (int j = 0; j < 2; j++) {
856                 printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
857             }
858             printf(" ");
859         }
860         printf("\n");
861     }
862 }
863 
864 /* print the scatter32 buffer */
865 void print_scatter32_buffer(void)
866 {
867     if (PRINT_DATA) {
868         printf("\n\nPrinting the 32 bit scatter buffer");
869 
870         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
871             if ((i % MATRIX_SIZE) == 0) {
872                 printf("\n");
873             }
874             for (int j = 0; j < 4; j++) {
875                 printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
876             }
877             printf(" ");
878         }
879         printf("\n");
880     }
881 }
882 
883 /* print the gather 32 buffer */
884 void print_gather_result_32(void)
885 {
886     if (PRINT_DATA) {
887         printf("\n\nPrinting the 32 bit gather result\n");
888 
889         for (int i = 0; i < MATRIX_SIZE; i++) {
890             for (int j = 0; j < 4; j++) {
891                 printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
892             }
893             printf(" ");
894         }
895         printf("\n");
896     }
897 }
898 
899 /* print the scatter16_32 buffer */
900 void print_scatter16_32_buffer(void)
901 {
902     if (PRINT_DATA) {
903         printf("\n\nPrinting the 16_32 bit scatter buffer");
904 
905         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
906             if ((i % MATRIX_SIZE) == 0) {
907                 printf("\n");
908             }
909             for (int j = 0; j < 2; j++) {
910                 printf("%c",
911                       (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
912             }
913             printf(" ");
914         }
915         printf("\n");
916     }
917 }
918 
919 /* print the gather 16_32 buffer */
920 void print_gather_result_16_32(void)
921 {
922     if (PRINT_DATA) {
923         printf("\n\nPrinting the 16_32 bit gather result\n");
924 
925         for (int i = 0; i < MATRIX_SIZE; i++) {
926             for (int j = 0; j < 2; j++) {
927                 printf("%c",
928                        (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
929             }
930             printf(" ");
931         }
932         printf("\n");
933     }
934 }
935 
936 int main()
937 {
938     prefill_vtcm_scratch();
939 
940     /* 16 bit elements with 16 bit offsets */
941     create_offsets_values_preds_16();
942 
943     vector_scatter_16();
944     print_scatter16_buffer();
945     check_scatter_16();
946 
947     vector_gather_16();
948     print_gather_result_16();
949     check_gather_16();
950 
951     vector_gather_16_masked();
952     print_gather_result_16();
953     check_gather_16_masked();
954 
955     vector_scatter_16_acc();
956     print_scatter16_buffer();
957     check_scatter_16_acc();
958 
959     vector_scatter_16_masked();
960     print_scatter16_buffer();
961     check_scatter_16_masked();
962 
963     /* 32 bit elements with 32 bit offsets */
964     create_offsets_values_preds_32();
965 
966     vector_scatter_32();
967     print_scatter32_buffer();
968     check_scatter_32();
969 
970     vector_gather_32();
971     print_gather_result_32();
972     check_gather_32();
973 
974     vector_gather_32_masked();
975     print_gather_result_32();
976     check_gather_32_masked();
977 
978     vector_scatter_32_acc();
979     print_scatter32_buffer();
980     check_scatter_32_acc();
981 
982     vector_scatter_32_masked();
983     print_scatter32_buffer();
984     check_scatter_32_masked();
985 
986     /* 16 bit elements with 32 bit offsets */
987     create_offsets_values_preds_16_32();
988 
989     vector_scatter_16_32();
990     print_scatter16_32_buffer();
991     check_scatter_16_32();
992 
993     vector_gather_16_32();
994     print_gather_result_16_32();
995     check_gather_16_32();
996 
997     vector_gather_16_32_masked();
998     print_gather_result_16_32();
999     check_gather_16_32_masked();
1000 
1001     vector_scatter_16_32_acc();
1002     print_scatter16_32_buffer();
1003     check_scatter_16_32_acc();
1004 
1005     vector_scatter_16_32_masked();
1006     print_scatter16_32_buffer();
1007     check_scatter_16_32_masked();
1008 
1009     puts(err ? "FAIL" : "PASS");
1010     return err;
1011 }
1012