1 /*
2  *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 /*
19  * This example tests the HVX scatter/gather instructions
20  *
21  * See section 5.13 of the V68 HVX Programmer's Reference
22  *
23  * There are 3 main classes operations
24  *     _16                 16-bit elements and 16-bit offsets
25  *     _32                 32-bit elements and 32-bit offsets
26  *     _16_32              16-bit elements and 32-bit offsets
27  *
28  * There are also masked and accumulate versions
29  */
30 
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include <inttypes.h>
35 
36 typedef long HVX_Vector       __attribute__((__vector_size__(128)))
37                               __attribute__((aligned(128)));
38 typedef long HVX_VectorPair   __attribute__((__vector_size__(256)))
39                               __attribute__((aligned(128)));
40 typedef long HVX_VectorPred   __attribute__((__vector_size__(128)))
41                               __attribute__((aligned(128)));
42 
43 int err;
44 
45 /* define the number of rows/cols in a square matrix */
46 #define MATRIX_SIZE 64
47 
48 /* define the size of the scatter buffer */
49 #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
50 
51 /* fake vtcm - put buffers together and force alignment */
52 static struct {
53     unsigned short vscatter16[SCATTER_BUFFER_SIZE];
54     unsigned short vgather16[MATRIX_SIZE];
55     unsigned int   vscatter32[SCATTER_BUFFER_SIZE];
56     unsigned int   vgather32[MATRIX_SIZE];
57     unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
58     unsigned short vgather16_32[MATRIX_SIZE];
59 } vtcm __attribute__((aligned(0x10000)));
60 
61 /* declare the arrays of reference values */
62 unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
63 unsigned short vgather16_ref[MATRIX_SIZE];
64 unsigned int   vscatter32_ref[SCATTER_BUFFER_SIZE];
65 unsigned int   vgather32_ref[MATRIX_SIZE];
66 unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
67 unsigned short vgather16_32_ref[MATRIX_SIZE];
68 
69 /* declare the arrays of offsets */
70 unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
71 unsigned int   word_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
72 
73 /* declare the arrays of values */
74 unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128)));
75 unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
76 unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
77 unsigned int   word_values[MATRIX_SIZE] __attribute__((aligned(128)));
78 unsigned int   word_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
79 unsigned int   word_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
80 
81 /* declare the arrays of predicates */
82 unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
83 unsigned int   word_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
84 
85 /* make this big enough for all the operations */
86 const size_t region_len = sizeof(vtcm);
87 
88 /* optionally add sync instructions */
89 #define SYNC_VECTOR 1
90 
91 static void sync_scatter(void *addr)
92 {
93 #if SYNC_VECTOR
94     /*
95      * Do the scatter release followed by a dummy load to complete the
96      * synchronization.  Normally the dummy load would be deferred as
97      * long as possible to minimize stalls.
98      */
99     asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
100     /* use volatile to force the load */
101     volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
102 #endif
103 }
104 
105 static void sync_gather(void *addr)
106 {
107 #if SYNC_VECTOR
108     /* use volatile to force the load */
109     volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
110 #endif
111 }
112 
113 /* optionally print the results */
114 #define PRINT_DATA 0
115 
116 #define FILL_CHAR       '.'
117 
118 /* fill vtcm scratch with ee */
119 void prefill_vtcm_scratch(void)
120 {
121     memset(&vtcm, FILL_CHAR, sizeof(vtcm));
122 }
123 
124 /* create byte offsets to be a diagonal of the matrix with 16 bit elements */
125 void create_offsets_values_preds_16(void)
126 {
127     unsigned short half_element = 0;
128     unsigned short half_element_masked = 0;
129     char letter = 'A';
130     char letter_masked = '@';
131 
132     for (int i = 0; i < MATRIX_SIZE; i++) {
133         half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
134 
135         half_element = 0;
136         half_element_masked = 0;
137         for (int j = 0; j < 2; j++) {
138             half_element |= letter << j * 8;
139             half_element_masked |= letter_masked << j * 8;
140         }
141 
142         half_values[i] = half_element;
143         half_values_acc[i] = ((i % 10) << 8) + (i % 10);
144         half_values_masked[i] = half_element_masked;
145 
146         letter++;
147         /* reset to 'A' */
148         if (letter == 'M') {
149             letter = 'A';
150         }
151 
152         half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
153     }
154 }
155 
156 /* create byte offsets to be a diagonal of the matrix with 32 bit elements */
157 void create_offsets_values_preds_32(void)
158 {
159     unsigned int word_element = 0;
160     unsigned int word_element_masked = 0;
161     char letter = 'A';
162     char letter_masked = '&';
163 
164     for (int i = 0; i < MATRIX_SIZE; i++) {
165         word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
166 
167         word_element = 0;
168         word_element_masked = 0;
169         for (int j = 0; j < 4; j++) {
170             word_element |= letter << j * 8;
171             word_element_masked |= letter_masked << j * 8;
172         }
173 
174         word_values[i] = word_element;
175         word_values_acc[i] = ((i % 10) << 8) + (i % 10);
176         word_values_masked[i] = word_element_masked;
177 
178         letter++;
179         /* reset to 'A' */
180         if (letter == 'M') {
181             letter = 'A';
182         }
183 
184         word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
185     }
186 }
187 
188 /*
189  * create byte offsets to be a diagonal of the matrix with 16 bit elements
190  * and 32 bit offsets
191  */
192 void create_offsets_values_preds_16_32(void)
193 {
194     unsigned short half_element = 0;
195     unsigned short half_element_masked = 0;
196     char letter = 'D';
197     char letter_masked = '$';
198 
199     for (int i = 0; i < MATRIX_SIZE; i++) {
200         word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
201 
202         half_element = 0;
203         half_element_masked = 0;
204         for (int j = 0; j < 2; j++) {
205             half_element |= letter << j * 8;
206             half_element_masked |= letter_masked << j * 8;
207         }
208 
209         half_values[i] = half_element;
210         half_values_acc[i] = ((i % 10) << 8) + (i % 10);
211         half_values_masked[i] = half_element_masked;
212 
213         letter++;
214         /* reset to 'A' */
215         if (letter == 'P') {
216             letter = 'D';
217         }
218 
219         half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
220     }
221 }
222 
223 /* scatter the 16 bit elements using HVX */
224 void vector_scatter_16(void)
225 {
226     asm ("m0 = %1\n\t"
227          "v0 = vmem(%2 + #0)\n\t"
228          "v1 = vmem(%3 + #0)\n\t"
229          "vscatter(%0, m0, v0.h).h = v1\n\t"
230          : : "r"(vtcm.vscatter16), "r"(region_len),
231              "r"(half_offsets), "r"(half_values)
232          : "m0", "v0", "v1", "memory");
233 
234     sync_scatter(vtcm.vscatter16);
235 }
236 
237 /* scatter-accumulate the 16 bit elements using HVX */
238 void vector_scatter_16_acc(void)
239 {
240     asm ("m0 = %1\n\t"
241          "v0 = vmem(%2 + #0)\n\t"
242          "v1 = vmem(%3 + #0)\n\t"
243          "vscatter(%0, m0, v0.h).h += v1\n\t"
244          : : "r"(vtcm.vscatter16), "r"(region_len),
245              "r"(half_offsets), "r"(half_values_acc)
246          : "m0", "v0", "v1", "memory");
247 
248     sync_scatter(vtcm.vscatter16);
249 }
250 
251 /* masked scatter the 16 bit elements using HVX */
252 void vector_scatter_16_masked(void)
253 {
254     asm ("r1 = #-1\n\t"
255          "v0 = vmem(%0 + #0)\n\t"
256          "q0 = vand(v0, r1)\n\t"
257          "m0 = %2\n\t"
258          "v0 = vmem(%3 + #0)\n\t"
259          "v1 = vmem(%4 + #0)\n\t"
260          "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t"
261          : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
262              "r"(half_offsets), "r"(half_values_masked)
263          : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
264 
265     sync_scatter(vtcm.vscatter16);
266 }
267 
268 /* scatter the 32 bit elements using HVX */
269 void vector_scatter_32(void)
270 {
271     HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
272     HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
273     HVX_Vector *valueslo = (HVX_Vector *)word_values;
274     HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2];
275 
276     asm ("m0 = %1\n\t"
277          "v0 = vmem(%2 + #0)\n\t"
278          "v1 = vmem(%3 + #0)\n\t"
279          "vscatter(%0, m0, v0.w).w = v1\n\t"
280          : : "r"(vtcm.vscatter32), "r"(region_len),
281              "r"(offsetslo), "r"(valueslo)
282          : "m0", "v0", "v1", "memory");
283     asm ("m0 = %1\n\t"
284          "v0 = vmem(%2 + #0)\n\t"
285          "v1 = vmem(%3 + #0)\n\t"
286          "vscatter(%0, m0, v0.w).w = v1\n\t"
287          : : "r"(vtcm.vscatter32), "r"(region_len),
288              "r"(offsetshi), "r"(valueshi)
289          : "m0", "v0", "v1", "memory");
290 
291     sync_scatter(vtcm.vscatter32);
292 }
293 
294 /* scatter-accumulate the 32 bit elements using HVX */
295 void vector_scatter_32_acc(void)
296 {
297     HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
298     HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
299     HVX_Vector *valueslo = (HVX_Vector *)word_values_acc;
300     HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
301 
302     asm ("m0 = %1\n\t"
303          "v0 = vmem(%2 + #0)\n\t"
304          "v1 = vmem(%3 + #0)\n\t"
305          "vscatter(%0, m0, v0.w).w += v1\n\t"
306          : : "r"(vtcm.vscatter32), "r"(region_len),
307              "r"(offsetslo), "r"(valueslo)
308          : "m0", "v0", "v1", "memory");
309     asm ("m0 = %1\n\t"
310          "v0 = vmem(%2 + #0)\n\t"
311          "v1 = vmem(%3 + #0)\n\t"
312          "vscatter(%0, m0, v0.w).w += v1\n\t"
313          : : "r"(vtcm.vscatter32), "r"(region_len),
314              "r"(offsetshi), "r"(valueshi)
315          : "m0", "v0", "v1", "memory");
316 
317     sync_scatter(vtcm.vscatter32);
318 }
319 
320 /* masked scatter the 32 bit elements using HVX */
321 void vector_scatter_32_masked(void)
322 {
323     HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
324     HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
325     HVX_Vector *valueslo = (HVX_Vector *)word_values_masked;
326     HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
327     HVX_Vector *predslo = (HVX_Vector *)word_predicates;
328     HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
329 
330     asm ("r1 = #-1\n\t"
331          "v0 = vmem(%0 + #0)\n\t"
332          "q0 = vand(v0, r1)\n\t"
333          "m0 = %2\n\t"
334          "v0 = vmem(%3 + #0)\n\t"
335          "v1 = vmem(%4 + #0)\n\t"
336          "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
337          : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
338              "r"(offsetslo), "r"(valueslo)
339          : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
340     asm ("r1 = #-1\n\t"
341          "v0 = vmem(%0 + #0)\n\t"
342          "q0 = vand(v0, r1)\n\t"
343          "m0 = %2\n\t"
344          "v0 = vmem(%3 + #0)\n\t"
345          "v1 = vmem(%4 + #0)\n\t"
346          "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
347          : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
348              "r"(offsetshi), "r"(valueshi)
349          : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
350 
351     sync_scatter(vtcm.vscatter32);
352 }
353 
354 /* scatter the 16 bit elements with 32 bit offsets using HVX */
355 void vector_scatter_16_32(void)
356 {
357     asm ("m0 = %1\n\t"
358          "v0 = vmem(%2 + #0)\n\t"
359          "v1 = vmem(%2 + #1)\n\t"
360          "v2 = vmem(%3 + #0)\n\t"
361          "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
362          "vscatter(%0, m0, v1:0.w).h = v2\n\t"
363          : : "r"(vtcm.vscatter16_32), "r"(region_len),
364              "r"(word_offsets), "r"(half_values)
365          : "m0", "v0", "v1", "v2", "memory");
366 
367     sync_scatter(vtcm.vscatter16_32);
368 }
369 
370 /* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */
371 void vector_scatter_16_32_acc(void)
372 {
373     asm ("m0 = %1\n\t"
374          "v0 = vmem(%2 + #0)\n\t"
375          "v1 = vmem(%2 + #1)\n\t"
376          "v2 = vmem(%3 + #0)\n\t" \
377          "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
378          "vscatter(%0, m0, v1:0.w).h += v2\n\t"
379          : : "r"(vtcm.vscatter16_32), "r"(region_len),
380              "r"(word_offsets), "r"(half_values_acc)
381          : "m0", "v0", "v1", "v2", "memory");
382 
383     sync_scatter(vtcm.vscatter16_32);
384 }
385 
386 /* masked scatter the 16 bit elements with 32 bit offsets using HVX */
387 void vector_scatter_16_32_masked(void)
388 {
389     asm ("r1 = #-1\n\t"
390          "v0 = vmem(%0 + #0)\n\t"
391          "v0.h = vshuff(v0.h)\n\t"  /* shuffle the predicates */
392          "q0 = vand(v0, r1)\n\t"
393          "m0 = %2\n\t"
394          "v0 = vmem(%3 + #0)\n\t"
395          "v1 = vmem(%3 + #1)\n\t"
396          "v2 = vmem(%4 + #0)\n\t" \
397          "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
398          "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t"
399          : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
400              "r"(word_offsets), "r"(half_values_masked)
401          : "r1", "q0", "m0", "v0", "v1", "v2", "memory");
402 
403     sync_scatter(vtcm.vscatter16_32);
404 }
405 
406 /* gather the elements from the scatter16 buffer using HVX */
407 void vector_gather_16(void)
408 {
409     asm ("m0 = %1\n\t"
410          "v0 = vmem(%2 + #0)\n\t"
411          "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t"
412          "  vmem(%3 + #0) = vtmp.new }\n\t"
413          : : "r"(vtcm.vscatter16), "r"(region_len),
414              "r"(half_offsets), "r"(vtcm.vgather16)
415          : "m0", "v0", "memory");
416 
417     sync_gather(vtcm.vgather16);
418 }
419 
420 static unsigned short gather_16_masked_init(void)
421 {
422     char letter = '?';
423     return letter | (letter << 8);
424 }
425 
426 /* masked gather the elements from the scatter16 buffer using HVX */
427 void vector_gather_16_masked(void)
428 {
429     unsigned short init = gather_16_masked_init();
430 
431     asm ("v0.h = vsplat(%5)\n\t"
432          "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
433          "r1 = #-1\n\t"
434          "v0 = vmem(%0 + #0)\n\t"
435          "q0 = vand(v0, r1)\n\t"
436          "m0 = %2\n\t"
437          "v0 = vmem(%3 + #0)\n\t"
438          "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t"
439          "  vmem(%4 + #0) = vtmp.new }\n\t"
440          : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
441              "r"(half_offsets), "r"(vtcm.vgather16), "r"(init)
442          : "r1", "q0", "m0", "v0", "memory");
443 
444     sync_gather(vtcm.vgather16);
445 }
446 
447 /* gather the elements from the scatter32 buffer using HVX */
448 void vector_gather_32(void)
449 {
450     HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
451     HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
452     HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
453     HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
454 
455     asm ("m0 = %1\n\t"
456          "v0 = vmem(%2 + #0)\n\t"
457          "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
458          "  vmem(%3 + #0) = vtmp.new }\n\t"
459          : : "r"(vtcm.vscatter32), "r"(region_len),
460              "r"(offsetslo), "r"(vgatherlo)
461          : "m0", "v0", "memory");
462     asm ("m0 = %1\n\t"
463          "v0 = vmem(%2 + #0)\n\t"
464          "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
465          "  vmem(%3 + #0) = vtmp.new }\n\t"
466          : : "r"(vtcm.vscatter32), "r"(region_len),
467              "r"(offsetshi), "r"(vgatherhi)
468          : "m0", "v0", "memory");
469 
470     sync_gather(vgatherlo);
471     sync_gather(vgatherhi);
472 }
473 
474 static unsigned int gather_32_masked_init(void)
475 {
476     char letter = '?';
477     return letter | (letter << 8) | (letter << 16) | (letter << 24);
478 }
479 
480 /* masked gather the elements from the scatter32 buffer using HVX */
481 void vector_gather_32_masked(void)
482 {
483     unsigned int init = gather_32_masked_init();
484     HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
485     HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
486     HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
487     HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
488     HVX_Vector *predslo = (HVX_Vector *)word_predicates;
489     HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
490 
491     asm ("v0.h = vsplat(%5)\n\t"
492          "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
493          "r1 = #-1\n\t"
494          "v0 = vmem(%0 + #0)\n\t"
495          "q0 = vand(v0, r1)\n\t"
496          "m0 = %2\n\t"
497          "v0 = vmem(%3 + #0)\n\t"
498          "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
499          "  vmem(%4 + #0) = vtmp.new }\n\t"
500          : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
501              "r"(offsetslo), "r"(vgatherlo), "r"(init)
502          : "r1", "q0", "m0", "v0", "memory");
503     asm ("v0.h = vsplat(%5)\n\t"
504          "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
505          "r1 = #-1\n\t"
506          "v0 = vmem(%0 + #0)\n\t"
507          "q0 = vand(v0, r1)\n\t"
508          "m0 = %2\n\t"
509          "v0 = vmem(%3 + #0)\n\t"
510          "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
511          "  vmem(%4 + #0) = vtmp.new }\n\t"
512          : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
513              "r"(offsetshi), "r"(vgatherhi), "r"(init)
514          : "r1", "q0", "m0", "v0", "memory");
515 
516     sync_gather(vgatherlo);
517     sync_gather(vgatherhi);
518 }
519 
520 /* gather the elements from the scatter16_32 buffer using HVX */
521 void vector_gather_16_32(void)
522 {
523     asm ("m0 = %1\n\t"
524          "v0 = vmem(%2 + #0)\n\t"
525          "v1 = vmem(%2 + #1)\n\t"
526          "{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t"
527          "  vmem(%3 + #0) = vtmp.new }\n\t"
528          "v0 = vmem(%3 + #0)\n\t"
529          "v0.h = vdeal(v0.h)\n\t"  /* deal the elements to get the order back */
530          "vmem(%3 + #0) = v0\n\t"
531          : : "r"(vtcm.vscatter16_32), "r"(region_len),
532              "r"(word_offsets), "r"(vtcm.vgather16_32)
533          : "m0", "v0", "v1", "memory");
534 
535     sync_gather(vtcm.vgather16_32);
536 }
537 
538 /* masked gather the elements from the scatter16_32 buffer using HVX */
539 void vector_gather_16_32_masked(void)
540 {
541     unsigned short init = gather_16_masked_init();
542 
543     asm ("v0.h = vsplat(%5)\n\t"
544          "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
545          "r1 = #-1\n\t"
546          "v0 = vmem(%0 + #0)\n\t"
547          "v0.h = vshuff(v0.h)\n\t"  /* shuffle the predicates */
548          "q0 = vand(v0, r1)\n\t"
549          "m0 = %2\n\t"
550          "v0 = vmem(%3 + #0)\n\t"
551          "v1 = vmem(%3 + #1)\n\t"
552          "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t"
553          "  vmem(%4 + #0) = vtmp.new }\n\t"
554          "v0 = vmem(%4 + #0)\n\t"
555          "v0.h = vdeal(v0.h)\n\t"  /* deal the elements to get the order back */
556          "vmem(%4 + #0) = v0\n\t"
557          : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
558              "r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init)
559          : "r1", "q0", "m0", "v0", "v1", "memory");
560 
561     sync_gather(vtcm.vgather16_32);
562 }
563 
564 static void check_buffer(const char *name, void *c, void *r, size_t size)
565 {
566     char *check = (char *)c;
567     char *ref = (char *)r;
568     for (int i = 0; i < size; i++) {
569         if (check[i] != ref[i]) {
570             printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
571                    check[i], check[i], ref[i], ref[i]);
572             err++;
573         }
574     }
575 }
576 
577 /*
578  * These scalar functions are the C equivalents of the vector functions that
579  * use HVX
580  */
581 
582 /* scatter the 16 bit elements using C */
583 void scalar_scatter_16(unsigned short *vscatter16)
584 {
585     for (int i = 0; i < MATRIX_SIZE; ++i) {
586         vscatter16[half_offsets[i] / 2] = half_values[i];
587     }
588 }
589 
590 void check_scatter_16()
591 {
592     memset(vscatter16_ref, FILL_CHAR,
593            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
594     scalar_scatter_16(vscatter16_ref);
595     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
596                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
597 }
598 
599 /* scatter the 16 bit elements using C */
600 void scalar_scatter_16_acc(unsigned short *vscatter16)
601 {
602     for (int i = 0; i < MATRIX_SIZE; ++i) {
603         vscatter16[half_offsets[i] / 2] += half_values_acc[i];
604     }
605 }
606 
607 /* scatter-accumulate the 16 bit elements using C */
608 void check_scatter_16_acc()
609 {
610     memset(vscatter16_ref, FILL_CHAR,
611            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
612     scalar_scatter_16(vscatter16_ref);
613     scalar_scatter_16_acc(vscatter16_ref);
614     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
615                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
616 }
617 
618 /* masked scatter the 16 bit elements using C */
619 void scalar_scatter_16_masked(unsigned short *vscatter16)
620 {
621     for (int i = 0; i < MATRIX_SIZE; i++) {
622         if (half_predicates[i]) {
623             vscatter16[half_offsets[i] / 2] = half_values_masked[i];
624         }
625     }
626 
627 }
628 
629 void check_scatter_16_masked()
630 {
631     memset(vscatter16_ref, FILL_CHAR,
632            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
633     scalar_scatter_16(vscatter16_ref);
634     scalar_scatter_16_acc(vscatter16_ref);
635     scalar_scatter_16_masked(vscatter16_ref);
636     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
637                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
638 }
639 
640 /* scatter the 32 bit elements using C */
641 void scalar_scatter_32(unsigned int *vscatter32)
642 {
643     for (int i = 0; i < MATRIX_SIZE; ++i) {
644         vscatter32[word_offsets[i] / 4] = word_values[i];
645     }
646 }
647 
648 void check_scatter_32()
649 {
650     memset(vscatter32_ref, FILL_CHAR,
651            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
652     scalar_scatter_32(vscatter32_ref);
653     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
654                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
655 }
656 
657 /* scatter-accumulate the 32 bit elements using C */
658 void scalar_scatter_32_acc(unsigned int *vscatter32)
659 {
660     for (int i = 0; i < MATRIX_SIZE; ++i) {
661         vscatter32[word_offsets[i] / 4] += word_values_acc[i];
662     }
663 }
664 
665 void check_scatter_32_acc()
666 {
667     memset(vscatter32_ref, FILL_CHAR,
668            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
669     scalar_scatter_32(vscatter32_ref);
670     scalar_scatter_32_acc(vscatter32_ref);
671     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
672                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
673 }
674 
675 /* masked scatter the 32 bit elements using C */
676 void scalar_scatter_32_masked(unsigned int *vscatter32)
677 {
678     for (int i = 0; i < MATRIX_SIZE; i++) {
679         if (word_predicates[i]) {
680             vscatter32[word_offsets[i] / 4] = word_values_masked[i];
681         }
682     }
683 }
684 
685 void check_scatter_32_masked()
686 {
687     memset(vscatter32_ref, FILL_CHAR,
688            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
689     scalar_scatter_32(vscatter32_ref);
690     scalar_scatter_32_acc(vscatter32_ref);
691     scalar_scatter_32_masked(vscatter32_ref);
692     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
693                   SCATTER_BUFFER_SIZE * sizeof(unsigned int));
694 }
695 
696 /* scatter the 16 bit elements with 32 bit offsets using C */
697 void scalar_scatter_16_32(unsigned short *vscatter16_32)
698 {
699     for (int i = 0; i < MATRIX_SIZE; ++i) {
700         vscatter16_32[word_offsets[i] / 2] = half_values[i];
701     }
702 }
703 
704 void check_scatter_16_32()
705 {
706     memset(vscatter16_32_ref, FILL_CHAR,
707            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
708     scalar_scatter_16_32(vscatter16_32_ref);
709     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
710                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
711 }
712 
713 /* scatter-accumulate the 16 bit elements with 32 bit offsets using C */
714 void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
715 {
716     for (int i = 0; i < MATRIX_SIZE; ++i) {
717         vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
718     }
719 }
720 
721 void check_scatter_16_32_acc()
722 {
723     memset(vscatter16_32_ref, FILL_CHAR,
724            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
725     scalar_scatter_16_32(vscatter16_32_ref);
726     scalar_scatter_16_32_acc(vscatter16_32_ref);
727     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
728                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
729 }
730 
731 /* masked scatter the 16 bit elements with 32 bit offsets using C */
732 void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
733 {
734     for (int i = 0; i < MATRIX_SIZE; i++) {
735         if (half_predicates[i]) {
736             vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
737         }
738     }
739 }
740 
741 void check_scatter_16_32_masked()
742 {
743     memset(vscatter16_32_ref, FILL_CHAR,
744            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
745     scalar_scatter_16_32(vscatter16_32_ref);
746     scalar_scatter_16_32_acc(vscatter16_32_ref);
747     scalar_scatter_16_32_masked(vscatter16_32_ref);
748     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
749                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
750 }
751 
752 /* gather the elements from the scatter buffer using C */
753 void scalar_gather_16(unsigned short *vgather16)
754 {
755     for (int i = 0; i < MATRIX_SIZE; ++i) {
756         vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
757     }
758 }
759 
760 void check_gather_16()
761 {
762       memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
763       scalar_gather_16(vgather16_ref);
764       check_buffer(__func__, vtcm.vgather16, vgather16_ref,
765                    MATRIX_SIZE * sizeof(unsigned short));
766 }
767 
768 /* masked gather the elements from the scatter buffer using C */
769 void scalar_gather_16_masked(unsigned short *vgather16)
770 {
771     for (int i = 0; i < MATRIX_SIZE; ++i) {
772         if (half_predicates[i]) {
773             vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
774         }
775     }
776 }
777 
778 void check_gather_16_masked()
779 {
780     memset(vgather16_ref, gather_16_masked_init(),
781            MATRIX_SIZE * sizeof(unsigned short));
782     scalar_gather_16_masked(vgather16_ref);
783     check_buffer(__func__, vtcm.vgather16, vgather16_ref,
784                  MATRIX_SIZE * sizeof(unsigned short));
785 }
786 
787 /* gather the elements from the scatter32 buffer using C */
788 void scalar_gather_32(unsigned int *vgather32)
789 {
790     for (int i = 0; i < MATRIX_SIZE; ++i) {
791         vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
792     }
793 }
794 
795 void check_gather_32(void)
796 {
797     memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
798     scalar_gather_32(vgather32_ref);
799     check_buffer(__func__, vtcm.vgather32, vgather32_ref,
800                  MATRIX_SIZE * sizeof(unsigned int));
801 }
802 
803 /* masked gather the elements from the scatter32 buffer using C */
804 void scalar_gather_32_masked(unsigned int *vgather32)
805 {
806     for (int i = 0; i < MATRIX_SIZE; ++i) {
807         if (word_predicates[i]) {
808             vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
809         }
810     }
811 }
812 
813 void check_gather_32_masked(void)
814 {
815     memset(vgather32_ref, gather_32_masked_init(),
816            MATRIX_SIZE * sizeof(unsigned int));
817     scalar_gather_32_masked(vgather32_ref);
818     check_buffer(__func__, vtcm.vgather32,
819                  vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
820 }
821 
822 /* gather the elements from the scatter16_32 buffer using C */
823 void scalar_gather_16_32(unsigned short *vgather16_32)
824 {
825     for (int i = 0; i < MATRIX_SIZE; ++i) {
826         vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
827     }
828 }
829 
830 void check_gather_16_32(void)
831 {
832     memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
833     scalar_gather_16_32(vgather16_32_ref);
834     check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
835                  MATRIX_SIZE * sizeof(unsigned short));
836 }
837 
838 /* masked gather the elements from the scatter16_32 buffer using C */
839 void scalar_gather_16_32_masked(unsigned short *vgather16_32)
840 {
841     for (int i = 0; i < MATRIX_SIZE; ++i) {
842         if (half_predicates[i]) {
843             vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
844         }
845     }
846 
847 }
848 
849 void check_gather_16_32_masked(void)
850 {
851     memset(vgather16_32_ref, gather_16_masked_init(),
852            MATRIX_SIZE * sizeof(unsigned short));
853     scalar_gather_16_32_masked(vgather16_32_ref);
854     check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
855                  MATRIX_SIZE * sizeof(unsigned short));
856 }
857 
858 /* print scatter16 buffer */
859 void print_scatter16_buffer(void)
860 {
861     if (PRINT_DATA) {
862         printf("\n\nPrinting the 16 bit scatter buffer");
863 
864         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
865             if ((i % MATRIX_SIZE) == 0) {
866                 printf("\n");
867             }
868             for (int j = 0; j < 2; j++) {
869                 printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
870             }
871             printf(" ");
872         }
873         printf("\n");
874     }
875 }
876 
877 /* print the gather 16 buffer */
878 void print_gather_result_16(void)
879 {
880     if (PRINT_DATA) {
881         printf("\n\nPrinting the 16 bit gather result\n");
882 
883         for (int i = 0; i < MATRIX_SIZE; i++) {
884             for (int j = 0; j < 2; j++) {
885                 printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
886             }
887             printf(" ");
888         }
889         printf("\n");
890     }
891 }
892 
893 /* print the scatter32 buffer */
894 void print_scatter32_buffer(void)
895 {
896     if (PRINT_DATA) {
897         printf("\n\nPrinting the 32 bit scatter buffer");
898 
899         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
900             if ((i % MATRIX_SIZE) == 0) {
901                 printf("\n");
902             }
903             for (int j = 0; j < 4; j++) {
904                 printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
905             }
906             printf(" ");
907         }
908         printf("\n");
909     }
910 }
911 
912 /* print the gather 32 buffer */
913 void print_gather_result_32(void)
914 {
915     if (PRINT_DATA) {
916         printf("\n\nPrinting the 32 bit gather result\n");
917 
918         for (int i = 0; i < MATRIX_SIZE; i++) {
919             for (int j = 0; j < 4; j++) {
920                 printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
921             }
922             printf(" ");
923         }
924         printf("\n");
925     }
926 }
927 
928 /* print the scatter16_32 buffer */
929 void print_scatter16_32_buffer(void)
930 {
931     if (PRINT_DATA) {
932         printf("\n\nPrinting the 16_32 bit scatter buffer");
933 
934         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
935             if ((i % MATRIX_SIZE) == 0) {
936                 printf("\n");
937             }
938             for (int j = 0; j < 2; j++) {
939                 printf("%c",
940                       (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
941             }
942             printf(" ");
943         }
944         printf("\n");
945     }
946 }
947 
948 /* print the gather 16_32 buffer */
949 void print_gather_result_16_32(void)
950 {
951     if (PRINT_DATA) {
952         printf("\n\nPrinting the 16_32 bit gather result\n");
953 
954         for (int i = 0; i < MATRIX_SIZE; i++) {
955             for (int j = 0; j < 2; j++) {
956                 printf("%c",
957                        (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
958             }
959             printf(" ");
960         }
961         printf("\n");
962     }
963 }
964 
965 int main()
966 {
967     prefill_vtcm_scratch();
968 
969     /* 16 bit elements with 16 bit offsets */
970     create_offsets_values_preds_16();
971 
972     vector_scatter_16();
973     print_scatter16_buffer();
974     check_scatter_16();
975 
976     vector_gather_16();
977     print_gather_result_16();
978     check_gather_16();
979 
980     vector_gather_16_masked();
981     print_gather_result_16();
982     check_gather_16_masked();
983 
984     vector_scatter_16_acc();
985     print_scatter16_buffer();
986     check_scatter_16_acc();
987 
988     vector_scatter_16_masked();
989     print_scatter16_buffer();
990     check_scatter_16_masked();
991 
992     /* 32 bit elements with 32 bit offsets */
993     create_offsets_values_preds_32();
994 
995     vector_scatter_32();
996     print_scatter32_buffer();
997     check_scatter_32();
998 
999     vector_gather_32();
1000     print_gather_result_32();
1001     check_gather_32();
1002 
1003     vector_gather_32_masked();
1004     print_gather_result_32();
1005     check_gather_32_masked();
1006 
1007     vector_scatter_32_acc();
1008     print_scatter32_buffer();
1009     check_scatter_32_acc();
1010 
1011     vector_scatter_32_masked();
1012     print_scatter32_buffer();
1013     check_scatter_32_masked();
1014 
1015     /* 16 bit elements with 32 bit offsets */
1016     create_offsets_values_preds_16_32();
1017 
1018     vector_scatter_16_32();
1019     print_scatter16_32_buffer();
1020     check_scatter_16_32();
1021 
1022     vector_gather_16_32();
1023     print_gather_result_16_32();
1024     check_gather_16_32();
1025 
1026     vector_gather_16_32_masked();
1027     print_gather_result_16_32();
1028     check_gather_16_32_masked();
1029 
1030     vector_scatter_16_32_acc();
1031     print_scatter16_32_buffer();
1032     check_scatter_16_32_acc();
1033 
1034     vector_scatter_16_32_masked();
1035     print_scatter16_32_buffer();
1036     check_scatter_16_32_masked();
1037 
1038     puts(err ? "FAIL" : "PASS");
1039     return err;
1040 }
1041