xref: /openbmc/qemu/tests/tcg/hexagon/scatter_gather.c (revision aa4af821296ceaf3abb856920b0f3f0822c2d8fd)
1  /*
2   *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
3   *
4   *  This program is free software; you can redistribute it and/or modify
5   *  it under the terms of the GNU General Public License as published by
6   *  the Free Software Foundation; either version 2 of the License, or
7   *  (at your option) any later version.
8   *
9   *  This program is distributed in the hope that it will be useful,
10   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   *  GNU General Public License for more details.
13   *
14   *  You should have received a copy of the GNU General Public License
15   *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16   */
17  
18  /*
19   * This example tests the HVX scatter/gather instructions
20   *
21   * See section 5.13 of the V68 HVX Programmer's Reference
22   *
23   * There are 3 main classes operations
24   *     _16                 16-bit elements and 16-bit offsets
25   *     _32                 32-bit elements and 32-bit offsets
26   *     _16_32              16-bit elements and 32-bit offsets
27   *
28   * There are also masked and accumulate versions
29   */
30  
31  #include <stdio.h>
32  #include <string.h>
33  #include <stdlib.h>
34  #include <inttypes.h>
35  
36  typedef long HVX_Vector       __attribute__((__vector_size__(128)))
37                                __attribute__((aligned(128)));
38  typedef long HVX_VectorPair   __attribute__((__vector_size__(256)))
39                                __attribute__((aligned(128)));
40  typedef long HVX_VectorPred   __attribute__((__vector_size__(128)))
41                                __attribute__((aligned(128)));
42  
43  int err;
44  
45  /* define the number of rows/cols in a square matrix */
46  #define MATRIX_SIZE 64
47  
48  /* define the size of the scatter buffer */
49  #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
50  
51  /* fake vtcm - put buffers together and force alignment */
52  static struct {
53      unsigned short vscatter16[SCATTER_BUFFER_SIZE];
54      unsigned short vgather16[MATRIX_SIZE];
55      unsigned int   vscatter32[SCATTER_BUFFER_SIZE];
56      unsigned int   vgather32[MATRIX_SIZE];
57      unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
58      unsigned short vgather16_32[MATRIX_SIZE];
59  } vtcm __attribute__((aligned(0x10000)));
60  
61  /* declare the arrays of reference values */
62  unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
63  unsigned short vgather16_ref[MATRIX_SIZE];
64  unsigned int   vscatter32_ref[SCATTER_BUFFER_SIZE];
65  unsigned int   vgather32_ref[MATRIX_SIZE];
66  unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
67  unsigned short vgather16_32_ref[MATRIX_SIZE];
68  
69  /* declare the arrays of offsets */
70  unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
71  unsigned int   word_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
72  
73  /* declare the arrays of values */
74  unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128)));
75  unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
76  unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
77  unsigned int   word_values[MATRIX_SIZE] __attribute__((aligned(128)));
78  unsigned int   word_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
79  unsigned int   word_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
80  
81  /* declare the arrays of predicates */
82  unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
83  unsigned int   word_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
84  
85  /* make this big enough for all the operations */
86  const size_t region_len = sizeof(vtcm);
87  
88  /* optionally add sync instructions */
89  #define SYNC_VECTOR 1
90  
sync_scatter(void * addr)91  static void sync_scatter(void *addr)
92  {
93  #if SYNC_VECTOR
94      /*
95       * Do the scatter release followed by a dummy load to complete the
96       * synchronization.  Normally the dummy load would be deferred as
97       * long as possible to minimize stalls.
98       */
99      asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
100      /* use volatile to force the load */
101      volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
102  #endif
103  }
104  
sync_gather(void * addr)105  static void sync_gather(void *addr)
106  {
107  #if SYNC_VECTOR
108      /* use volatile to force the load */
109      volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
110  #endif
111  }
112  
113  /* optionally print the results */
114  #define PRINT_DATA 0
115  
116  #define FILL_CHAR       '.'
117  
118  /* fill vtcm scratch with ee */
prefill_vtcm_scratch(void)119  void prefill_vtcm_scratch(void)
120  {
121      memset(&vtcm, FILL_CHAR, sizeof(vtcm));
122  }
123  
124  /* create byte offsets to be a diagonal of the matrix with 16 bit elements */
create_offsets_values_preds_16(void)125  void create_offsets_values_preds_16(void)
126  {
127      unsigned short half_element = 0;
128      unsigned short half_element_masked = 0;
129      char letter = 'A';
130      char letter_masked = '@';
131  
132      for (int i = 0; i < MATRIX_SIZE; i++) {
133          half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
134  
135          half_element = 0;
136          half_element_masked = 0;
137          for (int j = 0; j < 2; j++) {
138              half_element |= letter << j * 8;
139              half_element_masked |= letter_masked << j * 8;
140          }
141  
142          half_values[i] = half_element;
143          half_values_acc[i] = ((i % 10) << 8) + (i % 10);
144          half_values_masked[i] = half_element_masked;
145  
146          letter++;
147          /* reset to 'A' */
148          if (letter == 'M') {
149              letter = 'A';
150          }
151  
152          half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
153      }
154  }
155  
156  /* create byte offsets to be a diagonal of the matrix with 32 bit elements */
create_offsets_values_preds_32(void)157  void create_offsets_values_preds_32(void)
158  {
159      unsigned int word_element = 0;
160      unsigned int word_element_masked = 0;
161      char letter = 'A';
162      char letter_masked = '&';
163  
164      for (int i = 0; i < MATRIX_SIZE; i++) {
165          word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
166  
167          word_element = 0;
168          word_element_masked = 0;
169          for (int j = 0; j < 4; j++) {
170              word_element |= letter << j * 8;
171              word_element_masked |= letter_masked << j * 8;
172          }
173  
174          word_values[i] = word_element;
175          word_values_acc[i] = ((i % 10) << 8) + (i % 10);
176          word_values_masked[i] = word_element_masked;
177  
178          letter++;
179          /* reset to 'A' */
180          if (letter == 'M') {
181              letter = 'A';
182          }
183  
184          word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
185      }
186  }
187  
188  /*
189   * create byte offsets to be a diagonal of the matrix with 16 bit elements
190   * and 32 bit offsets
191   */
create_offsets_values_preds_16_32(void)192  void create_offsets_values_preds_16_32(void)
193  {
194      unsigned short half_element = 0;
195      unsigned short half_element_masked = 0;
196      char letter = 'D';
197      char letter_masked = '$';
198  
199      for (int i = 0; i < MATRIX_SIZE; i++) {
200          word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
201  
202          half_element = 0;
203          half_element_masked = 0;
204          for (int j = 0; j < 2; j++) {
205              half_element |= letter << j * 8;
206              half_element_masked |= letter_masked << j * 8;
207          }
208  
209          half_values[i] = half_element;
210          half_values_acc[i] = ((i % 10) << 8) + (i % 10);
211          half_values_masked[i] = half_element_masked;
212  
213          letter++;
214          /* reset to 'A' */
215          if (letter == 'P') {
216              letter = 'D';
217          }
218  
219          half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
220      }
221  }
222  
223  /* scatter the 16 bit elements using HVX */
vector_scatter_16(void)224  void vector_scatter_16(void)
225  {
226      asm ("m0 = %1\n\t"
227           "v0 = vmem(%2 + #0)\n\t"
228           "v1 = vmem(%3 + #0)\n\t"
229           "vscatter(%0, m0, v0.h).h = v1\n\t"
230           : : "r"(vtcm.vscatter16), "r"(region_len),
231               "r"(half_offsets), "r"(half_values)
232           : "m0", "v0", "v1", "memory");
233  
234      sync_scatter(vtcm.vscatter16);
235  }
236  
237  /* scatter-accumulate the 16 bit elements using HVX */
vector_scatter_16_acc(void)238  void vector_scatter_16_acc(void)
239  {
240      asm ("m0 = %1\n\t"
241           "v0 = vmem(%2 + #0)\n\t"
242           "v1 = vmem(%3 + #0)\n\t"
243           "vscatter(%0, m0, v0.h).h += v1\n\t"
244           : : "r"(vtcm.vscatter16), "r"(region_len),
245               "r"(half_offsets), "r"(half_values_acc)
246           : "m0", "v0", "v1", "memory");
247  
248      sync_scatter(vtcm.vscatter16);
249  }
250  
251  /* masked scatter the 16 bit elements using HVX */
vector_scatter_16_masked(void)252  void vector_scatter_16_masked(void)
253  {
254      asm ("r1 = #-1\n\t"
255           "v0 = vmem(%0 + #0)\n\t"
256           "q0 = vand(v0, r1)\n\t"
257           "m0 = %2\n\t"
258           "v0 = vmem(%3 + #0)\n\t"
259           "v1 = vmem(%4 + #0)\n\t"
260           "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t"
261           : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
262               "r"(half_offsets), "r"(half_values_masked)
263           : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
264  
265      sync_scatter(vtcm.vscatter16);
266  }
267  
268  /* scatter the 32 bit elements using HVX */
vector_scatter_32(void)269  void vector_scatter_32(void)
270  {
271      HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
272      HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
273      HVX_Vector *valueslo = (HVX_Vector *)word_values;
274      HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2];
275  
276      asm ("m0 = %1\n\t"
277           "v0 = vmem(%2 + #0)\n\t"
278           "v1 = vmem(%3 + #0)\n\t"
279           "vscatter(%0, m0, v0.w).w = v1\n\t"
280           : : "r"(vtcm.vscatter32), "r"(region_len),
281               "r"(offsetslo), "r"(valueslo)
282           : "m0", "v0", "v1", "memory");
283      asm ("m0 = %1\n\t"
284           "v0 = vmem(%2 + #0)\n\t"
285           "v1 = vmem(%3 + #0)\n\t"
286           "vscatter(%0, m0, v0.w).w = v1\n\t"
287           : : "r"(vtcm.vscatter32), "r"(region_len),
288               "r"(offsetshi), "r"(valueshi)
289           : "m0", "v0", "v1", "memory");
290  
291      sync_scatter(vtcm.vscatter32);
292  }
293  
294  /* scatter-accumulate the 32 bit elements using HVX */
vector_scatter_32_acc(void)295  void vector_scatter_32_acc(void)
296  {
297      HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
298      HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
299      HVX_Vector *valueslo = (HVX_Vector *)word_values_acc;
300      HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
301  
302      asm ("m0 = %1\n\t"
303           "v0 = vmem(%2 + #0)\n\t"
304           "v1 = vmem(%3 + #0)\n\t"
305           "vscatter(%0, m0, v0.w).w += v1\n\t"
306           : : "r"(vtcm.vscatter32), "r"(region_len),
307               "r"(offsetslo), "r"(valueslo)
308           : "m0", "v0", "v1", "memory");
309      asm ("m0 = %1\n\t"
310           "v0 = vmem(%2 + #0)\n\t"
311           "v1 = vmem(%3 + #0)\n\t"
312           "vscatter(%0, m0, v0.w).w += v1\n\t"
313           : : "r"(vtcm.vscatter32), "r"(region_len),
314               "r"(offsetshi), "r"(valueshi)
315           : "m0", "v0", "v1", "memory");
316  
317      sync_scatter(vtcm.vscatter32);
318  }
319  
320  /* masked scatter the 32 bit elements using HVX */
vector_scatter_32_masked(void)321  void vector_scatter_32_masked(void)
322  {
323      HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
324      HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
325      HVX_Vector *valueslo = (HVX_Vector *)word_values_masked;
326      HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
327      HVX_Vector *predslo = (HVX_Vector *)word_predicates;
328      HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
329  
330      asm ("r1 = #-1\n\t"
331           "v0 = vmem(%0 + #0)\n\t"
332           "q0 = vand(v0, r1)\n\t"
333           "m0 = %2\n\t"
334           "v0 = vmem(%3 + #0)\n\t"
335           "v1 = vmem(%4 + #0)\n\t"
336           "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
337           : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
338               "r"(offsetslo), "r"(valueslo)
339           : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
340      asm ("r1 = #-1\n\t"
341           "v0 = vmem(%0 + #0)\n\t"
342           "q0 = vand(v0, r1)\n\t"
343           "m0 = %2\n\t"
344           "v0 = vmem(%3 + #0)\n\t"
345           "v1 = vmem(%4 + #0)\n\t"
346           "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
347           : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
348               "r"(offsetshi), "r"(valueshi)
349           : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
350  
351      sync_scatter(vtcm.vscatter32);
352  }
353  
354  /* scatter the 16 bit elements with 32 bit offsets using HVX */
vector_scatter_16_32(void)355  void vector_scatter_16_32(void)
356  {
357      asm ("m0 = %1\n\t"
358           "v0 = vmem(%2 + #0)\n\t"
359           "v1 = vmem(%2 + #1)\n\t"
360           "v2 = vmem(%3 + #0)\n\t"
361           "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
362           "vscatter(%0, m0, v1:0.w).h = v2\n\t"
363           : : "r"(vtcm.vscatter16_32), "r"(region_len),
364               "r"(word_offsets), "r"(half_values)
365           : "m0", "v0", "v1", "v2", "memory");
366  
367      sync_scatter(vtcm.vscatter16_32);
368  }
369  
370  /* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */
vector_scatter_16_32_acc(void)371  void vector_scatter_16_32_acc(void)
372  {
373      asm ("m0 = %1\n\t"
374           "v0 = vmem(%2 + #0)\n\t"
375           "v1 = vmem(%2 + #1)\n\t"
376           "v2 = vmem(%3 + #0)\n\t" \
377           "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
378           "vscatter(%0, m0, v1:0.w).h += v2\n\t"
379           : : "r"(vtcm.vscatter16_32), "r"(region_len),
380               "r"(word_offsets), "r"(half_values_acc)
381           : "m0", "v0", "v1", "v2", "memory");
382  
383      sync_scatter(vtcm.vscatter16_32);
384  }
385  
386  /* masked scatter the 16 bit elements with 32 bit offsets using HVX */
vector_scatter_16_32_masked(void)387  void vector_scatter_16_32_masked(void)
388  {
389      asm ("r1 = #-1\n\t"
390           "v0 = vmem(%0 + #0)\n\t"
391           "v0.h = vshuff(v0.h)\n\t"  /* shuffle the predicates */
392           "q0 = vand(v0, r1)\n\t"
393           "m0 = %2\n\t"
394           "v0 = vmem(%3 + #0)\n\t"
395           "v1 = vmem(%3 + #1)\n\t"
396           "v2 = vmem(%4 + #0)\n\t" \
397           "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
398           "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t"
399           : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
400               "r"(word_offsets), "r"(half_values_masked)
401           : "r1", "q0", "m0", "v0", "v1", "v2", "memory");
402  
403      sync_scatter(vtcm.vscatter16_32);
404  }
405  
406  /* gather the elements from the scatter16 buffer using HVX */
vector_gather_16(void)407  void vector_gather_16(void)
408  {
409      asm ("m0 = %1\n\t"
410           "v0 = vmem(%2 + #0)\n\t"
411           "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t"
412           "  vmem(%3 + #0) = vtmp.new }\n\t"
413           : : "r"(vtcm.vscatter16), "r"(region_len),
414               "r"(half_offsets), "r"(vtcm.vgather16)
415           : "m0", "v0", "memory");
416  
417      sync_gather(vtcm.vgather16);
418  }
419  
gather_16_masked_init(void)420  static unsigned short gather_16_masked_init(void)
421  {
422      char letter = '?';
423      return letter | (letter << 8);
424  }
425  
426  /* masked gather the elements from the scatter16 buffer using HVX */
vector_gather_16_masked(void)427  void vector_gather_16_masked(void)
428  {
429      unsigned short init = gather_16_masked_init();
430  
431      asm ("v0.h = vsplat(%5)\n\t"
432           "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
433           "r1 = #-1\n\t"
434           "v0 = vmem(%0 + #0)\n\t"
435           "q0 = vand(v0, r1)\n\t"
436           "m0 = %2\n\t"
437           "v0 = vmem(%3 + #0)\n\t"
438           "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t"
439           "  vmem(%4 + #0) = vtmp.new }\n\t"
440           : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
441               "r"(half_offsets), "r"(vtcm.vgather16), "r"(init)
442           : "r1", "q0", "m0", "v0", "memory");
443  
444      sync_gather(vtcm.vgather16);
445  }
446  
447  /* gather the elements from the scatter32 buffer using HVX */
vector_gather_32(void)448  void vector_gather_32(void)
449  {
450      HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
451      HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
452      HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
453      HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
454  
455      asm ("m0 = %1\n\t"
456           "v0 = vmem(%2 + #0)\n\t"
457           "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
458           "  vmem(%3 + #0) = vtmp.new }\n\t"
459           : : "r"(vtcm.vscatter32), "r"(region_len),
460               "r"(offsetslo), "r"(vgatherlo)
461           : "m0", "v0", "memory");
462      asm ("m0 = %1\n\t"
463           "v0 = vmem(%2 + #0)\n\t"
464           "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
465           "  vmem(%3 + #0) = vtmp.new }\n\t"
466           : : "r"(vtcm.vscatter32), "r"(region_len),
467               "r"(offsetshi), "r"(vgatherhi)
468           : "m0", "v0", "memory");
469  
470      sync_gather(vgatherlo);
471      sync_gather(vgatherhi);
472  }
473  
gather_32_masked_init(void)474  static unsigned int gather_32_masked_init(void)
475  {
476      char letter = '?';
477      return letter | (letter << 8) | (letter << 16) | (letter << 24);
478  }
479  
480  /* masked gather the elements from the scatter32 buffer using HVX */
vector_gather_32_masked(void)481  void vector_gather_32_masked(void)
482  {
483      unsigned int init = gather_32_masked_init();
484      HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
485      HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
486      HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
487      HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
488      HVX_Vector *predslo = (HVX_Vector *)word_predicates;
489      HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
490  
491      asm ("v0.h = vsplat(%5)\n\t"
492           "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
493           "r1 = #-1\n\t"
494           "v0 = vmem(%0 + #0)\n\t"
495           "q0 = vand(v0, r1)\n\t"
496           "m0 = %2\n\t"
497           "v0 = vmem(%3 + #0)\n\t"
498           "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
499           "  vmem(%4 + #0) = vtmp.new }\n\t"
500           : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
501               "r"(offsetslo), "r"(vgatherlo), "r"(init)
502           : "r1", "q0", "m0", "v0", "memory");
503      asm ("v0.h = vsplat(%5)\n\t"
504           "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
505           "r1 = #-1\n\t"
506           "v0 = vmem(%0 + #0)\n\t"
507           "q0 = vand(v0, r1)\n\t"
508           "m0 = %2\n\t"
509           "v0 = vmem(%3 + #0)\n\t"
510           "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
511           "  vmem(%4 + #0) = vtmp.new }\n\t"
512           : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
513               "r"(offsetshi), "r"(vgatherhi), "r"(init)
514           : "r1", "q0", "m0", "v0", "memory");
515  
516      sync_gather(vgatherlo);
517      sync_gather(vgatherhi);
518  }
519  
520  /* gather the elements from the scatter16_32 buffer using HVX */
vector_gather_16_32(void)521  void vector_gather_16_32(void)
522  {
523      asm ("m0 = %1\n\t"
524           "v0 = vmem(%2 + #0)\n\t"
525           "v1 = vmem(%2 + #1)\n\t"
526           "{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t"
527           "  vmem(%3 + #0) = vtmp.new }\n\t"
528           "v0 = vmem(%3 + #0)\n\t"
529           "v0.h = vdeal(v0.h)\n\t"  /* deal the elements to get the order back */
530           "vmem(%3 + #0) = v0\n\t"
531           : : "r"(vtcm.vscatter16_32), "r"(region_len),
532               "r"(word_offsets), "r"(vtcm.vgather16_32)
533           : "m0", "v0", "v1", "memory");
534  
535      sync_gather(vtcm.vgather16_32);
536  }
537  
538  /* masked gather the elements from the scatter16_32 buffer using HVX */
vector_gather_16_32_masked(void)539  void vector_gather_16_32_masked(void)
540  {
541      unsigned short init = gather_16_masked_init();
542  
543      asm ("v0.h = vsplat(%5)\n\t"
544           "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
545           "r1 = #-1\n\t"
546           "v0 = vmem(%0 + #0)\n\t"
547           "v0.h = vshuff(v0.h)\n\t"  /* shuffle the predicates */
548           "q0 = vand(v0, r1)\n\t"
549           "m0 = %2\n\t"
550           "v0 = vmem(%3 + #0)\n\t"
551           "v1 = vmem(%3 + #1)\n\t"
552           "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t"
553           "  vmem(%4 + #0) = vtmp.new }\n\t"
554           "v0 = vmem(%4 + #0)\n\t"
555           "v0.h = vdeal(v0.h)\n\t"  /* deal the elements to get the order back */
556           "vmem(%4 + #0) = v0\n\t"
557           : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
558               "r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init)
559           : "r1", "q0", "m0", "v0", "v1", "memory");
560  
561      sync_gather(vtcm.vgather16_32);
562  }
563  
check_buffer(const char * name,void * c,void * r,size_t size)564  static void check_buffer(const char *name, void *c, void *r, size_t size)
565  {
566      char *check = (char *)c;
567      char *ref = (char *)r;
568      for (int i = 0; i < size; i++) {
569          if (check[i] != ref[i]) {
570              printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
571                     check[i], check[i], ref[i], ref[i]);
572              err++;
573          }
574      }
575  }
576  
577  /*
578   * These scalar functions are the C equivalents of the vector functions that
579   * use HVX
580   */
581  
582  /* scatter the 16 bit elements using C */
scalar_scatter_16(unsigned short * vscatter16)583  void scalar_scatter_16(unsigned short *vscatter16)
584  {
585      for (int i = 0; i < MATRIX_SIZE; ++i) {
586          vscatter16[half_offsets[i] / 2] = half_values[i];
587      }
588  }
589  
check_scatter_16()590  void check_scatter_16()
591  {
592      memset(vscatter16_ref, FILL_CHAR,
593             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
594      scalar_scatter_16(vscatter16_ref);
595      check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
596                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
597  }
598  
599  /* scatter the 16 bit elements using C */
scalar_scatter_16_acc(unsigned short * vscatter16)600  void scalar_scatter_16_acc(unsigned short *vscatter16)
601  {
602      for (int i = 0; i < MATRIX_SIZE; ++i) {
603          vscatter16[half_offsets[i] / 2] += half_values_acc[i];
604      }
605  }
606  
607  /* scatter-accumulate the 16 bit elements using C */
check_scatter_16_acc()608  void check_scatter_16_acc()
609  {
610      memset(vscatter16_ref, FILL_CHAR,
611             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
612      scalar_scatter_16(vscatter16_ref);
613      scalar_scatter_16_acc(vscatter16_ref);
614      check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
615                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
616  }
617  
618  /* masked scatter the 16 bit elements using C */
scalar_scatter_16_masked(unsigned short * vscatter16)619  void scalar_scatter_16_masked(unsigned short *vscatter16)
620  {
621      for (int i = 0; i < MATRIX_SIZE; i++) {
622          if (half_predicates[i]) {
623              vscatter16[half_offsets[i] / 2] = half_values_masked[i];
624          }
625      }
626  
627  }
628  
check_scatter_16_masked()629  void check_scatter_16_masked()
630  {
631      memset(vscatter16_ref, FILL_CHAR,
632             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
633      scalar_scatter_16(vscatter16_ref);
634      scalar_scatter_16_acc(vscatter16_ref);
635      scalar_scatter_16_masked(vscatter16_ref);
636      check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
637                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
638  }
639  
640  /* scatter the 32 bit elements using C */
scalar_scatter_32(unsigned int * vscatter32)641  void scalar_scatter_32(unsigned int *vscatter32)
642  {
643      for (int i = 0; i < MATRIX_SIZE; ++i) {
644          vscatter32[word_offsets[i] / 4] = word_values[i];
645      }
646  }
647  
check_scatter_32()648  void check_scatter_32()
649  {
650      memset(vscatter32_ref, FILL_CHAR,
651             SCATTER_BUFFER_SIZE * sizeof(unsigned int));
652      scalar_scatter_32(vscatter32_ref);
653      check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
654                   SCATTER_BUFFER_SIZE * sizeof(unsigned int));
655  }
656  
657  /* scatter-accumulate the 32 bit elements using C */
scalar_scatter_32_acc(unsigned int * vscatter32)658  void scalar_scatter_32_acc(unsigned int *vscatter32)
659  {
660      for (int i = 0; i < MATRIX_SIZE; ++i) {
661          vscatter32[word_offsets[i] / 4] += word_values_acc[i];
662      }
663  }
664  
check_scatter_32_acc()665  void check_scatter_32_acc()
666  {
667      memset(vscatter32_ref, FILL_CHAR,
668             SCATTER_BUFFER_SIZE * sizeof(unsigned int));
669      scalar_scatter_32(vscatter32_ref);
670      scalar_scatter_32_acc(vscatter32_ref);
671      check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
672                   SCATTER_BUFFER_SIZE * sizeof(unsigned int));
673  }
674  
675  /* masked scatter the 32 bit elements using C */
scalar_scatter_32_masked(unsigned int * vscatter32)676  void scalar_scatter_32_masked(unsigned int *vscatter32)
677  {
678      for (int i = 0; i < MATRIX_SIZE; i++) {
679          if (word_predicates[i]) {
680              vscatter32[word_offsets[i] / 4] = word_values_masked[i];
681          }
682      }
683  }
684  
check_scatter_32_masked()685  void check_scatter_32_masked()
686  {
687      memset(vscatter32_ref, FILL_CHAR,
688             SCATTER_BUFFER_SIZE * sizeof(unsigned int));
689      scalar_scatter_32(vscatter32_ref);
690      scalar_scatter_32_acc(vscatter32_ref);
691      scalar_scatter_32_masked(vscatter32_ref);
692      check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
693                    SCATTER_BUFFER_SIZE * sizeof(unsigned int));
694  }
695  
696  /* scatter the 16 bit elements with 32 bit offsets using C */
scalar_scatter_16_32(unsigned short * vscatter16_32)697  void scalar_scatter_16_32(unsigned short *vscatter16_32)
698  {
699      for (int i = 0; i < MATRIX_SIZE; ++i) {
700          vscatter16_32[word_offsets[i] / 2] = half_values[i];
701      }
702  }
703  
check_scatter_16_32()704  void check_scatter_16_32()
705  {
706      memset(vscatter16_32_ref, FILL_CHAR,
707             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
708      scalar_scatter_16_32(vscatter16_32_ref);
709      check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
710                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
711  }
712  
713  /* scatter-accumulate the 16 bit elements with 32 bit offsets using C */
scalar_scatter_16_32_acc(unsigned short * vscatter16_32)714  void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
715  {
716      for (int i = 0; i < MATRIX_SIZE; ++i) {
717          vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
718      }
719  }
720  
check_scatter_16_32_acc()721  void check_scatter_16_32_acc()
722  {
723      memset(vscatter16_32_ref, FILL_CHAR,
724             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
725      scalar_scatter_16_32(vscatter16_32_ref);
726      scalar_scatter_16_32_acc(vscatter16_32_ref);
727      check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
728                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
729  }
730  
731  /* masked scatter the 16 bit elements with 32 bit offsets using C */
scalar_scatter_16_32_masked(unsigned short * vscatter16_32)732  void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
733  {
734      for (int i = 0; i < MATRIX_SIZE; i++) {
735          if (half_predicates[i]) {
736              vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
737          }
738      }
739  }
740  
check_scatter_16_32_masked()741  void check_scatter_16_32_masked()
742  {
743      memset(vscatter16_32_ref, FILL_CHAR,
744             SCATTER_BUFFER_SIZE * sizeof(unsigned short));
745      scalar_scatter_16_32(vscatter16_32_ref);
746      scalar_scatter_16_32_acc(vscatter16_32_ref);
747      scalar_scatter_16_32_masked(vscatter16_32_ref);
748      check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
749                   SCATTER_BUFFER_SIZE * sizeof(unsigned short));
750  }
751  
752  /* gather the elements from the scatter buffer using C */
scalar_gather_16(unsigned short * vgather16)753  void scalar_gather_16(unsigned short *vgather16)
754  {
755      for (int i = 0; i < MATRIX_SIZE; ++i) {
756          vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
757      }
758  }
759  
check_gather_16()760  void check_gather_16()
761  {
762        memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
763        scalar_gather_16(vgather16_ref);
764        check_buffer(__func__, vtcm.vgather16, vgather16_ref,
765                     MATRIX_SIZE * sizeof(unsigned short));
766  }
767  
768  /* masked gather the elements from the scatter buffer using C */
scalar_gather_16_masked(unsigned short * vgather16)769  void scalar_gather_16_masked(unsigned short *vgather16)
770  {
771      for (int i = 0; i < MATRIX_SIZE; ++i) {
772          if (half_predicates[i]) {
773              vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
774          }
775      }
776  }
777  
check_gather_16_masked()778  void check_gather_16_masked()
779  {
780      memset(vgather16_ref, gather_16_masked_init(),
781             MATRIX_SIZE * sizeof(unsigned short));
782      scalar_gather_16_masked(vgather16_ref);
783      check_buffer(__func__, vtcm.vgather16, vgather16_ref,
784                   MATRIX_SIZE * sizeof(unsigned short));
785  }
786  
787  /* gather the elements from the scatter32 buffer using C */
scalar_gather_32(unsigned int * vgather32)788  void scalar_gather_32(unsigned int *vgather32)
789  {
790      for (int i = 0; i < MATRIX_SIZE; ++i) {
791          vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
792      }
793  }
794  
check_gather_32(void)795  void check_gather_32(void)
796  {
797      memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
798      scalar_gather_32(vgather32_ref);
799      check_buffer(__func__, vtcm.vgather32, vgather32_ref,
800                   MATRIX_SIZE * sizeof(unsigned int));
801  }
802  
803  /* masked gather the elements from the scatter32 buffer using C */
scalar_gather_32_masked(unsigned int * vgather32)804  void scalar_gather_32_masked(unsigned int *vgather32)
805  {
806      for (int i = 0; i < MATRIX_SIZE; ++i) {
807          if (word_predicates[i]) {
808              vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
809          }
810      }
811  }
812  
check_gather_32_masked(void)813  void check_gather_32_masked(void)
814  {
815      memset(vgather32_ref, gather_32_masked_init(),
816             MATRIX_SIZE * sizeof(unsigned int));
817      scalar_gather_32_masked(vgather32_ref);
818      check_buffer(__func__, vtcm.vgather32,
819                   vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
820  }
821  
822  /* gather the elements from the scatter16_32 buffer using C */
scalar_gather_16_32(unsigned short * vgather16_32)823  void scalar_gather_16_32(unsigned short *vgather16_32)
824  {
825      for (int i = 0; i < MATRIX_SIZE; ++i) {
826          vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
827      }
828  }
829  
check_gather_16_32(void)830  void check_gather_16_32(void)
831  {
832      memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
833      scalar_gather_16_32(vgather16_32_ref);
834      check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
835                   MATRIX_SIZE * sizeof(unsigned short));
836  }
837  
838  /* masked gather the elements from the scatter16_32 buffer using C */
scalar_gather_16_32_masked(unsigned short * vgather16_32)839  void scalar_gather_16_32_masked(unsigned short *vgather16_32)
840  {
841      for (int i = 0; i < MATRIX_SIZE; ++i) {
842          if (half_predicates[i]) {
843              vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
844          }
845      }
846  
847  }
848  
check_gather_16_32_masked(void)849  void check_gather_16_32_masked(void)
850  {
851      memset(vgather16_32_ref, gather_16_masked_init(),
852             MATRIX_SIZE * sizeof(unsigned short));
853      scalar_gather_16_32_masked(vgather16_32_ref);
854      check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
855                   MATRIX_SIZE * sizeof(unsigned short));
856  }
857  
858  /* print scatter16 buffer */
print_scatter16_buffer(void)859  void print_scatter16_buffer(void)
860  {
861      if (PRINT_DATA) {
862          printf("\n\nPrinting the 16 bit scatter buffer");
863  
864          for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
865              if ((i % MATRIX_SIZE) == 0) {
866                  printf("\n");
867              }
868              for (int j = 0; j < 2; j++) {
869                  printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
870              }
871              printf(" ");
872          }
873          printf("\n");
874      }
875  }
876  
877  /* print the gather 16 buffer */
print_gather_result_16(void)878  void print_gather_result_16(void)
879  {
880      if (PRINT_DATA) {
881          printf("\n\nPrinting the 16 bit gather result\n");
882  
883          for (int i = 0; i < MATRIX_SIZE; i++) {
884              for (int j = 0; j < 2; j++) {
885                  printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
886              }
887              printf(" ");
888          }
889          printf("\n");
890      }
891  }
892  
893  /* print the scatter32 buffer */
print_scatter32_buffer(void)894  void print_scatter32_buffer(void)
895  {
896      if (PRINT_DATA) {
897          printf("\n\nPrinting the 32 bit scatter buffer");
898  
899          for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
900              if ((i % MATRIX_SIZE) == 0) {
901                  printf("\n");
902              }
903              for (int j = 0; j < 4; j++) {
904                  printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
905              }
906              printf(" ");
907          }
908          printf("\n");
909      }
910  }
911  
912  /* print the gather 32 buffer */
print_gather_result_32(void)913  void print_gather_result_32(void)
914  {
915      if (PRINT_DATA) {
916          printf("\n\nPrinting the 32 bit gather result\n");
917  
918          for (int i = 0; i < MATRIX_SIZE; i++) {
919              for (int j = 0; j < 4; j++) {
920                  printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
921              }
922              printf(" ");
923          }
924          printf("\n");
925      }
926  }
927  
928  /* print the scatter16_32 buffer */
print_scatter16_32_buffer(void)929  void print_scatter16_32_buffer(void)
930  {
931      if (PRINT_DATA) {
932          printf("\n\nPrinting the 16_32 bit scatter buffer");
933  
934          for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
935              if ((i % MATRIX_SIZE) == 0) {
936                  printf("\n");
937              }
938              for (int j = 0; j < 2; j++) {
939                  printf("%c",
940                        (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
941              }
942              printf(" ");
943          }
944          printf("\n");
945      }
946  }
947  
948  /* print the gather 16_32 buffer */
print_gather_result_16_32(void)949  void print_gather_result_16_32(void)
950  {
951      if (PRINT_DATA) {
952          printf("\n\nPrinting the 16_32 bit gather result\n");
953  
954          for (int i = 0; i < MATRIX_SIZE; i++) {
955              for (int j = 0; j < 2; j++) {
956                  printf("%c",
957                         (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
958              }
959              printf(" ");
960          }
961          printf("\n");
962      }
963  }
964  
main()965  int main()
966  {
967      prefill_vtcm_scratch();
968  
969      /* 16 bit elements with 16 bit offsets */
970      create_offsets_values_preds_16();
971  
972      vector_scatter_16();
973      print_scatter16_buffer();
974      check_scatter_16();
975  
976      vector_gather_16();
977      print_gather_result_16();
978      check_gather_16();
979  
980      vector_gather_16_masked();
981      print_gather_result_16();
982      check_gather_16_masked();
983  
984      vector_scatter_16_acc();
985      print_scatter16_buffer();
986      check_scatter_16_acc();
987  
988      vector_scatter_16_masked();
989      print_scatter16_buffer();
990      check_scatter_16_masked();
991  
992      /* 32 bit elements with 32 bit offsets */
993      create_offsets_values_preds_32();
994  
995      vector_scatter_32();
996      print_scatter32_buffer();
997      check_scatter_32();
998  
999      vector_gather_32();
1000      print_gather_result_32();
1001      check_gather_32();
1002  
1003      vector_gather_32_masked();
1004      print_gather_result_32();
1005      check_gather_32_masked();
1006  
1007      vector_scatter_32_acc();
1008      print_scatter32_buffer();
1009      check_scatter_32_acc();
1010  
1011      vector_scatter_32_masked();
1012      print_scatter32_buffer();
1013      check_scatter_32_masked();
1014  
1015      /* 16 bit elements with 32 bit offsets */
1016      create_offsets_values_preds_16_32();
1017  
1018      vector_scatter_16_32();
1019      print_scatter16_32_buffer();
1020      check_scatter_16_32();
1021  
1022      vector_gather_16_32();
1023      print_gather_result_16_32();
1024      check_gather_16_32();
1025  
1026      vector_gather_16_32_masked();
1027      print_gather_result_16_32();
1028      check_gather_16_32_masked();
1029  
1030      vector_scatter_16_32_acc();
1031      print_scatter16_32_buffer();
1032      check_scatter_16_32_acc();
1033  
1034      vector_scatter_16_32_masked();
1035      print_scatter16_32_buffer();
1036      check_scatter_16_32_masked();
1037  
1038      puts(err ? "FAIL" : "PASS");
1039      return err;
1040  }
1041