162e93b08STaylor Simpson /*
2*c3679385STaylor Simpson  *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
362e93b08STaylor Simpson  *
462e93b08STaylor Simpson  *  This program is free software; you can redistribute it and/or modify
562e93b08STaylor Simpson  *  it under the terms of the GNU General Public License as published by
662e93b08STaylor Simpson  *  the Free Software Foundation; either version 2 of the License, or
762e93b08STaylor Simpson  *  (at your option) any later version.
862e93b08STaylor Simpson  *
962e93b08STaylor Simpson  *  This program is distributed in the hope that it will be useful,
1062e93b08STaylor Simpson  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
1162e93b08STaylor Simpson  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1262e93b08STaylor Simpson  *  GNU General Public License for more details.
1362e93b08STaylor Simpson  *
1462e93b08STaylor Simpson  *  You should have received a copy of the GNU General Public License
1562e93b08STaylor Simpson  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
1662e93b08STaylor Simpson  */
1762e93b08STaylor Simpson 
1862e93b08STaylor Simpson /*
1962e93b08STaylor Simpson  * This example tests the HVX scatter/gather instructions
2062e93b08STaylor Simpson  *
2162e93b08STaylor Simpson  * See section 5.13 of the V68 HVX Programmer's Reference
2262e93b08STaylor Simpson  *
2362e93b08STaylor Simpson  * There are 3 main classes operations
2462e93b08STaylor Simpson  *     _16                 16-bit elements and 16-bit offsets
2562e93b08STaylor Simpson  *     _32                 32-bit elements and 32-bit offsets
2662e93b08STaylor Simpson  *     _16_32              16-bit elements and 32-bit offsets
2762e93b08STaylor Simpson  *
2862e93b08STaylor Simpson  * There are also masked and accumulate versions
2962e93b08STaylor Simpson  */
3062e93b08STaylor Simpson 
3162e93b08STaylor Simpson #include <stdio.h>
3262e93b08STaylor Simpson #include <string.h>
3362e93b08STaylor Simpson #include <stdlib.h>
3462e93b08STaylor Simpson #include <inttypes.h>
3562e93b08STaylor Simpson 
3662e93b08STaylor Simpson typedef long HVX_Vector       __attribute__((__vector_size__(128)))
3762e93b08STaylor Simpson                               __attribute__((aligned(128)));
3862e93b08STaylor Simpson typedef long HVX_VectorPair   __attribute__((__vector_size__(256)))
3962e93b08STaylor Simpson                               __attribute__((aligned(128)));
4062e93b08STaylor Simpson typedef long HVX_VectorPred   __attribute__((__vector_size__(128)))
4162e93b08STaylor Simpson                               __attribute__((aligned(128)));
4262e93b08STaylor Simpson 
4362e93b08STaylor Simpson int err;
4462e93b08STaylor Simpson 
4562e93b08STaylor Simpson /* define the number of rows/cols in a square matrix */
4662e93b08STaylor Simpson #define MATRIX_SIZE 64
4762e93b08STaylor Simpson 
4862e93b08STaylor Simpson /* define the size of the scatter buffer */
4962e93b08STaylor Simpson #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
5062e93b08STaylor Simpson 
5162e93b08STaylor Simpson /* fake vtcm - put buffers together and force alignment */
5262e93b08STaylor Simpson static struct {
5362e93b08STaylor Simpson     unsigned short vscatter16[SCATTER_BUFFER_SIZE];
5462e93b08STaylor Simpson     unsigned short vgather16[MATRIX_SIZE];
5562e93b08STaylor Simpson     unsigned int   vscatter32[SCATTER_BUFFER_SIZE];
5662e93b08STaylor Simpson     unsigned int   vgather32[MATRIX_SIZE];
5762e93b08STaylor Simpson     unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
5862e93b08STaylor Simpson     unsigned short vgather16_32[MATRIX_SIZE];
5962e93b08STaylor Simpson } vtcm __attribute__((aligned(0x10000)));
6062e93b08STaylor Simpson 
6162e93b08STaylor Simpson /* declare the arrays of reference values */
6262e93b08STaylor Simpson unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
6362e93b08STaylor Simpson unsigned short vgather16_ref[MATRIX_SIZE];
6462e93b08STaylor Simpson unsigned int   vscatter32_ref[SCATTER_BUFFER_SIZE];
6562e93b08STaylor Simpson unsigned int   vgather32_ref[MATRIX_SIZE];
6662e93b08STaylor Simpson unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
6762e93b08STaylor Simpson unsigned short vgather16_32_ref[MATRIX_SIZE];
6862e93b08STaylor Simpson 
6962e93b08STaylor Simpson /* declare the arrays of offsets */
70*c3679385STaylor Simpson unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
71*c3679385STaylor Simpson unsigned int   word_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
7262e93b08STaylor Simpson 
7362e93b08STaylor Simpson /* declare the arrays of values */
74*c3679385STaylor Simpson unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128)));
75*c3679385STaylor Simpson unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
76*c3679385STaylor Simpson unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
77*c3679385STaylor Simpson unsigned int   word_values[MATRIX_SIZE] __attribute__((aligned(128)));
78*c3679385STaylor Simpson unsigned int   word_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
79*c3679385STaylor Simpson unsigned int   word_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
8062e93b08STaylor Simpson 
8162e93b08STaylor Simpson /* declare the arrays of predicates */
82*c3679385STaylor Simpson unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
83*c3679385STaylor Simpson unsigned int   word_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
8462e93b08STaylor Simpson 
85*c3679385STaylor Simpson /* make this big enough for all the operations */
8662e93b08STaylor Simpson const size_t region_len = sizeof(vtcm);
8762e93b08STaylor Simpson 
8862e93b08STaylor Simpson /* optionally add sync instructions */
8962e93b08STaylor Simpson #define SYNC_VECTOR 1
9062e93b08STaylor Simpson 
sync_scatter(void * addr)9162e93b08STaylor Simpson static void sync_scatter(void *addr)
9262e93b08STaylor Simpson {
9362e93b08STaylor Simpson #if SYNC_VECTOR
9462e93b08STaylor Simpson     /*
9562e93b08STaylor Simpson      * Do the scatter release followed by a dummy load to complete the
9662e93b08STaylor Simpson      * synchronization.  Normally the dummy load would be deferred as
9762e93b08STaylor Simpson      * long as possible to minimize stalls.
9862e93b08STaylor Simpson      */
9962e93b08STaylor Simpson     asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
10062e93b08STaylor Simpson     /* use volatile to force the load */
10162e93b08STaylor Simpson     volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
10262e93b08STaylor Simpson #endif
10362e93b08STaylor Simpson }
10462e93b08STaylor Simpson 
sync_gather(void * addr)10562e93b08STaylor Simpson static void sync_gather(void *addr)
10662e93b08STaylor Simpson {
10762e93b08STaylor Simpson #if SYNC_VECTOR
10862e93b08STaylor Simpson     /* use volatile to force the load */
10962e93b08STaylor Simpson     volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
11062e93b08STaylor Simpson #endif
11162e93b08STaylor Simpson }
11262e93b08STaylor Simpson 
11362e93b08STaylor Simpson /* optionally print the results */
11462e93b08STaylor Simpson #define PRINT_DATA 0
11562e93b08STaylor Simpson 
11662e93b08STaylor Simpson #define FILL_CHAR       '.'
11762e93b08STaylor Simpson 
11862e93b08STaylor Simpson /* fill vtcm scratch with ee */
prefill_vtcm_scratch(void)11962e93b08STaylor Simpson void prefill_vtcm_scratch(void)
12062e93b08STaylor Simpson {
12162e93b08STaylor Simpson     memset(&vtcm, FILL_CHAR, sizeof(vtcm));
12262e93b08STaylor Simpson }
12362e93b08STaylor Simpson 
12462e93b08STaylor Simpson /* create byte offsets to be a diagonal of the matrix with 16 bit elements */
create_offsets_values_preds_16(void)12562e93b08STaylor Simpson void create_offsets_values_preds_16(void)
12662e93b08STaylor Simpson {
12762e93b08STaylor Simpson     unsigned short half_element = 0;
12862e93b08STaylor Simpson     unsigned short half_element_masked = 0;
12962e93b08STaylor Simpson     char letter = 'A';
13062e93b08STaylor Simpson     char letter_masked = '@';
13162e93b08STaylor Simpson 
13262e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; i++) {
13362e93b08STaylor Simpson         half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
13462e93b08STaylor Simpson 
13562e93b08STaylor Simpson         half_element = 0;
13662e93b08STaylor Simpson         half_element_masked = 0;
13762e93b08STaylor Simpson         for (int j = 0; j < 2; j++) {
13862e93b08STaylor Simpson             half_element |= letter << j * 8;
13962e93b08STaylor Simpson             half_element_masked |= letter_masked << j * 8;
14062e93b08STaylor Simpson         }
14162e93b08STaylor Simpson 
14262e93b08STaylor Simpson         half_values[i] = half_element;
14362e93b08STaylor Simpson         half_values_acc[i] = ((i % 10) << 8) + (i % 10);
14462e93b08STaylor Simpson         half_values_masked[i] = half_element_masked;
14562e93b08STaylor Simpson 
14662e93b08STaylor Simpson         letter++;
14762e93b08STaylor Simpson         /* reset to 'A' */
14862e93b08STaylor Simpson         if (letter == 'M') {
14962e93b08STaylor Simpson             letter = 'A';
15062e93b08STaylor Simpson         }
15162e93b08STaylor Simpson 
15262e93b08STaylor Simpson         half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
15362e93b08STaylor Simpson     }
15462e93b08STaylor Simpson }
15562e93b08STaylor Simpson 
15662e93b08STaylor Simpson /* create byte offsets to be a diagonal of the matrix with 32 bit elements */
create_offsets_values_preds_32(void)15762e93b08STaylor Simpson void create_offsets_values_preds_32(void)
15862e93b08STaylor Simpson {
15962e93b08STaylor Simpson     unsigned int word_element = 0;
16062e93b08STaylor Simpson     unsigned int word_element_masked = 0;
16162e93b08STaylor Simpson     char letter = 'A';
16262e93b08STaylor Simpson     char letter_masked = '&';
16362e93b08STaylor Simpson 
16462e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; i++) {
16562e93b08STaylor Simpson         word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
16662e93b08STaylor Simpson 
16762e93b08STaylor Simpson         word_element = 0;
16862e93b08STaylor Simpson         word_element_masked = 0;
16962e93b08STaylor Simpson         for (int j = 0; j < 4; j++) {
17062e93b08STaylor Simpson             word_element |= letter << j * 8;
17162e93b08STaylor Simpson             word_element_masked |= letter_masked << j * 8;
17262e93b08STaylor Simpson         }
17362e93b08STaylor Simpson 
17462e93b08STaylor Simpson         word_values[i] = word_element;
17562e93b08STaylor Simpson         word_values_acc[i] = ((i % 10) << 8) + (i % 10);
17662e93b08STaylor Simpson         word_values_masked[i] = word_element_masked;
17762e93b08STaylor Simpson 
17862e93b08STaylor Simpson         letter++;
17962e93b08STaylor Simpson         /* reset to 'A' */
18062e93b08STaylor Simpson         if (letter == 'M') {
18162e93b08STaylor Simpson             letter = 'A';
18262e93b08STaylor Simpson         }
18362e93b08STaylor Simpson 
18462e93b08STaylor Simpson         word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
18562e93b08STaylor Simpson     }
18662e93b08STaylor Simpson }
18762e93b08STaylor Simpson 
18862e93b08STaylor Simpson /*
18962e93b08STaylor Simpson  * create byte offsets to be a diagonal of the matrix with 16 bit elements
19062e93b08STaylor Simpson  * and 32 bit offsets
19162e93b08STaylor Simpson  */
create_offsets_values_preds_16_32(void)19262e93b08STaylor Simpson void create_offsets_values_preds_16_32(void)
19362e93b08STaylor Simpson {
19462e93b08STaylor Simpson     unsigned short half_element = 0;
19562e93b08STaylor Simpson     unsigned short half_element_masked = 0;
19662e93b08STaylor Simpson     char letter = 'D';
19762e93b08STaylor Simpson     char letter_masked = '$';
19862e93b08STaylor Simpson 
19962e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; i++) {
20062e93b08STaylor Simpson         word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
20162e93b08STaylor Simpson 
20262e93b08STaylor Simpson         half_element = 0;
20362e93b08STaylor Simpson         half_element_masked = 0;
20462e93b08STaylor Simpson         for (int j = 0; j < 2; j++) {
20562e93b08STaylor Simpson             half_element |= letter << j * 8;
20662e93b08STaylor Simpson             half_element_masked |= letter_masked << j * 8;
20762e93b08STaylor Simpson         }
20862e93b08STaylor Simpson 
20962e93b08STaylor Simpson         half_values[i] = half_element;
21062e93b08STaylor Simpson         half_values_acc[i] = ((i % 10) << 8) + (i % 10);
21162e93b08STaylor Simpson         half_values_masked[i] = half_element_masked;
21262e93b08STaylor Simpson 
21362e93b08STaylor Simpson         letter++;
21462e93b08STaylor Simpson         /* reset to 'A' */
21562e93b08STaylor Simpson         if (letter == 'P') {
21662e93b08STaylor Simpson             letter = 'D';
21762e93b08STaylor Simpson         }
21862e93b08STaylor Simpson 
21962e93b08STaylor Simpson         half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
22062e93b08STaylor Simpson     }
22162e93b08STaylor Simpson }
22262e93b08STaylor Simpson 
223*c3679385STaylor Simpson /* scatter the 16 bit elements using HVX */
vector_scatter_16(void)22462e93b08STaylor Simpson void vector_scatter_16(void)
22562e93b08STaylor Simpson {
226*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
227*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
228*c3679385STaylor Simpson          "v1 = vmem(%3 + #0)\n\t"
229*c3679385STaylor Simpson          "vscatter(%0, m0, v0.h).h = v1\n\t"
230*c3679385STaylor Simpson          : : "r"(vtcm.vscatter16), "r"(region_len),
231*c3679385STaylor Simpson              "r"(half_offsets), "r"(half_values)
232*c3679385STaylor Simpson          : "m0", "v0", "v1", "memory");
23362e93b08STaylor Simpson 
23462e93b08STaylor Simpson     sync_scatter(vtcm.vscatter16);
23562e93b08STaylor Simpson }
23662e93b08STaylor Simpson 
237*c3679385STaylor Simpson /* scatter-accumulate the 16 bit elements using HVX */
vector_scatter_16_acc(void)23862e93b08STaylor Simpson void vector_scatter_16_acc(void)
23962e93b08STaylor Simpson {
240*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
241*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
242*c3679385STaylor Simpson          "v1 = vmem(%3 + #0)\n\t"
243*c3679385STaylor Simpson          "vscatter(%0, m0, v0.h).h += v1\n\t"
244*c3679385STaylor Simpson          : : "r"(vtcm.vscatter16), "r"(region_len),
245*c3679385STaylor Simpson              "r"(half_offsets), "r"(half_values_acc)
246*c3679385STaylor Simpson          : "m0", "v0", "v1", "memory");
24762e93b08STaylor Simpson 
24862e93b08STaylor Simpson     sync_scatter(vtcm.vscatter16);
24962e93b08STaylor Simpson }
25062e93b08STaylor Simpson 
251*c3679385STaylor Simpson /* masked scatter the 16 bit elements using HVX */
vector_scatter_16_masked(void)25262e93b08STaylor Simpson void vector_scatter_16_masked(void)
25362e93b08STaylor Simpson {
254*c3679385STaylor Simpson     asm ("r1 = #-1\n\t"
255*c3679385STaylor Simpson          "v0 = vmem(%0 + #0)\n\t"
256*c3679385STaylor Simpson          "q0 = vand(v0, r1)\n\t"
257*c3679385STaylor Simpson          "m0 = %2\n\t"
258*c3679385STaylor Simpson          "v0 = vmem(%3 + #0)\n\t"
259*c3679385STaylor Simpson          "v1 = vmem(%4 + #0)\n\t"
260*c3679385STaylor Simpson          "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t"
261*c3679385STaylor Simpson          : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
262*c3679385STaylor Simpson              "r"(half_offsets), "r"(half_values_masked)
263*c3679385STaylor Simpson          : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
26462e93b08STaylor Simpson 
26562e93b08STaylor Simpson     sync_scatter(vtcm.vscatter16);
26662e93b08STaylor Simpson }
26762e93b08STaylor Simpson 
268*c3679385STaylor Simpson /* scatter the 32 bit elements using HVX */
vector_scatter_32(void)26962e93b08STaylor Simpson void vector_scatter_32(void)
27062e93b08STaylor Simpson {
271*c3679385STaylor Simpson     HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
272*c3679385STaylor Simpson     HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
273*c3679385STaylor Simpson     HVX_Vector *valueslo = (HVX_Vector *)word_values;
274*c3679385STaylor Simpson     HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2];
27562e93b08STaylor Simpson 
276*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
277*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
278*c3679385STaylor Simpson          "v1 = vmem(%3 + #0)\n\t"
279*c3679385STaylor Simpson          "vscatter(%0, m0, v0.w).w = v1\n\t"
280*c3679385STaylor Simpson          : : "r"(vtcm.vscatter32), "r"(region_len),
281*c3679385STaylor Simpson              "r"(offsetslo), "r"(valueslo)
282*c3679385STaylor Simpson          : "m0", "v0", "v1", "memory");
283*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
284*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
285*c3679385STaylor Simpson          "v1 = vmem(%3 + #0)\n\t"
286*c3679385STaylor Simpson          "vscatter(%0, m0, v0.w).w = v1\n\t"
287*c3679385STaylor Simpson          : : "r"(vtcm.vscatter32), "r"(region_len),
288*c3679385STaylor Simpson              "r"(offsetshi), "r"(valueshi)
289*c3679385STaylor Simpson          : "m0", "v0", "v1", "memory");
29062e93b08STaylor Simpson 
29162e93b08STaylor Simpson     sync_scatter(vtcm.vscatter32);
29262e93b08STaylor Simpson }
29362e93b08STaylor Simpson 
294*c3679385STaylor Simpson /* scatter-accumulate the 32 bit elements using HVX */
vector_scatter_32_acc(void)29562e93b08STaylor Simpson void vector_scatter_32_acc(void)
29662e93b08STaylor Simpson {
297*c3679385STaylor Simpson     HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
298*c3679385STaylor Simpson     HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
299*c3679385STaylor Simpson     HVX_Vector *valueslo = (HVX_Vector *)word_values_acc;
300*c3679385STaylor Simpson     HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
30162e93b08STaylor Simpson 
302*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
303*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
304*c3679385STaylor Simpson          "v1 = vmem(%3 + #0)\n\t"
305*c3679385STaylor Simpson          "vscatter(%0, m0, v0.w).w += v1\n\t"
306*c3679385STaylor Simpson          : : "r"(vtcm.vscatter32), "r"(region_len),
307*c3679385STaylor Simpson              "r"(offsetslo), "r"(valueslo)
308*c3679385STaylor Simpson          : "m0", "v0", "v1", "memory");
309*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
310*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
311*c3679385STaylor Simpson          "v1 = vmem(%3 + #0)\n\t"
312*c3679385STaylor Simpson          "vscatter(%0, m0, v0.w).w += v1\n\t"
313*c3679385STaylor Simpson          : : "r"(vtcm.vscatter32), "r"(region_len),
314*c3679385STaylor Simpson              "r"(offsetshi), "r"(valueshi)
315*c3679385STaylor Simpson          : "m0", "v0", "v1", "memory");
31662e93b08STaylor Simpson 
31762e93b08STaylor Simpson     sync_scatter(vtcm.vscatter32);
31862e93b08STaylor Simpson }
31962e93b08STaylor Simpson 
320*c3679385STaylor Simpson /* masked scatter the 32 bit elements using HVX */
vector_scatter_32_masked(void)32162e93b08STaylor Simpson void vector_scatter_32_masked(void)
32262e93b08STaylor Simpson {
323*c3679385STaylor Simpson     HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
324*c3679385STaylor Simpson     HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
325*c3679385STaylor Simpson     HVX_Vector *valueslo = (HVX_Vector *)word_values_masked;
326*c3679385STaylor Simpson     HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
327*c3679385STaylor Simpson     HVX_Vector *predslo = (HVX_Vector *)word_predicates;
328*c3679385STaylor Simpson     HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
32962e93b08STaylor Simpson 
330*c3679385STaylor Simpson     asm ("r1 = #-1\n\t"
331*c3679385STaylor Simpson          "v0 = vmem(%0 + #0)\n\t"
332*c3679385STaylor Simpson          "q0 = vand(v0, r1)\n\t"
333*c3679385STaylor Simpson          "m0 = %2\n\t"
334*c3679385STaylor Simpson          "v0 = vmem(%3 + #0)\n\t"
335*c3679385STaylor Simpson          "v1 = vmem(%4 + #0)\n\t"
336*c3679385STaylor Simpson          "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
337*c3679385STaylor Simpson          : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
338*c3679385STaylor Simpson              "r"(offsetslo), "r"(valueslo)
339*c3679385STaylor Simpson          : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
340*c3679385STaylor Simpson     asm ("r1 = #-1\n\t"
341*c3679385STaylor Simpson          "v0 = vmem(%0 + #0)\n\t"
342*c3679385STaylor Simpson          "q0 = vand(v0, r1)\n\t"
343*c3679385STaylor Simpson          "m0 = %2\n\t"
344*c3679385STaylor Simpson          "v0 = vmem(%3 + #0)\n\t"
345*c3679385STaylor Simpson          "v1 = vmem(%4 + #0)\n\t"
346*c3679385STaylor Simpson          "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
347*c3679385STaylor Simpson          : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
348*c3679385STaylor Simpson              "r"(offsetshi), "r"(valueshi)
349*c3679385STaylor Simpson          : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
35062e93b08STaylor Simpson 
351*c3679385STaylor Simpson     sync_scatter(vtcm.vscatter32);
35262e93b08STaylor Simpson }
35362e93b08STaylor Simpson 
354*c3679385STaylor Simpson /* scatter the 16 bit elements with 32 bit offsets using HVX */
vector_scatter_16_32(void)35562e93b08STaylor Simpson void vector_scatter_16_32(void)
35662e93b08STaylor Simpson {
357*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
358*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
359*c3679385STaylor Simpson          "v1 = vmem(%2 + #1)\n\t"
360*c3679385STaylor Simpson          "v2 = vmem(%3 + #0)\n\t"
361*c3679385STaylor Simpson          "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
362*c3679385STaylor Simpson          "vscatter(%0, m0, v1:0.w).h = v2\n\t"
363*c3679385STaylor Simpson          : : "r"(vtcm.vscatter16_32), "r"(region_len),
364*c3679385STaylor Simpson              "r"(word_offsets), "r"(half_values)
365*c3679385STaylor Simpson          : "m0", "v0", "v1", "v2", "memory");
36662e93b08STaylor Simpson 
36762e93b08STaylor Simpson     sync_scatter(vtcm.vscatter16_32);
36862e93b08STaylor Simpson }
36962e93b08STaylor Simpson 
370*c3679385STaylor Simpson /* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */
vector_scatter_16_32_acc(void)37162e93b08STaylor Simpson void vector_scatter_16_32_acc(void)
37262e93b08STaylor Simpson {
373*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
374*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
375*c3679385STaylor Simpson          "v1 = vmem(%2 + #1)\n\t"
376*c3679385STaylor Simpson          "v2 = vmem(%3 + #0)\n\t" \
377*c3679385STaylor Simpson          "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
378*c3679385STaylor Simpson          "vscatter(%0, m0, v1:0.w).h += v2\n\t"
379*c3679385STaylor Simpson          : : "r"(vtcm.vscatter16_32), "r"(region_len),
380*c3679385STaylor Simpson              "r"(word_offsets), "r"(half_values_acc)
381*c3679385STaylor Simpson          : "m0", "v0", "v1", "v2", "memory");
38262e93b08STaylor Simpson 
38362e93b08STaylor Simpson     sync_scatter(vtcm.vscatter16_32);
38462e93b08STaylor Simpson }
38562e93b08STaylor Simpson 
386*c3679385STaylor Simpson /* masked scatter the 16 bit elements with 32 bit offsets using HVX */
vector_scatter_16_32_masked(void)38762e93b08STaylor Simpson void vector_scatter_16_32_masked(void)
38862e93b08STaylor Simpson {
389*c3679385STaylor Simpson     asm ("r1 = #-1\n\t"
390*c3679385STaylor Simpson          "v0 = vmem(%0 + #0)\n\t"
391*c3679385STaylor Simpson          "v0.h = vshuff(v0.h)\n\t"  /* shuffle the predicates */
392*c3679385STaylor Simpson          "q0 = vand(v0, r1)\n\t"
393*c3679385STaylor Simpson          "m0 = %2\n\t"
394*c3679385STaylor Simpson          "v0 = vmem(%3 + #0)\n\t"
395*c3679385STaylor Simpson          "v1 = vmem(%3 + #1)\n\t"
396*c3679385STaylor Simpson          "v2 = vmem(%4 + #0)\n\t" \
397*c3679385STaylor Simpson          "v2.h = vshuff(v2.h)\n\t"  /* shuffle the values for the scatter */
398*c3679385STaylor Simpson          "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t"
399*c3679385STaylor Simpson          : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
400*c3679385STaylor Simpson              "r"(word_offsets), "r"(half_values_masked)
401*c3679385STaylor Simpson          : "r1", "q0", "m0", "v0", "v1", "v2", "memory");
40262e93b08STaylor Simpson 
40362e93b08STaylor Simpson     sync_scatter(vtcm.vscatter16_32);
40462e93b08STaylor Simpson }
40562e93b08STaylor Simpson 
406*c3679385STaylor Simpson /* gather the elements from the scatter16 buffer using HVX */
vector_gather_16(void)40762e93b08STaylor Simpson void vector_gather_16(void)
40862e93b08STaylor Simpson {
409*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
410*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
411*c3679385STaylor Simpson          "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t"
412*c3679385STaylor Simpson          "  vmem(%3 + #0) = vtmp.new }\n\t"
413*c3679385STaylor Simpson          : : "r"(vtcm.vscatter16), "r"(region_len),
414*c3679385STaylor Simpson              "r"(half_offsets), "r"(vtcm.vgather16)
415*c3679385STaylor Simpson          : "m0", "v0", "memory");
41662e93b08STaylor Simpson 
417*c3679385STaylor Simpson     sync_gather(vtcm.vgather16);
41862e93b08STaylor Simpson }
41962e93b08STaylor Simpson 
gather_16_masked_init(void)42062e93b08STaylor Simpson static unsigned short gather_16_masked_init(void)
42162e93b08STaylor Simpson {
42262e93b08STaylor Simpson     char letter = '?';
42362e93b08STaylor Simpson     return letter | (letter << 8);
42462e93b08STaylor Simpson }
42562e93b08STaylor Simpson 
426*c3679385STaylor Simpson /* masked gather the elements from the scatter16 buffer using HVX */
vector_gather_16_masked(void)42762e93b08STaylor Simpson void vector_gather_16_masked(void)
42862e93b08STaylor Simpson {
429*c3679385STaylor Simpson     unsigned short init = gather_16_masked_init();
43062e93b08STaylor Simpson 
431*c3679385STaylor Simpson     asm ("v0.h = vsplat(%5)\n\t"
432*c3679385STaylor Simpson          "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
433*c3679385STaylor Simpson          "r1 = #-1\n\t"
434*c3679385STaylor Simpson          "v0 = vmem(%0 + #0)\n\t"
435*c3679385STaylor Simpson          "q0 = vand(v0, r1)\n\t"
436*c3679385STaylor Simpson          "m0 = %2\n\t"
437*c3679385STaylor Simpson          "v0 = vmem(%3 + #0)\n\t"
438*c3679385STaylor Simpson          "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t"
439*c3679385STaylor Simpson          "  vmem(%4 + #0) = vtmp.new }\n\t"
440*c3679385STaylor Simpson          : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
441*c3679385STaylor Simpson              "r"(half_offsets), "r"(vtcm.vgather16), "r"(init)
442*c3679385STaylor Simpson          : "r1", "q0", "m0", "v0", "memory");
44362e93b08STaylor Simpson 
444*c3679385STaylor Simpson     sync_gather(vtcm.vgather16);
44562e93b08STaylor Simpson }
44662e93b08STaylor Simpson 
447*c3679385STaylor Simpson /* gather the elements from the scatter32 buffer using HVX */
vector_gather_32(void)44862e93b08STaylor Simpson void vector_gather_32(void)
44962e93b08STaylor Simpson {
450*c3679385STaylor Simpson     HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
451*c3679385STaylor Simpson     HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
452*c3679385STaylor Simpson     HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
453*c3679385STaylor Simpson     HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
45462e93b08STaylor Simpson 
455*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
456*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
457*c3679385STaylor Simpson          "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
458*c3679385STaylor Simpson          "  vmem(%3 + #0) = vtmp.new }\n\t"
459*c3679385STaylor Simpson          : : "r"(vtcm.vscatter32), "r"(region_len),
460*c3679385STaylor Simpson              "r"(offsetslo), "r"(vgatherlo)
461*c3679385STaylor Simpson          : "m0", "v0", "memory");
462*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
463*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
464*c3679385STaylor Simpson          "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
465*c3679385STaylor Simpson          "  vmem(%3 + #0) = vtmp.new }\n\t"
466*c3679385STaylor Simpson          : : "r"(vtcm.vscatter32), "r"(region_len),
467*c3679385STaylor Simpson              "r"(offsetshi), "r"(vgatherhi)
468*c3679385STaylor Simpson          : "m0", "v0", "memory");
46962e93b08STaylor Simpson 
470*c3679385STaylor Simpson     sync_gather(vgatherlo);
47162e93b08STaylor Simpson     sync_gather(vgatherhi);
47262e93b08STaylor Simpson }
47362e93b08STaylor Simpson 
gather_32_masked_init(void)47462e93b08STaylor Simpson static unsigned int gather_32_masked_init(void)
47562e93b08STaylor Simpson {
47662e93b08STaylor Simpson     char letter = '?';
47762e93b08STaylor Simpson     return letter | (letter << 8) | (letter << 16) | (letter << 24);
47862e93b08STaylor Simpson }
47962e93b08STaylor Simpson 
480*c3679385STaylor Simpson /* masked gather the elements from the scatter32 buffer using HVX */
vector_gather_32_masked(void)48162e93b08STaylor Simpson void vector_gather_32_masked(void)
48262e93b08STaylor Simpson {
483*c3679385STaylor Simpson     unsigned int init = gather_32_masked_init();
484*c3679385STaylor Simpson     HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
485*c3679385STaylor Simpson     HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
486*c3679385STaylor Simpson     HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
487*c3679385STaylor Simpson     HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
488*c3679385STaylor Simpson     HVX_Vector *predslo = (HVX_Vector *)word_predicates;
489*c3679385STaylor Simpson     HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
49062e93b08STaylor Simpson 
491*c3679385STaylor Simpson     asm ("v0.h = vsplat(%5)\n\t"
492*c3679385STaylor Simpson          "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
493*c3679385STaylor Simpson          "r1 = #-1\n\t"
494*c3679385STaylor Simpson          "v0 = vmem(%0 + #0)\n\t"
495*c3679385STaylor Simpson          "q0 = vand(v0, r1)\n\t"
496*c3679385STaylor Simpson          "m0 = %2\n\t"
497*c3679385STaylor Simpson          "v0 = vmem(%3 + #0)\n\t"
498*c3679385STaylor Simpson          "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
499*c3679385STaylor Simpson          "  vmem(%4 + #0) = vtmp.new }\n\t"
500*c3679385STaylor Simpson          : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
501*c3679385STaylor Simpson              "r"(offsetslo), "r"(vgatherlo), "r"(init)
502*c3679385STaylor Simpson          : "r1", "q0", "m0", "v0", "memory");
503*c3679385STaylor Simpson     asm ("v0.h = vsplat(%5)\n\t"
504*c3679385STaylor Simpson          "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
505*c3679385STaylor Simpson          "r1 = #-1\n\t"
506*c3679385STaylor Simpson          "v0 = vmem(%0 + #0)\n\t"
507*c3679385STaylor Simpson          "q0 = vand(v0, r1)\n\t"
508*c3679385STaylor Simpson          "m0 = %2\n\t"
509*c3679385STaylor Simpson          "v0 = vmem(%3 + #0)\n\t"
510*c3679385STaylor Simpson          "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
511*c3679385STaylor Simpson          "  vmem(%4 + #0) = vtmp.new }\n\t"
512*c3679385STaylor Simpson          : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
513*c3679385STaylor Simpson              "r"(offsetshi), "r"(vgatherhi), "r"(init)
514*c3679385STaylor Simpson          : "r1", "q0", "m0", "v0", "memory");
51562e93b08STaylor Simpson 
51662e93b08STaylor Simpson     sync_gather(vgatherlo);
51762e93b08STaylor Simpson     sync_gather(vgatherhi);
51862e93b08STaylor Simpson }
51962e93b08STaylor Simpson 
520*c3679385STaylor Simpson /* gather the elements from the scatter16_32 buffer using HVX */
vector_gather_16_32(void)52162e93b08STaylor Simpson void vector_gather_16_32(void)
52262e93b08STaylor Simpson {
523*c3679385STaylor Simpson     asm ("m0 = %1\n\t"
524*c3679385STaylor Simpson          "v0 = vmem(%2 + #0)\n\t"
525*c3679385STaylor Simpson          "v1 = vmem(%2 + #1)\n\t"
526*c3679385STaylor Simpson          "{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t"
527*c3679385STaylor Simpson          "  vmem(%3 + #0) = vtmp.new }\n\t"
528*c3679385STaylor Simpson          "v0 = vmem(%3 + #0)\n\t"
529*c3679385STaylor Simpson          "v0.h = vdeal(v0.h)\n\t"  /* deal the elements to get the order back */
530*c3679385STaylor Simpson          "vmem(%3 + #0) = v0\n\t"
531*c3679385STaylor Simpson          : : "r"(vtcm.vscatter16_32), "r"(region_len),
532*c3679385STaylor Simpson              "r"(word_offsets), "r"(vtcm.vgather16_32)
533*c3679385STaylor Simpson          : "m0", "v0", "v1", "memory");
53462e93b08STaylor Simpson 
535*c3679385STaylor Simpson     sync_gather(vtcm.vgather16_32);
53662e93b08STaylor Simpson }
53762e93b08STaylor Simpson 
538*c3679385STaylor Simpson /* masked gather the elements from the scatter16_32 buffer using HVX */
vector_gather_16_32_masked(void)53962e93b08STaylor Simpson void vector_gather_16_32_masked(void)
54062e93b08STaylor Simpson {
541*c3679385STaylor Simpson     unsigned short init = gather_16_masked_init();
54262e93b08STaylor Simpson 
543*c3679385STaylor Simpson     asm ("v0.h = vsplat(%5)\n\t"
544*c3679385STaylor Simpson          "vmem(%4 + #0) = v0\n\t"  /* initialize the write area */
545*c3679385STaylor Simpson          "r1 = #-1\n\t"
546*c3679385STaylor Simpson          "v0 = vmem(%0 + #0)\n\t"
547*c3679385STaylor Simpson          "v0.h = vshuff(v0.h)\n\t"  /* shuffle the predicates */
548*c3679385STaylor Simpson          "q0 = vand(v0, r1)\n\t"
549*c3679385STaylor Simpson          "m0 = %2\n\t"
550*c3679385STaylor Simpson          "v0 = vmem(%3 + #0)\n\t"
551*c3679385STaylor Simpson          "v1 = vmem(%3 + #1)\n\t"
552*c3679385STaylor Simpson          "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t"
553*c3679385STaylor Simpson          "  vmem(%4 + #0) = vtmp.new }\n\t"
554*c3679385STaylor Simpson          "v0 = vmem(%4 + #0)\n\t"
555*c3679385STaylor Simpson          "v0.h = vdeal(v0.h)\n\t"  /* deal the elements to get the order back */
556*c3679385STaylor Simpson          "vmem(%4 + #0) = v0\n\t"
557*c3679385STaylor Simpson          : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
558*c3679385STaylor Simpson              "r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init)
559*c3679385STaylor Simpson          : "r1", "q0", "m0", "v0", "v1", "memory");
56062e93b08STaylor Simpson 
561*c3679385STaylor Simpson     sync_gather(vtcm.vgather16_32);
56262e93b08STaylor Simpson }
56362e93b08STaylor Simpson 
check_buffer(const char * name,void * c,void * r,size_t size)56462e93b08STaylor Simpson static void check_buffer(const char *name, void *c, void *r, size_t size)
56562e93b08STaylor Simpson {
56662e93b08STaylor Simpson     char *check = (char *)c;
56762e93b08STaylor Simpson     char *ref = (char *)r;
56862e93b08STaylor Simpson     for (int i = 0; i < size; i++) {
56962e93b08STaylor Simpson         if (check[i] != ref[i]) {
57062e93b08STaylor Simpson             printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
57162e93b08STaylor Simpson                    check[i], check[i], ref[i], ref[i]);
57262e93b08STaylor Simpson             err++;
57362e93b08STaylor Simpson         }
57462e93b08STaylor Simpson     }
57562e93b08STaylor Simpson }
57662e93b08STaylor Simpson 
57762e93b08STaylor Simpson /*
57862e93b08STaylor Simpson  * These scalar functions are the C equivalents of the vector functions that
57962e93b08STaylor Simpson  * use HVX
58062e93b08STaylor Simpson  */
58162e93b08STaylor Simpson 
58262e93b08STaylor Simpson /* scatter the 16 bit elements using C */
scalar_scatter_16(unsigned short * vscatter16)58362e93b08STaylor Simpson void scalar_scatter_16(unsigned short *vscatter16)
58462e93b08STaylor Simpson {
58562e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
58662e93b08STaylor Simpson         vscatter16[half_offsets[i] / 2] = half_values[i];
58762e93b08STaylor Simpson     }
58862e93b08STaylor Simpson }
58962e93b08STaylor Simpson 
check_scatter_16()59062e93b08STaylor Simpson void check_scatter_16()
59162e93b08STaylor Simpson {
59262e93b08STaylor Simpson     memset(vscatter16_ref, FILL_CHAR,
59362e93b08STaylor Simpson            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
59462e93b08STaylor Simpson     scalar_scatter_16(vscatter16_ref);
59562e93b08STaylor Simpson     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
59662e93b08STaylor Simpson                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
59762e93b08STaylor Simpson }
59862e93b08STaylor Simpson 
59962e93b08STaylor Simpson /* scatter the 16 bit elements using C */
scalar_scatter_16_acc(unsigned short * vscatter16)60062e93b08STaylor Simpson void scalar_scatter_16_acc(unsigned short *vscatter16)
60162e93b08STaylor Simpson {
60262e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
60362e93b08STaylor Simpson         vscatter16[half_offsets[i] / 2] += half_values_acc[i];
60462e93b08STaylor Simpson     }
60562e93b08STaylor Simpson }
60662e93b08STaylor Simpson 
607*c3679385STaylor Simpson /* scatter-accumulate the 16 bit elements using C */
check_scatter_16_acc()60862e93b08STaylor Simpson void check_scatter_16_acc()
60962e93b08STaylor Simpson {
61062e93b08STaylor Simpson     memset(vscatter16_ref, FILL_CHAR,
61162e93b08STaylor Simpson            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
61262e93b08STaylor Simpson     scalar_scatter_16(vscatter16_ref);
61362e93b08STaylor Simpson     scalar_scatter_16_acc(vscatter16_ref);
61462e93b08STaylor Simpson     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
61562e93b08STaylor Simpson                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
61662e93b08STaylor Simpson }
61762e93b08STaylor Simpson 
618*c3679385STaylor Simpson /* masked scatter the 16 bit elements using C */
scalar_scatter_16_masked(unsigned short * vscatter16)61962e93b08STaylor Simpson void scalar_scatter_16_masked(unsigned short *vscatter16)
62062e93b08STaylor Simpson {
62162e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; i++) {
62262e93b08STaylor Simpson         if (half_predicates[i]) {
62362e93b08STaylor Simpson             vscatter16[half_offsets[i] / 2] = half_values_masked[i];
62462e93b08STaylor Simpson         }
62562e93b08STaylor Simpson     }
62662e93b08STaylor Simpson 
62762e93b08STaylor Simpson }
62862e93b08STaylor Simpson 
check_scatter_16_masked()62962e93b08STaylor Simpson void check_scatter_16_masked()
63062e93b08STaylor Simpson {
63162e93b08STaylor Simpson     memset(vscatter16_ref, FILL_CHAR,
63262e93b08STaylor Simpson            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
63362e93b08STaylor Simpson     scalar_scatter_16(vscatter16_ref);
63462e93b08STaylor Simpson     scalar_scatter_16_acc(vscatter16_ref);
63562e93b08STaylor Simpson     scalar_scatter_16_masked(vscatter16_ref);
63662e93b08STaylor Simpson     check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
63762e93b08STaylor Simpson                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
63862e93b08STaylor Simpson }
63962e93b08STaylor Simpson 
64062e93b08STaylor Simpson /* scatter the 32 bit elements using C */
scalar_scatter_32(unsigned int * vscatter32)64162e93b08STaylor Simpson void scalar_scatter_32(unsigned int *vscatter32)
64262e93b08STaylor Simpson {
64362e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
64462e93b08STaylor Simpson         vscatter32[word_offsets[i] / 4] = word_values[i];
64562e93b08STaylor Simpson     }
64662e93b08STaylor Simpson }
64762e93b08STaylor Simpson 
check_scatter_32()64862e93b08STaylor Simpson void check_scatter_32()
64962e93b08STaylor Simpson {
65062e93b08STaylor Simpson     memset(vscatter32_ref, FILL_CHAR,
65162e93b08STaylor Simpson            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
65262e93b08STaylor Simpson     scalar_scatter_32(vscatter32_ref);
65362e93b08STaylor Simpson     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
65462e93b08STaylor Simpson                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
65562e93b08STaylor Simpson }
65662e93b08STaylor Simpson 
657*c3679385STaylor Simpson /* scatter-accumulate the 32 bit elements using C */
scalar_scatter_32_acc(unsigned int * vscatter32)65862e93b08STaylor Simpson void scalar_scatter_32_acc(unsigned int *vscatter32)
65962e93b08STaylor Simpson {
66062e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
66162e93b08STaylor Simpson         vscatter32[word_offsets[i] / 4] += word_values_acc[i];
66262e93b08STaylor Simpson     }
66362e93b08STaylor Simpson }
66462e93b08STaylor Simpson 
check_scatter_32_acc()66562e93b08STaylor Simpson void check_scatter_32_acc()
66662e93b08STaylor Simpson {
66762e93b08STaylor Simpson     memset(vscatter32_ref, FILL_CHAR,
66862e93b08STaylor Simpson            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
66962e93b08STaylor Simpson     scalar_scatter_32(vscatter32_ref);
67062e93b08STaylor Simpson     scalar_scatter_32_acc(vscatter32_ref);
67162e93b08STaylor Simpson     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
67262e93b08STaylor Simpson                  SCATTER_BUFFER_SIZE * sizeof(unsigned int));
67362e93b08STaylor Simpson }
67462e93b08STaylor Simpson 
675*c3679385STaylor Simpson /* masked scatter the 32 bit elements using C */
scalar_scatter_32_masked(unsigned int * vscatter32)67662e93b08STaylor Simpson void scalar_scatter_32_masked(unsigned int *vscatter32)
67762e93b08STaylor Simpson {
67862e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; i++) {
67962e93b08STaylor Simpson         if (word_predicates[i]) {
68062e93b08STaylor Simpson             vscatter32[word_offsets[i] / 4] = word_values_masked[i];
68162e93b08STaylor Simpson         }
68262e93b08STaylor Simpson     }
68362e93b08STaylor Simpson }
68462e93b08STaylor Simpson 
check_scatter_32_masked()68562e93b08STaylor Simpson void check_scatter_32_masked()
68662e93b08STaylor Simpson {
68762e93b08STaylor Simpson     memset(vscatter32_ref, FILL_CHAR,
68862e93b08STaylor Simpson            SCATTER_BUFFER_SIZE * sizeof(unsigned int));
68962e93b08STaylor Simpson     scalar_scatter_32(vscatter32_ref);
69062e93b08STaylor Simpson     scalar_scatter_32_acc(vscatter32_ref);
69162e93b08STaylor Simpson     scalar_scatter_32_masked(vscatter32_ref);
69262e93b08STaylor Simpson     check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
69362e93b08STaylor Simpson                   SCATTER_BUFFER_SIZE * sizeof(unsigned int));
69462e93b08STaylor Simpson }
69562e93b08STaylor Simpson 
696*c3679385STaylor Simpson /* scatter the 16 bit elements with 32 bit offsets using C */
scalar_scatter_16_32(unsigned short * vscatter16_32)69762e93b08STaylor Simpson void scalar_scatter_16_32(unsigned short *vscatter16_32)
69862e93b08STaylor Simpson {
69962e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
70062e93b08STaylor Simpson         vscatter16_32[word_offsets[i] / 2] = half_values[i];
70162e93b08STaylor Simpson     }
70262e93b08STaylor Simpson }
70362e93b08STaylor Simpson 
check_scatter_16_32()70462e93b08STaylor Simpson void check_scatter_16_32()
70562e93b08STaylor Simpson {
70662e93b08STaylor Simpson     memset(vscatter16_32_ref, FILL_CHAR,
70762e93b08STaylor Simpson            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
70862e93b08STaylor Simpson     scalar_scatter_16_32(vscatter16_32_ref);
70962e93b08STaylor Simpson     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
71062e93b08STaylor Simpson                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
71162e93b08STaylor Simpson }
71262e93b08STaylor Simpson 
713*c3679385STaylor Simpson /* scatter-accumulate the 16 bit elements with 32 bit offsets using C */
scalar_scatter_16_32_acc(unsigned short * vscatter16_32)71462e93b08STaylor Simpson void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
71562e93b08STaylor Simpson {
71662e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
71762e93b08STaylor Simpson         vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
71862e93b08STaylor Simpson     }
71962e93b08STaylor Simpson }
72062e93b08STaylor Simpson 
check_scatter_16_32_acc()72162e93b08STaylor Simpson void check_scatter_16_32_acc()
72262e93b08STaylor Simpson {
72362e93b08STaylor Simpson     memset(vscatter16_32_ref, FILL_CHAR,
72462e93b08STaylor Simpson            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
72562e93b08STaylor Simpson     scalar_scatter_16_32(vscatter16_32_ref);
72662e93b08STaylor Simpson     scalar_scatter_16_32_acc(vscatter16_32_ref);
72762e93b08STaylor Simpson     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
72862e93b08STaylor Simpson                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
72962e93b08STaylor Simpson }
73062e93b08STaylor Simpson 
731*c3679385STaylor Simpson /* masked scatter the 16 bit elements with 32 bit offsets using C */
scalar_scatter_16_32_masked(unsigned short * vscatter16_32)73262e93b08STaylor Simpson void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
73362e93b08STaylor Simpson {
73462e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; i++) {
73562e93b08STaylor Simpson         if (half_predicates[i]) {
73662e93b08STaylor Simpson             vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
73762e93b08STaylor Simpson         }
73862e93b08STaylor Simpson     }
73962e93b08STaylor Simpson }
74062e93b08STaylor Simpson 
check_scatter_16_32_masked()74162e93b08STaylor Simpson void check_scatter_16_32_masked()
74262e93b08STaylor Simpson {
74362e93b08STaylor Simpson     memset(vscatter16_32_ref, FILL_CHAR,
74462e93b08STaylor Simpson            SCATTER_BUFFER_SIZE * sizeof(unsigned short));
74562e93b08STaylor Simpson     scalar_scatter_16_32(vscatter16_32_ref);
74662e93b08STaylor Simpson     scalar_scatter_16_32_acc(vscatter16_32_ref);
74762e93b08STaylor Simpson     scalar_scatter_16_32_masked(vscatter16_32_ref);
74862e93b08STaylor Simpson     check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
74962e93b08STaylor Simpson                  SCATTER_BUFFER_SIZE * sizeof(unsigned short));
75062e93b08STaylor Simpson }
75162e93b08STaylor Simpson 
75262e93b08STaylor Simpson /* gather the elements from the scatter buffer using C */
scalar_gather_16(unsigned short * vgather16)75362e93b08STaylor Simpson void scalar_gather_16(unsigned short *vgather16)
75462e93b08STaylor Simpson {
75562e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
75662e93b08STaylor Simpson         vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
75762e93b08STaylor Simpson     }
75862e93b08STaylor Simpson }
75962e93b08STaylor Simpson 
check_gather_16()76062e93b08STaylor Simpson void check_gather_16()
76162e93b08STaylor Simpson {
76262e93b08STaylor Simpson       memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
76362e93b08STaylor Simpson       scalar_gather_16(vgather16_ref);
76462e93b08STaylor Simpson       check_buffer(__func__, vtcm.vgather16, vgather16_ref,
76562e93b08STaylor Simpson                    MATRIX_SIZE * sizeof(unsigned short));
76662e93b08STaylor Simpson }
76762e93b08STaylor Simpson 
768*c3679385STaylor Simpson /* masked gather the elements from the scatter buffer using C */
scalar_gather_16_masked(unsigned short * vgather16)76962e93b08STaylor Simpson void scalar_gather_16_masked(unsigned short *vgather16)
77062e93b08STaylor Simpson {
77162e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
77262e93b08STaylor Simpson         if (half_predicates[i]) {
77362e93b08STaylor Simpson             vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
77462e93b08STaylor Simpson         }
77562e93b08STaylor Simpson     }
77662e93b08STaylor Simpson }
77762e93b08STaylor Simpson 
check_gather_16_masked()77862e93b08STaylor Simpson void check_gather_16_masked()
77962e93b08STaylor Simpson {
78062e93b08STaylor Simpson     memset(vgather16_ref, gather_16_masked_init(),
78162e93b08STaylor Simpson            MATRIX_SIZE * sizeof(unsigned short));
78262e93b08STaylor Simpson     scalar_gather_16_masked(vgather16_ref);
78362e93b08STaylor Simpson     check_buffer(__func__, vtcm.vgather16, vgather16_ref,
78462e93b08STaylor Simpson                  MATRIX_SIZE * sizeof(unsigned short));
78562e93b08STaylor Simpson }
78662e93b08STaylor Simpson 
787*c3679385STaylor Simpson /* gather the elements from the scatter32 buffer using C */
scalar_gather_32(unsigned int * vgather32)78862e93b08STaylor Simpson void scalar_gather_32(unsigned int *vgather32)
78962e93b08STaylor Simpson {
79062e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
79162e93b08STaylor Simpson         vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
79262e93b08STaylor Simpson     }
79362e93b08STaylor Simpson }
79462e93b08STaylor Simpson 
check_gather_32(void)79562e93b08STaylor Simpson void check_gather_32(void)
79662e93b08STaylor Simpson {
79762e93b08STaylor Simpson     memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
79862e93b08STaylor Simpson     scalar_gather_32(vgather32_ref);
79962e93b08STaylor Simpson     check_buffer(__func__, vtcm.vgather32, vgather32_ref,
80062e93b08STaylor Simpson                  MATRIX_SIZE * sizeof(unsigned int));
80162e93b08STaylor Simpson }
80262e93b08STaylor Simpson 
803*c3679385STaylor Simpson /* masked gather the elements from the scatter32 buffer using C */
scalar_gather_32_masked(unsigned int * vgather32)80462e93b08STaylor Simpson void scalar_gather_32_masked(unsigned int *vgather32)
80562e93b08STaylor Simpson {
80662e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
80762e93b08STaylor Simpson         if (word_predicates[i]) {
80862e93b08STaylor Simpson             vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
80962e93b08STaylor Simpson         }
81062e93b08STaylor Simpson     }
81162e93b08STaylor Simpson }
81262e93b08STaylor Simpson 
check_gather_32_masked(void)81362e93b08STaylor Simpson void check_gather_32_masked(void)
81462e93b08STaylor Simpson {
81562e93b08STaylor Simpson     memset(vgather32_ref, gather_32_masked_init(),
81662e93b08STaylor Simpson            MATRIX_SIZE * sizeof(unsigned int));
81762e93b08STaylor Simpson     scalar_gather_32_masked(vgather32_ref);
81862e93b08STaylor Simpson     check_buffer(__func__, vtcm.vgather32,
81962e93b08STaylor Simpson                  vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
82062e93b08STaylor Simpson }
82162e93b08STaylor Simpson 
822*c3679385STaylor Simpson /* gather the elements from the scatter16_32 buffer using C */
scalar_gather_16_32(unsigned short * vgather16_32)82362e93b08STaylor Simpson void scalar_gather_16_32(unsigned short *vgather16_32)
82462e93b08STaylor Simpson {
82562e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
82662e93b08STaylor Simpson         vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
82762e93b08STaylor Simpson     }
82862e93b08STaylor Simpson }
82962e93b08STaylor Simpson 
check_gather_16_32(void)83062e93b08STaylor Simpson void check_gather_16_32(void)
83162e93b08STaylor Simpson {
83262e93b08STaylor Simpson     memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
83362e93b08STaylor Simpson     scalar_gather_16_32(vgather16_32_ref);
83462e93b08STaylor Simpson     check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
83562e93b08STaylor Simpson                  MATRIX_SIZE * sizeof(unsigned short));
83662e93b08STaylor Simpson }
83762e93b08STaylor Simpson 
838*c3679385STaylor Simpson /* masked gather the elements from the scatter16_32 buffer using C */
scalar_gather_16_32_masked(unsigned short * vgather16_32)83962e93b08STaylor Simpson void scalar_gather_16_32_masked(unsigned short *vgather16_32)
84062e93b08STaylor Simpson {
84162e93b08STaylor Simpson     for (int i = 0; i < MATRIX_SIZE; ++i) {
84262e93b08STaylor Simpson         if (half_predicates[i]) {
84362e93b08STaylor Simpson             vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
84462e93b08STaylor Simpson         }
84562e93b08STaylor Simpson     }
84662e93b08STaylor Simpson 
84762e93b08STaylor Simpson }
84862e93b08STaylor Simpson 
check_gather_16_32_masked(void)84962e93b08STaylor Simpson void check_gather_16_32_masked(void)
85062e93b08STaylor Simpson {
85162e93b08STaylor Simpson     memset(vgather16_32_ref, gather_16_masked_init(),
85262e93b08STaylor Simpson            MATRIX_SIZE * sizeof(unsigned short));
85362e93b08STaylor Simpson     scalar_gather_16_32_masked(vgather16_32_ref);
85462e93b08STaylor Simpson     check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
85562e93b08STaylor Simpson                  MATRIX_SIZE * sizeof(unsigned short));
85662e93b08STaylor Simpson }
85762e93b08STaylor Simpson 
85862e93b08STaylor Simpson /* print scatter16 buffer */
print_scatter16_buffer(void)85962e93b08STaylor Simpson void print_scatter16_buffer(void)
86062e93b08STaylor Simpson {
86162e93b08STaylor Simpson     if (PRINT_DATA) {
86262e93b08STaylor Simpson         printf("\n\nPrinting the 16 bit scatter buffer");
86362e93b08STaylor Simpson 
86462e93b08STaylor Simpson         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
86562e93b08STaylor Simpson             if ((i % MATRIX_SIZE) == 0) {
86662e93b08STaylor Simpson                 printf("\n");
86762e93b08STaylor Simpson             }
86862e93b08STaylor Simpson             for (int j = 0; j < 2; j++) {
86962e93b08STaylor Simpson                 printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
87062e93b08STaylor Simpson             }
87162e93b08STaylor Simpson             printf(" ");
87262e93b08STaylor Simpson         }
87362e93b08STaylor Simpson         printf("\n");
87462e93b08STaylor Simpson     }
87562e93b08STaylor Simpson }
87662e93b08STaylor Simpson 
87762e93b08STaylor Simpson /* print the gather 16 buffer */
print_gather_result_16(void)87862e93b08STaylor Simpson void print_gather_result_16(void)
87962e93b08STaylor Simpson {
88062e93b08STaylor Simpson     if (PRINT_DATA) {
88162e93b08STaylor Simpson         printf("\n\nPrinting the 16 bit gather result\n");
88262e93b08STaylor Simpson 
88362e93b08STaylor Simpson         for (int i = 0; i < MATRIX_SIZE; i++) {
88462e93b08STaylor Simpson             for (int j = 0; j < 2; j++) {
88562e93b08STaylor Simpson                 printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
88662e93b08STaylor Simpson             }
88762e93b08STaylor Simpson             printf(" ");
88862e93b08STaylor Simpson         }
88962e93b08STaylor Simpson         printf("\n");
89062e93b08STaylor Simpson     }
89162e93b08STaylor Simpson }
89262e93b08STaylor Simpson 
89362e93b08STaylor Simpson /* print the scatter32 buffer */
print_scatter32_buffer(void)89462e93b08STaylor Simpson void print_scatter32_buffer(void)
89562e93b08STaylor Simpson {
89662e93b08STaylor Simpson     if (PRINT_DATA) {
89762e93b08STaylor Simpson         printf("\n\nPrinting the 32 bit scatter buffer");
89862e93b08STaylor Simpson 
89962e93b08STaylor Simpson         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
90062e93b08STaylor Simpson             if ((i % MATRIX_SIZE) == 0) {
90162e93b08STaylor Simpson                 printf("\n");
90262e93b08STaylor Simpson             }
90362e93b08STaylor Simpson             for (int j = 0; j < 4; j++) {
90462e93b08STaylor Simpson                 printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
90562e93b08STaylor Simpson             }
90662e93b08STaylor Simpson             printf(" ");
90762e93b08STaylor Simpson         }
90862e93b08STaylor Simpson         printf("\n");
90962e93b08STaylor Simpson     }
91062e93b08STaylor Simpson }
91162e93b08STaylor Simpson 
91262e93b08STaylor Simpson /* print the gather 32 buffer */
print_gather_result_32(void)91362e93b08STaylor Simpson void print_gather_result_32(void)
91462e93b08STaylor Simpson {
91562e93b08STaylor Simpson     if (PRINT_DATA) {
91662e93b08STaylor Simpson         printf("\n\nPrinting the 32 bit gather result\n");
91762e93b08STaylor Simpson 
91862e93b08STaylor Simpson         for (int i = 0; i < MATRIX_SIZE; i++) {
91962e93b08STaylor Simpson             for (int j = 0; j < 4; j++) {
92062e93b08STaylor Simpson                 printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
92162e93b08STaylor Simpson             }
92262e93b08STaylor Simpson             printf(" ");
92362e93b08STaylor Simpson         }
92462e93b08STaylor Simpson         printf("\n");
92562e93b08STaylor Simpson     }
92662e93b08STaylor Simpson }
92762e93b08STaylor Simpson 
92862e93b08STaylor Simpson /* print the scatter16_32 buffer */
print_scatter16_32_buffer(void)92962e93b08STaylor Simpson void print_scatter16_32_buffer(void)
93062e93b08STaylor Simpson {
93162e93b08STaylor Simpson     if (PRINT_DATA) {
93262e93b08STaylor Simpson         printf("\n\nPrinting the 16_32 bit scatter buffer");
93362e93b08STaylor Simpson 
93462e93b08STaylor Simpson         for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
93562e93b08STaylor Simpson             if ((i % MATRIX_SIZE) == 0) {
93662e93b08STaylor Simpson                 printf("\n");
93762e93b08STaylor Simpson             }
93862e93b08STaylor Simpson             for (int j = 0; j < 2; j++) {
93962e93b08STaylor Simpson                 printf("%c",
94062e93b08STaylor Simpson                       (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
94162e93b08STaylor Simpson             }
94262e93b08STaylor Simpson             printf(" ");
94362e93b08STaylor Simpson         }
94462e93b08STaylor Simpson         printf("\n");
94562e93b08STaylor Simpson     }
94662e93b08STaylor Simpson }
94762e93b08STaylor Simpson 
94862e93b08STaylor Simpson /* print the gather 16_32 buffer */
print_gather_result_16_32(void)94962e93b08STaylor Simpson void print_gather_result_16_32(void)
95062e93b08STaylor Simpson {
95162e93b08STaylor Simpson     if (PRINT_DATA) {
95262e93b08STaylor Simpson         printf("\n\nPrinting the 16_32 bit gather result\n");
95362e93b08STaylor Simpson 
95462e93b08STaylor Simpson         for (int i = 0; i < MATRIX_SIZE; i++) {
95562e93b08STaylor Simpson             for (int j = 0; j < 2; j++) {
95662e93b08STaylor Simpson                 printf("%c",
95762e93b08STaylor Simpson                        (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
95862e93b08STaylor Simpson             }
95962e93b08STaylor Simpson             printf(" ");
96062e93b08STaylor Simpson         }
96162e93b08STaylor Simpson         printf("\n");
96262e93b08STaylor Simpson     }
96362e93b08STaylor Simpson }
96462e93b08STaylor Simpson 
main()96562e93b08STaylor Simpson int main()
96662e93b08STaylor Simpson {
96762e93b08STaylor Simpson     prefill_vtcm_scratch();
96862e93b08STaylor Simpson 
96962e93b08STaylor Simpson     /* 16 bit elements with 16 bit offsets */
97062e93b08STaylor Simpson     create_offsets_values_preds_16();
97162e93b08STaylor Simpson 
97262e93b08STaylor Simpson     vector_scatter_16();
97362e93b08STaylor Simpson     print_scatter16_buffer();
97462e93b08STaylor Simpson     check_scatter_16();
97562e93b08STaylor Simpson 
97662e93b08STaylor Simpson     vector_gather_16();
97762e93b08STaylor Simpson     print_gather_result_16();
97862e93b08STaylor Simpson     check_gather_16();
97962e93b08STaylor Simpson 
98062e93b08STaylor Simpson     vector_gather_16_masked();
98162e93b08STaylor Simpson     print_gather_result_16();
98262e93b08STaylor Simpson     check_gather_16_masked();
98362e93b08STaylor Simpson 
98462e93b08STaylor Simpson     vector_scatter_16_acc();
98562e93b08STaylor Simpson     print_scatter16_buffer();
98662e93b08STaylor Simpson     check_scatter_16_acc();
98762e93b08STaylor Simpson 
98862e93b08STaylor Simpson     vector_scatter_16_masked();
98962e93b08STaylor Simpson     print_scatter16_buffer();
99062e93b08STaylor Simpson     check_scatter_16_masked();
99162e93b08STaylor Simpson 
99262e93b08STaylor Simpson     /* 32 bit elements with 32 bit offsets */
99362e93b08STaylor Simpson     create_offsets_values_preds_32();
99462e93b08STaylor Simpson 
99562e93b08STaylor Simpson     vector_scatter_32();
99662e93b08STaylor Simpson     print_scatter32_buffer();
99762e93b08STaylor Simpson     check_scatter_32();
99862e93b08STaylor Simpson 
99962e93b08STaylor Simpson     vector_gather_32();
100062e93b08STaylor Simpson     print_gather_result_32();
100162e93b08STaylor Simpson     check_gather_32();
100262e93b08STaylor Simpson 
100362e93b08STaylor Simpson     vector_gather_32_masked();
100462e93b08STaylor Simpson     print_gather_result_32();
100562e93b08STaylor Simpson     check_gather_32_masked();
100662e93b08STaylor Simpson 
100762e93b08STaylor Simpson     vector_scatter_32_acc();
100862e93b08STaylor Simpson     print_scatter32_buffer();
100962e93b08STaylor Simpson     check_scatter_32_acc();
101062e93b08STaylor Simpson 
101162e93b08STaylor Simpson     vector_scatter_32_masked();
101262e93b08STaylor Simpson     print_scatter32_buffer();
101362e93b08STaylor Simpson     check_scatter_32_masked();
101462e93b08STaylor Simpson 
101562e93b08STaylor Simpson     /* 16 bit elements with 32 bit offsets */
101662e93b08STaylor Simpson     create_offsets_values_preds_16_32();
101762e93b08STaylor Simpson 
101862e93b08STaylor Simpson     vector_scatter_16_32();
101962e93b08STaylor Simpson     print_scatter16_32_buffer();
102062e93b08STaylor Simpson     check_scatter_16_32();
102162e93b08STaylor Simpson 
102262e93b08STaylor Simpson     vector_gather_16_32();
102362e93b08STaylor Simpson     print_gather_result_16_32();
102462e93b08STaylor Simpson     check_gather_16_32();
102562e93b08STaylor Simpson 
102662e93b08STaylor Simpson     vector_gather_16_32_masked();
102762e93b08STaylor Simpson     print_gather_result_16_32();
102862e93b08STaylor Simpson     check_gather_16_32_masked();
102962e93b08STaylor Simpson 
103062e93b08STaylor Simpson     vector_scatter_16_32_acc();
103162e93b08STaylor Simpson     print_scatter16_32_buffer();
103262e93b08STaylor Simpson     check_scatter_16_32_acc();
103362e93b08STaylor Simpson 
103462e93b08STaylor Simpson     vector_scatter_16_32_masked();
103562e93b08STaylor Simpson     print_scatter16_32_buffer();
103662e93b08STaylor Simpson     check_scatter_16_32_masked();
103762e93b08STaylor Simpson 
103862e93b08STaylor Simpson     puts(err ? "FAIL" : "PASS");
103962e93b08STaylor Simpson     return err;
104062e93b08STaylor Simpson }
1041