xref: /openbmc/qemu/tests/tcg/hexagon/hvx_misc.c (revision b3eb5b86)
1 /*
2  *  Copyright(c) 2021-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdio.h>
19 #include <stdint.h>
20 #include <stdbool.h>
21 #include <string.h>
22 #include <limits.h>
23 
24 int err;
25 
26 static void __check(int line, int i, int j, uint64_t result, uint64_t expect)
27 {
28     if (result != expect) {
29         printf("ERROR at line %d: [%d][%d] 0x%016llx != 0x%016llx\n",
30                line, i, j, result, expect);
31         err++;
32     }
33 }
34 
35 #define check(RES, EXP) __check(__LINE__, RES, EXP)
36 
37 #define MAX_VEC_SIZE_BYTES         128
38 
39 typedef union {
40     uint64_t ud[MAX_VEC_SIZE_BYTES / 8];
41     int64_t   d[MAX_VEC_SIZE_BYTES / 8];
42     uint32_t uw[MAX_VEC_SIZE_BYTES / 4];
43     int32_t   w[MAX_VEC_SIZE_BYTES / 4];
44     uint16_t uh[MAX_VEC_SIZE_BYTES / 2];
45     int16_t   h[MAX_VEC_SIZE_BYTES / 2];
46     uint8_t  ub[MAX_VEC_SIZE_BYTES / 1];
47     int8_t    b[MAX_VEC_SIZE_BYTES / 1];
48 } MMVector;
49 
50 #define BUFSIZE      16
51 #define OUTSIZE      16
52 #define MASKMOD      3
53 
54 MMVector buffer0[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
55 MMVector buffer1[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
56 MMVector mask[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
57 MMVector output[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
58 MMVector expect[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
59 
60 #define CHECK_OUTPUT_FUNC(FIELD, FIELDSZ) \
61 static void check_output_##FIELD(int line, size_t num_vectors) \
62 { \
63     for (int i = 0; i < num_vectors; i++) { \
64         for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
65             __check(line, i, j, output[i].FIELD[j], expect[i].FIELD[j]); \
66         } \
67     } \
68 }
69 
70 CHECK_OUTPUT_FUNC(d,  8)
71 CHECK_OUTPUT_FUNC(w,  4)
72 CHECK_OUTPUT_FUNC(h,  2)
73 CHECK_OUTPUT_FUNC(b,  1)
74 
75 static void init_buffers(void)
76 {
77     int counter0 = 0;
78     int counter1 = 17;
79     for (int i = 0; i < BUFSIZE; i++) {
80         for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) {
81             buffer0[i].b[j] = counter0++;
82             buffer1[i].b[j] = counter1++;
83         }
84         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
85             mask[i].w[j] = (i + j % MASKMOD == 0) ? 0 : 1;
86         }
87     }
88 }
89 
90 static void test_load_tmp(void)
91 {
92     void *p0 = buffer0;
93     void *p1 = buffer1;
94     void *pout = output;
95 
96     for (int i = 0; i < BUFSIZE; i++) {
97         /*
98          * Load into v12 as .tmp, then use it in the next packet
99          * Should get the new value within the same packet and
100          * the old value in the next packet
101          */
102         asm("v3 = vmem(%0 + #0)\n\t"
103             "r1 = #1\n\t"
104             "v12 = vsplat(r1)\n\t"
105             "{\n\t"
106             "    v12.tmp = vmem(%1 + #0)\n\t"
107             "    v4.w = vadd(v12.w, v3.w)\n\t"
108             "}\n\t"
109             "v4.w = vadd(v4.w, v12.w)\n\t"
110             "vmem(%2 + #0) = v4\n\t"
111             : : "r"(p0), "r"(p1), "r"(pout)
112             : "r1", "v12", "v3", "v4", "v6", "memory");
113         p0 += sizeof(MMVector);
114         p1 += sizeof(MMVector);
115         pout += sizeof(MMVector);
116 
117         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
118             expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
119         }
120     }
121 
122     check_output_w(__LINE__, BUFSIZE);
123 }
124 
125 static void test_load_cur(void)
126 {
127     void *p0 = buffer0;
128     void *pout = output;
129 
130     for (int i = 0; i < BUFSIZE; i++) {
131         asm("{\n\t"
132             "    v2.cur = vmem(%0 + #0)\n\t"
133             "    vmem(%1 + #0) = v2\n\t"
134             "}\n\t"
135             : : "r"(p0), "r"(pout) : "v2", "memory");
136         p0 += sizeof(MMVector);
137         pout += sizeof(MMVector);
138 
139         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
140             expect[i].uw[j] = buffer0[i].uw[j];
141         }
142     }
143 
144     check_output_w(__LINE__, BUFSIZE);
145 }
146 
147 static void test_load_aligned(void)
148 {
149     /* Aligned loads ignore the low bits of the address */
150     void *p0 = buffer0;
151     void *pout = output;
152     const size_t offset = 13;
153 
154     p0 += offset;    /* Create an unaligned address */
155     asm("v2 = vmem(%0 + #0)\n\t"
156         "vmem(%1 + #0) = v2\n\t"
157         : : "r"(p0), "r"(pout) : "v2", "memory");
158 
159     expect[0] = buffer0[0];
160 
161     check_output_w(__LINE__, 1);
162 }
163 
164 static void test_load_unaligned(void)
165 {
166     void *p0 = buffer0;
167     void *pout = output;
168     const size_t offset = 12;
169 
170     p0 += offset;    /* Create an unaligned address */
171     asm("v2 = vmemu(%0 + #0)\n\t"
172         "vmem(%1 + #0) = v2\n\t"
173         : : "r"(p0), "r"(pout) : "v2", "memory");
174 
175     memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
176 
177     check_output_w(__LINE__, 1);
178 }
179 
180 static void test_store_aligned(void)
181 {
182     /* Aligned stores ignore the low bits of the address */
183     void *p0 = buffer0;
184     void *pout = output;
185     const size_t offset = 13;
186 
187     pout += offset;    /* Create an unaligned address */
188     asm("v2 = vmem(%0 + #0)\n\t"
189         "vmem(%1 + #0) = v2\n\t"
190         : : "r"(p0), "r"(pout) : "v2", "memory");
191 
192     expect[0] = buffer0[0];
193 
194     check_output_w(__LINE__, 1);
195 }
196 
197 static void test_store_unaligned(void)
198 {
199     void *p0 = buffer0;
200     void *pout = output;
201     const size_t offset = 12;
202 
203     pout += offset;    /* Create an unaligned address */
204     asm("v2 = vmem(%0 + #0)\n\t"
205         "vmemu(%1 + #0) = v2\n\t"
206         : : "r"(p0), "r"(pout) : "v2", "memory");
207 
208     memcpy(expect, buffer0, 2 * sizeof(MMVector));
209     memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
210 
211     check_output_w(__LINE__, 2);
212 }
213 
214 static void test_masked_store(bool invert)
215 {
216     void *p0 = buffer0;
217     void *pmask = mask;
218     void *pout = output;
219 
220     memset(expect, 0xff, sizeof(expect));
221     memset(output, 0xff, sizeof(expect));
222 
223     for (int i = 0; i < BUFSIZE; i++) {
224         if (invert) {
225             asm("r4 = #0\n\t"
226                 "v4 = vsplat(r4)\n\t"
227                 "v5 = vmem(%0 + #0)\n\t"
228                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
229                 "v5 = vmem(%1)\n\t"
230                 "if (!q0) vmem(%2) = v5\n\t"             /* Inverted test */
231                 : : "r"(pmask), "r"(p0), "r"(pout)
232                 : "r4", "v4", "v5", "q0", "memory");
233         } else {
234             asm("r4 = #0\n\t"
235                 "v4 = vsplat(r4)\n\t"
236                 "v5 = vmem(%0 + #0)\n\t"
237                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
238                 "v5 = vmem(%1)\n\t"
239                 "if (q0) vmem(%2) = v5\n\t"             /* Non-inverted test */
240                 : : "r"(pmask), "r"(p0), "r"(pout)
241                 : "r4", "v4", "v5", "q0", "memory");
242         }
243         p0 += sizeof(MMVector);
244         pmask += sizeof(MMVector);
245         pout += sizeof(MMVector);
246 
247         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
248             if (invert) {
249                 if (i + j % MASKMOD != 0) {
250                     expect[i].w[j] = buffer0[i].w[j];
251                 }
252             } else {
253                 if (i + j % MASKMOD == 0) {
254                     expect[i].w[j] = buffer0[i].w[j];
255                 }
256             }
257         }
258     }
259 
260     check_output_w(__LINE__, BUFSIZE);
261 }
262 
263 static void test_new_value_store(void)
264 {
265     void *p0 = buffer0;
266     void *pout = output;
267 
268     asm("{\n\t"
269         "    v2 = vmem(%0 + #0)\n\t"
270         "    vmem(%1 + #0) = v2.new\n\t"
271         "}\n\t"
272         : : "r"(p0), "r"(pout) : "v2", "memory");
273 
274     expect[0] = buffer0[0];
275 
276     check_output_w(__LINE__, 1);
277 }
278 
279 static void test_max_temps()
280 {
281     void *p0 = buffer0;
282     void *pout = output;
283 
284     asm("v0 = vmem(%0 + #0)\n\t"
285         "v1 = vmem(%0 + #1)\n\t"
286         "v2 = vmem(%0 + #2)\n\t"
287         "v3 = vmem(%0 + #3)\n\t"
288         "v4 = vmem(%0 + #4)\n\t"
289         "{\n\t"
290         "    v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
291         "    v2.b = vshuffe(v3.b, v2.b)\n\t"
292         "    v3.w = vadd(v1.w, v4.w)\n\t"
293         "    v4.tmp = vmem(%0 + #5)\n\t"
294         "}\n\t"
295         "vmem(%1 + #0) = v0\n\t"
296         "vmem(%1 + #1) = v1\n\t"
297         "vmem(%1 + #2) = v2\n\t"
298         "vmem(%1 + #3) = v3\n\t"
299         "vmem(%1 + #4) = v4\n\t"
300         : : "r"(p0), "r"(pout) : "memory");
301 
302         /* The first two vectors come from the vadd-pair instruction */
303         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
304             expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
305             expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
306         }
307         /* The third vector comes from the vshuffe instruction */
308         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
309             expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
310                               (buffer0[3].uh[i] & 0xff) << 8;
311         }
312         /* The fourth vector comes from the vadd-single instruction */
313         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
314             expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
315         }
316         /*
317          * The fifth vector comes from the load to v4
318          * make sure the .tmp is dropped
319          */
320         expect[4] = buffer0[4];
321 
322         check_output_b(__LINE__, 5);
323 }
324 
325 #define VEC_OP1(ASM, EL, IN, OUT) \
326     asm("v2 = vmem(%0 + #0)\n\t" \
327         "v2" #EL " = " #ASM "(v2" #EL ")\n\t" \
328         "vmem(%1 + #0) = v2\n\t" \
329         : : "r"(IN), "r"(OUT) : "v2", "memory")
330 
331 #define VEC_OP2(ASM, EL, IN0, IN1, OUT) \
332     asm("v2 = vmem(%0 + #0)\n\t" \
333         "v3 = vmem(%1 + #0)\n\t" \
334         "v2" #EL " = " #ASM "(v2" #EL ", v3" #EL ")\n\t" \
335         "vmem(%2 + #0) = v2\n\t" \
336         : : "r"(IN0), "r"(IN1), "r"(OUT) : "v2", "v3", "memory")
337 
338 #define TEST_VEC_OP1(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
339 static void test_##NAME(void) \
340 { \
341     void *pin = buffer0; \
342     void *pout = output; \
343     for (int i = 0; i < BUFSIZE; i++) { \
344         VEC_OP1(ASM, EL, pin, pout); \
345         pin += sizeof(MMVector); \
346         pout += sizeof(MMVector); \
347     } \
348     for (int i = 0; i < BUFSIZE; i++) { \
349         for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
350             expect[i].FIELD[j] = OP buffer0[i].FIELD[j]; \
351         } \
352     } \
353     check_output_##FIELD(__LINE__, BUFSIZE); \
354 }
355 
356 #define TEST_VEC_OP2(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
357 static void test_##NAME(void) \
358 { \
359     void *p0 = buffer0; \
360     void *p1 = buffer1; \
361     void *pout = output; \
362     for (int i = 0; i < BUFSIZE; i++) { \
363         VEC_OP2(ASM, EL, p0, p1, pout); \
364         p0 += sizeof(MMVector); \
365         p1 += sizeof(MMVector); \
366         pout += sizeof(MMVector); \
367     } \
368     for (int i = 0; i < BUFSIZE; i++) { \
369         for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
370             expect[i].FIELD[j] = buffer0[i].FIELD[j] OP buffer1[i].FIELD[j]; \
371         } \
372     } \
373     check_output_##FIELD(__LINE__, BUFSIZE); \
374 }
375 
376 #define THRESHOLD        31
377 
378 #define PRED_OP2(ASM, IN0, IN1, OUT, INV) \
379     asm("r4 = #%3\n\t" \
380         "v1.b = vsplat(r4)\n\t" \
381         "v2 = vmem(%0 + #0)\n\t" \
382         "q0 = vcmp.gt(v2.b, v1.b)\n\t" \
383         "v3 = vmem(%1 + #0)\n\t" \
384         "q1 = vcmp.gt(v3.b, v1.b)\n\t" \
385         "q2 = " #ASM "(q0, " INV "q1)\n\t" \
386         "r4 = #0xff\n\t" \
387         "v1.b = vsplat(r4)\n\t" \
388         "if (q2) vmem(%2 + #0) = v1\n\t" \
389         : : "r"(IN0), "r"(IN1), "r"(OUT), "i"(THRESHOLD) \
390         : "r4", "v1", "v2", "v3", "q0", "q1", "q2", "memory")
391 
392 #define TEST_PRED_OP2(NAME, ASM, OP, INV) \
393 static void test_##NAME(bool invert) \
394 { \
395     void *p0 = buffer0; \
396     void *p1 = buffer1; \
397     void *pout = output; \
398     memset(output, 0, sizeof(expect)); \
399     for (int i = 0; i < BUFSIZE; i++) { \
400         PRED_OP2(ASM, p0, p1, pout, INV); \
401         p0 += sizeof(MMVector); \
402         p1 += sizeof(MMVector); \
403         pout += sizeof(MMVector); \
404     } \
405     for (int i = 0; i < BUFSIZE; i++) { \
406         for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { \
407             bool p0 = (buffer0[i].b[j] > THRESHOLD); \
408             bool p1 = (buffer1[i].b[j] > THRESHOLD); \
409             if (invert) { \
410                 expect[i].b[j] = (p0 OP !p1) ? 0xff : 0x00; \
411             } else { \
412                 expect[i].b[j] = (p0 OP p1) ? 0xff : 0x00; \
413             } \
414         } \
415     } \
416     check_output_b(__LINE__, BUFSIZE); \
417 }
418 
419 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
420 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
421 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
422 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
423 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
424 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
425 TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
426 TEST_VEC_OP2(vand, vand, , d, 8, &)
427 TEST_VEC_OP2(vor, vor, , d, 8, |)
428 TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
429 
430 TEST_PRED_OP2(pred_or, or, |, "")
431 TEST_PRED_OP2(pred_or_n, or, |, "!")
432 TEST_PRED_OP2(pred_and, and, &, "")
433 TEST_PRED_OP2(pred_and_n, and, &, "!")
434 TEST_PRED_OP2(pred_xor, xor, ^, "")
435 
436 static void test_vadduwsat(void)
437 {
438     /*
439      * Test for saturation by adding two numbers that add to more than UINT_MAX
440      * and make sure the result saturates to UINT_MAX
441      */
442     const uint32_t x = 0xffff0000;
443     const uint32_t y = 0x000fffff;
444 
445     memset(expect, 0x12, sizeof(MMVector));
446     memset(output, 0x34, sizeof(MMVector));
447 
448     asm volatile ("v10 = vsplat(%0)\n\t"
449                   "v11 = vsplat(%1)\n\t"
450                   "v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
451                   "vmem(%2+#0) = v21\n\t"
452                   : /* no outputs */
453                   : "r"(x), "r"(y), "r"(output)
454                   : "v10", "v11", "v21", "memory");
455 
456     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
457         expect[0].uw[j] = UINT_MAX;
458     }
459 
460     check_output_w(__LINE__, 1);
461 }
462 
463 static void test_vsubuwsat_dv(void)
464 {
465     /*
466      * Test for saturation by subtracting two numbers where the result is
467      * negative and make sure the result saturates to zero
468      *
469      * vsubuwsat_dv operates on an HVX register pair, so we'll have a
470      * pair of subtractions
471      *     w - x < 0
472      *     y - z < 0
473      */
474     const uint32_t w = 0x000000b7;
475     const uint32_t x = 0xffffff4e;
476     const uint32_t y = 0x31fe88e7;
477     const uint32_t z = 0x7fffff79;
478 
479     memset(expect, 0x12, sizeof(MMVector) * 2);
480     memset(output, 0x34, sizeof(MMVector) * 2);
481 
482     asm volatile ("v16 = vsplat(%0)\n\t"
483                   "v17 = vsplat(%1)\n\t"
484                   "v26 = vsplat(%2)\n\t"
485                   "v27 = vsplat(%3)\n\t"
486                   "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
487                   "vmem(%4+#0) = v24\n\t"
488                   "vmem(%4+#1) = v25\n\t"
489                   : /* no outputs */
490                   : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
491                   : "v16", "v17", "v24", "v25", "v26", "v27", "memory");
492 
493     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
494         expect[0].uw[j] = 0x00000000;
495         expect[1].uw[j] = 0x00000000;
496     }
497 
498     check_output_w(__LINE__, 2);
499 }
500 
501 static void test_vshuff(void)
502 {
503     /* Test that vshuff works when the two operands are the same register */
504     const uint32_t splat = 0x089be55c;
505     const uint32_t shuff = 0x454fa926;
506     MMVector v0, v1;
507 
508     memset(expect, 0x12, sizeof(MMVector));
509     memset(output, 0x34, sizeof(MMVector));
510 
511     asm volatile("v25 = vsplat(%0)\n\t"
512                  "vshuff(v25, v25, %1)\n\t"
513                  "vmem(%2 + #0) = v25\n\t"
514                  : /* no outputs */
515                  : "r"(splat), "r"(shuff), "r"(output)
516                  : "v25", "memory");
517 
518     /*
519      * The semantics of Hexagon are the operands are pass-by-value, so create
520      * two copies of the vsplat result.
521      */
522     for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
523         v0.uw[i] = splat;
524         v1.uw[i] = splat;
525     }
526     /* Do the vshuff operation */
527     for (int offset = 1; offset < MAX_VEC_SIZE_BYTES; offset <<= 1) {
528         if (shuff & offset) {
529             for (int k = 0; k < MAX_VEC_SIZE_BYTES; k++) {
530                 if (!(k & offset)) {
531                     uint8_t tmp = v0.ub[k];
532                     v0.ub[k] = v1.ub[k + offset];
533                     v1.ub[k + offset] = tmp;
534                 }
535             }
536         }
537     }
538     /* Put the result in the expect buffer for verification */
539     expect[0] = v1;
540 
541     check_output_b(__LINE__, 1);
542 }
543 
544 static void test_load_tmp_predicated(void)
545 {
546     void *p0 = buffer0;
547     void *p1 = buffer1;
548     void *pout = output;
549     bool pred = true;
550 
551     for (int i = 0; i < BUFSIZE; i++) {
552         /*
553          * Load into v12 as .tmp with a predicate
554          * When the predicate is true, we get the vector from buffer1[i]
555          * When the predicate is false, we get a vector of all 1's
556          * Regardless of the predicate, the next packet should have
557          * a vector of all 1's
558          */
559         asm("v3 = vmem(%0 + #0)\n\t"
560             "r1 = #1\n\t"
561             "v12 = vsplat(r1)\n\t"
562             "p1 = !cmp.eq(%3, #0)\n\t"
563             "{\n\t"
564             "    if (p1) v12.tmp = vmem(%1 + #0)\n\t"
565             "    v4.w = vadd(v12.w, v3.w)\n\t"
566             "}\n\t"
567             "v4.w = vadd(v4.w, v12.w)\n\t"
568             "vmem(%2 + #0) = v4\n\t"
569             : : "r"(p0), "r"(p1), "r"(pout), "r"(pred)
570             : "r1", "p1", "v12", "v3", "v4", "v6", "memory");
571         p0 += sizeof(MMVector);
572         p1 += sizeof(MMVector);
573         pout += sizeof(MMVector);
574 
575         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
576             expect[i].w[j] =
577                 pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1
578                      : buffer0[i].w[j] + 2;
579         }
580         pred = !pred;
581     }
582 
583     check_output_w(__LINE__, BUFSIZE);
584 }
585 
586 static void test_load_cur_predicated(void)
587 {
588     bool pred = true;
589     for (int i = 0; i < BUFSIZE; i++) {
590         asm volatile("p0 = !cmp.eq(%3, #0)\n\t"
591                      "v3 = vmem(%0+#0)\n\t"
592                      /*
593                       * Preload v4 to make sure that the assignment from the
594                       * packet below is not being ignored when pred is false.
595                       */
596                      "r0 = #0x01237654\n\t"
597                      "v4 = vsplat(r0)\n\t"
598                      "{\n\t"
599                      "    if (p0) v3.cur = vmem(%1+#0)\n\t"
600                      "    v4 = v3\n\t"
601                      "}\n\t"
602                      "vmem(%2+#0) = v4\n\t"
603                      :
604                      : "r"(&buffer0[i]), "r"(&buffer1[i]),
605                        "r"(&output[i]), "r"(pred)
606                      : "r0", "p0", "v3", "v4", "memory");
607         expect[i] = pred ? buffer1[i] : buffer0[i];
608         pred = !pred;
609     }
610     check_output_w(__LINE__, BUFSIZE);
611 }
612 
613 int main()
614 {
615     init_buffers();
616 
617     test_load_tmp();
618     test_load_cur();
619     test_load_aligned();
620     test_load_unaligned();
621     test_store_aligned();
622     test_store_unaligned();
623     test_masked_store(false);
624     test_masked_store(true);
625     test_new_value_store();
626     test_max_temps();
627 
628     test_vadd_w();
629     test_vadd_h();
630     test_vadd_b();
631     test_vsub_w();
632     test_vsub_h();
633     test_vsub_b();
634     test_vxor();
635     test_vand();
636     test_vor();
637     test_vnot();
638 
639     test_pred_or(false);
640     test_pred_or_n(true);
641     test_pred_and(false);
642     test_pred_and_n(true);
643     test_pred_xor(false);
644 
645     test_vadduwsat();
646     test_vsubuwsat_dv();
647 
648     test_vshuff();
649 
650     test_load_tmp_predicated();
651     test_load_cur_predicated();
652 
653     puts(err ? "FAIL" : "PASS");
654     return err ? 1 : 0;
655 }
656