xref: /openbmc/qemu/tests/tcg/hexagon/hvx_misc.c (revision 51e47cf8)
1 /*
2  *  Copyright(c) 2021-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdio.h>
19 #include <stdint.h>
20 #include <stdbool.h>
21 #include <string.h>
22 #include <limits.h>
23 
24 int err;
25 
26 #include "hvx_misc.h"
27 
28 static void test_load_tmp(void)
29 {
30     void *p0 = buffer0;
31     void *p1 = buffer1;
32     void *pout = output;
33 
34     for (int i = 0; i < BUFSIZE; i++) {
35         /*
36          * Load into v12 as .tmp, then use it in the next packet
37          * Should get the new value within the same packet and
38          * the old value in the next packet
39          */
40         asm("v3 = vmem(%0 + #0)\n\t"
41             "r1 = #1\n\t"
42             "v12 = vsplat(r1)\n\t"
43             "{\n\t"
44             "    v12.tmp = vmem(%1 + #0)\n\t"
45             "    v4.w = vadd(v12.w, v3.w)\n\t"
46             "}\n\t"
47             "v4.w = vadd(v4.w, v12.w)\n\t"
48             "vmem(%2 + #0) = v4\n\t"
49             : : "r"(p0), "r"(p1), "r"(pout)
50             : "r1", "v12", "v3", "v4", "v6", "memory");
51         p0 += sizeof(MMVector);
52         p1 += sizeof(MMVector);
53         pout += sizeof(MMVector);
54 
55         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
56             expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
57         }
58     }
59 
60     check_output_w(__LINE__, BUFSIZE);
61 }
62 
63 static void test_load_cur(void)
64 {
65     void *p0 = buffer0;
66     void *pout = output;
67 
68     for (int i = 0; i < BUFSIZE; i++) {
69         asm("{\n\t"
70             "    v2.cur = vmem(%0 + #0)\n\t"
71             "    vmem(%1 + #0) = v2\n\t"
72             "}\n\t"
73             : : "r"(p0), "r"(pout) : "v2", "memory");
74         p0 += sizeof(MMVector);
75         pout += sizeof(MMVector);
76 
77         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
78             expect[i].uw[j] = buffer0[i].uw[j];
79         }
80     }
81 
82     check_output_w(__LINE__, BUFSIZE);
83 }
84 
85 static void test_load_aligned(void)
86 {
87     /* Aligned loads ignore the low bits of the address */
88     void *p0 = buffer0;
89     void *pout = output;
90     const size_t offset = 13;
91 
92     p0 += offset;    /* Create an unaligned address */
93     asm("v2 = vmem(%0 + #0)\n\t"
94         "vmem(%1 + #0) = v2\n\t"
95         : : "r"(p0), "r"(pout) : "v2", "memory");
96 
97     expect[0] = buffer0[0];
98 
99     check_output_w(__LINE__, 1);
100 }
101 
102 static void test_load_unaligned(void)
103 {
104     void *p0 = buffer0;
105     void *pout = output;
106     const size_t offset = 12;
107 
108     p0 += offset;    /* Create an unaligned address */
109     asm("v2 = vmemu(%0 + #0)\n\t"
110         "vmem(%1 + #0) = v2\n\t"
111         : : "r"(p0), "r"(pout) : "v2", "memory");
112 
113     memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
114 
115     check_output_w(__LINE__, 1);
116 }
117 
118 static void test_store_aligned(void)
119 {
120     /* Aligned stores ignore the low bits of the address */
121     void *p0 = buffer0;
122     void *pout = output;
123     const size_t offset = 13;
124 
125     pout += offset;    /* Create an unaligned address */
126     asm("v2 = vmem(%0 + #0)\n\t"
127         "vmem(%1 + #0) = v2\n\t"
128         : : "r"(p0), "r"(pout) : "v2", "memory");
129 
130     expect[0] = buffer0[0];
131 
132     check_output_w(__LINE__, 1);
133 }
134 
135 static void test_store_unaligned(void)
136 {
137     void *p0 = buffer0;
138     void *pout = output;
139     const size_t offset = 12;
140 
141     pout += offset;    /* Create an unaligned address */
142     asm("v2 = vmem(%0 + #0)\n\t"
143         "vmemu(%1 + #0) = v2\n\t"
144         : : "r"(p0), "r"(pout) : "v2", "memory");
145 
146     memcpy(expect, buffer0, 2 * sizeof(MMVector));
147     memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
148 
149     check_output_w(__LINE__, 2);
150 }
151 
152 static void test_masked_store(bool invert)
153 {
154     void *p0 = buffer0;
155     void *pmask = mask;
156     void *pout = output;
157 
158     memset(expect, 0xff, sizeof(expect));
159     memset(output, 0xff, sizeof(expect));
160 
161     for (int i = 0; i < BUFSIZE; i++) {
162         if (invert) {
163             asm("r4 = #0\n\t"
164                 "v4 = vsplat(r4)\n\t"
165                 "v5 = vmem(%0 + #0)\n\t"
166                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
167                 "v5 = vmem(%1)\n\t"
168                 "if (!q0) vmem(%2) = v5\n\t"             /* Inverted test */
169                 : : "r"(pmask), "r"(p0), "r"(pout)
170                 : "r4", "v4", "v5", "q0", "memory");
171         } else {
172             asm("r4 = #0\n\t"
173                 "v4 = vsplat(r4)\n\t"
174                 "v5 = vmem(%0 + #0)\n\t"
175                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
176                 "v5 = vmem(%1)\n\t"
177                 "if (q0) vmem(%2) = v5\n\t"             /* Non-inverted test */
178                 : : "r"(pmask), "r"(p0), "r"(pout)
179                 : "r4", "v4", "v5", "q0", "memory");
180         }
181         p0 += sizeof(MMVector);
182         pmask += sizeof(MMVector);
183         pout += sizeof(MMVector);
184 
185         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
186             if (invert) {
187                 if (i + j % MASKMOD != 0) {
188                     expect[i].w[j] = buffer0[i].w[j];
189                 }
190             } else {
191                 if (i + j % MASKMOD == 0) {
192                     expect[i].w[j] = buffer0[i].w[j];
193                 }
194             }
195         }
196     }
197 
198     check_output_w(__LINE__, BUFSIZE);
199 }
200 
201 static void test_new_value_store(void)
202 {
203     void *p0 = buffer0;
204     void *pout = output;
205 
206     asm("{\n\t"
207         "    v2 = vmem(%0 + #0)\n\t"
208         "    vmem(%1 + #0) = v2.new\n\t"
209         "}\n\t"
210         : : "r"(p0), "r"(pout) : "v2", "memory");
211 
212     expect[0] = buffer0[0];
213 
214     check_output_w(__LINE__, 1);
215 }
216 
217 static void test_max_temps()
218 {
219     void *p0 = buffer0;
220     void *pout = output;
221 
222     asm("v0 = vmem(%0 + #0)\n\t"
223         "v1 = vmem(%0 + #1)\n\t"
224         "v2 = vmem(%0 + #2)\n\t"
225         "v3 = vmem(%0 + #3)\n\t"
226         "v4 = vmem(%0 + #4)\n\t"
227         "{\n\t"
228         "    v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
229         "    v2.b = vshuffe(v3.b, v2.b)\n\t"
230         "    v3.w = vadd(v1.w, v4.w)\n\t"
231         "    v4.tmp = vmem(%0 + #5)\n\t"
232         "}\n\t"
233         "vmem(%1 + #0) = v0\n\t"
234         "vmem(%1 + #1) = v1\n\t"
235         "vmem(%1 + #2) = v2\n\t"
236         "vmem(%1 + #3) = v3\n\t"
237         "vmem(%1 + #4) = v4\n\t"
238         : : "r"(p0), "r"(pout) : "memory");
239 
240         /* The first two vectors come from the vadd-pair instruction */
241         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
242             expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
243             expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
244         }
245         /* The third vector comes from the vshuffe instruction */
246         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
247             expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
248                               (buffer0[3].uh[i] & 0xff) << 8;
249         }
250         /* The fourth vector comes from the vadd-single instruction */
251         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
252             expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
253         }
254         /*
255          * The fifth vector comes from the load to v4
256          * make sure the .tmp is dropped
257          */
258         expect[4] = buffer0[4];
259 
260         check_output_b(__LINE__, 5);
261 }
262 
263 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
264 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
265 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
266 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
267 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
268 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
269 TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
270 TEST_VEC_OP2(vand, vand, , d, 8, &)
271 TEST_VEC_OP2(vor, vor, , d, 8, |)
272 TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
273 
274 TEST_PRED_OP2(pred_or, or, |, "")
275 TEST_PRED_OP2(pred_or_n, or, |, "!")
276 TEST_PRED_OP2(pred_and, and, &, "")
277 TEST_PRED_OP2(pred_and_n, and, &, "!")
278 TEST_PRED_OP2(pred_xor, xor, ^, "")
279 
280 static void test_vadduwsat(void)
281 {
282     /*
283      * Test for saturation by adding two numbers that add to more than UINT_MAX
284      * and make sure the result saturates to UINT_MAX
285      */
286     const uint32_t x = 0xffff0000;
287     const uint32_t y = 0x000fffff;
288 
289     memset(expect, 0x12, sizeof(MMVector));
290     memset(output, 0x34, sizeof(MMVector));
291 
292     asm volatile ("v10 = vsplat(%0)\n\t"
293                   "v11 = vsplat(%1)\n\t"
294                   "v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
295                   "vmem(%2+#0) = v21\n\t"
296                   : /* no outputs */
297                   : "r"(x), "r"(y), "r"(output)
298                   : "v10", "v11", "v21", "memory");
299 
300     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
301         expect[0].uw[j] = UINT_MAX;
302     }
303 
304     check_output_w(__LINE__, 1);
305 }
306 
307 static void test_vsubuwsat_dv(void)
308 {
309     /*
310      * Test for saturation by subtracting two numbers where the result is
311      * negative and make sure the result saturates to zero
312      *
313      * vsubuwsat_dv operates on an HVX register pair, so we'll have a
314      * pair of subtractions
315      *     w - x < 0
316      *     y - z < 0
317      */
318     const uint32_t w = 0x000000b7;
319     const uint32_t x = 0xffffff4e;
320     const uint32_t y = 0x31fe88e7;
321     const uint32_t z = 0x7fffff79;
322 
323     memset(expect, 0x12, sizeof(MMVector) * 2);
324     memset(output, 0x34, sizeof(MMVector) * 2);
325 
326     asm volatile ("v16 = vsplat(%0)\n\t"
327                   "v17 = vsplat(%1)\n\t"
328                   "v26 = vsplat(%2)\n\t"
329                   "v27 = vsplat(%3)\n\t"
330                   "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
331                   "vmem(%4+#0) = v24\n\t"
332                   "vmem(%4+#1) = v25\n\t"
333                   : /* no outputs */
334                   : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
335                   : "v16", "v17", "v24", "v25", "v26", "v27", "memory");
336 
337     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
338         expect[0].uw[j] = 0x00000000;
339         expect[1].uw[j] = 0x00000000;
340     }
341 
342     check_output_w(__LINE__, 2);
343 }
344 
345 static void test_vshuff(void)
346 {
347     /* Test that vshuff works when the two operands are the same register */
348     const uint32_t splat = 0x089be55c;
349     const uint32_t shuff = 0x454fa926;
350     MMVector v0, v1;
351 
352     memset(expect, 0x12, sizeof(MMVector));
353     memset(output, 0x34, sizeof(MMVector));
354 
355     asm volatile("v25 = vsplat(%0)\n\t"
356                  "vshuff(v25, v25, %1)\n\t"
357                  "vmem(%2 + #0) = v25\n\t"
358                  : /* no outputs */
359                  : "r"(splat), "r"(shuff), "r"(output)
360                  : "v25", "memory");
361 
362     /*
363      * The semantics of Hexagon are the operands are pass-by-value, so create
364      * two copies of the vsplat result.
365      */
366     for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
367         v0.uw[i] = splat;
368         v1.uw[i] = splat;
369     }
370     /* Do the vshuff operation */
371     for (int offset = 1; offset < MAX_VEC_SIZE_BYTES; offset <<= 1) {
372         if (shuff & offset) {
373             for (int k = 0; k < MAX_VEC_SIZE_BYTES; k++) {
374                 if (!(k & offset)) {
375                     uint8_t tmp = v0.ub[k];
376                     v0.ub[k] = v1.ub[k + offset];
377                     v1.ub[k + offset] = tmp;
378                 }
379             }
380         }
381     }
382     /* Put the result in the expect buffer for verification */
383     expect[0] = v1;
384 
385     check_output_b(__LINE__, 1);
386 }
387 
388 static void test_load_tmp_predicated(void)
389 {
390     void *p0 = buffer0;
391     void *p1 = buffer1;
392     void *pout = output;
393     bool pred = true;
394 
395     for (int i = 0; i < BUFSIZE; i++) {
396         /*
397          * Load into v12 as .tmp with a predicate
398          * When the predicate is true, we get the vector from buffer1[i]
399          * When the predicate is false, we get a vector of all 1's
400          * Regardless of the predicate, the next packet should have
401          * a vector of all 1's
402          */
403         asm("v3 = vmem(%0 + #0)\n\t"
404             "r1 = #1\n\t"
405             "v12 = vsplat(r1)\n\t"
406             "p1 = !cmp.eq(%3, #0)\n\t"
407             "{\n\t"
408             "    if (p1) v12.tmp = vmem(%1 + #0)\n\t"
409             "    v4.w = vadd(v12.w, v3.w)\n\t"
410             "}\n\t"
411             "v4.w = vadd(v4.w, v12.w)\n\t"
412             "vmem(%2 + #0) = v4\n\t"
413             : : "r"(p0), "r"(p1), "r"(pout), "r"(pred)
414             : "r1", "p1", "v12", "v3", "v4", "v6", "memory");
415         p0 += sizeof(MMVector);
416         p1 += sizeof(MMVector);
417         pout += sizeof(MMVector);
418 
419         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
420             expect[i].w[j] =
421                 pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1
422                      : buffer0[i].w[j] + 2;
423         }
424         pred = !pred;
425     }
426 
427     check_output_w(__LINE__, BUFSIZE);
428 }
429 
430 static void test_load_cur_predicated(void)
431 {
432     bool pred = true;
433     for (int i = 0; i < BUFSIZE; i++) {
434         asm volatile("p0 = !cmp.eq(%3, #0)\n\t"
435                      "v3 = vmem(%0+#0)\n\t"
436                      /*
437                       * Preload v4 to make sure that the assignment from the
438                       * packet below is not being ignored when pred is false.
439                       */
440                      "r0 = #0x01237654\n\t"
441                      "v4 = vsplat(r0)\n\t"
442                      "{\n\t"
443                      "    if (p0) v3.cur = vmem(%1+#0)\n\t"
444                      "    v4 = v3\n\t"
445                      "}\n\t"
446                      "vmem(%2+#0) = v4\n\t"
447                      :
448                      : "r"(&buffer0[i]), "r"(&buffer1[i]),
449                        "r"(&output[i]), "r"(pred)
450                      : "r0", "p0", "v3", "v4", "memory");
451         expect[i] = pred ? buffer1[i] : buffer0[i];
452         pred = !pred;
453     }
454     check_output_w(__LINE__, BUFSIZE);
455 }
456 
457 int main()
458 {
459     init_buffers();
460 
461     test_load_tmp();
462     test_load_cur();
463     test_load_aligned();
464     test_load_unaligned();
465     test_store_aligned();
466     test_store_unaligned();
467     test_masked_store(false);
468     test_masked_store(true);
469     test_new_value_store();
470     test_max_temps();
471 
472     test_vadd_w();
473     test_vadd_h();
474     test_vadd_b();
475     test_vsub_w();
476     test_vsub_h();
477     test_vsub_b();
478     test_vxor();
479     test_vand();
480     test_vor();
481     test_vnot();
482 
483     test_pred_or(false);
484     test_pred_or_n(true);
485     test_pred_and(false);
486     test_pred_and_n(true);
487     test_pred_xor(false);
488 
489     test_vadduwsat();
490     test_vsubuwsat_dv();
491 
492     test_vshuff();
493 
494     test_load_tmp_predicated();
495     test_load_cur_predicated();
496 
497     puts(err ? "FAIL" : "PASS");
498     return err ? 1 : 0;
499 }
500