xref: /openbmc/qemu/tests/tcg/hexagon/hvx_misc.c (revision b8116f4c)
1 /*
2  *  Copyright(c) 2021-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdio.h>
19 #include <stdint.h>
20 #include <stdbool.h>
21 #include <string.h>
22 #include <limits.h>
23 
24 int err;
25 
26 #include "hvx_misc.h"
27 
28 static void test_load_tmp(void)
29 {
30     void *p0 = buffer0;
31     void *p1 = buffer1;
32     void *pout = output;
33 
34     for (int i = 0; i < BUFSIZE; i++) {
35         /*
36          * Load into v12 as .tmp, then use it in the next packet
37          * Should get the new value within the same packet and
38          * the old value in the next packet
39          */
40         asm("v3 = vmem(%0 + #0)\n\t"
41             "r1 = #1\n\t"
42             "v12 = vsplat(r1)\n\t"
43             "{\n\t"
44             "    v12.tmp = vmem(%1 + #0)\n\t"
45             "    v4.w = vadd(v12.w, v3.w)\n\t"
46             "}\n\t"
47             "v4.w = vadd(v4.w, v12.w)\n\t"
48             "vmem(%2 + #0) = v4\n\t"
49             : : "r"(p0), "r"(p1), "r"(pout)
50             : "r1", "v12", "v3", "v4", "v6", "memory");
51         p0 += sizeof(MMVector);
52         p1 += sizeof(MMVector);
53         pout += sizeof(MMVector);
54 
55         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
56             expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
57         }
58     }
59 
60     check_output_w(__LINE__, BUFSIZE);
61 }
62 
63 static void test_load_tmp2(void)
64 {
65     void *pout0 = &output[0];
66     void *pout1 = &output[1];
67 
68     asm volatile(
69         "r0 = #0x03030303\n\t"
70         "v16 = vsplat(r0)\n\t"
71         "r0 = #0x04040404\n\t"
72         "v18 = vsplat(r0)\n\t"
73         "r0 = #0x05050505\n\t"
74         "v21 = vsplat(r0)\n\t"
75         "{\n\t"
76         "   v25:24 += vmpyo(v18.w, v14.h)\n\t"
77         "   v15:14.tmp = vcombine(v21, v16)\n\t"
78         "}\n\t"
79         "vmem(%0 + #0) = v24\n\t"
80         "vmem(%1 + #0) = v25\n\t"
81         : : "r"(pout0), "r"(pout1)
82         : "r0", "v16", "v18", "v21", "v24", "v25", "memory"
83     );
84 
85     for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; ++i) {
86         expect[0].w[i] = 0x180c0000;
87         expect[1].w[i] = 0x000c1818;
88     }
89 
90     check_output_w(__LINE__, 2);
91 }
92 
93 static void test_load_cur(void)
94 {
95     void *p0 = buffer0;
96     void *pout = output;
97 
98     for (int i = 0; i < BUFSIZE; i++) {
99         asm("{\n\t"
100             "    v2.cur = vmem(%0 + #0)\n\t"
101             "    vmem(%1 + #0) = v2\n\t"
102             "}\n\t"
103             : : "r"(p0), "r"(pout) : "v2", "memory");
104         p0 += sizeof(MMVector);
105         pout += sizeof(MMVector);
106 
107         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
108             expect[i].uw[j] = buffer0[i].uw[j];
109         }
110     }
111 
112     check_output_w(__LINE__, BUFSIZE);
113 }
114 
115 static void test_load_aligned(void)
116 {
117     /* Aligned loads ignore the low bits of the address */
118     void *p0 = buffer0;
119     void *pout = output;
120     const size_t offset = 13;
121 
122     p0 += offset;    /* Create an unaligned address */
123     asm("v2 = vmem(%0 + #0)\n\t"
124         "vmem(%1 + #0) = v2\n\t"
125         : : "r"(p0), "r"(pout) : "v2", "memory");
126 
127     expect[0] = buffer0[0];
128 
129     check_output_w(__LINE__, 1);
130 }
131 
132 static void test_load_unaligned(void)
133 {
134     void *p0 = buffer0;
135     void *pout = output;
136     const size_t offset = 12;
137 
138     p0 += offset;    /* Create an unaligned address */
139     asm("v2 = vmemu(%0 + #0)\n\t"
140         "vmem(%1 + #0) = v2\n\t"
141         : : "r"(p0), "r"(pout) : "v2", "memory");
142 
143     memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
144 
145     check_output_w(__LINE__, 1);
146 }
147 
148 static void test_store_aligned(void)
149 {
150     /* Aligned stores ignore the low bits of the address */
151     void *p0 = buffer0;
152     void *pout = output;
153     const size_t offset = 13;
154 
155     pout += offset;    /* Create an unaligned address */
156     asm("v2 = vmem(%0 + #0)\n\t"
157         "vmem(%1 + #0) = v2\n\t"
158         : : "r"(p0), "r"(pout) : "v2", "memory");
159 
160     expect[0] = buffer0[0];
161 
162     check_output_w(__LINE__, 1);
163 }
164 
165 static void test_store_unaligned(void)
166 {
167     void *p0 = buffer0;
168     void *pout = output;
169     const size_t offset = 12;
170 
171     pout += offset;    /* Create an unaligned address */
172     asm("v2 = vmem(%0 + #0)\n\t"
173         "vmemu(%1 + #0) = v2\n\t"
174         : : "r"(p0), "r"(pout) : "v2", "memory");
175 
176     memcpy(expect, buffer0, 2 * sizeof(MMVector));
177     memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
178 
179     check_output_w(__LINE__, 2);
180 }
181 
182 static void test_masked_store(bool invert)
183 {
184     void *p0 = buffer0;
185     void *pmask = mask;
186     void *pout = output;
187 
188     memset(expect, 0xff, sizeof(expect));
189     memset(output, 0xff, sizeof(expect));
190 
191     for (int i = 0; i < BUFSIZE; i++) {
192         if (invert) {
193             asm("r4 = #0\n\t"
194                 "v4 = vsplat(r4)\n\t"
195                 "v5 = vmem(%0 + #0)\n\t"
196                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
197                 "v5 = vmem(%1)\n\t"
198                 "if (!q0) vmem(%2) = v5\n\t"             /* Inverted test */
199                 : : "r"(pmask), "r"(p0), "r"(pout)
200                 : "r4", "v4", "v5", "q0", "memory");
201         } else {
202             asm("r4 = #0\n\t"
203                 "v4 = vsplat(r4)\n\t"
204                 "v5 = vmem(%0 + #0)\n\t"
205                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
206                 "v5 = vmem(%1)\n\t"
207                 "if (q0) vmem(%2) = v5\n\t"             /* Non-inverted test */
208                 : : "r"(pmask), "r"(p0), "r"(pout)
209                 : "r4", "v4", "v5", "q0", "memory");
210         }
211         p0 += sizeof(MMVector);
212         pmask += sizeof(MMVector);
213         pout += sizeof(MMVector);
214 
215         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
216             if (invert) {
217                 if (i + j % MASKMOD != 0) {
218                     expect[i].w[j] = buffer0[i].w[j];
219                 }
220             } else {
221                 if (i + j % MASKMOD == 0) {
222                     expect[i].w[j] = buffer0[i].w[j];
223                 }
224             }
225         }
226     }
227 
228     check_output_w(__LINE__, BUFSIZE);
229 }
230 
231 static void test_new_value_store(void)
232 {
233     void *p0 = buffer0;
234     void *p1 = buffer1;
235     void *pout = output;
236 
237     asm("{\n\t"
238         "    v2 = vmem(%0 + #0)\n\t"
239         "    vmem(%1 + #0) = v2.new\n\t"
240         "}\n\t"
241         : : "r"(p0), "r"(pout) : "v2", "memory");
242 
243     expect[0] = buffer0[0];
244 
245     check_output_w(__LINE__, 1);
246 
247     /* Test the .new read from the high half of a pair */
248     asm("v7 = vmem(%0 + #0)\n\t"
249         "v12 = vmem(%1 + #0)\n\t"
250         "{\n\t"
251         "    v5:4 = vcombine(v12, v7)\n\t"
252         "    vmem(%2 + #0) = v5.new\n\t"
253         "}\n\t"
254         : : "r"(p0), "r"(p1), "r"(pout) : "v4", "v5", "v7", "v12", "memory");
255 
256     expect[0] = buffer1[0];
257 
258     check_output_w(__LINE__, 1);
259 }
260 
261 static void test_max_temps()
262 {
263     void *p0 = buffer0;
264     void *pout = output;
265 
266     asm("v0 = vmem(%0 + #0)\n\t"
267         "v1 = vmem(%0 + #1)\n\t"
268         "v2 = vmem(%0 + #2)\n\t"
269         "v3 = vmem(%0 + #3)\n\t"
270         "v4 = vmem(%0 + #4)\n\t"
271         "{\n\t"
272         "    v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
273         "    v2.b = vshuffe(v3.b, v2.b)\n\t"
274         "    v3.w = vadd(v1.w, v4.w)\n\t"
275         "    v4.tmp = vmem(%0 + #5)\n\t"
276         "}\n\t"
277         "vmem(%1 + #0) = v0\n\t"
278         "vmem(%1 + #1) = v1\n\t"
279         "vmem(%1 + #2) = v2\n\t"
280         "vmem(%1 + #3) = v3\n\t"
281         "vmem(%1 + #4) = v4\n\t"
282         : : "r"(p0), "r"(pout) : "memory");
283 
284         /* The first two vectors come from the vadd-pair instruction */
285         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
286             expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
287             expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
288         }
289         /* The third vector comes from the vshuffe instruction */
290         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
291             expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
292                               (buffer0[3].uh[i] & 0xff) << 8;
293         }
294         /* The fourth vector comes from the vadd-single instruction */
295         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
296             expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
297         }
298         /*
299          * The fifth vector comes from the load to v4
300          * make sure the .tmp is dropped
301          */
302         expect[4] = buffer0[4];
303 
304         check_output_b(__LINE__, 5);
305 }
306 
307 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
308 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
309 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
310 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
311 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
312 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
313 TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
314 TEST_VEC_OP2(vand, vand, , d, 8, &)
315 TEST_VEC_OP2(vor, vor, , d, 8, |)
316 TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
317 
318 TEST_PRED_OP2(pred_or, or, |, "")
319 TEST_PRED_OP2(pred_or_n, or, |, "!")
320 TEST_PRED_OP2(pred_and, and, &, "")
321 TEST_PRED_OP2(pred_and_n, and, &, "!")
322 TEST_PRED_OP2(pred_xor, xor, ^, "")
323 
324 static void test_vadduwsat(void)
325 {
326     /*
327      * Test for saturation by adding two numbers that add to more than UINT_MAX
328      * and make sure the result saturates to UINT_MAX
329      */
330     const uint32_t x = 0xffff0000;
331     const uint32_t y = 0x000fffff;
332 
333     memset(expect, 0x12, sizeof(MMVector));
334     memset(output, 0x34, sizeof(MMVector));
335 
336     asm volatile ("v10 = vsplat(%0)\n\t"
337                   "v11 = vsplat(%1)\n\t"
338                   "v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
339                   "vmem(%2+#0) = v21\n\t"
340                   : /* no outputs */
341                   : "r"(x), "r"(y), "r"(output)
342                   : "v10", "v11", "v21", "memory");
343 
344     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
345         expect[0].uw[j] = UINT_MAX;
346     }
347 
348     check_output_w(__LINE__, 1);
349 }
350 
351 static void test_vsubuwsat_dv(void)
352 {
353     /*
354      * Test for saturation by subtracting two numbers where the result is
355      * negative and make sure the result saturates to zero
356      *
357      * vsubuwsat_dv operates on an HVX register pair, so we'll have a
358      * pair of subtractions
359      *     w - x < 0
360      *     y - z < 0
361      */
362     const uint32_t w = 0x000000b7;
363     const uint32_t x = 0xffffff4e;
364     const uint32_t y = 0x31fe88e7;
365     const uint32_t z = 0x7fffff79;
366 
367     memset(expect, 0x12, sizeof(MMVector) * 2);
368     memset(output, 0x34, sizeof(MMVector) * 2);
369 
370     asm volatile ("v16 = vsplat(%0)\n\t"
371                   "v17 = vsplat(%1)\n\t"
372                   "v26 = vsplat(%2)\n\t"
373                   "v27 = vsplat(%3)\n\t"
374                   "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
375                   "vmem(%4+#0) = v24\n\t"
376                   "vmem(%4+#1) = v25\n\t"
377                   : /* no outputs */
378                   : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
379                   : "v16", "v17", "v24", "v25", "v26", "v27", "memory");
380 
381     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
382         expect[0].uw[j] = 0x00000000;
383         expect[1].uw[j] = 0x00000000;
384     }
385 
386     check_output_w(__LINE__, 2);
387 }
388 
389 static void test_load_tmp_predicated(void)
390 {
391     void *p0 = buffer0;
392     void *p1 = buffer1;
393     void *pout = output;
394     bool pred = true;
395 
396     for (int i = 0; i < BUFSIZE; i++) {
397         /*
398          * Load into v12 as .tmp with a predicate
399          * When the predicate is true, we get the vector from buffer1[i]
400          * When the predicate is false, we get a vector of all 1's
401          * Regardless of the predicate, the next packet should have
402          * a vector of all 1's
403          */
404         asm("v3 = vmem(%0 + #0)\n\t"
405             "r1 = #1\n\t"
406             "v12 = vsplat(r1)\n\t"
407             "p1 = !cmp.eq(%3, #0)\n\t"
408             "{\n\t"
409             "    if (p1) v12.tmp = vmem(%1 + #0)\n\t"
410             "    v4.w = vadd(v12.w, v3.w)\n\t"
411             "}\n\t"
412             "v4.w = vadd(v4.w, v12.w)\n\t"
413             "vmem(%2 + #0) = v4\n\t"
414             : : "r"(p0), "r"(p1), "r"(pout), "r"(pred)
415             : "r1", "p1", "v12", "v3", "v4", "v6", "memory");
416         p0 += sizeof(MMVector);
417         p1 += sizeof(MMVector);
418         pout += sizeof(MMVector);
419 
420         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
421             expect[i].w[j] =
422                 pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1
423                      : buffer0[i].w[j] + 2;
424         }
425         pred = !pred;
426     }
427 
428     check_output_w(__LINE__, BUFSIZE);
429 }
430 
431 static void test_load_cur_predicated(void)
432 {
433     bool pred = true;
434     for (int i = 0; i < BUFSIZE; i++) {
435         asm volatile("p0 = !cmp.eq(%3, #0)\n\t"
436                      "v3 = vmem(%0+#0)\n\t"
437                      /*
438                       * Preload v4 to make sure that the assignment from the
439                       * packet below is not being ignored when pred is false.
440                       */
441                      "r0 = #0x01237654\n\t"
442                      "v4 = vsplat(r0)\n\t"
443                      "{\n\t"
444                      "    if (p0) v3.cur = vmem(%1+#0)\n\t"
445                      "    v4 = v3\n\t"
446                      "}\n\t"
447                      "vmem(%2+#0) = v4\n\t"
448                      :
449                      : "r"(&buffer0[i]), "r"(&buffer1[i]),
450                        "r"(&output[i]), "r"(pred)
451                      : "r0", "p0", "v3", "v4", "memory");
452         expect[i] = pred ? buffer1[i] : buffer0[i];
453         pred = !pred;
454     }
455     check_output_w(__LINE__, BUFSIZE);
456 }
457 
458 static void test_vcombine(void)
459 {
460     for (int i = 0; i < BUFSIZE / 2; i++) {
461         asm volatile("v2 = vsplat(%0)\n\t"
462                      "v3 = vsplat(%1)\n\t"
463                      "v3:2 = vcombine(v2, v3)\n\t"
464                      "vmem(%2+#0) = v2\n\t"
465                      "vmem(%2+#1) = v3\n\t"
466                      :
467                      : "r"(2 * i), "r"(2 * i + 1), "r"(&output[2 * i])
468                      : "v2", "v3", "memory");
469         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
470             expect[2 * i].w[j] = 2 * i + 1;
471             expect[2 * i + 1].w[j] = 2 * i;
472         }
473     }
474     check_output_w(__LINE__, BUFSIZE);
475 }
476 
477 int main()
478 {
479     init_buffers();
480 
481     test_load_tmp();
482     test_load_tmp2();
483     test_load_cur();
484     test_load_aligned();
485     test_load_unaligned();
486     test_store_aligned();
487     test_store_unaligned();
488     test_masked_store(false);
489     test_masked_store(true);
490     test_new_value_store();
491     test_max_temps();
492 
493     test_vadd_w();
494     test_vadd_h();
495     test_vadd_b();
496     test_vsub_w();
497     test_vsub_h();
498     test_vsub_b();
499     test_vxor();
500     test_vand();
501     test_vor();
502     test_vnot();
503 
504     test_pred_or(false);
505     test_pred_or_n(true);
506     test_pred_and(false);
507     test_pred_and_n(true);
508     test_pred_xor(false);
509 
510     test_vadduwsat();
511     test_vsubuwsat_dv();
512 
513     test_load_tmp_predicated();
514     test_load_cur_predicated();
515 
516     test_vcombine();
517 
518     puts(err ? "FAIL" : "PASS");
519     return err ? 1 : 0;
520 }
521