1 /*
2 * Copyright(c) 2021-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <stdio.h>
19 #include <stdint.h>
20 #include <stdbool.h>
21 #include <string.h>
22 #include <limits.h>
23
24 int err;
25
26 #include "hvx_misc.h"
27
test_load_tmp(void)28 static void test_load_tmp(void)
29 {
30 void *p0 = buffer0;
31 void *p1 = buffer1;
32 void *pout = output;
33
34 for (int i = 0; i < BUFSIZE; i++) {
35 /*
36 * Load into v12 as .tmp, then use it in the next packet
37 * Should get the new value within the same packet and
38 * the old value in the next packet
39 */
40 asm("v3 = vmem(%0 + #0)\n\t"
41 "r1 = #1\n\t"
42 "v12 = vsplat(r1)\n\t"
43 "{\n\t"
44 " v12.tmp = vmem(%1 + #0)\n\t"
45 " v4.w = vadd(v12.w, v3.w)\n\t"
46 "}\n\t"
47 "v4.w = vadd(v4.w, v12.w)\n\t"
48 "vmem(%2 + #0) = v4\n\t"
49 : : "r"(p0), "r"(p1), "r"(pout)
50 : "r1", "v12", "v3", "v4", "v6", "memory");
51 p0 += sizeof(MMVector);
52 p1 += sizeof(MMVector);
53 pout += sizeof(MMVector);
54
55 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
56 expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
57 }
58 }
59
60 check_output_w(__LINE__, BUFSIZE);
61 }
62
test_load_tmp2(void)63 static void test_load_tmp2(void)
64 {
65 void *pout0 = &output[0];
66 void *pout1 = &output[1];
67
68 asm volatile(
69 "r0 = #0x03030303\n\t"
70 "v16 = vsplat(r0)\n\t"
71 "r0 = #0x04040404\n\t"
72 "v18 = vsplat(r0)\n\t"
73 "r0 = #0x05050505\n\t"
74 "v21 = vsplat(r0)\n\t"
75 "{\n\t"
76 " v25:24 += vmpyo(v18.w, v14.h)\n\t"
77 " v15:14.tmp = vcombine(v21, v16)\n\t"
78 "}\n\t"
79 "vmem(%0 + #0) = v24\n\t"
80 "vmem(%1 + #0) = v25\n\t"
81 : : "r"(pout0), "r"(pout1)
82 : "r0", "v16", "v18", "v21", "v24", "v25", "memory"
83 );
84
85 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; ++i) {
86 expect[0].w[i] = 0x180c0000;
87 expect[1].w[i] = 0x000c1818;
88 }
89
90 check_output_w(__LINE__, 2);
91 }
92
test_load_cur(void)93 static void test_load_cur(void)
94 {
95 void *p0 = buffer0;
96 void *pout = output;
97
98 for (int i = 0; i < BUFSIZE; i++) {
99 asm("{\n\t"
100 " v2.cur = vmem(%0 + #0)\n\t"
101 " vmem(%1 + #0) = v2\n\t"
102 "}\n\t"
103 : : "r"(p0), "r"(pout) : "v2", "memory");
104 p0 += sizeof(MMVector);
105 pout += sizeof(MMVector);
106
107 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
108 expect[i].uw[j] = buffer0[i].uw[j];
109 }
110 }
111
112 check_output_w(__LINE__, BUFSIZE);
113 }
114
test_load_aligned(void)115 static void test_load_aligned(void)
116 {
117 /* Aligned loads ignore the low bits of the address */
118 void *p0 = buffer0;
119 void *pout = output;
120 const size_t offset = 13;
121
122 p0 += offset; /* Create an unaligned address */
123 asm("v2 = vmem(%0 + #0)\n\t"
124 "vmem(%1 + #0) = v2\n\t"
125 : : "r"(p0), "r"(pout) : "v2", "memory");
126
127 expect[0] = buffer0[0];
128
129 check_output_w(__LINE__, 1);
130 }
131
test_load_unaligned(void)132 static void test_load_unaligned(void)
133 {
134 void *p0 = buffer0;
135 void *pout = output;
136 const size_t offset = 12;
137
138 p0 += offset; /* Create an unaligned address */
139 asm("v2 = vmemu(%0 + #0)\n\t"
140 "vmem(%1 + #0) = v2\n\t"
141 : : "r"(p0), "r"(pout) : "v2", "memory");
142
143 memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
144
145 check_output_w(__LINE__, 1);
146 }
147
test_store_aligned(void)148 static void test_store_aligned(void)
149 {
150 /* Aligned stores ignore the low bits of the address */
151 void *p0 = buffer0;
152 void *pout = output;
153 const size_t offset = 13;
154
155 pout += offset; /* Create an unaligned address */
156 asm("v2 = vmem(%0 + #0)\n\t"
157 "vmem(%1 + #0) = v2\n\t"
158 : : "r"(p0), "r"(pout) : "v2", "memory");
159
160 expect[0] = buffer0[0];
161
162 check_output_w(__LINE__, 1);
163 }
164
test_store_unaligned(void)165 static void test_store_unaligned(void)
166 {
167 void *p0 = buffer0;
168 void *pout = output;
169 const size_t offset = 12;
170
171 pout += offset; /* Create an unaligned address */
172 asm("v2 = vmem(%0 + #0)\n\t"
173 "vmemu(%1 + #0) = v2\n\t"
174 : : "r"(p0), "r"(pout) : "v2", "memory");
175
176 memcpy(expect, buffer0, 2 * sizeof(MMVector));
177 memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
178
179 check_output_w(__LINE__, 2);
180 }
181
test_masked_store(bool invert)182 static void test_masked_store(bool invert)
183 {
184 void *p0 = buffer0;
185 void *pmask = mask;
186 void *pout = output;
187
188 memset(expect, 0xff, sizeof(expect));
189 memset(output, 0xff, sizeof(expect));
190
191 for (int i = 0; i < BUFSIZE; i++) {
192 if (invert) {
193 asm("r4 = #0\n\t"
194 "v4 = vsplat(r4)\n\t"
195 "v5 = vmem(%0 + #0)\n\t"
196 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
197 "v5 = vmem(%1)\n\t"
198 "if (!q0) vmem(%2) = v5\n\t" /* Inverted test */
199 : : "r"(pmask), "r"(p0), "r"(pout)
200 : "r4", "v4", "v5", "q0", "memory");
201 } else {
202 asm("r4 = #0\n\t"
203 "v4 = vsplat(r4)\n\t"
204 "v5 = vmem(%0 + #0)\n\t"
205 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
206 "v5 = vmem(%1)\n\t"
207 "if (q0) vmem(%2) = v5\n\t" /* Non-inverted test */
208 : : "r"(pmask), "r"(p0), "r"(pout)
209 : "r4", "v4", "v5", "q0", "memory");
210 }
211 p0 += sizeof(MMVector);
212 pmask += sizeof(MMVector);
213 pout += sizeof(MMVector);
214
215 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
216 if (invert) {
217 if (i + j % MASKMOD != 0) {
218 expect[i].w[j] = buffer0[i].w[j];
219 }
220 } else {
221 if (i + j % MASKMOD == 0) {
222 expect[i].w[j] = buffer0[i].w[j];
223 }
224 }
225 }
226 }
227
228 check_output_w(__LINE__, BUFSIZE);
229 }
230
test_new_value_store(void)231 static void test_new_value_store(void)
232 {
233 void *p0 = buffer0;
234 void *p1 = buffer1;
235 void *pout = output;
236
237 asm("{\n\t"
238 " v2 = vmem(%0 + #0)\n\t"
239 " vmem(%1 + #0) = v2.new\n\t"
240 "}\n\t"
241 : : "r"(p0), "r"(pout) : "v2", "memory");
242
243 expect[0] = buffer0[0];
244
245 check_output_w(__LINE__, 1);
246
247 /* Test the .new read from the high half of a pair */
248 asm("v7 = vmem(%0 + #0)\n\t"
249 "v12 = vmem(%1 + #0)\n\t"
250 "{\n\t"
251 " v5:4 = vcombine(v12, v7)\n\t"
252 " vmem(%2 + #0) = v5.new\n\t"
253 "}\n\t"
254 : : "r"(p0), "r"(p1), "r"(pout) : "v4", "v5", "v7", "v12", "memory");
255
256 expect[0] = buffer1[0];
257
258 check_output_w(__LINE__, 1);
259 }
260
test_max_temps()261 static void test_max_temps()
262 {
263 void *p0 = buffer0;
264 void *pout = output;
265
266 asm("v0 = vmem(%0 + #0)\n\t"
267 "v1 = vmem(%0 + #1)\n\t"
268 "v2 = vmem(%0 + #2)\n\t"
269 "v3 = vmem(%0 + #3)\n\t"
270 "v4 = vmem(%0 + #4)\n\t"
271 "{\n\t"
272 " v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
273 " v2.b = vshuffe(v3.b, v2.b)\n\t"
274 " v3.w = vadd(v1.w, v4.w)\n\t"
275 " v4.tmp = vmem(%0 + #5)\n\t"
276 "}\n\t"
277 "vmem(%1 + #0) = v0\n\t"
278 "vmem(%1 + #1) = v1\n\t"
279 "vmem(%1 + #2) = v2\n\t"
280 "vmem(%1 + #3) = v3\n\t"
281 "vmem(%1 + #4) = v4\n\t"
282 : : "r"(p0), "r"(pout) : "memory");
283
284 /* The first two vectors come from the vadd-pair instruction */
285 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
286 expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
287 expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
288 }
289 /* The third vector comes from the vshuffe instruction */
290 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
291 expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
292 (buffer0[3].uh[i] & 0xff) << 8;
293 }
294 /* The fourth vector comes from the vadd-single instruction */
295 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
296 expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
297 }
298 /*
299 * The fifth vector comes from the load to v4
300 * make sure the .tmp is dropped
301 */
302 expect[4] = buffer0[4];
303
304 check_output_b(__LINE__, 5);
305 }
306
307 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
308 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
309 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
310 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
311 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
312 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
313 TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
314 TEST_VEC_OP2(vand, vand, , d, 8, &)
315 TEST_VEC_OP2(vor, vor, , d, 8, |)
316 TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
317
318 TEST_PRED_OP2(pred_or, or, |, "")
319 TEST_PRED_OP2(pred_or_n, or, |, "!")
320 TEST_PRED_OP2(pred_and, and, &, "")
321 TEST_PRED_OP2(pred_and_n, and, &, "!")
322 TEST_PRED_OP2(pred_xor, xor, ^, "")
323
test_vadduwsat(void)324 static void test_vadduwsat(void)
325 {
326 /*
327 * Test for saturation by adding two numbers that add to more than UINT_MAX
328 * and make sure the result saturates to UINT_MAX
329 */
330 const uint32_t x = 0xffff0000;
331 const uint32_t y = 0x000fffff;
332
333 memset(expect, 0x12, sizeof(MMVector));
334 memset(output, 0x34, sizeof(MMVector));
335
336 asm volatile ("v10 = vsplat(%0)\n\t"
337 "v11 = vsplat(%1)\n\t"
338 "v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
339 "vmem(%2+#0) = v21\n\t"
340 : /* no outputs */
341 : "r"(x), "r"(y), "r"(output)
342 : "v10", "v11", "v21", "memory");
343
344 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
345 expect[0].uw[j] = UINT_MAX;
346 }
347
348 check_output_w(__LINE__, 1);
349 }
350
test_vsubuwsat_dv(void)351 static void test_vsubuwsat_dv(void)
352 {
353 /*
354 * Test for saturation by subtracting two numbers where the result is
355 * negative and make sure the result saturates to zero
356 *
357 * vsubuwsat_dv operates on an HVX register pair, so we'll have a
358 * pair of subtractions
359 * w - x < 0
360 * y - z < 0
361 */
362 const uint32_t w = 0x000000b7;
363 const uint32_t x = 0xffffff4e;
364 const uint32_t y = 0x31fe88e7;
365 const uint32_t z = 0x7fffff79;
366
367 memset(expect, 0x12, sizeof(MMVector) * 2);
368 memset(output, 0x34, sizeof(MMVector) * 2);
369
370 asm volatile ("v16 = vsplat(%0)\n\t"
371 "v17 = vsplat(%1)\n\t"
372 "v26 = vsplat(%2)\n\t"
373 "v27 = vsplat(%3)\n\t"
374 "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
375 "vmem(%4+#0) = v24\n\t"
376 "vmem(%4+#1) = v25\n\t"
377 : /* no outputs */
378 : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
379 : "v16", "v17", "v24", "v25", "v26", "v27", "memory");
380
381 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
382 expect[0].uw[j] = 0x00000000;
383 expect[1].uw[j] = 0x00000000;
384 }
385
386 check_output_w(__LINE__, 2);
387 }
388
test_load_tmp_predicated(void)389 static void test_load_tmp_predicated(void)
390 {
391 void *p0 = buffer0;
392 void *p1 = buffer1;
393 void *pout = output;
394 bool pred = true;
395
396 for (int i = 0; i < BUFSIZE; i++) {
397 /*
398 * Load into v12 as .tmp with a predicate
399 * When the predicate is true, we get the vector from buffer1[i]
400 * When the predicate is false, we get a vector of all 1's
401 * Regardless of the predicate, the next packet should have
402 * a vector of all 1's
403 */
404 asm("v3 = vmem(%0 + #0)\n\t"
405 "r1 = #1\n\t"
406 "v12 = vsplat(r1)\n\t"
407 "p1 = !cmp.eq(%3, #0)\n\t"
408 "{\n\t"
409 " if (p1) v12.tmp = vmem(%1 + #0)\n\t"
410 " v4.w = vadd(v12.w, v3.w)\n\t"
411 "}\n\t"
412 "v4.w = vadd(v4.w, v12.w)\n\t"
413 "vmem(%2 + #0) = v4\n\t"
414 : : "r"(p0), "r"(p1), "r"(pout), "r"(pred)
415 : "r1", "p1", "v12", "v3", "v4", "v6", "memory");
416 p0 += sizeof(MMVector);
417 p1 += sizeof(MMVector);
418 pout += sizeof(MMVector);
419
420 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
421 expect[i].w[j] =
422 pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1
423 : buffer0[i].w[j] + 2;
424 }
425 pred = !pred;
426 }
427
428 check_output_w(__LINE__, BUFSIZE);
429 }
430
test_load_cur_predicated(void)431 static void test_load_cur_predicated(void)
432 {
433 bool pred = true;
434 for (int i = 0; i < BUFSIZE; i++) {
435 asm volatile("p0 = !cmp.eq(%3, #0)\n\t"
436 "v3 = vmem(%0+#0)\n\t"
437 /*
438 * Preload v4 to make sure that the assignment from the
439 * packet below is not being ignored when pred is false.
440 */
441 "r0 = #0x01237654\n\t"
442 "v4 = vsplat(r0)\n\t"
443 "{\n\t"
444 " if (p0) v3.cur = vmem(%1+#0)\n\t"
445 " v4 = v3\n\t"
446 "}\n\t"
447 "vmem(%2+#0) = v4\n\t"
448 :
449 : "r"(&buffer0[i]), "r"(&buffer1[i]),
450 "r"(&output[i]), "r"(pred)
451 : "r0", "p0", "v3", "v4", "memory");
452 expect[i] = pred ? buffer1[i] : buffer0[i];
453 pred = !pred;
454 }
455 check_output_w(__LINE__, BUFSIZE);
456 }
457
test_vcombine(void)458 static void test_vcombine(void)
459 {
460 for (int i = 0; i < BUFSIZE / 2; i++) {
461 asm volatile("v2 = vsplat(%0)\n\t"
462 "v3 = vsplat(%1)\n\t"
463 "v3:2 = vcombine(v2, v3)\n\t"
464 "vmem(%2+#0) = v2\n\t"
465 "vmem(%2+#1) = v3\n\t"
466 :
467 : "r"(2 * i), "r"(2 * i + 1), "r"(&output[2 * i])
468 : "v2", "v3", "memory");
469 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
470 expect[2 * i].w[j] = 2 * i + 1;
471 expect[2 * i + 1].w[j] = 2 * i;
472 }
473 }
474 check_output_w(__LINE__, BUFSIZE);
475 }
476
test_store_new()477 void test_store_new()
478 {
479 asm volatile(
480 "r0 = #0x12345678\n"
481 "v0 = vsplat(r0)\n"
482 "r0 = #0xff00ff00\n"
483 "v1 = vsplat(r0)\n"
484 "{\n"
485 " vdeal(v1,v0,r0)\n"
486 " vmem(%0) = v0.new\n"
487 "}\n"
488 :
489 : "r"(&output[0])
490 : "r0", "v0", "v1", "memory"
491 );
492 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
493 expect[0].w[i] = 0x12345678;
494 }
495 check_output_w(__LINE__, 1);
496 }
497
main()498 int main()
499 {
500 init_buffers();
501
502 test_load_tmp();
503 test_load_tmp2();
504 test_load_cur();
505 test_load_aligned();
506 test_load_unaligned();
507 test_store_aligned();
508 test_store_unaligned();
509 test_masked_store(false);
510 test_masked_store(true);
511 test_new_value_store();
512 test_max_temps();
513
514 test_vadd_w();
515 test_vadd_h();
516 test_vadd_b();
517 test_vsub_w();
518 test_vsub_h();
519 test_vsub_b();
520 test_vxor();
521 test_vand();
522 test_vor();
523 test_vnot();
524
525 test_pred_or(false);
526 test_pred_or_n(true);
527 test_pred_and(false);
528 test_pred_and_n(true);
529 test_pred_xor(false);
530
531 test_vadduwsat();
532 test_vsubuwsat_dv();
533
534 test_load_tmp_predicated();
535 test_load_cur_predicated();
536
537 test_vcombine();
538
539 test_store_new();
540
541 puts(err ? "FAIL" : "PASS");
542 return err ? 1 : 0;
543 }
544