1*6c61d4e1STaylor Simpson /*
2*6c61d4e1STaylor Simpson * Copyright(c) 2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
3*6c61d4e1STaylor Simpson *
4*6c61d4e1STaylor Simpson * This program is free software; you can redistribute it and/or modify
5*6c61d4e1STaylor Simpson * it under the terms of the GNU General Public License as published by
6*6c61d4e1STaylor Simpson * the Free Software Foundation; either version 2 of the License, or
7*6c61d4e1STaylor Simpson * (at your option) any later version.
8*6c61d4e1STaylor Simpson *
9*6c61d4e1STaylor Simpson * This program is distributed in the hope that it will be useful,
10*6c61d4e1STaylor Simpson * but WITHOUT ANY WARRANTY; without even the implied warranty of
11*6c61d4e1STaylor Simpson * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12*6c61d4e1STaylor Simpson * GNU General Public License for more details.
13*6c61d4e1STaylor Simpson *
14*6c61d4e1STaylor Simpson * You should have received a copy of the GNU General Public License
15*6c61d4e1STaylor Simpson * along with this program; if not, see <http://www.gnu.org/licenses/>.
16*6c61d4e1STaylor Simpson */
17*6c61d4e1STaylor Simpson
18*6c61d4e1STaylor Simpson #include <stdio.h>
19*6c61d4e1STaylor Simpson #include <stdint.h>
20*6c61d4e1STaylor Simpson #include <stdbool.h>
21*6c61d4e1STaylor Simpson #include <string.h>
22*6c61d4e1STaylor Simpson #include <limits.h>
23*6c61d4e1STaylor Simpson
24*6c61d4e1STaylor Simpson int err;
25*6c61d4e1STaylor Simpson
26*6c61d4e1STaylor Simpson #include "hvx_misc.h"
27*6c61d4e1STaylor Simpson
28*6c61d4e1STaylor Simpson #define fVROUND(VAL, SHAMT) \
29*6c61d4e1STaylor Simpson ((VAL) + (((SHAMT) > 0) ? (1LL << ((SHAMT) - 1)) : 0))
30*6c61d4e1STaylor Simpson
31*6c61d4e1STaylor Simpson #define fVSATUB(VAL) \
32*6c61d4e1STaylor Simpson ((((VAL) & 0xffLL) == (VAL)) ? \
33*6c61d4e1STaylor Simpson (VAL) : \
34*6c61d4e1STaylor Simpson ((((int32_t)(VAL)) < 0) ? 0 : 0xff))
35*6c61d4e1STaylor Simpson
36*6c61d4e1STaylor Simpson #define fVSATUH(VAL) \
37*6c61d4e1STaylor Simpson ((((VAL) & 0xffffLL) == (VAL)) ? \
38*6c61d4e1STaylor Simpson (VAL) : \
39*6c61d4e1STaylor Simpson ((((int32_t)(VAL)) < 0) ? 0 : 0xffff))
40*6c61d4e1STaylor Simpson
test_vasrvuhubrndsat(void)41*6c61d4e1STaylor Simpson static void test_vasrvuhubrndsat(void)
42*6c61d4e1STaylor Simpson {
43*6c61d4e1STaylor Simpson void *p0 = buffer0;
44*6c61d4e1STaylor Simpson void *p1 = buffer1;
45*6c61d4e1STaylor Simpson void *pout = output;
46*6c61d4e1STaylor Simpson
47*6c61d4e1STaylor Simpson memset(expect, 0xaa, sizeof(expect));
48*6c61d4e1STaylor Simpson memset(output, 0xbb, sizeof(output));
49*6c61d4e1STaylor Simpson
50*6c61d4e1STaylor Simpson for (int i = 0; i < BUFSIZE / 2; i++) {
51*6c61d4e1STaylor Simpson asm("v4 = vmem(%0 + #0)\n\t"
52*6c61d4e1STaylor Simpson "v5 = vmem(%0 + #1)\n\t"
53*6c61d4e1STaylor Simpson "v6 = vmem(%1 + #0)\n\t"
54*6c61d4e1STaylor Simpson "v5.ub = vasr(v5:4.uh, v6.ub):rnd:sat\n\t"
55*6c61d4e1STaylor Simpson "vmem(%2) = v5\n\t"
56*6c61d4e1STaylor Simpson : : "r"(p0), "r"(p1), "r"(pout)
57*6c61d4e1STaylor Simpson : "v4", "v5", "v6", "memory");
58*6c61d4e1STaylor Simpson p0 += sizeof(MMVector) * 2;
59*6c61d4e1STaylor Simpson p1 += sizeof(MMVector);
60*6c61d4e1STaylor Simpson pout += sizeof(MMVector);
61*6c61d4e1STaylor Simpson
62*6c61d4e1STaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 2; j++) {
63*6c61d4e1STaylor Simpson int shamt;
64*6c61d4e1STaylor Simpson uint8_t byte0;
65*6c61d4e1STaylor Simpson uint8_t byte1;
66*6c61d4e1STaylor Simpson
67*6c61d4e1STaylor Simpson shamt = buffer1[i].ub[2 * j + 0] & 0x7;
68*6c61d4e1STaylor Simpson byte0 = fVSATUB(fVROUND(buffer0[2 * i + 0].uh[j], shamt) >> shamt);
69*6c61d4e1STaylor Simpson shamt = buffer1[i].ub[2 * j + 1] & 0x7;
70*6c61d4e1STaylor Simpson byte1 = fVSATUB(fVROUND(buffer0[2 * i + 1].uh[j], shamt) >> shamt);
71*6c61d4e1STaylor Simpson expect[i].uh[j] = (byte1 << 8) | (byte0 & 0xff);
72*6c61d4e1STaylor Simpson }
73*6c61d4e1STaylor Simpson }
74*6c61d4e1STaylor Simpson
75*6c61d4e1STaylor Simpson check_output_h(__LINE__, BUFSIZE / 2);
76*6c61d4e1STaylor Simpson }
77*6c61d4e1STaylor Simpson
test_vasrvuhubsat(void)78*6c61d4e1STaylor Simpson static void test_vasrvuhubsat(void)
79*6c61d4e1STaylor Simpson {
80*6c61d4e1STaylor Simpson void *p0 = buffer0;
81*6c61d4e1STaylor Simpson void *p1 = buffer1;
82*6c61d4e1STaylor Simpson void *pout = output;
83*6c61d4e1STaylor Simpson
84*6c61d4e1STaylor Simpson memset(expect, 0xaa, sizeof(expect));
85*6c61d4e1STaylor Simpson memset(output, 0xbb, sizeof(output));
86*6c61d4e1STaylor Simpson
87*6c61d4e1STaylor Simpson for (int i = 0; i < BUFSIZE / 2; i++) {
88*6c61d4e1STaylor Simpson asm("v4 = vmem(%0 + #0)\n\t"
89*6c61d4e1STaylor Simpson "v5 = vmem(%0 + #1)\n\t"
90*6c61d4e1STaylor Simpson "v6 = vmem(%1 + #0)\n\t"
91*6c61d4e1STaylor Simpson "v5.ub = vasr(v5:4.uh, v6.ub):sat\n\t"
92*6c61d4e1STaylor Simpson "vmem(%2) = v5\n\t"
93*6c61d4e1STaylor Simpson : : "r"(p0), "r"(p1), "r"(pout)
94*6c61d4e1STaylor Simpson : "v4", "v5", "v6", "memory");
95*6c61d4e1STaylor Simpson p0 += sizeof(MMVector) * 2;
96*6c61d4e1STaylor Simpson p1 += sizeof(MMVector);
97*6c61d4e1STaylor Simpson pout += sizeof(MMVector);
98*6c61d4e1STaylor Simpson
99*6c61d4e1STaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 2; j++) {
100*6c61d4e1STaylor Simpson int shamt;
101*6c61d4e1STaylor Simpson uint8_t byte0;
102*6c61d4e1STaylor Simpson uint8_t byte1;
103*6c61d4e1STaylor Simpson
104*6c61d4e1STaylor Simpson shamt = buffer1[i].ub[2 * j + 0] & 0x7;
105*6c61d4e1STaylor Simpson byte0 = fVSATUB(buffer0[2 * i + 0].uh[j] >> shamt);
106*6c61d4e1STaylor Simpson shamt = buffer1[i].ub[2 * j + 1] & 0x7;
107*6c61d4e1STaylor Simpson byte1 = fVSATUB(buffer0[2 * i + 1].uh[j] >> shamt);
108*6c61d4e1STaylor Simpson expect[i].uh[j] = (byte1 << 8) | (byte0 & 0xff);
109*6c61d4e1STaylor Simpson }
110*6c61d4e1STaylor Simpson }
111*6c61d4e1STaylor Simpson
112*6c61d4e1STaylor Simpson check_output_h(__LINE__, BUFSIZE / 2);
113*6c61d4e1STaylor Simpson }
114*6c61d4e1STaylor Simpson
test_vasrvwuhrndsat(void)115*6c61d4e1STaylor Simpson static void test_vasrvwuhrndsat(void)
116*6c61d4e1STaylor Simpson {
117*6c61d4e1STaylor Simpson void *p0 = buffer0;
118*6c61d4e1STaylor Simpson void *p1 = buffer1;
119*6c61d4e1STaylor Simpson void *pout = output;
120*6c61d4e1STaylor Simpson
121*6c61d4e1STaylor Simpson memset(expect, 0xaa, sizeof(expect));
122*6c61d4e1STaylor Simpson memset(output, 0xbb, sizeof(output));
123*6c61d4e1STaylor Simpson
124*6c61d4e1STaylor Simpson for (int i = 0; i < BUFSIZE / 2; i++) {
125*6c61d4e1STaylor Simpson asm("v4 = vmem(%0 + #0)\n\t"
126*6c61d4e1STaylor Simpson "v5 = vmem(%0 + #1)\n\t"
127*6c61d4e1STaylor Simpson "v6 = vmem(%1 + #0)\n\t"
128*6c61d4e1STaylor Simpson "v5.uh = vasr(v5:4.w, v6.uh):rnd:sat\n\t"
129*6c61d4e1STaylor Simpson "vmem(%2) = v5\n\t"
130*6c61d4e1STaylor Simpson : : "r"(p0), "r"(p1), "r"(pout)
131*6c61d4e1STaylor Simpson : "v4", "v5", "v6", "memory");
132*6c61d4e1STaylor Simpson p0 += sizeof(MMVector) * 2;
133*6c61d4e1STaylor Simpson p1 += sizeof(MMVector);
134*6c61d4e1STaylor Simpson pout += sizeof(MMVector);
135*6c61d4e1STaylor Simpson
136*6c61d4e1STaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
137*6c61d4e1STaylor Simpson int shamt;
138*6c61d4e1STaylor Simpson uint16_t half0;
139*6c61d4e1STaylor Simpson uint16_t half1;
140*6c61d4e1STaylor Simpson
141*6c61d4e1STaylor Simpson shamt = buffer1[i].uh[2 * j + 0] & 0xf;
142*6c61d4e1STaylor Simpson half0 = fVSATUH(fVROUND(buffer0[2 * i + 0].w[j], shamt) >> shamt);
143*6c61d4e1STaylor Simpson shamt = buffer1[i].uh[2 * j + 1] & 0xf;
144*6c61d4e1STaylor Simpson half1 = fVSATUH(fVROUND(buffer0[2 * i + 1].w[j], shamt) >> shamt);
145*6c61d4e1STaylor Simpson expect[i].w[j] = (half1 << 16) | (half0 & 0xffff);
146*6c61d4e1STaylor Simpson }
147*6c61d4e1STaylor Simpson }
148*6c61d4e1STaylor Simpson
149*6c61d4e1STaylor Simpson check_output_w(__LINE__, BUFSIZE / 2);
150*6c61d4e1STaylor Simpson }
151*6c61d4e1STaylor Simpson
test_vasrvwuhsat(void)152*6c61d4e1STaylor Simpson static void test_vasrvwuhsat(void)
153*6c61d4e1STaylor Simpson {
154*6c61d4e1STaylor Simpson void *p0 = buffer0;
155*6c61d4e1STaylor Simpson void *p1 = buffer1;
156*6c61d4e1STaylor Simpson void *pout = output;
157*6c61d4e1STaylor Simpson
158*6c61d4e1STaylor Simpson memset(expect, 0xaa, sizeof(expect));
159*6c61d4e1STaylor Simpson memset(output, 0xbb, sizeof(output));
160*6c61d4e1STaylor Simpson
161*6c61d4e1STaylor Simpson for (int i = 0; i < BUFSIZE / 2; i++) {
162*6c61d4e1STaylor Simpson asm("v4 = vmem(%0 + #0)\n\t"
163*6c61d4e1STaylor Simpson "v5 = vmem(%0 + #1)\n\t"
164*6c61d4e1STaylor Simpson "v6 = vmem(%1 + #0)\n\t"
165*6c61d4e1STaylor Simpson "v5.uh = vasr(v5:4.w, v6.uh):sat\n\t"
166*6c61d4e1STaylor Simpson "vmem(%2) = v5\n\t"
167*6c61d4e1STaylor Simpson : : "r"(p0), "r"(p1), "r"(pout)
168*6c61d4e1STaylor Simpson : "v4", "v5", "v6", "memory");
169*6c61d4e1STaylor Simpson p0 += sizeof(MMVector) * 2;
170*6c61d4e1STaylor Simpson p1 += sizeof(MMVector);
171*6c61d4e1STaylor Simpson pout += sizeof(MMVector);
172*6c61d4e1STaylor Simpson
173*6c61d4e1STaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
174*6c61d4e1STaylor Simpson int shamt;
175*6c61d4e1STaylor Simpson uint16_t half0;
176*6c61d4e1STaylor Simpson uint16_t half1;
177*6c61d4e1STaylor Simpson
178*6c61d4e1STaylor Simpson shamt = buffer1[i].uh[2 * j + 0] & 0xf;
179*6c61d4e1STaylor Simpson half0 = fVSATUH(buffer0[2 * i + 0].w[j] >> shamt);
180*6c61d4e1STaylor Simpson shamt = buffer1[i].uh[2 * j + 1] & 0xf;
181*6c61d4e1STaylor Simpson half1 = fVSATUH(buffer0[2 * i + 1].w[j] >> shamt);
182*6c61d4e1STaylor Simpson expect[i].w[j] = (half1 << 16) | (half0 & 0xffff);
183*6c61d4e1STaylor Simpson }
184*6c61d4e1STaylor Simpson }
185*6c61d4e1STaylor Simpson
186*6c61d4e1STaylor Simpson check_output_w(__LINE__, BUFSIZE / 2);
187*6c61d4e1STaylor Simpson }
188*6c61d4e1STaylor Simpson
test_vassign_tmp(void)189*6c61d4e1STaylor Simpson static void test_vassign_tmp(void)
190*6c61d4e1STaylor Simpson {
191*6c61d4e1STaylor Simpson void *p0 = buffer0;
192*6c61d4e1STaylor Simpson void *pout = output;
193*6c61d4e1STaylor Simpson
194*6c61d4e1STaylor Simpson memset(expect, 0xaa, sizeof(expect));
195*6c61d4e1STaylor Simpson memset(output, 0xbb, sizeof(output));
196*6c61d4e1STaylor Simpson
197*6c61d4e1STaylor Simpson for (int i = 0; i < BUFSIZE; i++) {
198*6c61d4e1STaylor Simpson /*
199*6c61d4e1STaylor Simpson * Assign into v12 as .tmp, then use it in the next packet
200*6c61d4e1STaylor Simpson * Should get the new value within the same packet and
201*6c61d4e1STaylor Simpson * the old value in the next packet
202*6c61d4e1STaylor Simpson */
203*6c61d4e1STaylor Simpson asm("v3 = vmem(%0 + #0)\n\t"
204*6c61d4e1STaylor Simpson "r1 = #1\n\t"
205*6c61d4e1STaylor Simpson "v12 = vsplat(r1)\n\t"
206*6c61d4e1STaylor Simpson "r1 = #2\n\t"
207*6c61d4e1STaylor Simpson "v13 = vsplat(r1)\n\t"
208*6c61d4e1STaylor Simpson "{\n\t"
209*6c61d4e1STaylor Simpson " v12.tmp = v13\n\t"
210*6c61d4e1STaylor Simpson " v4.w = vadd(v12.w, v3.w)\n\t"
211*6c61d4e1STaylor Simpson "}\n\t"
212*6c61d4e1STaylor Simpson "v4.w = vadd(v4.w, v12.w)\n\t"
213*6c61d4e1STaylor Simpson "vmem(%1 + #0) = v4\n\t"
214*6c61d4e1STaylor Simpson : : "r"(p0), "r"(pout)
215*6c61d4e1STaylor Simpson : "r1", "v3", "v4", "v12", "v13", "memory");
216*6c61d4e1STaylor Simpson p0 += sizeof(MMVector);
217*6c61d4e1STaylor Simpson pout += sizeof(MMVector);
218*6c61d4e1STaylor Simpson
219*6c61d4e1STaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
220*6c61d4e1STaylor Simpson expect[i].w[j] = buffer0[i].w[j] + 3;
221*6c61d4e1STaylor Simpson }
222*6c61d4e1STaylor Simpson }
223*6c61d4e1STaylor Simpson
224*6c61d4e1STaylor Simpson check_output_w(__LINE__, BUFSIZE);
225*6c61d4e1STaylor Simpson }
226*6c61d4e1STaylor Simpson
test_vcombine_tmp(void)227*6c61d4e1STaylor Simpson static void test_vcombine_tmp(void)
228*6c61d4e1STaylor Simpson {
229*6c61d4e1STaylor Simpson void *p0 = buffer0;
230*6c61d4e1STaylor Simpson void *p1 = buffer1;
231*6c61d4e1STaylor Simpson void *pout = output;
232*6c61d4e1STaylor Simpson
233*6c61d4e1STaylor Simpson memset(expect, 0xaa, sizeof(expect));
234*6c61d4e1STaylor Simpson memset(output, 0xbb, sizeof(output));
235*6c61d4e1STaylor Simpson
236*6c61d4e1STaylor Simpson for (int i = 0; i < BUFSIZE; i++) {
237*6c61d4e1STaylor Simpson /*
238*6c61d4e1STaylor Simpson * Combine into v13:12 as .tmp, then use it in the next packet
239*6c61d4e1STaylor Simpson * Should get the new value within the same packet and
240*6c61d4e1STaylor Simpson * the old value in the next packet
241*6c61d4e1STaylor Simpson */
242*6c61d4e1STaylor Simpson asm("v3 = vmem(%0 + #0)\n\t"
243*6c61d4e1STaylor Simpson "r1 = #1\n\t"
244*6c61d4e1STaylor Simpson "v12 = vsplat(r1)\n\t"
245*6c61d4e1STaylor Simpson "r1 = #2\n\t"
246*6c61d4e1STaylor Simpson "v13 = vsplat(r1)\n\t"
247*6c61d4e1STaylor Simpson "r1 = #3\n\t"
248*6c61d4e1STaylor Simpson "v14 = vsplat(r1)\n\t"
249*6c61d4e1STaylor Simpson "r1 = #4\n\t"
250*6c61d4e1STaylor Simpson "v15 = vsplat(r1)\n\t"
251*6c61d4e1STaylor Simpson "{\n\t"
252*6c61d4e1STaylor Simpson " v13:12.tmp = vcombine(v15, v14)\n\t"
253*6c61d4e1STaylor Simpson " v4.w = vadd(v12.w, v3.w)\n\t"
254*6c61d4e1STaylor Simpson " v16 = v13\n\t"
255*6c61d4e1STaylor Simpson "}\n\t"
256*6c61d4e1STaylor Simpson "v4.w = vadd(v4.w, v12.w)\n\t"
257*6c61d4e1STaylor Simpson "v4.w = vadd(v4.w, v13.w)\n\t"
258*6c61d4e1STaylor Simpson "v4.w = vadd(v4.w, v16.w)\n\t"
259*6c61d4e1STaylor Simpson "vmem(%2 + #0) = v4\n\t"
260*6c61d4e1STaylor Simpson : : "r"(p0), "r"(p1), "r"(pout)
261*6c61d4e1STaylor Simpson : "r1", "v3", "v4", "v12", "v13", "v14", "v15", "v16", "memory");
262*6c61d4e1STaylor Simpson p0 += sizeof(MMVector);
263*6c61d4e1STaylor Simpson p1 += sizeof(MMVector);
264*6c61d4e1STaylor Simpson pout += sizeof(MMVector);
265*6c61d4e1STaylor Simpson
266*6c61d4e1STaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
267*6c61d4e1STaylor Simpson expect[i].w[j] = buffer0[i].w[j] + 10;
268*6c61d4e1STaylor Simpson }
269*6c61d4e1STaylor Simpson }
270*6c61d4e1STaylor Simpson
271*6c61d4e1STaylor Simpson check_output_w(__LINE__, BUFSIZE);
272*6c61d4e1STaylor Simpson }
273*6c61d4e1STaylor Simpson
test_vmpyuhvs(void)274*6c61d4e1STaylor Simpson static void test_vmpyuhvs(void)
275*6c61d4e1STaylor Simpson {
276*6c61d4e1STaylor Simpson void *p0 = buffer0;
277*6c61d4e1STaylor Simpson void *p1 = buffer1;
278*6c61d4e1STaylor Simpson void *pout = output;
279*6c61d4e1STaylor Simpson
280*6c61d4e1STaylor Simpson memset(expect, 0xaa, sizeof(expect));
281*6c61d4e1STaylor Simpson memset(output, 0xbb, sizeof(output));
282*6c61d4e1STaylor Simpson
283*6c61d4e1STaylor Simpson for (int i = 0; i < BUFSIZE; i++) {
284*6c61d4e1STaylor Simpson asm("v4 = vmem(%0 + #0)\n\t"
285*6c61d4e1STaylor Simpson "v5 = vmem(%1 + #0)\n\t"
286*6c61d4e1STaylor Simpson "v4.uh = vmpy(V4.uh, v5.uh):>>16\n\t"
287*6c61d4e1STaylor Simpson "vmem(%2) = v4\n\t"
288*6c61d4e1STaylor Simpson : : "r"(p0), "r"(p1), "r"(pout)
289*6c61d4e1STaylor Simpson : "v4", "v5", "memory");
290*6c61d4e1STaylor Simpson p0 += sizeof(MMVector);
291*6c61d4e1STaylor Simpson p1 += sizeof(MMVector);
292*6c61d4e1STaylor Simpson pout += sizeof(MMVector);
293*6c61d4e1STaylor Simpson
294*6c61d4e1STaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 2; j++) {
295*6c61d4e1STaylor Simpson expect[i].uh[j] = (buffer0[i].uh[j] * buffer1[i].uh[j]) >> 16;
296*6c61d4e1STaylor Simpson }
297*6c61d4e1STaylor Simpson }
298*6c61d4e1STaylor Simpson
299*6c61d4e1STaylor Simpson check_output_h(__LINE__, BUFSIZE);
300*6c61d4e1STaylor Simpson }
301*6c61d4e1STaylor Simpson
main()302*6c61d4e1STaylor Simpson int main()
303*6c61d4e1STaylor Simpson {
304*6c61d4e1STaylor Simpson init_buffers();
305*6c61d4e1STaylor Simpson
306*6c61d4e1STaylor Simpson test_vasrvuhubrndsat();
307*6c61d4e1STaylor Simpson test_vasrvuhubsat();
308*6c61d4e1STaylor Simpson test_vasrvwuhrndsat();
309*6c61d4e1STaylor Simpson test_vasrvwuhsat();
310*6c61d4e1STaylor Simpson
311*6c61d4e1STaylor Simpson test_vassign_tmp();
312*6c61d4e1STaylor Simpson test_vcombine_tmp();
313*6c61d4e1STaylor Simpson
314*6c61d4e1STaylor Simpson test_vmpyuhvs();
315*6c61d4e1STaylor Simpson
316*6c61d4e1STaylor Simpson puts(err ? "FAIL" : "PASS");
317*6c61d4e1STaylor Simpson return err ? 1 : 0;
318*6c61d4e1STaylor Simpson }
319