1 /* 2 * Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 #include <stdio.h> 19 #include <stdint.h> 20 #include <stdbool.h> 21 #include <string.h> 22 23 int err; 24 25 static void __check(int line, int i, int j, uint64_t result, uint64_t expect) 26 { 27 if (result != expect) { 28 printf("ERROR at line %d: [%d][%d] 0x%016llx != 0x%016llx\n", 29 line, i, j, result, expect); 30 err++; 31 } 32 } 33 34 #define check(RES, EXP) __check(__LINE__, RES, EXP) 35 36 #define MAX_VEC_SIZE_BYTES 128 37 38 typedef union { 39 uint64_t ud[MAX_VEC_SIZE_BYTES / 8]; 40 int64_t d[MAX_VEC_SIZE_BYTES / 8]; 41 uint32_t uw[MAX_VEC_SIZE_BYTES / 4]; 42 int32_t w[MAX_VEC_SIZE_BYTES / 4]; 43 uint16_t uh[MAX_VEC_SIZE_BYTES / 2]; 44 int16_t h[MAX_VEC_SIZE_BYTES / 2]; 45 uint8_t ub[MAX_VEC_SIZE_BYTES / 1]; 46 int8_t b[MAX_VEC_SIZE_BYTES / 1]; 47 } MMVector; 48 49 #define BUFSIZE 16 50 #define OUTSIZE 16 51 #define MASKMOD 3 52 53 MMVector buffer0[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); 54 MMVector buffer1[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); 55 MMVector mask[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); 56 MMVector output[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); 57 MMVector expect[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); 58 59 #define CHECK_OUTPUT_FUNC(FIELD, FIELDSZ) \ 60 static void check_output_##FIELD(int line, size_t num_vectors) \ 61 { \ 62 for (int i = 0; i < num_vectors; i++) { \ 63 for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \ 64 __check(line, i, j, output[i].FIELD[j], expect[i].FIELD[j]); \ 65 } \ 66 } \ 67 } 68 69 CHECK_OUTPUT_FUNC(d, 8) 70 CHECK_OUTPUT_FUNC(w, 4) 71 CHECK_OUTPUT_FUNC(h, 2) 72 CHECK_OUTPUT_FUNC(b, 1) 73 74 static void init_buffers(void) 75 { 76 int counter0 = 0; 77 int counter1 = 17; 78 for (int i = 0; i < BUFSIZE; i++) { 79 for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { 80 buffer0[i].b[j] = counter0++; 81 buffer1[i].b[j] = counter1++; 82 } 83 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 84 mask[i].w[j] = (i + j % MASKMOD == 0) ? 0 : 1; 85 } 86 } 87 } 88 89 static void test_load_tmp(void) 90 { 91 void *p0 = buffer0; 92 void *p1 = buffer1; 93 void *pout = output; 94 95 for (int i = 0; i < BUFSIZE; i++) { 96 /* 97 * Load into v12 as .tmp, then use it in the next packet 98 * Should get the new value within the same packet and 99 * the old value in the next packet 100 */ 101 asm("v3 = vmem(%0 + #0)\n\t" 102 "r1 = #1\n\t" 103 "v12 = vsplat(r1)\n\t" 104 "{\n\t" 105 " v12.tmp = vmem(%1 + #0)\n\t" 106 " v4.w = vadd(v12.w, v3.w)\n\t" 107 "}\n\t" 108 "v4.w = vadd(v4.w, v12.w)\n\t" 109 "vmem(%2 + #0) = v4\n\t" 110 : : "r"(p0), "r"(p1), "r"(pout) 111 : "r1", "v12", "v3", "v4", "v6", "memory"); 112 p0 += sizeof(MMVector); 113 p1 += sizeof(MMVector); 114 pout += sizeof(MMVector); 115 116 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 117 expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1; 118 } 119 } 120 121 check_output_w(__LINE__, BUFSIZE); 122 } 123 124 static void test_load_cur(void) 125 { 126 void *p0 = buffer0; 127 void *pout = output; 128 129 for (int i = 0; i < BUFSIZE; i++) { 130 asm("{\n\t" 131 " v2.cur = vmem(%0 + #0)\n\t" 132 " vmem(%1 + #0) = v2\n\t" 133 "}\n\t" 134 : : "r"(p0), "r"(pout) : "v2", "memory"); 135 p0 += sizeof(MMVector); 136 pout += sizeof(MMVector); 137 138 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 139 expect[i].uw[j] = buffer0[i].uw[j]; 140 } 141 } 142 143 check_output_w(__LINE__, BUFSIZE); 144 } 145 146 static void test_load_aligned(void) 147 { 148 /* Aligned loads ignore the low bits of the address */ 149 void *p0 = buffer0; 150 void *pout = output; 151 const size_t offset = 13; 152 153 p0 += offset; /* Create an unaligned address */ 154 asm("v2 = vmem(%0 + #0)\n\t" 155 "vmem(%1 + #0) = v2\n\t" 156 : : "r"(p0), "r"(pout) : "v2", "memory"); 157 158 expect[0] = buffer0[0]; 159 160 check_output_w(__LINE__, 1); 161 } 162 163 static void test_load_unaligned(void) 164 { 165 void *p0 = buffer0; 166 void *pout = output; 167 const size_t offset = 12; 168 169 p0 += offset; /* Create an unaligned address */ 170 asm("v2 = vmemu(%0 + #0)\n\t" 171 "vmem(%1 + #0) = v2\n\t" 172 : : "r"(p0), "r"(pout) : "v2", "memory"); 173 174 memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector)); 175 176 check_output_w(__LINE__, 1); 177 } 178 179 static void test_store_aligned(void) 180 { 181 /* Aligned stores ignore the low bits of the address */ 182 void *p0 = buffer0; 183 void *pout = output; 184 const size_t offset = 13; 185 186 pout += offset; /* Create an unaligned address */ 187 asm("v2 = vmem(%0 + #0)\n\t" 188 "vmem(%1 + #0) = v2\n\t" 189 : : "r"(p0), "r"(pout) : "v2", "memory"); 190 191 expect[0] = buffer0[0]; 192 193 check_output_w(__LINE__, 1); 194 } 195 196 static void test_store_unaligned(void) 197 { 198 void *p0 = buffer0; 199 void *pout = output; 200 const size_t offset = 12; 201 202 pout += offset; /* Create an unaligned address */ 203 asm("v2 = vmem(%0 + #0)\n\t" 204 "vmemu(%1 + #0) = v2\n\t" 205 : : "r"(p0), "r"(pout) : "v2", "memory"); 206 207 memcpy(expect, buffer0, 2 * sizeof(MMVector)); 208 memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector)); 209 210 check_output_w(__LINE__, 2); 211 } 212 213 static void test_masked_store(bool invert) 214 { 215 void *p0 = buffer0; 216 void *pmask = mask; 217 void *pout = output; 218 219 memset(expect, 0xff, sizeof(expect)); 220 memset(output, 0xff, sizeof(expect)); 221 222 for (int i = 0; i < BUFSIZE; i++) { 223 if (invert) { 224 asm("r4 = #0\n\t" 225 "v4 = vsplat(r4)\n\t" 226 "v5 = vmem(%0 + #0)\n\t" 227 "q0 = vcmp.eq(v4.w, v5.w)\n\t" 228 "v5 = vmem(%1)\n\t" 229 "if (!q0) vmem(%2) = v5\n\t" /* Inverted test */ 230 : : "r"(pmask), "r"(p0), "r"(pout) 231 : "r4", "v4", "v5", "q0", "memory"); 232 } else { 233 asm("r4 = #0\n\t" 234 "v4 = vsplat(r4)\n\t" 235 "v5 = vmem(%0 + #0)\n\t" 236 "q0 = vcmp.eq(v4.w, v5.w)\n\t" 237 "v5 = vmem(%1)\n\t" 238 "if (q0) vmem(%2) = v5\n\t" /* Non-inverted test */ 239 : : "r"(pmask), "r"(p0), "r"(pout) 240 : "r4", "v4", "v5", "q0", "memory"); 241 } 242 p0 += sizeof(MMVector); 243 pmask += sizeof(MMVector); 244 pout += sizeof(MMVector); 245 246 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 247 if (invert) { 248 if (i + j % MASKMOD != 0) { 249 expect[i].w[j] = buffer0[i].w[j]; 250 } 251 } else { 252 if (i + j % MASKMOD == 0) { 253 expect[i].w[j] = buffer0[i].w[j]; 254 } 255 } 256 } 257 } 258 259 check_output_w(__LINE__, BUFSIZE); 260 } 261 262 static void test_new_value_store(void) 263 { 264 void *p0 = buffer0; 265 void *pout = output; 266 267 asm("{\n\t" 268 " v2 = vmem(%0 + #0)\n\t" 269 " vmem(%1 + #0) = v2.new\n\t" 270 "}\n\t" 271 : : "r"(p0), "r"(pout) : "v2", "memory"); 272 273 expect[0] = buffer0[0]; 274 275 check_output_w(__LINE__, 1); 276 } 277 278 static void test_max_temps() 279 { 280 void *p0 = buffer0; 281 void *pout = output; 282 283 asm("v0 = vmem(%0 + #0)\n\t" 284 "v1 = vmem(%0 + #1)\n\t" 285 "v2 = vmem(%0 + #2)\n\t" 286 "v3 = vmem(%0 + #3)\n\t" 287 "v4 = vmem(%0 + #4)\n\t" 288 "{\n\t" 289 " v1:0.w = vadd(v3:2.w, v1:0.w)\n\t" 290 " v2.b = vshuffe(v3.b, v2.b)\n\t" 291 " v3.w = vadd(v1.w, v4.w)\n\t" 292 " v4.tmp = vmem(%0 + #5)\n\t" 293 "}\n\t" 294 "vmem(%1 + #0) = v0\n\t" 295 "vmem(%1 + #1) = v1\n\t" 296 "vmem(%1 + #2) = v2\n\t" 297 "vmem(%1 + #3) = v3\n\t" 298 "vmem(%1 + #4) = v4\n\t" 299 : : "r"(p0), "r"(pout) : "memory"); 300 301 /* The first two vectors come from the vadd-pair instruction */ 302 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 303 expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i]; 304 expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i]; 305 } 306 /* The third vector comes from the vshuffe instruction */ 307 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) { 308 expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) | 309 (buffer0[3].uh[i] & 0xff) << 8; 310 } 311 /* The fourth vector comes from the vadd-single instruction */ 312 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 313 expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i]; 314 } 315 /* 316 * The fifth vector comes from the load to v4 317 * make sure the .tmp is dropped 318 */ 319 expect[4] = buffer0[4]; 320 321 check_output_b(__LINE__, 5); 322 } 323 324 #define VEC_OP1(ASM, EL, IN, OUT) \ 325 asm("v2 = vmem(%0 + #0)\n\t" \ 326 "v2" #EL " = " #ASM "(v2" #EL ")\n\t" \ 327 "vmem(%1 + #0) = v2\n\t" \ 328 : : "r"(IN), "r"(OUT) : "v2", "memory") 329 330 #define VEC_OP2(ASM, EL, IN0, IN1, OUT) \ 331 asm("v2 = vmem(%0 + #0)\n\t" \ 332 "v3 = vmem(%1 + #0)\n\t" \ 333 "v2" #EL " = " #ASM "(v2" #EL ", v3" #EL ")\n\t" \ 334 "vmem(%2 + #0) = v2\n\t" \ 335 : : "r"(IN0), "r"(IN1), "r"(OUT) : "v2", "v3", "memory") 336 337 #define TEST_VEC_OP1(NAME, ASM, EL, FIELD, FIELDSZ, OP) \ 338 static void test_##NAME(void) \ 339 { \ 340 void *pin = buffer0; \ 341 void *pout = output; \ 342 for (int i = 0; i < BUFSIZE; i++) { \ 343 VEC_OP1(ASM, EL, pin, pout); \ 344 pin += sizeof(MMVector); \ 345 pout += sizeof(MMVector); \ 346 } \ 347 for (int i = 0; i < BUFSIZE; i++) { \ 348 for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \ 349 expect[i].FIELD[j] = OP buffer0[i].FIELD[j]; \ 350 } \ 351 } \ 352 check_output_##FIELD(__LINE__, BUFSIZE); \ 353 } 354 355 #define TEST_VEC_OP2(NAME, ASM, EL, FIELD, FIELDSZ, OP) \ 356 static void test_##NAME(void) \ 357 { \ 358 void *p0 = buffer0; \ 359 void *p1 = buffer1; \ 360 void *pout = output; \ 361 for (int i = 0; i < BUFSIZE; i++) { \ 362 VEC_OP2(ASM, EL, p0, p1, pout); \ 363 p0 += sizeof(MMVector); \ 364 p1 += sizeof(MMVector); \ 365 pout += sizeof(MMVector); \ 366 } \ 367 for (int i = 0; i < BUFSIZE; i++) { \ 368 for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \ 369 expect[i].FIELD[j] = buffer0[i].FIELD[j] OP buffer1[i].FIELD[j]; \ 370 } \ 371 } \ 372 check_output_##FIELD(__LINE__, BUFSIZE); \ 373 } 374 375 #define THRESHOLD 31 376 377 #define PRED_OP2(ASM, IN0, IN1, OUT, INV) \ 378 asm("r4 = #%3\n\t" \ 379 "v1.b = vsplat(r4)\n\t" \ 380 "v2 = vmem(%0 + #0)\n\t" \ 381 "q0 = vcmp.gt(v2.b, v1.b)\n\t" \ 382 "v3 = vmem(%1 + #0)\n\t" \ 383 "q1 = vcmp.gt(v3.b, v1.b)\n\t" \ 384 "q2 = " #ASM "(q0, " INV "q1)\n\t" \ 385 "r4 = #0xff\n\t" \ 386 "v1.b = vsplat(r4)\n\t" \ 387 "if (q2) vmem(%2 + #0) = v1\n\t" \ 388 : : "r"(IN0), "r"(IN1), "r"(OUT), "i"(THRESHOLD) \ 389 : "r4", "v1", "v2", "v3", "q0", "q1", "q2", "memory") 390 391 #define TEST_PRED_OP2(NAME, ASM, OP, INV) \ 392 static void test_##NAME(bool invert) \ 393 { \ 394 void *p0 = buffer0; \ 395 void *p1 = buffer1; \ 396 void *pout = output; \ 397 memset(output, 0, sizeof(expect)); \ 398 for (int i = 0; i < BUFSIZE; i++) { \ 399 PRED_OP2(ASM, p0, p1, pout, INV); \ 400 p0 += sizeof(MMVector); \ 401 p1 += sizeof(MMVector); \ 402 pout += sizeof(MMVector); \ 403 } \ 404 for (int i = 0; i < BUFSIZE; i++) { \ 405 for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { \ 406 bool p0 = (buffer0[i].b[j] > THRESHOLD); \ 407 bool p1 = (buffer1[i].b[j] > THRESHOLD); \ 408 if (invert) { \ 409 expect[i].b[j] = (p0 OP !p1) ? 0xff : 0x00; \ 410 } else { \ 411 expect[i].b[j] = (p0 OP p1) ? 0xff : 0x00; \ 412 } \ 413 } \ 414 } \ 415 check_output_b(__LINE__, BUFSIZE); \ 416 } 417 418 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +) 419 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +) 420 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +) 421 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -) 422 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -) 423 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -) 424 TEST_VEC_OP2(vxor, vxor, , d, 8, ^) 425 TEST_VEC_OP2(vand, vand, , d, 8, &) 426 TEST_VEC_OP2(vor, vor, , d, 8, |) 427 TEST_VEC_OP1(vnot, vnot, , d, 8, ~) 428 429 TEST_PRED_OP2(pred_or, or, |, "") 430 TEST_PRED_OP2(pred_or_n, or, |, "!") 431 TEST_PRED_OP2(pred_and, and, &, "") 432 TEST_PRED_OP2(pred_and_n, and, &, "!") 433 TEST_PRED_OP2(pred_xor, xor, ^, "") 434 435 int main() 436 { 437 init_buffers(); 438 439 test_load_tmp(); 440 test_load_cur(); 441 test_load_aligned(); 442 test_load_unaligned(); 443 test_store_aligned(); 444 test_store_unaligned(); 445 test_masked_store(false); 446 test_masked_store(true); 447 test_new_value_store(); 448 test_max_temps(); 449 450 test_vadd_w(); 451 test_vadd_h(); 452 test_vadd_b(); 453 test_vsub_w(); 454 test_vsub_h(); 455 test_vsub_b(); 456 test_vxor(); 457 test_vand(); 458 test_vor(); 459 test_vnot(); 460 461 test_pred_or(false); 462 test_pred_or_n(true); 463 test_pred_and(false); 464 test_pred_and_n(true); 465 test_pred_xor(false); 466 467 puts(err ? "FAIL" : "PASS"); 468 return err ? 1 : 0; 469 } 470