1 /* 2 * Copyright(c) 2021-2022 Qualcomm Innovation Center, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 #include <stdio.h> 19 #include <stdint.h> 20 #include <stdbool.h> 21 #include <string.h> 22 #include <limits.h> 23 24 int err; 25 26 static void __check(int line, int i, int j, uint64_t result, uint64_t expect) 27 { 28 if (result != expect) { 29 printf("ERROR at line %d: [%d][%d] 0x%016llx != 0x%016llx\n", 30 line, i, j, result, expect); 31 err++; 32 } 33 } 34 35 #define check(RES, EXP) __check(__LINE__, RES, EXP) 36 37 #define MAX_VEC_SIZE_BYTES 128 38 39 typedef union { 40 uint64_t ud[MAX_VEC_SIZE_BYTES / 8]; 41 int64_t d[MAX_VEC_SIZE_BYTES / 8]; 42 uint32_t uw[MAX_VEC_SIZE_BYTES / 4]; 43 int32_t w[MAX_VEC_SIZE_BYTES / 4]; 44 uint16_t uh[MAX_VEC_SIZE_BYTES / 2]; 45 int16_t h[MAX_VEC_SIZE_BYTES / 2]; 46 uint8_t ub[MAX_VEC_SIZE_BYTES / 1]; 47 int8_t b[MAX_VEC_SIZE_BYTES / 1]; 48 } MMVector; 49 50 #define BUFSIZE 16 51 #define OUTSIZE 16 52 #define MASKMOD 3 53 54 MMVector buffer0[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); 55 MMVector buffer1[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); 56 MMVector mask[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); 57 MMVector output[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); 58 MMVector expect[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES))); 59 60 #define CHECK_OUTPUT_FUNC(FIELD, FIELDSZ) \ 61 static void check_output_##FIELD(int line, size_t num_vectors) \ 62 { \ 63 for (int i = 0; i < num_vectors; i++) { \ 64 for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \ 65 __check(line, i, j, output[i].FIELD[j], expect[i].FIELD[j]); \ 66 } \ 67 } \ 68 } 69 70 CHECK_OUTPUT_FUNC(d, 8) 71 CHECK_OUTPUT_FUNC(w, 4) 72 CHECK_OUTPUT_FUNC(h, 2) 73 CHECK_OUTPUT_FUNC(b, 1) 74 75 static void init_buffers(void) 76 { 77 int counter0 = 0; 78 int counter1 = 17; 79 for (int i = 0; i < BUFSIZE; i++) { 80 for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { 81 buffer0[i].b[j] = counter0++; 82 buffer1[i].b[j] = counter1++; 83 } 84 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 85 mask[i].w[j] = (i + j % MASKMOD == 0) ? 0 : 1; 86 } 87 } 88 } 89 90 static void test_load_tmp(void) 91 { 92 void *p0 = buffer0; 93 void *p1 = buffer1; 94 void *pout = output; 95 96 for (int i = 0; i < BUFSIZE; i++) { 97 /* 98 * Load into v12 as .tmp, then use it in the next packet 99 * Should get the new value within the same packet and 100 * the old value in the next packet 101 */ 102 asm("v3 = vmem(%0 + #0)\n\t" 103 "r1 = #1\n\t" 104 "v12 = vsplat(r1)\n\t" 105 "{\n\t" 106 " v12.tmp = vmem(%1 + #0)\n\t" 107 " v4.w = vadd(v12.w, v3.w)\n\t" 108 "}\n\t" 109 "v4.w = vadd(v4.w, v12.w)\n\t" 110 "vmem(%2 + #0) = v4\n\t" 111 : : "r"(p0), "r"(p1), "r"(pout) 112 : "r1", "v12", "v3", "v4", "v6", "memory"); 113 p0 += sizeof(MMVector); 114 p1 += sizeof(MMVector); 115 pout += sizeof(MMVector); 116 117 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 118 expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1; 119 } 120 } 121 122 check_output_w(__LINE__, BUFSIZE); 123 } 124 125 static void test_load_cur(void) 126 { 127 void *p0 = buffer0; 128 void *pout = output; 129 130 for (int i = 0; i < BUFSIZE; i++) { 131 asm("{\n\t" 132 " v2.cur = vmem(%0 + #0)\n\t" 133 " vmem(%1 + #0) = v2\n\t" 134 "}\n\t" 135 : : "r"(p0), "r"(pout) : "v2", "memory"); 136 p0 += sizeof(MMVector); 137 pout += sizeof(MMVector); 138 139 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 140 expect[i].uw[j] = buffer0[i].uw[j]; 141 } 142 } 143 144 check_output_w(__LINE__, BUFSIZE); 145 } 146 147 static void test_load_aligned(void) 148 { 149 /* Aligned loads ignore the low bits of the address */ 150 void *p0 = buffer0; 151 void *pout = output; 152 const size_t offset = 13; 153 154 p0 += offset; /* Create an unaligned address */ 155 asm("v2 = vmem(%0 + #0)\n\t" 156 "vmem(%1 + #0) = v2\n\t" 157 : : "r"(p0), "r"(pout) : "v2", "memory"); 158 159 expect[0] = buffer0[0]; 160 161 check_output_w(__LINE__, 1); 162 } 163 164 static void test_load_unaligned(void) 165 { 166 void *p0 = buffer0; 167 void *pout = output; 168 const size_t offset = 12; 169 170 p0 += offset; /* Create an unaligned address */ 171 asm("v2 = vmemu(%0 + #0)\n\t" 172 "vmem(%1 + #0) = v2\n\t" 173 : : "r"(p0), "r"(pout) : "v2", "memory"); 174 175 memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector)); 176 177 check_output_w(__LINE__, 1); 178 } 179 180 static void test_store_aligned(void) 181 { 182 /* Aligned stores ignore the low bits of the address */ 183 void *p0 = buffer0; 184 void *pout = output; 185 const size_t offset = 13; 186 187 pout += offset; /* Create an unaligned address */ 188 asm("v2 = vmem(%0 + #0)\n\t" 189 "vmem(%1 + #0) = v2\n\t" 190 : : "r"(p0), "r"(pout) : "v2", "memory"); 191 192 expect[0] = buffer0[0]; 193 194 check_output_w(__LINE__, 1); 195 } 196 197 static void test_store_unaligned(void) 198 { 199 void *p0 = buffer0; 200 void *pout = output; 201 const size_t offset = 12; 202 203 pout += offset; /* Create an unaligned address */ 204 asm("v2 = vmem(%0 + #0)\n\t" 205 "vmemu(%1 + #0) = v2\n\t" 206 : : "r"(p0), "r"(pout) : "v2", "memory"); 207 208 memcpy(expect, buffer0, 2 * sizeof(MMVector)); 209 memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector)); 210 211 check_output_w(__LINE__, 2); 212 } 213 214 static void test_masked_store(bool invert) 215 { 216 void *p0 = buffer0; 217 void *pmask = mask; 218 void *pout = output; 219 220 memset(expect, 0xff, sizeof(expect)); 221 memset(output, 0xff, sizeof(expect)); 222 223 for (int i = 0; i < BUFSIZE; i++) { 224 if (invert) { 225 asm("r4 = #0\n\t" 226 "v4 = vsplat(r4)\n\t" 227 "v5 = vmem(%0 + #0)\n\t" 228 "q0 = vcmp.eq(v4.w, v5.w)\n\t" 229 "v5 = vmem(%1)\n\t" 230 "if (!q0) vmem(%2) = v5\n\t" /* Inverted test */ 231 : : "r"(pmask), "r"(p0), "r"(pout) 232 : "r4", "v4", "v5", "q0", "memory"); 233 } else { 234 asm("r4 = #0\n\t" 235 "v4 = vsplat(r4)\n\t" 236 "v5 = vmem(%0 + #0)\n\t" 237 "q0 = vcmp.eq(v4.w, v5.w)\n\t" 238 "v5 = vmem(%1)\n\t" 239 "if (q0) vmem(%2) = v5\n\t" /* Non-inverted test */ 240 : : "r"(pmask), "r"(p0), "r"(pout) 241 : "r4", "v4", "v5", "q0", "memory"); 242 } 243 p0 += sizeof(MMVector); 244 pmask += sizeof(MMVector); 245 pout += sizeof(MMVector); 246 247 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 248 if (invert) { 249 if (i + j % MASKMOD != 0) { 250 expect[i].w[j] = buffer0[i].w[j]; 251 } 252 } else { 253 if (i + j % MASKMOD == 0) { 254 expect[i].w[j] = buffer0[i].w[j]; 255 } 256 } 257 } 258 } 259 260 check_output_w(__LINE__, BUFSIZE); 261 } 262 263 static void test_new_value_store(void) 264 { 265 void *p0 = buffer0; 266 void *pout = output; 267 268 asm("{\n\t" 269 " v2 = vmem(%0 + #0)\n\t" 270 " vmem(%1 + #0) = v2.new\n\t" 271 "}\n\t" 272 : : "r"(p0), "r"(pout) : "v2", "memory"); 273 274 expect[0] = buffer0[0]; 275 276 check_output_w(__LINE__, 1); 277 } 278 279 static void test_max_temps() 280 { 281 void *p0 = buffer0; 282 void *pout = output; 283 284 asm("v0 = vmem(%0 + #0)\n\t" 285 "v1 = vmem(%0 + #1)\n\t" 286 "v2 = vmem(%0 + #2)\n\t" 287 "v3 = vmem(%0 + #3)\n\t" 288 "v4 = vmem(%0 + #4)\n\t" 289 "{\n\t" 290 " v1:0.w = vadd(v3:2.w, v1:0.w)\n\t" 291 " v2.b = vshuffe(v3.b, v2.b)\n\t" 292 " v3.w = vadd(v1.w, v4.w)\n\t" 293 " v4.tmp = vmem(%0 + #5)\n\t" 294 "}\n\t" 295 "vmem(%1 + #0) = v0\n\t" 296 "vmem(%1 + #1) = v1\n\t" 297 "vmem(%1 + #2) = v2\n\t" 298 "vmem(%1 + #3) = v3\n\t" 299 "vmem(%1 + #4) = v4\n\t" 300 : : "r"(p0), "r"(pout) : "memory"); 301 302 /* The first two vectors come from the vadd-pair instruction */ 303 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 304 expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i]; 305 expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i]; 306 } 307 /* The third vector comes from the vshuffe instruction */ 308 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) { 309 expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) | 310 (buffer0[3].uh[i] & 0xff) << 8; 311 } 312 /* The fourth vector comes from the vadd-single instruction */ 313 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 314 expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i]; 315 } 316 /* 317 * The fifth vector comes from the load to v4 318 * make sure the .tmp is dropped 319 */ 320 expect[4] = buffer0[4]; 321 322 check_output_b(__LINE__, 5); 323 } 324 325 #define VEC_OP1(ASM, EL, IN, OUT) \ 326 asm("v2 = vmem(%0 + #0)\n\t" \ 327 "v2" #EL " = " #ASM "(v2" #EL ")\n\t" \ 328 "vmem(%1 + #0) = v2\n\t" \ 329 : : "r"(IN), "r"(OUT) : "v2", "memory") 330 331 #define VEC_OP2(ASM, EL, IN0, IN1, OUT) \ 332 asm("v2 = vmem(%0 + #0)\n\t" \ 333 "v3 = vmem(%1 + #0)\n\t" \ 334 "v2" #EL " = " #ASM "(v2" #EL ", v3" #EL ")\n\t" \ 335 "vmem(%2 + #0) = v2\n\t" \ 336 : : "r"(IN0), "r"(IN1), "r"(OUT) : "v2", "v3", "memory") 337 338 #define TEST_VEC_OP1(NAME, ASM, EL, FIELD, FIELDSZ, OP) \ 339 static void test_##NAME(void) \ 340 { \ 341 void *pin = buffer0; \ 342 void *pout = output; \ 343 for (int i = 0; i < BUFSIZE; i++) { \ 344 VEC_OP1(ASM, EL, pin, pout); \ 345 pin += sizeof(MMVector); \ 346 pout += sizeof(MMVector); \ 347 } \ 348 for (int i = 0; i < BUFSIZE; i++) { \ 349 for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \ 350 expect[i].FIELD[j] = OP buffer0[i].FIELD[j]; \ 351 } \ 352 } \ 353 check_output_##FIELD(__LINE__, BUFSIZE); \ 354 } 355 356 #define TEST_VEC_OP2(NAME, ASM, EL, FIELD, FIELDSZ, OP) \ 357 static void test_##NAME(void) \ 358 { \ 359 void *p0 = buffer0; \ 360 void *p1 = buffer1; \ 361 void *pout = output; \ 362 for (int i = 0; i < BUFSIZE; i++) { \ 363 VEC_OP2(ASM, EL, p0, p1, pout); \ 364 p0 += sizeof(MMVector); \ 365 p1 += sizeof(MMVector); \ 366 pout += sizeof(MMVector); \ 367 } \ 368 for (int i = 0; i < BUFSIZE; i++) { \ 369 for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \ 370 expect[i].FIELD[j] = buffer0[i].FIELD[j] OP buffer1[i].FIELD[j]; \ 371 } \ 372 } \ 373 check_output_##FIELD(__LINE__, BUFSIZE); \ 374 } 375 376 #define THRESHOLD 31 377 378 #define PRED_OP2(ASM, IN0, IN1, OUT, INV) \ 379 asm("r4 = #%3\n\t" \ 380 "v1.b = vsplat(r4)\n\t" \ 381 "v2 = vmem(%0 + #0)\n\t" \ 382 "q0 = vcmp.gt(v2.b, v1.b)\n\t" \ 383 "v3 = vmem(%1 + #0)\n\t" \ 384 "q1 = vcmp.gt(v3.b, v1.b)\n\t" \ 385 "q2 = " #ASM "(q0, " INV "q1)\n\t" \ 386 "r4 = #0xff\n\t" \ 387 "v1.b = vsplat(r4)\n\t" \ 388 "if (q2) vmem(%2 + #0) = v1\n\t" \ 389 : : "r"(IN0), "r"(IN1), "r"(OUT), "i"(THRESHOLD) \ 390 : "r4", "v1", "v2", "v3", "q0", "q1", "q2", "memory") 391 392 #define TEST_PRED_OP2(NAME, ASM, OP, INV) \ 393 static void test_##NAME(bool invert) \ 394 { \ 395 void *p0 = buffer0; \ 396 void *p1 = buffer1; \ 397 void *pout = output; \ 398 memset(output, 0, sizeof(expect)); \ 399 for (int i = 0; i < BUFSIZE; i++) { \ 400 PRED_OP2(ASM, p0, p1, pout, INV); \ 401 p0 += sizeof(MMVector); \ 402 p1 += sizeof(MMVector); \ 403 pout += sizeof(MMVector); \ 404 } \ 405 for (int i = 0; i < BUFSIZE; i++) { \ 406 for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { \ 407 bool p0 = (buffer0[i].b[j] > THRESHOLD); \ 408 bool p1 = (buffer1[i].b[j] > THRESHOLD); \ 409 if (invert) { \ 410 expect[i].b[j] = (p0 OP !p1) ? 0xff : 0x00; \ 411 } else { \ 412 expect[i].b[j] = (p0 OP p1) ? 0xff : 0x00; \ 413 } \ 414 } \ 415 } \ 416 check_output_b(__LINE__, BUFSIZE); \ 417 } 418 419 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +) 420 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +) 421 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +) 422 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -) 423 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -) 424 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -) 425 TEST_VEC_OP2(vxor, vxor, , d, 8, ^) 426 TEST_VEC_OP2(vand, vand, , d, 8, &) 427 TEST_VEC_OP2(vor, vor, , d, 8, |) 428 TEST_VEC_OP1(vnot, vnot, , d, 8, ~) 429 430 TEST_PRED_OP2(pred_or, or, |, "") 431 TEST_PRED_OP2(pred_or_n, or, |, "!") 432 TEST_PRED_OP2(pred_and, and, &, "") 433 TEST_PRED_OP2(pred_and_n, and, &, "!") 434 TEST_PRED_OP2(pred_xor, xor, ^, "") 435 436 static void test_vadduwsat(void) 437 { 438 /* 439 * Test for saturation by adding two numbers that add to more than UINT_MAX 440 * and make sure the result saturates to UINT_MAX 441 */ 442 const uint32_t x = 0xffff0000; 443 const uint32_t y = 0x000fffff; 444 445 memset(expect, 0x12, sizeof(MMVector)); 446 memset(output, 0x34, sizeof(MMVector)); 447 448 asm volatile ("v10 = vsplat(%0)\n\t" 449 "v11 = vsplat(%1)\n\t" 450 "v21.uw = vadd(v11.uw, v10.uw):sat\n\t" 451 "vmem(%2+#0) = v21\n\t" 452 : /* no outputs */ 453 : "r"(x), "r"(y), "r"(output) 454 : "v10", "v11", "v21", "memory"); 455 456 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 457 expect[0].uw[j] = UINT_MAX; 458 } 459 460 check_output_w(__LINE__, 1); 461 } 462 463 static void test_vsubuwsat_dv(void) 464 { 465 /* 466 * Test for saturation by subtracting two numbers where the result is 467 * negative and make sure the result saturates to zero 468 * 469 * vsubuwsat_dv operates on an HVX register pair, so we'll have a 470 * pair of subtractions 471 * w - x < 0 472 * y - z < 0 473 */ 474 const uint32_t w = 0x000000b7; 475 const uint32_t x = 0xffffff4e; 476 const uint32_t y = 0x31fe88e7; 477 const uint32_t z = 0x7fffff79; 478 479 memset(expect, 0x12, sizeof(MMVector) * 2); 480 memset(output, 0x34, sizeof(MMVector) * 2); 481 482 asm volatile ("v16 = vsplat(%0)\n\t" 483 "v17 = vsplat(%1)\n\t" 484 "v26 = vsplat(%2)\n\t" 485 "v27 = vsplat(%3)\n\t" 486 "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t" 487 "vmem(%4+#0) = v24\n\t" 488 "vmem(%4+#1) = v25\n\t" 489 : /* no outputs */ 490 : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output) 491 : "v16", "v17", "v24", "v25", "v26", "v27", "memory"); 492 493 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 494 expect[0].uw[j] = 0x00000000; 495 expect[1].uw[j] = 0x00000000; 496 } 497 498 check_output_w(__LINE__, 2); 499 } 500 501 static void test_vshuff(void) 502 { 503 /* Test that vshuff works when the two operands are the same register */ 504 const uint32_t splat = 0x089be55c; 505 const uint32_t shuff = 0x454fa926; 506 MMVector v0, v1; 507 508 memset(expect, 0x12, sizeof(MMVector)); 509 memset(output, 0x34, sizeof(MMVector)); 510 511 asm volatile("v25 = vsplat(%0)\n\t" 512 "vshuff(v25, v25, %1)\n\t" 513 "vmem(%2 + #0) = v25\n\t" 514 : /* no outputs */ 515 : "r"(splat), "r"(shuff), "r"(output) 516 : "v25", "memory"); 517 518 /* 519 * The semantics of Hexagon are the operands are pass-by-value, so create 520 * two copies of the vsplat result. 521 */ 522 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 523 v0.uw[i] = splat; 524 v1.uw[i] = splat; 525 } 526 /* Do the vshuff operation */ 527 for (int offset = 1; offset < MAX_VEC_SIZE_BYTES; offset <<= 1) { 528 if (shuff & offset) { 529 for (int k = 0; k < MAX_VEC_SIZE_BYTES; k++) { 530 if (!(k & offset)) { 531 uint8_t tmp = v0.ub[k]; 532 v0.ub[k] = v1.ub[k + offset]; 533 v1.ub[k + offset] = tmp; 534 } 535 } 536 } 537 } 538 /* Put the result in the expect buffer for verification */ 539 expect[0] = v1; 540 541 check_output_b(__LINE__, 1); 542 } 543 544 int main() 545 { 546 init_buffers(); 547 548 test_load_tmp(); 549 test_load_cur(); 550 test_load_aligned(); 551 test_load_unaligned(); 552 test_store_aligned(); 553 test_store_unaligned(); 554 test_masked_store(false); 555 test_masked_store(true); 556 test_new_value_store(); 557 test_max_temps(); 558 559 test_vadd_w(); 560 test_vadd_h(); 561 test_vadd_b(); 562 test_vsub_w(); 563 test_vsub_h(); 564 test_vsub_b(); 565 test_vxor(); 566 test_vand(); 567 test_vor(); 568 test_vnot(); 569 570 test_pred_or(false); 571 test_pred_or_n(true); 572 test_pred_and(false); 573 test_pred_and_n(true); 574 test_pred_xor(false); 575 576 test_vadduwsat(); 577 test_vsubuwsat_dv(); 578 579 test_vshuff(); 580 581 puts(err ? "FAIL" : "PASS"); 582 return err ? 1 : 0; 583 } 584