1 /* 2 * Copyright(c) 2021-2024 Qualcomm Innovation Center, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 #include <stdio.h> 19 #include <stdint.h> 20 #include <stdbool.h> 21 #include <string.h> 22 #include <limits.h> 23 24 int err; 25 26 #include "hvx_misc.h" 27 28 static void test_load_tmp(void) 29 { 30 void *p0 = buffer0; 31 void *p1 = buffer1; 32 void *pout = output; 33 34 for (int i = 0; i < BUFSIZE; i++) { 35 /* 36 * Load into v12 as .tmp, then use it in the next packet 37 * Should get the new value within the same packet and 38 * the old value in the next packet 39 */ 40 asm("v3 = vmem(%0 + #0)\n\t" 41 "r1 = #1\n\t" 42 "v12 = vsplat(r1)\n\t" 43 "{\n\t" 44 " v12.tmp = vmem(%1 + #0)\n\t" 45 " v4.w = vadd(v12.w, v3.w)\n\t" 46 "}\n\t" 47 "v4.w = vadd(v4.w, v12.w)\n\t" 48 "vmem(%2 + #0) = v4\n\t" 49 : : "r"(p0), "r"(p1), "r"(pout) 50 : "r1", "v12", "v3", "v4", "v6", "memory"); 51 p0 += sizeof(MMVector); 52 p1 += sizeof(MMVector); 53 pout += sizeof(MMVector); 54 55 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 56 expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1; 57 } 58 } 59 60 check_output_w(__LINE__, BUFSIZE); 61 } 62 63 static void test_load_tmp2(void) 64 { 65 void *pout0 = &output[0]; 66 void *pout1 = &output[1]; 67 68 asm volatile( 69 "r0 = #0x03030303\n\t" 70 "v16 = vsplat(r0)\n\t" 71 "r0 = #0x04040404\n\t" 72 "v18 = vsplat(r0)\n\t" 73 "r0 = #0x05050505\n\t" 74 "v21 = vsplat(r0)\n\t" 75 "{\n\t" 76 " v25:24 += vmpyo(v18.w, v14.h)\n\t" 77 " v15:14.tmp = vcombine(v21, v16)\n\t" 78 "}\n\t" 79 "vmem(%0 + #0) = v24\n\t" 80 "vmem(%1 + #0) = v25\n\t" 81 : : "r"(pout0), "r"(pout1) 82 : "r0", "v16", "v18", "v21", "v24", "v25", "memory" 83 ); 84 85 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; ++i) { 86 expect[0].w[i] = 0x180c0000; 87 expect[1].w[i] = 0x000c1818; 88 } 89 90 check_output_w(__LINE__, 2); 91 } 92 93 static void test_load_cur(void) 94 { 95 void *p0 = buffer0; 96 void *pout = output; 97 98 for (int i = 0; i < BUFSIZE; i++) { 99 asm("{\n\t" 100 " v2.cur = vmem(%0 + #0)\n\t" 101 " vmem(%1 + #0) = v2\n\t" 102 "}\n\t" 103 : : "r"(p0), "r"(pout) : "v2", "memory"); 104 p0 += sizeof(MMVector); 105 pout += sizeof(MMVector); 106 107 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 108 expect[i].uw[j] = buffer0[i].uw[j]; 109 } 110 } 111 112 check_output_w(__LINE__, BUFSIZE); 113 } 114 115 static void test_load_aligned(void) 116 { 117 /* Aligned loads ignore the low bits of the address */ 118 void *p0 = buffer0; 119 void *pout = output; 120 const size_t offset = 13; 121 122 p0 += offset; /* Create an unaligned address */ 123 asm("v2 = vmem(%0 + #0)\n\t" 124 "vmem(%1 + #0) = v2\n\t" 125 : : "r"(p0), "r"(pout) : "v2", "memory"); 126 127 expect[0] = buffer0[0]; 128 129 check_output_w(__LINE__, 1); 130 } 131 132 static void test_load_unaligned(void) 133 { 134 void *p0 = buffer0; 135 void *pout = output; 136 const size_t offset = 12; 137 138 p0 += offset; /* Create an unaligned address */ 139 asm("v2 = vmemu(%0 + #0)\n\t" 140 "vmem(%1 + #0) = v2\n\t" 141 : : "r"(p0), "r"(pout) : "v2", "memory"); 142 143 memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector)); 144 145 check_output_w(__LINE__, 1); 146 } 147 148 static void test_store_aligned(void) 149 { 150 /* Aligned stores ignore the low bits of the address */ 151 void *p0 = buffer0; 152 void *pout = output; 153 const size_t offset = 13; 154 155 pout += offset; /* Create an unaligned address */ 156 asm("v2 = vmem(%0 + #0)\n\t" 157 "vmem(%1 + #0) = v2\n\t" 158 : : "r"(p0), "r"(pout) : "v2", "memory"); 159 160 expect[0] = buffer0[0]; 161 162 check_output_w(__LINE__, 1); 163 } 164 165 static void test_store_unaligned(void) 166 { 167 void *p0 = buffer0; 168 void *pout = output; 169 const size_t offset = 12; 170 171 pout += offset; /* Create an unaligned address */ 172 asm("v2 = vmem(%0 + #0)\n\t" 173 "vmemu(%1 + #0) = v2\n\t" 174 : : "r"(p0), "r"(pout) : "v2", "memory"); 175 176 memcpy(expect, buffer0, 2 * sizeof(MMVector)); 177 memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector)); 178 179 check_output_w(__LINE__, 2); 180 } 181 182 static void test_masked_store(bool invert) 183 { 184 void *p0 = buffer0; 185 void *pmask = mask; 186 void *pout = output; 187 188 memset(expect, 0xff, sizeof(expect)); 189 memset(output, 0xff, sizeof(expect)); 190 191 for (int i = 0; i < BUFSIZE; i++) { 192 if (invert) { 193 asm("r4 = #0\n\t" 194 "v4 = vsplat(r4)\n\t" 195 "v5 = vmem(%0 + #0)\n\t" 196 "q0 = vcmp.eq(v4.w, v5.w)\n\t" 197 "v5 = vmem(%1)\n\t" 198 "if (!q0) vmem(%2) = v5\n\t" /* Inverted test */ 199 : : "r"(pmask), "r"(p0), "r"(pout) 200 : "r4", "v4", "v5", "q0", "memory"); 201 } else { 202 asm("r4 = #0\n\t" 203 "v4 = vsplat(r4)\n\t" 204 "v5 = vmem(%0 + #0)\n\t" 205 "q0 = vcmp.eq(v4.w, v5.w)\n\t" 206 "v5 = vmem(%1)\n\t" 207 "if (q0) vmem(%2) = v5\n\t" /* Non-inverted test */ 208 : : "r"(pmask), "r"(p0), "r"(pout) 209 : "r4", "v4", "v5", "q0", "memory"); 210 } 211 p0 += sizeof(MMVector); 212 pmask += sizeof(MMVector); 213 pout += sizeof(MMVector); 214 215 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 216 if (invert) { 217 if (i + j % MASKMOD != 0) { 218 expect[i].w[j] = buffer0[i].w[j]; 219 } 220 } else { 221 if (i + j % MASKMOD == 0) { 222 expect[i].w[j] = buffer0[i].w[j]; 223 } 224 } 225 } 226 } 227 228 check_output_w(__LINE__, BUFSIZE); 229 } 230 231 static void test_new_value_store(void) 232 { 233 void *p0 = buffer0; 234 void *p1 = buffer1; 235 void *pout = output; 236 237 asm("{\n\t" 238 " v2 = vmem(%0 + #0)\n\t" 239 " vmem(%1 + #0) = v2.new\n\t" 240 "}\n\t" 241 : : "r"(p0), "r"(pout) : "v2", "memory"); 242 243 expect[0] = buffer0[0]; 244 245 check_output_w(__LINE__, 1); 246 247 /* Test the .new read from the high half of a pair */ 248 asm("v7 = vmem(%0 + #0)\n\t" 249 "v12 = vmem(%1 + #0)\n\t" 250 "{\n\t" 251 " v5:4 = vcombine(v12, v7)\n\t" 252 " vmem(%2 + #0) = v5.new\n\t" 253 "}\n\t" 254 : : "r"(p0), "r"(p1), "r"(pout) : "v4", "v5", "v7", "v12", "memory"); 255 256 expect[0] = buffer1[0]; 257 258 check_output_w(__LINE__, 1); 259 } 260 261 static void test_max_temps() 262 { 263 void *p0 = buffer0; 264 void *pout = output; 265 266 asm("v0 = vmem(%0 + #0)\n\t" 267 "v1 = vmem(%0 + #1)\n\t" 268 "v2 = vmem(%0 + #2)\n\t" 269 "v3 = vmem(%0 + #3)\n\t" 270 "v4 = vmem(%0 + #4)\n\t" 271 "{\n\t" 272 " v1:0.w = vadd(v3:2.w, v1:0.w)\n\t" 273 " v2.b = vshuffe(v3.b, v2.b)\n\t" 274 " v3.w = vadd(v1.w, v4.w)\n\t" 275 " v4.tmp = vmem(%0 + #5)\n\t" 276 "}\n\t" 277 "vmem(%1 + #0) = v0\n\t" 278 "vmem(%1 + #1) = v1\n\t" 279 "vmem(%1 + #2) = v2\n\t" 280 "vmem(%1 + #3) = v3\n\t" 281 "vmem(%1 + #4) = v4\n\t" 282 : : "r"(p0), "r"(pout) : "memory"); 283 284 /* The first two vectors come from the vadd-pair instruction */ 285 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 286 expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i]; 287 expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i]; 288 } 289 /* The third vector comes from the vshuffe instruction */ 290 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) { 291 expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) | 292 (buffer0[3].uh[i] & 0xff) << 8; 293 } 294 /* The fourth vector comes from the vadd-single instruction */ 295 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 296 expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i]; 297 } 298 /* 299 * The fifth vector comes from the load to v4 300 * make sure the .tmp is dropped 301 */ 302 expect[4] = buffer0[4]; 303 304 check_output_b(__LINE__, 5); 305 } 306 307 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +) 308 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +) 309 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +) 310 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -) 311 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -) 312 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -) 313 TEST_VEC_OP2(vxor, vxor, , d, 8, ^) 314 TEST_VEC_OP2(vand, vand, , d, 8, &) 315 TEST_VEC_OP2(vor, vor, , d, 8, |) 316 TEST_VEC_OP1(vnot, vnot, , d, 8, ~) 317 318 TEST_PRED_OP2(pred_or, or, |, "") 319 TEST_PRED_OP2(pred_or_n, or, |, "!") 320 TEST_PRED_OP2(pred_and, and, &, "") 321 TEST_PRED_OP2(pred_and_n, and, &, "!") 322 TEST_PRED_OP2(pred_xor, xor, ^, "") 323 324 static void test_vadduwsat(void) 325 { 326 /* 327 * Test for saturation by adding two numbers that add to more than UINT_MAX 328 * and make sure the result saturates to UINT_MAX 329 */ 330 const uint32_t x = 0xffff0000; 331 const uint32_t y = 0x000fffff; 332 333 memset(expect, 0x12, sizeof(MMVector)); 334 memset(output, 0x34, sizeof(MMVector)); 335 336 asm volatile ("v10 = vsplat(%0)\n\t" 337 "v11 = vsplat(%1)\n\t" 338 "v21.uw = vadd(v11.uw, v10.uw):sat\n\t" 339 "vmem(%2+#0) = v21\n\t" 340 : /* no outputs */ 341 : "r"(x), "r"(y), "r"(output) 342 : "v10", "v11", "v21", "memory"); 343 344 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 345 expect[0].uw[j] = UINT_MAX; 346 } 347 348 check_output_w(__LINE__, 1); 349 } 350 351 static void test_vsubuwsat_dv(void) 352 { 353 /* 354 * Test for saturation by subtracting two numbers where the result is 355 * negative and make sure the result saturates to zero 356 * 357 * vsubuwsat_dv operates on an HVX register pair, so we'll have a 358 * pair of subtractions 359 * w - x < 0 360 * y - z < 0 361 */ 362 const uint32_t w = 0x000000b7; 363 const uint32_t x = 0xffffff4e; 364 const uint32_t y = 0x31fe88e7; 365 const uint32_t z = 0x7fffff79; 366 367 memset(expect, 0x12, sizeof(MMVector) * 2); 368 memset(output, 0x34, sizeof(MMVector) * 2); 369 370 asm volatile ("v16 = vsplat(%0)\n\t" 371 "v17 = vsplat(%1)\n\t" 372 "v26 = vsplat(%2)\n\t" 373 "v27 = vsplat(%3)\n\t" 374 "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t" 375 "vmem(%4+#0) = v24\n\t" 376 "vmem(%4+#1) = v25\n\t" 377 : /* no outputs */ 378 : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output) 379 : "v16", "v17", "v24", "v25", "v26", "v27", "memory"); 380 381 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 382 expect[0].uw[j] = 0x00000000; 383 expect[1].uw[j] = 0x00000000; 384 } 385 386 check_output_w(__LINE__, 2); 387 } 388 389 static void test_load_tmp_predicated(void) 390 { 391 void *p0 = buffer0; 392 void *p1 = buffer1; 393 void *pout = output; 394 bool pred = true; 395 396 for (int i = 0; i < BUFSIZE; i++) { 397 /* 398 * Load into v12 as .tmp with a predicate 399 * When the predicate is true, we get the vector from buffer1[i] 400 * When the predicate is false, we get a vector of all 1's 401 * Regardless of the predicate, the next packet should have 402 * a vector of all 1's 403 */ 404 asm("v3 = vmem(%0 + #0)\n\t" 405 "r1 = #1\n\t" 406 "v12 = vsplat(r1)\n\t" 407 "p1 = !cmp.eq(%3, #0)\n\t" 408 "{\n\t" 409 " if (p1) v12.tmp = vmem(%1 + #0)\n\t" 410 " v4.w = vadd(v12.w, v3.w)\n\t" 411 "}\n\t" 412 "v4.w = vadd(v4.w, v12.w)\n\t" 413 "vmem(%2 + #0) = v4\n\t" 414 : : "r"(p0), "r"(p1), "r"(pout), "r"(pred) 415 : "r1", "p1", "v12", "v3", "v4", "v6", "memory"); 416 p0 += sizeof(MMVector); 417 p1 += sizeof(MMVector); 418 pout += sizeof(MMVector); 419 420 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 421 expect[i].w[j] = 422 pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1 423 : buffer0[i].w[j] + 2; 424 } 425 pred = !pred; 426 } 427 428 check_output_w(__LINE__, BUFSIZE); 429 } 430 431 static void test_load_cur_predicated(void) 432 { 433 bool pred = true; 434 for (int i = 0; i < BUFSIZE; i++) { 435 asm volatile("p0 = !cmp.eq(%3, #0)\n\t" 436 "v3 = vmem(%0+#0)\n\t" 437 /* 438 * Preload v4 to make sure that the assignment from the 439 * packet below is not being ignored when pred is false. 440 */ 441 "r0 = #0x01237654\n\t" 442 "v4 = vsplat(r0)\n\t" 443 "{\n\t" 444 " if (p0) v3.cur = vmem(%1+#0)\n\t" 445 " v4 = v3\n\t" 446 "}\n\t" 447 "vmem(%2+#0) = v4\n\t" 448 : 449 : "r"(&buffer0[i]), "r"(&buffer1[i]), 450 "r"(&output[i]), "r"(pred) 451 : "r0", "p0", "v3", "v4", "memory"); 452 expect[i] = pred ? buffer1[i] : buffer0[i]; 453 pred = !pred; 454 } 455 check_output_w(__LINE__, BUFSIZE); 456 } 457 458 static void test_vcombine(void) 459 { 460 for (int i = 0; i < BUFSIZE / 2; i++) { 461 asm volatile("v2 = vsplat(%0)\n\t" 462 "v3 = vsplat(%1)\n\t" 463 "v3:2 = vcombine(v2, v3)\n\t" 464 "vmem(%2+#0) = v2\n\t" 465 "vmem(%2+#1) = v3\n\t" 466 : 467 : "r"(2 * i), "r"(2 * i + 1), "r"(&output[2 * i]) 468 : "v2", "v3", "memory"); 469 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 470 expect[2 * i].w[j] = 2 * i + 1; 471 expect[2 * i + 1].w[j] = 2 * i; 472 } 473 } 474 check_output_w(__LINE__, BUFSIZE); 475 } 476 477 void test_store_new() 478 { 479 asm volatile( 480 "r0 = #0x12345678\n" 481 "v0 = vsplat(r0)\n" 482 "r0 = #0xff00ff00\n" 483 "v1 = vsplat(r0)\n" 484 "{\n" 485 " vdeal(v1,v0,r0)\n" 486 " vmem(%0) = v0.new\n" 487 "}\n" 488 : 489 : "r"(&output[0]) 490 : "r0", "v0", "v1", "memory" 491 ); 492 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 493 expect[0].w[i] = 0x12345678; 494 } 495 check_output_w(__LINE__, 1); 496 } 497 498 int main() 499 { 500 init_buffers(); 501 502 test_load_tmp(); 503 test_load_tmp2(); 504 test_load_cur(); 505 test_load_aligned(); 506 test_load_unaligned(); 507 test_store_aligned(); 508 test_store_unaligned(); 509 test_masked_store(false); 510 test_masked_store(true); 511 test_new_value_store(); 512 test_max_temps(); 513 514 test_vadd_w(); 515 test_vadd_h(); 516 test_vadd_b(); 517 test_vsub_w(); 518 test_vsub_h(); 519 test_vsub_b(); 520 test_vxor(); 521 test_vand(); 522 test_vor(); 523 test_vnot(); 524 525 test_pred_or(false); 526 test_pred_or_n(true); 527 test_pred_and(false); 528 test_pred_and_n(true); 529 test_pred_xor(false); 530 531 test_vadduwsat(); 532 test_vsubuwsat_dv(); 533 534 test_load_tmp_predicated(); 535 test_load_cur_predicated(); 536 537 test_vcombine(); 538 539 test_store_new(); 540 541 puts(err ? "FAIL" : "PASS"); 542 return err ? 1 : 0; 543 } 544