1 /* 2 * Copyright(c) 2021-2023 Qualcomm Innovation Center, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 #include <stdio.h> 19 #include <stdint.h> 20 #include <stdbool.h> 21 #include <string.h> 22 #include <limits.h> 23 24 int err; 25 26 #include "hvx_misc.h" 27 28 static void test_load_tmp(void) 29 { 30 void *p0 = buffer0; 31 void *p1 = buffer1; 32 void *pout = output; 33 34 for (int i = 0; i < BUFSIZE; i++) { 35 /* 36 * Load into v12 as .tmp, then use it in the next packet 37 * Should get the new value within the same packet and 38 * the old value in the next packet 39 */ 40 asm("v3 = vmem(%0 + #0)\n\t" 41 "r1 = #1\n\t" 42 "v12 = vsplat(r1)\n\t" 43 "{\n\t" 44 " v12.tmp = vmem(%1 + #0)\n\t" 45 " v4.w = vadd(v12.w, v3.w)\n\t" 46 "}\n\t" 47 "v4.w = vadd(v4.w, v12.w)\n\t" 48 "vmem(%2 + #0) = v4\n\t" 49 : : "r"(p0), "r"(p1), "r"(pout) 50 : "r1", "v12", "v3", "v4", "v6", "memory"); 51 p0 += sizeof(MMVector); 52 p1 += sizeof(MMVector); 53 pout += sizeof(MMVector); 54 55 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 56 expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1; 57 } 58 } 59 60 check_output_w(__LINE__, BUFSIZE); 61 } 62 63 static void test_load_tmp2(void) 64 { 65 void *pout0 = &output[0]; 66 void *pout1 = &output[1]; 67 68 asm volatile( 69 "r0 = #0x03030303\n\t" 70 "v16 = vsplat(r0)\n\t" 71 "r0 = #0x04040404\n\t" 72 "v18 = vsplat(r0)\n\t" 73 "r0 = #0x05050505\n\t" 74 "v21 = vsplat(r0)\n\t" 75 "{\n\t" 76 " v25:24 += vmpyo(v18.w, v14.h)\n\t" 77 " v15:14.tmp = vcombine(v21, v16)\n\t" 78 "}\n\t" 79 "vmem(%0 + #0) = v24\n\t" 80 "vmem(%1 + #0) = v25\n\t" 81 : : "r"(pout0), "r"(pout1) 82 : "r0", "v16", "v18", "v21", "v24", "v25", "memory" 83 ); 84 85 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; ++i) { 86 expect[0].w[i] = 0x180c0000; 87 expect[1].w[i] = 0x000c1818; 88 } 89 90 check_output_w(__LINE__, 2); 91 } 92 93 static void test_load_cur(void) 94 { 95 void *p0 = buffer0; 96 void *pout = output; 97 98 for (int i = 0; i < BUFSIZE; i++) { 99 asm("{\n\t" 100 " v2.cur = vmem(%0 + #0)\n\t" 101 " vmem(%1 + #0) = v2\n\t" 102 "}\n\t" 103 : : "r"(p0), "r"(pout) : "v2", "memory"); 104 p0 += sizeof(MMVector); 105 pout += sizeof(MMVector); 106 107 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 108 expect[i].uw[j] = buffer0[i].uw[j]; 109 } 110 } 111 112 check_output_w(__LINE__, BUFSIZE); 113 } 114 115 static void test_load_aligned(void) 116 { 117 /* Aligned loads ignore the low bits of the address */ 118 void *p0 = buffer0; 119 void *pout = output; 120 const size_t offset = 13; 121 122 p0 += offset; /* Create an unaligned address */ 123 asm("v2 = vmem(%0 + #0)\n\t" 124 "vmem(%1 + #0) = v2\n\t" 125 : : "r"(p0), "r"(pout) : "v2", "memory"); 126 127 expect[0] = buffer0[0]; 128 129 check_output_w(__LINE__, 1); 130 } 131 132 static void test_load_unaligned(void) 133 { 134 void *p0 = buffer0; 135 void *pout = output; 136 const size_t offset = 12; 137 138 p0 += offset; /* Create an unaligned address */ 139 asm("v2 = vmemu(%0 + #0)\n\t" 140 "vmem(%1 + #0) = v2\n\t" 141 : : "r"(p0), "r"(pout) : "v2", "memory"); 142 143 memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector)); 144 145 check_output_w(__LINE__, 1); 146 } 147 148 static void test_store_aligned(void) 149 { 150 /* Aligned stores ignore the low bits of the address */ 151 void *p0 = buffer0; 152 void *pout = output; 153 const size_t offset = 13; 154 155 pout += offset; /* Create an unaligned address */ 156 asm("v2 = vmem(%0 + #0)\n\t" 157 "vmem(%1 + #0) = v2\n\t" 158 : : "r"(p0), "r"(pout) : "v2", "memory"); 159 160 expect[0] = buffer0[0]; 161 162 check_output_w(__LINE__, 1); 163 } 164 165 static void test_store_unaligned(void) 166 { 167 void *p0 = buffer0; 168 void *pout = output; 169 const size_t offset = 12; 170 171 pout += offset; /* Create an unaligned address */ 172 asm("v2 = vmem(%0 + #0)\n\t" 173 "vmemu(%1 + #0) = v2\n\t" 174 : : "r"(p0), "r"(pout) : "v2", "memory"); 175 176 memcpy(expect, buffer0, 2 * sizeof(MMVector)); 177 memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector)); 178 179 check_output_w(__LINE__, 2); 180 } 181 182 static void test_masked_store(bool invert) 183 { 184 void *p0 = buffer0; 185 void *pmask = mask; 186 void *pout = output; 187 188 memset(expect, 0xff, sizeof(expect)); 189 memset(output, 0xff, sizeof(expect)); 190 191 for (int i = 0; i < BUFSIZE; i++) { 192 if (invert) { 193 asm("r4 = #0\n\t" 194 "v4 = vsplat(r4)\n\t" 195 "v5 = vmem(%0 + #0)\n\t" 196 "q0 = vcmp.eq(v4.w, v5.w)\n\t" 197 "v5 = vmem(%1)\n\t" 198 "if (!q0) vmem(%2) = v5\n\t" /* Inverted test */ 199 : : "r"(pmask), "r"(p0), "r"(pout) 200 : "r4", "v4", "v5", "q0", "memory"); 201 } else { 202 asm("r4 = #0\n\t" 203 "v4 = vsplat(r4)\n\t" 204 "v5 = vmem(%0 + #0)\n\t" 205 "q0 = vcmp.eq(v4.w, v5.w)\n\t" 206 "v5 = vmem(%1)\n\t" 207 "if (q0) vmem(%2) = v5\n\t" /* Non-inverted test */ 208 : : "r"(pmask), "r"(p0), "r"(pout) 209 : "r4", "v4", "v5", "q0", "memory"); 210 } 211 p0 += sizeof(MMVector); 212 pmask += sizeof(MMVector); 213 pout += sizeof(MMVector); 214 215 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 216 if (invert) { 217 if (i + j % MASKMOD != 0) { 218 expect[i].w[j] = buffer0[i].w[j]; 219 } 220 } else { 221 if (i + j % MASKMOD == 0) { 222 expect[i].w[j] = buffer0[i].w[j]; 223 } 224 } 225 } 226 } 227 228 check_output_w(__LINE__, BUFSIZE); 229 } 230 231 static void test_new_value_store(void) 232 { 233 void *p0 = buffer0; 234 void *pout = output; 235 236 asm("{\n\t" 237 " v2 = vmem(%0 + #0)\n\t" 238 " vmem(%1 + #0) = v2.new\n\t" 239 "}\n\t" 240 : : "r"(p0), "r"(pout) : "v2", "memory"); 241 242 expect[0] = buffer0[0]; 243 244 check_output_w(__LINE__, 1); 245 } 246 247 static void test_max_temps() 248 { 249 void *p0 = buffer0; 250 void *pout = output; 251 252 asm("v0 = vmem(%0 + #0)\n\t" 253 "v1 = vmem(%0 + #1)\n\t" 254 "v2 = vmem(%0 + #2)\n\t" 255 "v3 = vmem(%0 + #3)\n\t" 256 "v4 = vmem(%0 + #4)\n\t" 257 "{\n\t" 258 " v1:0.w = vadd(v3:2.w, v1:0.w)\n\t" 259 " v2.b = vshuffe(v3.b, v2.b)\n\t" 260 " v3.w = vadd(v1.w, v4.w)\n\t" 261 " v4.tmp = vmem(%0 + #5)\n\t" 262 "}\n\t" 263 "vmem(%1 + #0) = v0\n\t" 264 "vmem(%1 + #1) = v1\n\t" 265 "vmem(%1 + #2) = v2\n\t" 266 "vmem(%1 + #3) = v3\n\t" 267 "vmem(%1 + #4) = v4\n\t" 268 : : "r"(p0), "r"(pout) : "memory"); 269 270 /* The first two vectors come from the vadd-pair instruction */ 271 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 272 expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i]; 273 expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i]; 274 } 275 /* The third vector comes from the vshuffe instruction */ 276 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) { 277 expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) | 278 (buffer0[3].uh[i] & 0xff) << 8; 279 } 280 /* The fourth vector comes from the vadd-single instruction */ 281 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 282 expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i]; 283 } 284 /* 285 * The fifth vector comes from the load to v4 286 * make sure the .tmp is dropped 287 */ 288 expect[4] = buffer0[4]; 289 290 check_output_b(__LINE__, 5); 291 } 292 293 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +) 294 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +) 295 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +) 296 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -) 297 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -) 298 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -) 299 TEST_VEC_OP2(vxor, vxor, , d, 8, ^) 300 TEST_VEC_OP2(vand, vand, , d, 8, &) 301 TEST_VEC_OP2(vor, vor, , d, 8, |) 302 TEST_VEC_OP1(vnot, vnot, , d, 8, ~) 303 304 TEST_PRED_OP2(pred_or, or, |, "") 305 TEST_PRED_OP2(pred_or_n, or, |, "!") 306 TEST_PRED_OP2(pred_and, and, &, "") 307 TEST_PRED_OP2(pred_and_n, and, &, "!") 308 TEST_PRED_OP2(pred_xor, xor, ^, "") 309 310 static void test_vadduwsat(void) 311 { 312 /* 313 * Test for saturation by adding two numbers that add to more than UINT_MAX 314 * and make sure the result saturates to UINT_MAX 315 */ 316 const uint32_t x = 0xffff0000; 317 const uint32_t y = 0x000fffff; 318 319 memset(expect, 0x12, sizeof(MMVector)); 320 memset(output, 0x34, sizeof(MMVector)); 321 322 asm volatile ("v10 = vsplat(%0)\n\t" 323 "v11 = vsplat(%1)\n\t" 324 "v21.uw = vadd(v11.uw, v10.uw):sat\n\t" 325 "vmem(%2+#0) = v21\n\t" 326 : /* no outputs */ 327 : "r"(x), "r"(y), "r"(output) 328 : "v10", "v11", "v21", "memory"); 329 330 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 331 expect[0].uw[j] = UINT_MAX; 332 } 333 334 check_output_w(__LINE__, 1); 335 } 336 337 static void test_vsubuwsat_dv(void) 338 { 339 /* 340 * Test for saturation by subtracting two numbers where the result is 341 * negative and make sure the result saturates to zero 342 * 343 * vsubuwsat_dv operates on an HVX register pair, so we'll have a 344 * pair of subtractions 345 * w - x < 0 346 * y - z < 0 347 */ 348 const uint32_t w = 0x000000b7; 349 const uint32_t x = 0xffffff4e; 350 const uint32_t y = 0x31fe88e7; 351 const uint32_t z = 0x7fffff79; 352 353 memset(expect, 0x12, sizeof(MMVector) * 2); 354 memset(output, 0x34, sizeof(MMVector) * 2); 355 356 asm volatile ("v16 = vsplat(%0)\n\t" 357 "v17 = vsplat(%1)\n\t" 358 "v26 = vsplat(%2)\n\t" 359 "v27 = vsplat(%3)\n\t" 360 "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t" 361 "vmem(%4+#0) = v24\n\t" 362 "vmem(%4+#1) = v25\n\t" 363 : /* no outputs */ 364 : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output) 365 : "v16", "v17", "v24", "v25", "v26", "v27", "memory"); 366 367 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 368 expect[0].uw[j] = 0x00000000; 369 expect[1].uw[j] = 0x00000000; 370 } 371 372 check_output_w(__LINE__, 2); 373 } 374 375 static void test_load_tmp_predicated(void) 376 { 377 void *p0 = buffer0; 378 void *p1 = buffer1; 379 void *pout = output; 380 bool pred = true; 381 382 for (int i = 0; i < BUFSIZE; i++) { 383 /* 384 * Load into v12 as .tmp with a predicate 385 * When the predicate is true, we get the vector from buffer1[i] 386 * When the predicate is false, we get a vector of all 1's 387 * Regardless of the predicate, the next packet should have 388 * a vector of all 1's 389 */ 390 asm("v3 = vmem(%0 + #0)\n\t" 391 "r1 = #1\n\t" 392 "v12 = vsplat(r1)\n\t" 393 "p1 = !cmp.eq(%3, #0)\n\t" 394 "{\n\t" 395 " if (p1) v12.tmp = vmem(%1 + #0)\n\t" 396 " v4.w = vadd(v12.w, v3.w)\n\t" 397 "}\n\t" 398 "v4.w = vadd(v4.w, v12.w)\n\t" 399 "vmem(%2 + #0) = v4\n\t" 400 : : "r"(p0), "r"(p1), "r"(pout), "r"(pred) 401 : "r1", "p1", "v12", "v3", "v4", "v6", "memory"); 402 p0 += sizeof(MMVector); 403 p1 += sizeof(MMVector); 404 pout += sizeof(MMVector); 405 406 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 407 expect[i].w[j] = 408 pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1 409 : buffer0[i].w[j] + 2; 410 } 411 pred = !pred; 412 } 413 414 check_output_w(__LINE__, BUFSIZE); 415 } 416 417 static void test_load_cur_predicated(void) 418 { 419 bool pred = true; 420 for (int i = 0; i < BUFSIZE; i++) { 421 asm volatile("p0 = !cmp.eq(%3, #0)\n\t" 422 "v3 = vmem(%0+#0)\n\t" 423 /* 424 * Preload v4 to make sure that the assignment from the 425 * packet below is not being ignored when pred is false. 426 */ 427 "r0 = #0x01237654\n\t" 428 "v4 = vsplat(r0)\n\t" 429 "{\n\t" 430 " if (p0) v3.cur = vmem(%1+#0)\n\t" 431 " v4 = v3\n\t" 432 "}\n\t" 433 "vmem(%2+#0) = v4\n\t" 434 : 435 : "r"(&buffer0[i]), "r"(&buffer1[i]), 436 "r"(&output[i]), "r"(pred) 437 : "r0", "p0", "v3", "v4", "memory"); 438 expect[i] = pred ? buffer1[i] : buffer0[i]; 439 pred = !pred; 440 } 441 check_output_w(__LINE__, BUFSIZE); 442 } 443 444 static void test_vcombine(void) 445 { 446 for (int i = 0; i < BUFSIZE / 2; i++) { 447 asm volatile("v2 = vsplat(%0)\n\t" 448 "v3 = vsplat(%1)\n\t" 449 "v3:2 = vcombine(v2, v3)\n\t" 450 "vmem(%2+#0) = v2\n\t" 451 "vmem(%2+#1) = v3\n\t" 452 : 453 : "r"(2 * i), "r"(2 * i + 1), "r"(&output[2 * i]) 454 : "v2", "v3", "memory"); 455 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 456 expect[2 * i].w[j] = 2 * i + 1; 457 expect[2 * i + 1].w[j] = 2 * i; 458 } 459 } 460 check_output_w(__LINE__, BUFSIZE); 461 } 462 463 int main() 464 { 465 init_buffers(); 466 467 test_load_tmp(); 468 test_load_tmp2(); 469 test_load_cur(); 470 test_load_aligned(); 471 test_load_unaligned(); 472 test_store_aligned(); 473 test_store_unaligned(); 474 test_masked_store(false); 475 test_masked_store(true); 476 test_new_value_store(); 477 test_max_temps(); 478 479 test_vadd_w(); 480 test_vadd_h(); 481 test_vadd_b(); 482 test_vsub_w(); 483 test_vsub_h(); 484 test_vsub_b(); 485 test_vxor(); 486 test_vand(); 487 test_vor(); 488 test_vnot(); 489 490 test_pred_or(false); 491 test_pred_or_n(true); 492 test_pred_and(false); 493 test_pred_and_n(true); 494 test_pred_xor(false); 495 496 test_vadduwsat(); 497 test_vsubuwsat_dv(); 498 499 test_load_tmp_predicated(); 500 test_load_cur_predicated(); 501 502 test_vcombine(); 503 504 puts(err ? "FAIL" : "PASS"); 505 return err ? 1 : 0; 506 } 507