1 /* 2 * Copyright(c) 2021-2023 Qualcomm Innovation Center, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 #include <stdio.h> 19 #include <stdint.h> 20 #include <stdbool.h> 21 #include <string.h> 22 #include <limits.h> 23 24 int err; 25 26 #include "hvx_misc.h" 27 28 static void test_load_tmp(void) 29 { 30 void *p0 = buffer0; 31 void *p1 = buffer1; 32 void *pout = output; 33 34 for (int i = 0; i < BUFSIZE; i++) { 35 /* 36 * Load into v12 as .tmp, then use it in the next packet 37 * Should get the new value within the same packet and 38 * the old value in the next packet 39 */ 40 asm("v3 = vmem(%0 + #0)\n\t" 41 "r1 = #1\n\t" 42 "v12 = vsplat(r1)\n\t" 43 "{\n\t" 44 " v12.tmp = vmem(%1 + #0)\n\t" 45 " v4.w = vadd(v12.w, v3.w)\n\t" 46 "}\n\t" 47 "v4.w = vadd(v4.w, v12.w)\n\t" 48 "vmem(%2 + #0) = v4\n\t" 49 : : "r"(p0), "r"(p1), "r"(pout) 50 : "r1", "v12", "v3", "v4", "v6", "memory"); 51 p0 += sizeof(MMVector); 52 p1 += sizeof(MMVector); 53 pout += sizeof(MMVector); 54 55 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 56 expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1; 57 } 58 } 59 60 check_output_w(__LINE__, BUFSIZE); 61 } 62 63 static void test_load_cur(void) 64 { 65 void *p0 = buffer0; 66 void *pout = output; 67 68 for (int i = 0; i < BUFSIZE; i++) { 69 asm("{\n\t" 70 " v2.cur = vmem(%0 + #0)\n\t" 71 " vmem(%1 + #0) = v2\n\t" 72 "}\n\t" 73 : : "r"(p0), "r"(pout) : "v2", "memory"); 74 p0 += sizeof(MMVector); 75 pout += sizeof(MMVector); 76 77 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 78 expect[i].uw[j] = buffer0[i].uw[j]; 79 } 80 } 81 82 check_output_w(__LINE__, BUFSIZE); 83 } 84 85 static void test_load_aligned(void) 86 { 87 /* Aligned loads ignore the low bits of the address */ 88 void *p0 = buffer0; 89 void *pout = output; 90 const size_t offset = 13; 91 92 p0 += offset; /* Create an unaligned address */ 93 asm("v2 = vmem(%0 + #0)\n\t" 94 "vmem(%1 + #0) = v2\n\t" 95 : : "r"(p0), "r"(pout) : "v2", "memory"); 96 97 expect[0] = buffer0[0]; 98 99 check_output_w(__LINE__, 1); 100 } 101 102 static void test_load_unaligned(void) 103 { 104 void *p0 = buffer0; 105 void *pout = output; 106 const size_t offset = 12; 107 108 p0 += offset; /* Create an unaligned address */ 109 asm("v2 = vmemu(%0 + #0)\n\t" 110 "vmem(%1 + #0) = v2\n\t" 111 : : "r"(p0), "r"(pout) : "v2", "memory"); 112 113 memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector)); 114 115 check_output_w(__LINE__, 1); 116 } 117 118 static void test_store_aligned(void) 119 { 120 /* Aligned stores ignore the low bits of the address */ 121 void *p0 = buffer0; 122 void *pout = output; 123 const size_t offset = 13; 124 125 pout += offset; /* Create an unaligned address */ 126 asm("v2 = vmem(%0 + #0)\n\t" 127 "vmem(%1 + #0) = v2\n\t" 128 : : "r"(p0), "r"(pout) : "v2", "memory"); 129 130 expect[0] = buffer0[0]; 131 132 check_output_w(__LINE__, 1); 133 } 134 135 static void test_store_unaligned(void) 136 { 137 void *p0 = buffer0; 138 void *pout = output; 139 const size_t offset = 12; 140 141 pout += offset; /* Create an unaligned address */ 142 asm("v2 = vmem(%0 + #0)\n\t" 143 "vmemu(%1 + #0) = v2\n\t" 144 : : "r"(p0), "r"(pout) : "v2", "memory"); 145 146 memcpy(expect, buffer0, 2 * sizeof(MMVector)); 147 memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector)); 148 149 check_output_w(__LINE__, 2); 150 } 151 152 static void test_masked_store(bool invert) 153 { 154 void *p0 = buffer0; 155 void *pmask = mask; 156 void *pout = output; 157 158 memset(expect, 0xff, sizeof(expect)); 159 memset(output, 0xff, sizeof(expect)); 160 161 for (int i = 0; i < BUFSIZE; i++) { 162 if (invert) { 163 asm("r4 = #0\n\t" 164 "v4 = vsplat(r4)\n\t" 165 "v5 = vmem(%0 + #0)\n\t" 166 "q0 = vcmp.eq(v4.w, v5.w)\n\t" 167 "v5 = vmem(%1)\n\t" 168 "if (!q0) vmem(%2) = v5\n\t" /* Inverted test */ 169 : : "r"(pmask), "r"(p0), "r"(pout) 170 : "r4", "v4", "v5", "q0", "memory"); 171 } else { 172 asm("r4 = #0\n\t" 173 "v4 = vsplat(r4)\n\t" 174 "v5 = vmem(%0 + #0)\n\t" 175 "q0 = vcmp.eq(v4.w, v5.w)\n\t" 176 "v5 = vmem(%1)\n\t" 177 "if (q0) vmem(%2) = v5\n\t" /* Non-inverted test */ 178 : : "r"(pmask), "r"(p0), "r"(pout) 179 : "r4", "v4", "v5", "q0", "memory"); 180 } 181 p0 += sizeof(MMVector); 182 pmask += sizeof(MMVector); 183 pout += sizeof(MMVector); 184 185 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 186 if (invert) { 187 if (i + j % MASKMOD != 0) { 188 expect[i].w[j] = buffer0[i].w[j]; 189 } 190 } else { 191 if (i + j % MASKMOD == 0) { 192 expect[i].w[j] = buffer0[i].w[j]; 193 } 194 } 195 } 196 } 197 198 check_output_w(__LINE__, BUFSIZE); 199 } 200 201 static void test_new_value_store(void) 202 { 203 void *p0 = buffer0; 204 void *pout = output; 205 206 asm("{\n\t" 207 " v2 = vmem(%0 + #0)\n\t" 208 " vmem(%1 + #0) = v2.new\n\t" 209 "}\n\t" 210 : : "r"(p0), "r"(pout) : "v2", "memory"); 211 212 expect[0] = buffer0[0]; 213 214 check_output_w(__LINE__, 1); 215 } 216 217 static void test_max_temps() 218 { 219 void *p0 = buffer0; 220 void *pout = output; 221 222 asm("v0 = vmem(%0 + #0)\n\t" 223 "v1 = vmem(%0 + #1)\n\t" 224 "v2 = vmem(%0 + #2)\n\t" 225 "v3 = vmem(%0 + #3)\n\t" 226 "v4 = vmem(%0 + #4)\n\t" 227 "{\n\t" 228 " v1:0.w = vadd(v3:2.w, v1:0.w)\n\t" 229 " v2.b = vshuffe(v3.b, v2.b)\n\t" 230 " v3.w = vadd(v1.w, v4.w)\n\t" 231 " v4.tmp = vmem(%0 + #5)\n\t" 232 "}\n\t" 233 "vmem(%1 + #0) = v0\n\t" 234 "vmem(%1 + #1) = v1\n\t" 235 "vmem(%1 + #2) = v2\n\t" 236 "vmem(%1 + #3) = v3\n\t" 237 "vmem(%1 + #4) = v4\n\t" 238 : : "r"(p0), "r"(pout) : "memory"); 239 240 /* The first two vectors come from the vadd-pair instruction */ 241 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 242 expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i]; 243 expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i]; 244 } 245 /* The third vector comes from the vshuffe instruction */ 246 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) { 247 expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) | 248 (buffer0[3].uh[i] & 0xff) << 8; 249 } 250 /* The fourth vector comes from the vadd-single instruction */ 251 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 252 expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i]; 253 } 254 /* 255 * The fifth vector comes from the load to v4 256 * make sure the .tmp is dropped 257 */ 258 expect[4] = buffer0[4]; 259 260 check_output_b(__LINE__, 5); 261 } 262 263 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +) 264 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +) 265 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +) 266 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -) 267 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -) 268 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -) 269 TEST_VEC_OP2(vxor, vxor, , d, 8, ^) 270 TEST_VEC_OP2(vand, vand, , d, 8, &) 271 TEST_VEC_OP2(vor, vor, , d, 8, |) 272 TEST_VEC_OP1(vnot, vnot, , d, 8, ~) 273 274 TEST_PRED_OP2(pred_or, or, |, "") 275 TEST_PRED_OP2(pred_or_n, or, |, "!") 276 TEST_PRED_OP2(pred_and, and, &, "") 277 TEST_PRED_OP2(pred_and_n, and, &, "!") 278 TEST_PRED_OP2(pred_xor, xor, ^, "") 279 280 static void test_vadduwsat(void) 281 { 282 /* 283 * Test for saturation by adding two numbers that add to more than UINT_MAX 284 * and make sure the result saturates to UINT_MAX 285 */ 286 const uint32_t x = 0xffff0000; 287 const uint32_t y = 0x000fffff; 288 289 memset(expect, 0x12, sizeof(MMVector)); 290 memset(output, 0x34, sizeof(MMVector)); 291 292 asm volatile ("v10 = vsplat(%0)\n\t" 293 "v11 = vsplat(%1)\n\t" 294 "v21.uw = vadd(v11.uw, v10.uw):sat\n\t" 295 "vmem(%2+#0) = v21\n\t" 296 : /* no outputs */ 297 : "r"(x), "r"(y), "r"(output) 298 : "v10", "v11", "v21", "memory"); 299 300 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 301 expect[0].uw[j] = UINT_MAX; 302 } 303 304 check_output_w(__LINE__, 1); 305 } 306 307 static void test_vsubuwsat_dv(void) 308 { 309 /* 310 * Test for saturation by subtracting two numbers where the result is 311 * negative and make sure the result saturates to zero 312 * 313 * vsubuwsat_dv operates on an HVX register pair, so we'll have a 314 * pair of subtractions 315 * w - x < 0 316 * y - z < 0 317 */ 318 const uint32_t w = 0x000000b7; 319 const uint32_t x = 0xffffff4e; 320 const uint32_t y = 0x31fe88e7; 321 const uint32_t z = 0x7fffff79; 322 323 memset(expect, 0x12, sizeof(MMVector) * 2); 324 memset(output, 0x34, sizeof(MMVector) * 2); 325 326 asm volatile ("v16 = vsplat(%0)\n\t" 327 "v17 = vsplat(%1)\n\t" 328 "v26 = vsplat(%2)\n\t" 329 "v27 = vsplat(%3)\n\t" 330 "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t" 331 "vmem(%4+#0) = v24\n\t" 332 "vmem(%4+#1) = v25\n\t" 333 : /* no outputs */ 334 : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output) 335 : "v16", "v17", "v24", "v25", "v26", "v27", "memory"); 336 337 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 338 expect[0].uw[j] = 0x00000000; 339 expect[1].uw[j] = 0x00000000; 340 } 341 342 check_output_w(__LINE__, 2); 343 } 344 345 static void test_vshuff(void) 346 { 347 /* Test that vshuff works when the two operands are the same register */ 348 const uint32_t splat = 0x089be55c; 349 const uint32_t shuff = 0x454fa926; 350 MMVector v0, v1; 351 352 memset(expect, 0x12, sizeof(MMVector)); 353 memset(output, 0x34, sizeof(MMVector)); 354 355 asm volatile("v25 = vsplat(%0)\n\t" 356 "vshuff(v25, v25, %1)\n\t" 357 "vmem(%2 + #0) = v25\n\t" 358 : /* no outputs */ 359 : "r"(splat), "r"(shuff), "r"(output) 360 : "v25", "memory"); 361 362 /* 363 * The semantics of Hexagon are the operands are pass-by-value, so create 364 * two copies of the vsplat result. 365 */ 366 for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) { 367 v0.uw[i] = splat; 368 v1.uw[i] = splat; 369 } 370 /* Do the vshuff operation */ 371 for (int offset = 1; offset < MAX_VEC_SIZE_BYTES; offset <<= 1) { 372 if (shuff & offset) { 373 for (int k = 0; k < MAX_VEC_SIZE_BYTES; k++) { 374 if (!(k & offset)) { 375 uint8_t tmp = v0.ub[k]; 376 v0.ub[k] = v1.ub[k + offset]; 377 v1.ub[k + offset] = tmp; 378 } 379 } 380 } 381 } 382 /* Put the result in the expect buffer for verification */ 383 expect[0] = v1; 384 385 check_output_b(__LINE__, 1); 386 } 387 388 static void test_load_tmp_predicated(void) 389 { 390 void *p0 = buffer0; 391 void *p1 = buffer1; 392 void *pout = output; 393 bool pred = true; 394 395 for (int i = 0; i < BUFSIZE; i++) { 396 /* 397 * Load into v12 as .tmp with a predicate 398 * When the predicate is true, we get the vector from buffer1[i] 399 * When the predicate is false, we get a vector of all 1's 400 * Regardless of the predicate, the next packet should have 401 * a vector of all 1's 402 */ 403 asm("v3 = vmem(%0 + #0)\n\t" 404 "r1 = #1\n\t" 405 "v12 = vsplat(r1)\n\t" 406 "p1 = !cmp.eq(%3, #0)\n\t" 407 "{\n\t" 408 " if (p1) v12.tmp = vmem(%1 + #0)\n\t" 409 " v4.w = vadd(v12.w, v3.w)\n\t" 410 "}\n\t" 411 "v4.w = vadd(v4.w, v12.w)\n\t" 412 "vmem(%2 + #0) = v4\n\t" 413 : : "r"(p0), "r"(p1), "r"(pout), "r"(pred) 414 : "r1", "p1", "v12", "v3", "v4", "v6", "memory"); 415 p0 += sizeof(MMVector); 416 p1 += sizeof(MMVector); 417 pout += sizeof(MMVector); 418 419 for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) { 420 expect[i].w[j] = 421 pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1 422 : buffer0[i].w[j] + 2; 423 } 424 pred = !pred; 425 } 426 427 check_output_w(__LINE__, BUFSIZE); 428 } 429 430 static void test_load_cur_predicated(void) 431 { 432 bool pred = true; 433 for (int i = 0; i < BUFSIZE; i++) { 434 asm volatile("p0 = !cmp.eq(%3, #0)\n\t" 435 "v3 = vmem(%0+#0)\n\t" 436 /* 437 * Preload v4 to make sure that the assignment from the 438 * packet below is not being ignored when pred is false. 439 */ 440 "r0 = #0x01237654\n\t" 441 "v4 = vsplat(r0)\n\t" 442 "{\n\t" 443 " if (p0) v3.cur = vmem(%1+#0)\n\t" 444 " v4 = v3\n\t" 445 "}\n\t" 446 "vmem(%2+#0) = v4\n\t" 447 : 448 : "r"(&buffer0[i]), "r"(&buffer1[i]), 449 "r"(&output[i]), "r"(pred) 450 : "r0", "p0", "v3", "v4", "memory"); 451 expect[i] = pred ? buffer1[i] : buffer0[i]; 452 pred = !pred; 453 } 454 check_output_w(__LINE__, BUFSIZE); 455 } 456 457 int main() 458 { 459 init_buffers(); 460 461 test_load_tmp(); 462 test_load_cur(); 463 test_load_aligned(); 464 test_load_unaligned(); 465 test_store_aligned(); 466 test_store_unaligned(); 467 test_masked_store(false); 468 test_masked_store(true); 469 test_new_value_store(); 470 test_max_temps(); 471 472 test_vadd_w(); 473 test_vadd_h(); 474 test_vadd_b(); 475 test_vsub_w(); 476 test_vsub_h(); 477 test_vsub_b(); 478 test_vxor(); 479 test_vand(); 480 test_vor(); 481 test_vnot(); 482 483 test_pred_or(false); 484 test_pred_or_n(true); 485 test_pred_and(false); 486 test_pred_and_n(true); 487 test_pred_xor(false); 488 489 test_vadduwsat(); 490 test_vsubuwsat_dv(); 491 492 test_vshuff(); 493 494 test_load_tmp_predicated(); 495 test_load_cur_predicated(); 496 497 puts(err ? "FAIL" : "PASS"); 498 return err ? 1 : 0; 499 } 500