Lines Matching +full:1 +full:- +full:v0

2  *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
24 * _16 16-bit elements and 16-bit offsets
25 * _32 32-bit elements and 32-bit offsets
26 * _16_32 16-bit elements and 32-bit offsets
51 /* fake vtcm - put buffers together and force alignment */
89 #define SYNC_VECTOR 1
226 asm ("m0 = %1\n\t" in vector_scatter_16()
227 "v0 = vmem(%2 + #0)\n\t" in vector_scatter_16()
229 "vscatter(%0, m0, v0.h).h = v1\n\t" in vector_scatter_16()
232 : "m0", "v0", "v1", "memory"); in vector_scatter_16()
237 /* scatter-accumulate the 16 bit elements using HVX */
240 asm ("m0 = %1\n\t" in vector_scatter_16_acc()
241 "v0 = vmem(%2 + #0)\n\t" in vector_scatter_16_acc()
243 "vscatter(%0, m0, v0.h).h += v1\n\t" in vector_scatter_16_acc()
246 : "m0", "v0", "v1", "memory"); in vector_scatter_16_acc()
254 asm ("r1 = #-1\n\t" in vector_scatter_16_masked()
255 "v0 = vmem(%0 + #0)\n\t" in vector_scatter_16_masked()
256 "q0 = vand(v0, r1)\n\t" in vector_scatter_16_masked()
258 "v0 = vmem(%3 + #0)\n\t" in vector_scatter_16_masked()
260 "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t" in vector_scatter_16_masked()
263 : "r1", "q0", "m0", "q0", "v0", "v1", "memory"); in vector_scatter_16_masked()
276 asm ("m0 = %1\n\t" in vector_scatter_32()
277 "v0 = vmem(%2 + #0)\n\t" in vector_scatter_32()
279 "vscatter(%0, m0, v0.w).w = v1\n\t" in vector_scatter_32()
282 : "m0", "v0", "v1", "memory"); in vector_scatter_32()
283 asm ("m0 = %1\n\t" in vector_scatter_32()
284 "v0 = vmem(%2 + #0)\n\t" in vector_scatter_32()
286 "vscatter(%0, m0, v0.w).w = v1\n\t" in vector_scatter_32()
289 : "m0", "v0", "v1", "memory"); in vector_scatter_32()
294 /* scatter-accumulate the 32 bit elements using HVX */
302 asm ("m0 = %1\n\t" in vector_scatter_32_acc()
303 "v0 = vmem(%2 + #0)\n\t" in vector_scatter_32_acc()
305 "vscatter(%0, m0, v0.w).w += v1\n\t" in vector_scatter_32_acc()
308 : "m0", "v0", "v1", "memory"); in vector_scatter_32_acc()
309 asm ("m0 = %1\n\t" in vector_scatter_32_acc()
310 "v0 = vmem(%2 + #0)\n\t" in vector_scatter_32_acc()
312 "vscatter(%0, m0, v0.w).w += v1\n\t" in vector_scatter_32_acc()
315 : "m0", "v0", "v1", "memory"); in vector_scatter_32_acc()
330 asm ("r1 = #-1\n\t" in vector_scatter_32_masked()
331 "v0 = vmem(%0 + #0)\n\t" in vector_scatter_32_masked()
332 "q0 = vand(v0, r1)\n\t" in vector_scatter_32_masked()
334 "v0 = vmem(%3 + #0)\n\t" in vector_scatter_32_masked()
336 "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t" in vector_scatter_32_masked()
339 : "r1", "q0", "m0", "q0", "v0", "v1", "memory"); in vector_scatter_32_masked()
340 asm ("r1 = #-1\n\t" in vector_scatter_32_masked()
341 "v0 = vmem(%0 + #0)\n\t" in vector_scatter_32_masked()
342 "q0 = vand(v0, r1)\n\t" in vector_scatter_32_masked()
344 "v0 = vmem(%3 + #0)\n\t" in vector_scatter_32_masked()
346 "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t" in vector_scatter_32_masked()
349 : "r1", "q0", "m0", "q0", "v0", "v1", "memory"); in vector_scatter_32_masked()
357 asm ("m0 = %1\n\t" in vector_scatter_16_32()
358 "v0 = vmem(%2 + #0)\n\t" in vector_scatter_16_32()
359 "v1 = vmem(%2 + #1)\n\t" in vector_scatter_16_32()
365 : "m0", "v0", "v1", "v2", "memory"); in vector_scatter_16_32()
370 /* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */
373 asm ("m0 = %1\n\t" in vector_scatter_16_32_acc()
374 "v0 = vmem(%2 + #0)\n\t" in vector_scatter_16_32_acc()
375 "v1 = vmem(%2 + #1)\n\t" in vector_scatter_16_32_acc()
381 : "m0", "v0", "v1", "v2", "memory"); in vector_scatter_16_32_acc()
389 asm ("r1 = #-1\n\t" in vector_scatter_16_32_masked()
390 "v0 = vmem(%0 + #0)\n\t" in vector_scatter_16_32_masked()
391 "v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */ in vector_scatter_16_32_masked()
392 "q0 = vand(v0, r1)\n\t" in vector_scatter_16_32_masked()
394 "v0 = vmem(%3 + #0)\n\t" in vector_scatter_16_32_masked()
395 "v1 = vmem(%3 + #1)\n\t" in vector_scatter_16_32_masked()
398 "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t" in vector_scatter_16_32_masked()
401 : "r1", "q0", "m0", "v0", "v1", "v2", "memory"); in vector_scatter_16_32_masked()
409 asm ("m0 = %1\n\t" in vector_gather_16()
410 "v0 = vmem(%2 + #0)\n\t" in vector_gather_16()
411 "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t" in vector_gather_16()
415 : "m0", "v0", "memory"); in vector_gather_16()
431 asm ("v0.h = vsplat(%5)\n\t" in vector_gather_16_masked()
432 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ in vector_gather_16_masked()
433 "r1 = #-1\n\t" in vector_gather_16_masked()
434 "v0 = vmem(%0 + #0)\n\t" in vector_gather_16_masked()
435 "q0 = vand(v0, r1)\n\t" in vector_gather_16_masked()
437 "v0 = vmem(%3 + #0)\n\t" in vector_gather_16_masked()
438 "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t" in vector_gather_16_masked()
442 : "r1", "q0", "m0", "v0", "memory"); in vector_gather_16_masked()
455 asm ("m0 = %1\n\t" in vector_gather_32()
456 "v0 = vmem(%2 + #0)\n\t" in vector_gather_32()
457 "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t" in vector_gather_32()
461 : "m0", "v0", "memory"); in vector_gather_32()
462 asm ("m0 = %1\n\t" in vector_gather_32()
463 "v0 = vmem(%2 + #0)\n\t" in vector_gather_32()
464 "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t" in vector_gather_32()
468 : "m0", "v0", "memory"); in vector_gather_32()
491 asm ("v0.h = vsplat(%5)\n\t" in vector_gather_32_masked()
492 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ in vector_gather_32_masked()
493 "r1 = #-1\n\t" in vector_gather_32_masked()
494 "v0 = vmem(%0 + #0)\n\t" in vector_gather_32_masked()
495 "q0 = vand(v0, r1)\n\t" in vector_gather_32_masked()
497 "v0 = vmem(%3 + #0)\n\t" in vector_gather_32_masked()
498 "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t" in vector_gather_32_masked()
502 : "r1", "q0", "m0", "v0", "memory"); in vector_gather_32_masked()
503 asm ("v0.h = vsplat(%5)\n\t" in vector_gather_32_masked()
504 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ in vector_gather_32_masked()
505 "r1 = #-1\n\t" in vector_gather_32_masked()
506 "v0 = vmem(%0 + #0)\n\t" in vector_gather_32_masked()
507 "q0 = vand(v0, r1)\n\t" in vector_gather_32_masked()
509 "v0 = vmem(%3 + #0)\n\t" in vector_gather_32_masked()
510 "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t" in vector_gather_32_masked()
514 : "r1", "q0", "m0", "v0", "memory"); in vector_gather_32_masked()
523 asm ("m0 = %1\n\t" in vector_gather_16_32()
524 "v0 = vmem(%2 + #0)\n\t" in vector_gather_16_32()
525 "v1 = vmem(%2 + #1)\n\t" in vector_gather_16_32()
528 "v0 = vmem(%3 + #0)\n\t" in vector_gather_16_32()
529 "v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */ in vector_gather_16_32()
530 "vmem(%3 + #0) = v0\n\t" in vector_gather_16_32()
533 : "m0", "v0", "v1", "memory"); in vector_gather_16_32()
543 asm ("v0.h = vsplat(%5)\n\t" in vector_gather_16_32_masked()
544 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ in vector_gather_16_32_masked()
545 "r1 = #-1\n\t" in vector_gather_16_32_masked()
546 "v0 = vmem(%0 + #0)\n\t" in vector_gather_16_32_masked()
547 "v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */ in vector_gather_16_32_masked()
548 "q0 = vand(v0, r1)\n\t" in vector_gather_16_32_masked()
550 "v0 = vmem(%3 + #0)\n\t" in vector_gather_16_32_masked()
551 "v1 = vmem(%3 + #1)\n\t" in vector_gather_16_32_masked()
552 "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t" in vector_gather_16_32_masked()
554 "v0 = vmem(%4 + #0)\n\t" in vector_gather_16_32_masked()
555 "v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */ in vector_gather_16_32_masked()
556 "vmem(%4 + #0) = v0\n\t" in vector_gather_16_32_masked()
559 : "r1", "q0", "m0", "v0", "v1", "memory"); in vector_gather_16_32_masked()
607 /* scatter-accumulate the 16 bit elements using C */
657 /* scatter-accumulate the 32 bit elements using C */
713 /* scatter-accumulate the 16 bit elements with 32 bit offsets using C */