1/* 2 * Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 */ 17 18/****************************************************************************** 19 * 20 * HOYA: MULTI MEDIA INSTRUCITONS 21 * 22 ******************************************************************************/ 23 24#ifndef EXTINSN 25#define EXTINSN Q6INSN 26#define __SELF_DEF_EXTINSN 1 27#endif 28 29#ifndef NO_MMVEC 30 31#define DO_FOR_EACH_CODE(WIDTH, CODE) \ 32{ \ 33 fHIDE(int i;) \ 34 fVFOREACH(WIDTH, i) {\ 35 CODE ;\ 36 } \ 37} 38 39 40 41 42#define ITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 43EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), \ 44DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 45 46 47 48#define ITERATOR_INSN2_ANY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 49ITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 50 51#define ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 52EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV), \ 53DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 54 55 56#define ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 57ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE) 58 59 60#define ITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 61EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), \ 62DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 63 64 65#define ITERATOR_INSN_SHIFT3_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 66EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS,A_CVI_VS_3SRC,A_NOTE_SHIFT_RESOURCE,A_NOTE_NOVP,A_NOTE_VA_UNARY), \ 67DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 68 69#define ITERATOR_INSN_SHIFT_SLOT_VV_LATE(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 70EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), \ 71DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 72 73#define ITERATOR_INSN2_SHIFT_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 74ITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 75 76#define ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 77EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), \ 78DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 79 80#define ITERATOR_INSN2_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 81ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 82 83#define ITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 84EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 85 86 87#define ITERATOR_INSN2_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 88ITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX2,DESCR,CODE) 89 90#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 91EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 92DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 93 94#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 95EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 96DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 97 98#define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 99ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE) 100 101#define ITERATOR_INSN_MPY_SLOT(WIDTH,TAG, SYNTAX,DESCR,CODE) \ 102EXTINSN(V6_##TAG, SYNTAX, \ 103ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), \ 104DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 105 106#define ITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,DESCR,CODE) \ 107EXTINSN(V6_##TAG, SYNTAX, \ 108ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), \ 109DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 110 111#define ITERATOR_INSN2_MPY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 112ITERATOR_INSN_MPY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 113 114#define ITERATOR_INSN2_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,SYNTAX2,DESCR,CODE) \ 115ITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX2,DESCR,CODE) 116 117 118#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 119EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV), \ 120DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 121 122#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 123EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV), \ 124DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 125 126#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 127ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE) 128 129 130 131 132#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC2(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 133EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_CVI_VX_VSRC0_IS_DST), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 134 135#define ITERATOR_INSN_SLOT2_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 136EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_RESTRICT_SLOT2ONLY), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 137 138#define ITERATOR_INSN_VHISTLIKE(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 139EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), \ 140DESCR, fHIDE(mmvector_t input;) input = fTMPVDATA(); DO_FOR_EACH_CODE(WIDTH, CODE)) 141 142 143 144 145 146/****************************************************************************************** 147* 148* MMVECTOR MEMORY OPERATIONS - NO NAPALI V1 149* 150*******************************************************************************************/ 151 152 153 154#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 155EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV), \ 156DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 157 158#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 159ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 160 161 162 163#define ITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 164EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), \ 165DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 166 167#define ITERATOR_INSN2_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 168ITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 169 170 171#define ITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 172EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), \ 173DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 174 175#define ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 176ITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 177 178 179#define ITERATOR_INSN_MPY_SLOT_NOV1(WIDTH,TAG, SYNTAX,DESCR,CODE) \ 180EXTINSN(V6_##TAG, SYNTAX, \ 181ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), \ 182DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 183 184#define ITERATOR_INSN_PERMUTE_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 185EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), \ 186DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 187 188#define ITERATOR_INSN2_PERMUTE_SLOTT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 189ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 190 191#define ITERATOR_INSN_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 192EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 193 194 195#define ITERATOR_INSN2_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 196ITERATOR_INSN_PERMUTE_SLOT_DEP_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 197 198#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 199EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 200DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 201 202#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 203EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 204DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 205 206#define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 207ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 208 209#define NARROWING_SHIFT_NOV1(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \ 210ITERATOR_INSN_SHIFT_SLOT_NOV1(ITERSIZE,TAG, \ 211"Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \ 212"Vector shift right and shuffle", \ 213 fHIDE(int )shamt = RtV & SHAMTMASK; \ 214 DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \ 215 DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt))) 216 217#define MMVEC_AVGS_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 218ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 219ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGSRND( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 220ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vnavg##TYPE, "Vd32=vnavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")", "Vector Negative Average "DESCR, VdV.DEST[i] = fVNAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) 221 222 #define MMVEC_AVGU_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 223ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGU( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 224ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGURND(WIDTH, VuV.SRC[i], VvV.SRC[i])) 225 226 227 228/****************************************************************************************** 229* 230* MMVECTOR MEMORY OPERATIONS 231* 232*******************************************************************************************/ 233 234#define MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,BEH) \ 235EXTINSN(V6_##TAG##_pi, SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_I(RxV,VEC_SCALE(siV)); }) \ 236EXTINSN(V6_##TAG##_ai, SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_RI(RtV,VEC_SCALE(siV)); BEH;}) \ 237EXTINSN(V6_##TAG##_ppu, SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_M(RxV,MuV); }) \ 238 239 240#define MMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 241EXTINSN(V6_##TAG##_pred_pi, "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB, ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \ 242EXTINSN(V6_##TAG##_pred_ai, "if (" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB, ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \ 243EXTINSN(V6_##TAG##_pred_ppu, "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}}) \ 244 245#define MMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 246EXTINSN(V6_##TAG##_npred_pi, "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \ 247EXTINSN(V6_##TAG##_npred_ai, "if (!" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR, { if (fLSBOLDNOT(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \ 248EXTINSN(V6_##TAG##_npred_ppu, "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}}) 249 250#define MMVEC_COND_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 251MMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 252MMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) 253 254 255#define VEC_SCALE(X) X*fVECSIZE() 256 257 258#define MMVEC_LD(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmem","",fLOADMMV(EA,VdV)) 259#define MMVEC_LDC(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_cur,DESCR,ATTRIB,NT,"Vd32.cur=vmem","",fLOADMMV(EA,VdV)) 260#define MMVEC_LDT(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_tmp,DESCR,ATTRIB,NT,"Vd32.tmp=vmem","",fLOADMMV(EA,VdV)) 261#define MMVEC_LDU(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmemu","",fLOADMMVU(EA,VdV)) 262 263 264#define MMVEC_STQ(TAG,DESCR,ATTRIB,NT) \ 265MMVEC_EACH_EA(TAG##_qpred,DESCR,ATTRIB,NT,"if (Qv4) vmem","=Vs32",fSTOREMMVQ(EA,VsV,QvV)) \ 266MMVEC_EACH_EA(TAG##_nqpred,DESCR,ATTRIB,NT,"if (!Qv4) vmem","=Vs32",fSTOREMMVNQ(EA,VsV,QvV)) 267 268/**************************************************************** 269* MAPPING FOR VMEMs 270****************************************************************/ 271 272#define ATTR_VMEM A_EXTENSION,A_CVI,A_CVI_VM 273#define ATTR_VMEMU A_EXTENSION,A_CVI,A_CVI_VM,A_CVI_VP 274 275 276MMVEC_LD(vL32b, "Aligned Vector Load", ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),) 277MMVEC_LDC(vL32b, "Aligned Vector Load Cur", ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_NEW,A_CVI_VA),) 278MMVEC_LDT(vL32b, "Aligned Vector Load Tmp", ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),) 279 280MMVEC_COND_EACH_EA(vL32b,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),,"Vd32=vmem",,Pv,fLOADMMV(EA,VdV);) 281MMVEC_COND_EACH_EA(vL32b_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",,Pv,fLOADMMV(EA,VdV);) 282MMVEC_COND_EACH_EA(vL32b_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),,"Vd32.tmp=vmem",,Pv,fLOADMMV(EA,VdV);) 283 284MMVEC_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",fSTOREMMV(EA,VsV)) 285MMVEC_COND_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",Pv,fSTOREMMV(EA,VsV)) 286 287 288MMVEC_STQ(vS32b, "Aligned Vector Store", ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),) 289 290MMVEC_LDU(vL32Ub, "Unaligned Vector Load", ATTRIBS(ATTR_VMEMU,A_LOAD,A_RESTRICT_NOSLOT1),) 291 292MMVEC_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",fSTOREMMVU(EA,VsV)) 293 294MMVEC_COND_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",Pv,fSTOREMMVU(EA,VsV)) 295 296MMVEC_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN))) 297 298// V65 store relase, zero byte store 299MMVEC_EACH_EA(vS32b_srls,"Aligned Vector Scatter Release",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_SCATTER_RELEASE,A_CVI_NEW,A_RESTRICT_SLOT0ONLY),,"vmem",":scatter_release",fSTORERELEASE(EA,0)) 300 301 302 303MMVEC_COND_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN))) 304 305 306/****************************************************************************************** 307* 308* MMVECTOR MEMORY OPERATIONS - NON TEMPORAL 309* 310*******************************************************************************************/ 311 312#define ATTR_VMEM_NT A_EXTENSION,A_CVI,A_CVI_VM 313 314MMVEC_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",fSTOREMMV(EA,VsV)) 315MMVEC_COND_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",Pv,fSTOREMMV(EA,VsV)) 316 317MMVEC_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN))) 318MMVEC_COND_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN))) 319 320 321MMVEC_STQ(vS32b_nt, "Aligned Vector Store - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt") 322 323MMVEC_LD(vL32b_nt, "Aligned Vector Load - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_VA),":nt") 324MMVEC_LDC(vL32b_nt, "Aligned Vector Load Cur - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_NEW,A_CVI_VA),":nt") 325MMVEC_LDT(vL32b_nt, "Aligned Vector Load Tmp - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_TMP),":nt") 326 327MMVEC_COND_EACH_EA(vL32b_nt,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA),,"Vd32=vmem",":nt",Pv,fLOADMMV(EA,VdV);) 328MMVEC_COND_EACH_EA(vL32b_nt_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",":nt",Pv,fLOADMMV(EA,VdV);) 329MMVEC_COND_EACH_EA(vL32b_nt_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM_NT,A_CVI_TMP),,"Vd32.tmp=vmem",":nt",Pv,fLOADMMV(EA,VdV);) 330 331 332#undef VEC_SCALE 333 334 335/*************************************************** 336 * Vector Alignment 337 ************************************************/ 338 339#define VALIGNB(SHIFT) \ 340 fHIDE(int i;) \ 341 for(i = 0; i < fVBYTES(); i++) {\ 342 VdV.ub[i] = (i+SHIFT>=fVBYTES()) ? VuV.ub[i+SHIFT-fVBYTES()] : VvV.ub[i+SHIFT];\ 343 } 344 345EXTINSN(V6_valignb, "Vd32=valign(Vu32,Vv32,Rt8)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control", 346{ 347 unsigned shift = RtV & (fVBYTES()-1); 348 VALIGNB(shift) 349}) 350EXTINSN(V6_vlalignb, "Vd32=vlalign(Vu32,Vv32,Rt8)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control", 351{ 352 unsigned shift = fVBYTES() - (RtV & (fVBYTES()-1)); 353 VALIGNB(shift) 354}) 355EXTINSN(V6_valignbi, "Vd32=valign(Vu32,Vv32,#u3)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control", 356{ 357 VALIGNB(uiV) 358}) 359EXTINSN(V6_vlalignbi,"Vd32=vlalign(Vu32,Vv32,#u3)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control", 360{ 361 unsigned shift = fVBYTES() - uiV; 362 VALIGNB(shift) 363}) 364 365EXTINSN(V6_vror, "Vd32=vror(Vu32,Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 366"Align Two vectors by Rt32 as control", 367{ 368 fHIDE(int k;) 369 for (k=0;k<fVBYTES();k++) { 370 VdV.ub[k] = VuV.ub[(k+RtV)&(fVBYTES()-1)]; 371 } 372 }) 373 374 375 376 377 378 379 380/************************************************************** 381* Unpack elements with zero/sign extend and cross lane permute 382***************************************************************/ 383 384ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackub, "Vdd32=vunpackub(Vu32)", "Vdd32.uh=vunpack(Vu32.ub)", "Unpack byte with zero-extend", fVARRAY_ELEMENT_ACCESS(VddV, uh, i) = fZE8_16( VuV.ub[i])) 385ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackb, "Vdd32=vunpackb(Vu32)", "Vdd32.h=vunpack(Vu32.b)", "Unpack bytes with sign-extend", fVARRAY_ELEMENT_ACCESS(VddV, h, i) = fSE8_16( VuV.b[i] )) 386ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackuh, "Vdd32=vunpackuh(Vu32)", "Vdd32.uw=vunpack(Vu32.uh)", "Unpack halves with zero-extend", fVARRAY_ELEMENT_ACCESS(VddV, uw, i) = fZE16_32(VuV.uh[i])) 387ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackh, "Vdd32=vunpackh(Vu32)", "Vdd32.w=vunpack(Vu32.h)", "Unpack halves with sign-extend", fVARRAY_ELEMENT_ACCESS(VddV, w, i) = fSE16_32(VuV.h[i] )) 388 389ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8, vunpackob, "Vxx32|=vunpackob(Vu32)", "Vxx32.h|=vunpacko(Vu32.b)", "Unpack byte to odd bytes ", fVARRAY_ELEMENT_ACCESS(VxxV, uh, i) |= fZE8_16( VuV.ub[i])<<8) 390ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackoh, "Vxx32|=vunpackoh(Vu32)", "Vxx32.w|=vunpacko(Vu32.h)", "Unpack halves to odd halves", fVARRAY_ELEMENT_ACCESS(VxxV, uw, i) |= fZE16_32(VuV.uh[i])<<16) 391 392 393/************************************************************** 394* Pack elements and cross lane permute 395***************************************************************/ 396 397 ITERATOR_INSN2_PERMUTE_SLOT(16, vpackeb, "Vd32=vpackeb(Vu32,Vv32)", "Vd32.b=vpacke(Vu32.h,Vv32.h)", 398 "Pack bytes", 399 VdV.ub[i] = fGETUBYTE(0, VvV.uh[i]); 400 VdV.ub[i+fVELEM(16)] = fGETUBYTE(0, VuV.uh[i])) 401 402 ITERATOR_INSN2_PERMUTE_SLOT(32, vpackeh, "Vd32=vpackeh(Vu32,Vv32)", "Vd32.h=vpacke(Vu32.w,Vv32.w)", 403 "Pack halfwords", 404 VdV.uh[i] = fGETUHALF(0, VvV.uw[i]); 405 VdV.uh[i+fVELEM(32)] = fGETUHALF(0, VuV.uw[i])) 406 407 ITERATOR_INSN2_PERMUTE_SLOT(16, vpackob, "Vd32=vpackob(Vu32,Vv32)", "Vd32.b=vpacko(Vu32.h,Vv32.h)", 408 "Pack bytes", 409 VdV.ub[i] = fGETUBYTE(1, VvV.uh[i]); 410 VdV.ub[i+fVELEM(16)] = fGETUBYTE(1, VuV.uh[i])) 411 412 ITERATOR_INSN2_PERMUTE_SLOT(32, vpackoh, "Vd32=vpackoh(Vu32,Vv32)", "Vd32.h=vpacko(Vu32.w,Vv32.w)", 413 "Pack halfwords", 414 VdV.uh[i] = fGETUHALF(1, VvV.uw[i]); 415 VdV.uh[i+fVELEM(32)] = fGETUHALF(1, VuV.uw[i])) 416 417 418 419ITERATOR_INSN2_PERMUTE_SLOT(16, vpackhub_sat, "Vd32=vpackhub(Vu32,Vv32):sat", "Vd32.ub=vpack(Vu32.h,Vv32.h):sat", 420 "Pack ubytes with saturation", 421 VdV.ub[i] = fVSATUB(VvV.h[i]); 422 VdV.ub[i+fVELEM(16)] = fVSATUB(VuV.h[i])) 423 424 425ITERATOR_INSN2_PERMUTE_SLOT(16, vpackhb_sat, "Vd32=vpackhb(Vu32,Vv32):sat", "Vd32.b=vpack(Vu32.h,Vv32.h):sat", 426 "Pack bytes with saturation", 427 VdV.b[i] = fVSATB(VvV.h[i]); 428 VdV.b[i+fVELEM(16)] = fVSATB(VuV.h[i])) 429 430 431ITERATOR_INSN2_PERMUTE_SLOT(32, vpackwuh_sat, "Vd32=vpackwuh(Vu32,Vv32):sat", "Vd32.uh=vpack(Vu32.w,Vv32.w):sat", 432 "Pack ubytes with saturation", 433 VdV.uh[i] = fVSATUH(VvV.w[i]); 434 VdV.uh[i+fVELEM(32)] = fVSATUH(VuV.w[i])) 435 436ITERATOR_INSN2_PERMUTE_SLOT(32, vpackwh_sat, "Vd32=vpackwh(Vu32,Vv32):sat", "Vd32.h=vpack(Vu32.w,Vv32.w):sat", 437 "Pack bytes with saturation", 438 VdV.h[i] = fVSATH(VvV.w[i]); 439 VdV.h[i+fVELEM(32)] = fVSATH(VuV.w[i])) 440 441 442 443 444 445/************************************************************** 446* Zero/Sign Extend with in-lane permute 447***************************************************************/ 448 449ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vzb,"Vdd32=vzxtb(Vu32)","Vdd32.uh=vzxt(Vu32.ub)", 450"Vector Zero Extend Bytes", 451 VddV.v[0].uh[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])); 452 VddV.v[1].uh[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i]))) 453 454ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vsb,"Vdd32=vsxtb(Vu32)","Vdd32.h=vsxt(Vu32.b)", 455"Vector Sign Extend Bytes", 456 VddV.v[0].h[i] = fSE8_16(fGETBYTE(0, VuV.h[i])); 457 VddV.v[1].h[i] = fSE8_16(fGETBYTE(1, VuV.h[i]))) 458 459ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vzh,"Vdd32=vzxth(Vu32)","Vdd32.uw=vzxt(Vu32.uh)", 460"Vector Zero Extend halfwords", 461 VddV.v[0].uw[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])); 462 VddV.v[1].uw[i] = fZE16_32(fGETUHALF(1, VuV.uw[i]))) 463 464ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vsh,"Vdd32=vsxth(Vu32)","Vdd32.w=vsxt(Vu32.h)", 465"Vector Sign Extend halfwords", 466 VddV.v[0].w[i] = fSE16_32(fGETHALF(0, VuV.w[i])); 467 VddV.v[1].w[i] = fSE16_32(fGETHALF(1, VuV.w[i]))) 468 469 470/********************************************************************** 471* 472* 473* 474* MMVECTOR REDUCTION 475* 476* 477* 478**********************************************************************/ 479 480/******************************************** 481* 2-WAY REDUCTION - UNSIGNED BYTE BY BYTE 482********************************************/ 483 484 485ITERATOR_INSN2_MPY_SLOT(16,vdmpybus,"Vd32=vdmpybus(Vu32,Rt32)","Vd32.h=vdmpy(Vu32.ub,Rt32.b)", 486"Vector Dual Multiply-Accumulates unsigned bytes by bytes", 487 VdV.h[i] = fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV)); 488 VdV.h[i] += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 489 490ITERATOR_INSN2_MPY_SLOT(16,vdmpybus_acc,"Vx32+=vdmpybus(Vu32,Rt32)","Vx32.h+=vdmpy(Vu32.ub,Rt32.b)", 491"Vector Dual Multiply-Accumulates unsigned bytes by bytes, and accumulate", 492 VxV.h[i] += fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV)); 493 VxV.h[i] += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 494 495 496 497ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv,"Vdd32=vdmpybus(Vuu32,Rt32)","Vdd32.h=vdmpy(Vuu32.ub,Rt32.b)", 498"Vector Dual Multiply-Accumulates unsigned bytes by bytes, and accumulate Sliding Window Reduction", 499 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 500 VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV)); 501 502 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 503 VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV))) 504 505ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv_acc,"Vxx32+=vdmpybus(Vuu32,Rt32)","Vxx32.h+=vdmpy(Vuu32.ub,Rt32.b)", 506"Vector Dual Multiply-Accumulates unsigned bytes by bytes, and accumulate Sliding Window Reduction", 507 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 508 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV)); 509 510 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 511 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV))) 512 513 514 515/******************************************** 516* 2-WAY REDUCTION - HALF BY BYTE 517********************************************/ 518ITERATOR_INSN2_MPY_SLOT(32,vdmpyhb,"Vd32=vdmpyhb(Vu32,Rt32)","Vd32.w=vdmpy(Vu32.h,Rt32.b)", 519"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 520 VdV.w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV)); 521 VdV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV))) 522 523ITERATOR_INSN2_MPY_SLOT(32,vdmpyhb_acc,"Vx32+=vdmpyhb(Vu32,Rt32)","Vx32.w+=vdmpy(Vu32.h,Rt32.b)", 524"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 525 VxV.w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV)); 526 VxV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV))) 527 528 529 530ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv,"Vdd32=vdmpyhb(Vuu32,Rt32)","Vdd32.w=vdmpy(Vuu32.h,Rt32.b)", 531"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 532 VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 533 VddV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV)); 534 535 VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 536 VddV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV))) 537 538 539ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv_acc,"Vxx32+=vdmpyhb(Vuu32,Rt32)","Vxx32.w+=vdmpy(Vuu32.h,Rt32.b)", 540"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 541 VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 542 VxxV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV)); 543 544 VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 545 VxxV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV))) 546 547 548 549 550 551/******************************************** 552* 2-WAY REDUCTION - HALF BY HALF 553********************************************/ 554 555ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat,"Vd32=vdmpyh(Vu32,Vv32):sat","Vd32.w=vdmpy(Vu32.h,Vv32.h):sat", 556"Vector halfword multiply, accumulate pairs, sat to word", 557 fHIDE(size8s_t accum;) 558 accum = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i])); 559 accum += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i])); 560 VdV.w[i] = fVSATW(accum)) 561 562ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat_acc,"Vx32+=vdmpyh(Vu32,Vv32):sat","Vx32.w+=vdmpy(Vu32.h,Vv32.h):sat", 563"Vector halfword multiply, accumulate pairs, sat to word", 564 fHIDE(size8s_t accum;) 565 accum = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i])); 566 accum += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i])); 567 VxV.w[i] = fVSATW(VxV.w[i]+accum)) 568 569 570/* VDMPYH */ 571 572ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat,"Vd32=vdmpyh(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.h):sat", 573"Vector halfword multiply, accumulate pairs, saturate to word", 574 fHIDE(size8s_t accum;) 575 accum = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV)); 576 accum += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV)); 577 VdV.w[i] = fVSATW(accum)) 578 579ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat_acc,"Vx32+=vdmpyh(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.h):sat", 580"Vector halfword multiply, accumulate pairs, saturate to word", 581 fHIDE(size8s_t) accum = VxV.w[i]; 582 accum += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV)); 583 accum += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV)); 584 VxV.w[i] = fVSATW(accum)) 585 586 587 588 589ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat,"Vd32=vdmpyh(Vuu32,Rt32):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.h):sat", 590"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation", 591 fHIDE(size8s_t accum;) 592 accum = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV)); 593 accum += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV)); 594 VdV.w[i] = fVSATW(accum)) 595 596ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat_acc,"Vx32+=vdmpyh(Vuu32,Rt32):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.h):sat", 597"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation", 598 fHIDE(size8s_t) accum = VxV.w[i]; 599 accum += fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV)); 600 accum += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV)); 601 VxV.w[i] = fVSATW(accum)) 602 603 604 605 606 607 608 609/* VDMPYHSU */ 610ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat,"Vd32=vdmpyhsu(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.uh):sat", 611"Vector halfword multiply, accumulate pairs, saturate to word", 612 fHIDE(size8s_t accum;) 613 accum = fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV)); 614 accum += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV)); 615 VdV.w[i] = fVSATW(accum)) 616 617ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat_acc,"Vx32+=vdmpyhsu(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.uh):sat", 618"Vector halfword multiply, accumulate pairs, saturate to word", 619 fHIDE(size8s_t) accum=VxV.w[i]; 620 accum += fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV)); 621 accum += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV)); 622 VxV.w[i] = fVSATW(accum)) 623 624 625 626ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat,"Vd32=vdmpyhsu(Vuu32,Rt32,#1):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.uh,#1):sat", 627"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation", 628 fHIDE(size8s_t accum;) 629 accum = fMPY16SU(fGETHALF(1,VuuV.v[0].w[i]),fGETUHALF(0,RtV)); 630 accum += fMPY16SU(fGETHALF(0,VuuV.v[1].w[i]),fGETUHALF(1,RtV)); 631 VdV.w[i] = fVSATW(accum)) 632 633ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat_acc,"Vx32+=vdmpyhsu(Vuu32,Rt32,#1):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.uh,#1):sat", 634"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation", 635 fHIDE(size8s_t) accum=VxV.w[i]; 636 accum += fMPY16SU(fGETHALF(1, VuuV.v[0].w[i]),fGETUHALF(0,RtV)); 637 accum += fMPY16SU(fGETHALF(0, VuuV.v[1].w[i]),fGETUHALF(1,RtV)); 638 VxV.w[i] = fVSATW(accum)) 639 640 641 642/******************************************** 643* 3-WAY REDUCTION - UNSIGNED BYTE BY BYTE 644********************************************/ 645 646 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb, "Vdd32=vtmpyb(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.b,Rt32.b)", 647"Dual Vector 3x1 Reduction", 648 VddV.v[0].h[i] = fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 649 VddV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV)); 650 VddV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]); 651 652 VddV.v[1].h[i] = fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 653 VddV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV)); 654 VddV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i])) 655 656 657ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb_acc, "Vxx32+=vtmpyb(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.b,Rt32.b)", 658"Dual Vector 3x1 Reduction", 659 VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 660 VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV)); 661 VxxV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]); 662 663 VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 664 VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV)); 665 VxxV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i])) 666 667 668 669ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus, "Vdd32=vtmpybus(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.ub,Rt32.b)", 670"Dual Vector 3x1 Reduction", 671 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 672 VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 673 VddV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]); 674 675 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 676 VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 677 VddV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i])) 678 679ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus_acc, "Vxx32+=vtmpybus(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.ub,Rt32.b)", 680"Dual Vector 3x1 Reduction", 681 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 682 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 683 VxxV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]); 684 685 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 686 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 687 VxxV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i])) 688 689 690ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb, "Vdd32=vtmpyhb(Vuu32,Rt32)", "Vdd32.w=vtmpy(Vuu32.h,Rt32.b)", 691"Dual Vector 3x1 Reduction", 692 VddV.v[0].w[i] = fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 693 VddV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 694 VddV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]); 695 696 VddV.v[1].w[i] = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 697 VddV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 698 VddV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i])) 699 700ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb_acc, "Vxx32+=vtmpyhb(Vuu32,Rt32)", "Vxx32.w+=vtmpy(Vuu32.h,Rt32.b)", 701"Dual Vector 3x1 Reduction", 702 VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 703 VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 704 VxxV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]); 705 706 VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 707 VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 708 VxxV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i])) 709 710 711/******************************************** 712* 4-WAY REDUCTION - UNSIGNED BYTE BY UNSIGNED BYTE 713********************************************/ 714 715 716 717ITERATOR_INSN2_MPY_SLOT(32,vrmpyub,"Vd32=vrmpyub(Vu32,Rt32)","Vd32.uw=vrmpy(Vu32.ub,Rt32.ub)", 718"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 719 VdV.uw[i] = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV)); 720 VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV)); 721 VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV)); 722 VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV))) 723 724ITERATOR_INSN2_MPY_SLOT(32,vrmpyub_acc,"Vx32+=vrmpyub(Vu32,Rt32)","Vx32.uw+=vrmpy(Vu32.ub,Rt32.ub)", 725"Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate", 726 VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV)); 727 VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV)); 728 VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV)); 729 VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV))) 730 731 732ITERATOR_INSN2_MPY_SLOT(32,vrmpyubv,"Vd32=vrmpyub(Vu32,Vv32)","Vd32.uw=vrmpy(Vu32.ub,Vv32.ub)", 733"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 734 VdV.uw[i] = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i])); 735 VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i])); 736 VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i])); 737 VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i]))) 738 739ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubv_acc,"Vx32+=vrmpyub(Vu32,Vv32)","Vx32.uw+=vrmpy(Vu32.ub,Vv32.ub)", 740"Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate", 741 VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i])); 742 VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i])); 743 VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i])); 744 VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i]))) 745 746ITERATOR_INSN2_MPY_SLOT(32,vrmpybv,"Vd32=vrmpyb(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.b,Vv32.b)", 747"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 748 VdV.w[i] = fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i])); 749 VdV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i])); 750 VdV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i])); 751 VdV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i]))) 752 753 754ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybv_acc,"Vx32+=vrmpyb(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.b,Vv32.b)", 755"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 756 VxV.w[i] += fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i])); 757 VxV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i])); 758 VxV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i])); 759 VxV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i]))) 760 761 762ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi,"Vdd32=vrmpyub(Vuu32,Rt32,#u1)","Vdd32.uw=vrmpy(Vuu32.ub,Rt32.ub,#u1)", 763"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word", 764 VddV.v[0].uw[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 765 VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)); 766 VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 767 VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 768 769 VddV.v[1].uw[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 770 VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 771 VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 772 VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV))) 773 774 775ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi_acc,"Vxx32+=vrmpyub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrmpy(Vuu32.ub,Rt32.ub,#u1)", 776"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word", 777 VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 778 VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)); 779 VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 780 VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 781 782 VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 783 VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 784 VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 785 VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV))) 786 787 788 789 790/******************************************** 791* 4-WAY REDUCTION - UNSIGNED BYTE BY BYTE 792********************************************/ 793 794ITERATOR_INSN2_MPY_SLOT(32,vrmpybus,"Vd32=vrmpybus(Vu32,Rt32)","Vd32.w=vrmpy(Vu32.ub,Rt32.b)", 795"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 796 VdV.w[i] = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV)); 797 VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV)); 798 VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV)); 799 VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV))) 800 801 802ITERATOR_INSN2_MPY_SLOT(32,vrmpybus_acc,"Vx32+=vrmpybus(Vu32,Rt32)","Vx32.w+=vrmpy(Vu32.ub,Rt32.b)", 803"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 804 VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV)); 805 VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV)); 806 VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV)); 807 VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV))) 808 809 810ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi,"Vdd32=vrmpybus(Vuu32,Rt32,#u1)","Vdd32.w=vrmpy(Vuu32.ub,Rt32.b,#u1)", 811"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word", 812 VddV.v[0].w[i] = fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 813 VddV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)); 814 VddV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 815 VddV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 816 817 VddV.v[1].w[i] = fMPY8US(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 818 VddV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 819 VddV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 820 VddV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV))) 821 822 823ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi_acc,"Vxx32+=vrmpybus(Vuu32,Rt32,#u1)","Vxx32.w+=vrmpy(Vuu32.ub,Rt32.b,#u1)", 824"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word", 825 VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 826 VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)); 827 VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 828 VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 829 830 VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 831 VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 832 VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 833 VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV))) 834 835 836 837 838ITERATOR_INSN2_MPY_SLOT(32,vrmpybusv,"Vd32=vrmpybus(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.ub,Vv32.b)", 839"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 840 VdV.w[i] = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i])); 841 VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i])); 842 VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i])); 843 VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i]))) 844 845 846ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusv_acc,"Vx32+=vrmpybus(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.ub,Vv32.b)", 847"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 848 VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i])); 849 VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i])); 850 VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i])); 851 VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i]))) 852 853 854 855 856 857 858 859 860 861 862 863/******************************************** 864* 2-WAY REDUCTION - SAD 865********************************************/ 866 867ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh,"Vdd32=vdsaduh(Vuu32,Rt32)","Vdd32.uw=vdsad(Vuu32.uh,Rt32.uh)", 868"Dual Vector Halfword by Byte 4-Way Reduction to Word", 869 VddV.v[0].uw[i] = fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 870 VddV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV)); 871 VddV.v[1].uw[i] = fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 872 VddV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV))) 873 874ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh_acc,"Vxx32+=vdsaduh(Vuu32,Rt32)","Vxx32.uw+=vdsad(Vuu32.uh,Rt32.uh)", 875"Dual Vector Halfword by Byte 4-Way Reduction to Word", 876 VxxV.v[0].uw[i] += fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 877 VxxV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV)); 878 VxxV.v[1].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 879 VxxV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV))) 880 881 882 883 884/******************************************** 885* 4-WAY REDUCTION - SAD 886********************************************/ 887 888 889 890ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi,"Vdd32=vrsadub(Vuu32,Rt32,#u1)","Vdd32.uw=vrsad(Vuu32.ub,Rt32.ub,#u1)", 891"Dual Vector Halfword by Byte 4-Way Reduction to Word", 892 VddV.v[0].uw[i] = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 893 VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))); 894 VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 895 VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 896 897 VddV.v[1].uw[i] = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 898 VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 899 VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 900 VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)))) 901 902ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi_acc,"Vxx32+=vrsadub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrsad(Vuu32.ub,Rt32.ub,#u1)", 903"Dual Vector Halfword by Byte 4-Way Reduction to Word", 904 VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 905 VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))); 906 VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 907 VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 908 909 VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 910 VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 911 VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 912 VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)))) 913 914 915 916 917 918 919 920 921 922 923/********************************************************************* 924 * MMVECTOR SHIFTING 925 * ******************************************************************/ 926// Macro to shift arithmetically left/right and by either RT or Vv 927 928#define V_SHIFT(TYPE, DESC, SIZE, LOGSIZE, CASTTYPE) \ 929ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE, "Vd32=vasr" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Rt32)", "Vector arithmetic shift right " DESC, VdV.TYPE[i] = (VuV.TYPE[i] >> (RtV & (SIZE-1)))) \ 930ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE, "Vd32=vasl" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Rt32)", "Vector arithmetic shift left " DESC, VdV.TYPE[i] = (VuV.TYPE[i] << (RtV & (SIZE-1)))) \ 931ITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE, "Vd32=vlsr" #TYPE "(Vu32,Rt32)","Vd32.u"#TYPE"=vlsr(Vu32.u"#TYPE",Rt32)", "Vector logical shift right " DESC, VdV.u##TYPE[i] = (VuV.u##TYPE[i] >> (RtV & (SIZE-1)))) \ 932ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE##v,"Vd32=vasr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift right " DESC, VdV.TYPE[i] = fBIDIR_ASHIFTR(VuV.TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \ 933ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE##v,"Vd32=vasl" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift left " DESC, VdV.TYPE[i] = fBIDIR_ASHIFTL(VuV.TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \ 934ITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE##v,"Vd32=vlsr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vlsr(Vu32."#TYPE",Vv32."#TYPE")", "Vector logical shift right " DESC, VdV.u##TYPE[i] = fBIDIR_LSHIFTR(VuV.u##TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \ 935 936V_SHIFT(w, "word", 32,5,4_4) 937V_SHIFT(h, "halfword", 16,4,2_2) 938 939ITERATOR_INSN_SHIFT_SLOT(8,vlsrb,"Vd32.ub=vlsr(Vu32.ub,Rt32)","vec log shift right bytes", VdV.b[i] = VuV.ub[i] >> (RtV & 0x7)) 940 941ITERATOR_INSN2_SHIFT_SLOT(32,vrotr,"Vd32=vrotr(Vu32,Vv32)","Vd32.uw=vrotr(Vu32.uw,Vv32.uw)","Vector word rotate right", VdV.uw[i] = ((VuV.uw[i] >> (VvV.uw[i] & 0x1f)) | (VuV.uw[i] << (32 - (VvV.uw[i] & 0x1f))))) 942 943/********************************************************************* 944 * MMVECTOR SHIFT AND PERMUTE 945 * ******************************************************************/ 946 947ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(32,vasr_into,"Vxx32=vasrinto(Vu32,Vv32)","Vxx32.w=vasrinto(Vu32.w,Vv32.w)","ASR vector 1 elements and overlay dropping bits to MSB of vector 2 elements", 948 fHIDE(int64_t ) shift = (fSE32_64(VuV.w[i]) << 32); 949 fHIDE(int64_t ) mask = (((fSE32_64(VxxV.v[0].w[i])) << 32) | fZE32_64(VxxV.v[0].w[i])); 950 fHIDE(int64_t) lomask = (((fSE32_64(1)) << 32) - 1); 951 fHIDE(int ) count = -(0x40 & VvV.w[i]) + (VvV.w[i] & 0x3f); 952 fHIDE(int64_t ) result = (count == -0x40) ? 0 : (((count < 0) ? ((shift << -(count)) | (mask & (lomask << -(count)))) : ((shift >> count) | (mask & (lomask >> count))))); 953 VxxV.v[1].w[i] = ((result >> 32) & 0xffffffff); 954 VxxV.v[0].w[i] = (result & 0xffffffff)) 955 956#define NEW_NARROWING_SHIFT 1 957 958#if NEW_NARROWING_SHIFT 959#define NARROWING_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \ 960ITERATOR_INSN_SHIFT_SLOT(ITERSIZE,TAG, \ 961"Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \ 962"Vector shift right and shuffle", \ 963 fHIDE(int )shamt = RtV & SHAMTMASK; \ 964 DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \ 965 DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt))) 966 967 968 969 970 971/* WORD TO HALF*/ 972 973NARROWING_SHIFT(32,vasrwh,fSETHALF,h,w,,fECHO,fVNOROUND,0xF) 974NARROWING_SHIFT(32,vasrwhsat,fSETHALF,h,w,:sat,fVSATH,fVNOROUND,0xF) 975NARROWING_SHIFT(32,vasrwhrndsat,fSETHALF,h,w,:rnd:sat,fVSATH,fVROUND,0xF) 976NARROWING_SHIFT(32,vasrwuhrndsat,fSETHALF,uh,w,:rnd:sat,fVSATUH,fVROUND,0xF) 977NARROWING_SHIFT(32,vasrwuhsat,fSETHALF,uh,w,:sat,fVSATUH,fVNOROUND,0xF) 978NARROWING_SHIFT(32,vasruwuhrndsat,fSETHALF,uh,uw,:rnd:sat,fVSATUH,fVROUND,0xF) 979 980NARROWING_SHIFT_NOV1(32,vasruwuhsat,fSETHALF,uh,uw,:sat,fVSATUH,fVNOROUND,0xF) 981NARROWING_SHIFT(16,vasrhubsat,fSETBYTE,ub,h,:sat,fVSATUB,fVNOROUND,0x7) 982NARROWING_SHIFT(16,vasrhubrndsat,fSETBYTE,ub,h,:rnd:sat,fVSATUB,fVROUND,0x7) 983NARROWING_SHIFT(16,vasrhbsat,fSETBYTE,b,h,:sat,fVSATB,fVNOROUND,0x7) 984NARROWING_SHIFT(16,vasrhbrndsat,fSETBYTE,b,h,:rnd:sat,fVSATB,fVROUND,0x7) 985 986#define NARROWING_VECTOR_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SRCTYPE2,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \ 987ITERATOR_INSN_SHIFT3_SLOT(ITERSIZE,TAG, \ 988"Vd32." #DSTTYPE "=vasr(Vuu32." #SRCTYPE ",Vv32." #SRCTYPE2 ")" #SYNOPTS, \ 989"Vector shift by vector right and shuffle", \ 990 fHIDE(int )shamt = VvV.SRCTYPE2[2*i+0] & SHAMTMASK; \ 991 DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuuV.v[0].SRCTYPE[i],shamt) >> shamt)); \ 992 shamt = VvV.SRCTYPE2[2*i+1] & SHAMTMASK; \ 993 DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuuV.v[1].SRCTYPE[i],shamt) >> shamt))) 994 995/* WORD TO HALF*/ 996NARROWING_VECTOR_SHIFT(32,vasrvwuhsat,fSETHALF,uh,w,uh,:sat,fVSATUH,fVNOROUND,0xF) 997NARROWING_VECTOR_SHIFT(32,vasrvwuhrndsat,fSETHALF,uh,w,uh,:rnd:sat,fVSATUH,fVROUND,0xF) 998/* HALF TO BYTE*/ 999NARROWING_VECTOR_SHIFT(16,vasrvuhubsat,fSETBYTE,ub,uh,ub,:sat,fVSATUB,fVNOROUND,0x7) 1000NARROWING_VECTOR_SHIFT(16,vasrvuhubrndsat,fSETBYTE,ub,uh,ub,:rnd:sat,fVSATUB,fVROUND,0x7) 1001 1002NARROWING_SHIFT_NOV1(16,vasruhubsat,fSETBYTE,ub,uh,:sat,fVSATUB,fVNOROUND,0x7) 1003NARROWING_SHIFT_NOV1(16,vasruhubrndsat,fSETBYTE,ub,uh,:rnd:sat,fVSATUB,fVROUND,0x7) 1004 1005#else 1006ITERATOR_INSN2_SHIFT_SLOT(32,vasrwh,"Vd32=vasrwh(Vu32,Vv32,Rt8)","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8)", 1007"Vector arithmetic shift right words, shuffle even halfwords", 1008 fSETHALF(0,VdV.w[i], (VvV.w[i] >> (RtV & 0xF))); 1009 fSETHALF(1,VdV.w[i], (VuV.w[i] >> (RtV & 0xF)))) 1010 1011 1012ITERATOR_INSN2_SHIFT_SLOT(32,vasrwhsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):sat", 1013"Vector arithmetic shift right words, shuffle even halfwords", 1014 fSETHALF(0,VdV.w[i], fVSATH(VvV.w[i] >> (RtV & 0xF))); 1015 fSETHALF(1,VdV.w[i], fVSATH(VuV.w[i] >> (RtV & 0xF)))) 1016 1017ITERATOR_INSN2_SHIFT_SLOT(32,vasrwhrndsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):rnd:sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat", 1018"Vector arithmetic shift right words, shuffle even halfwords", 1019 fHIDE(int ) shamt = RtV & 0xF; 1020 fSETHALF(0,VdV.w[i], fVSATH( (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)); 1021 fSETHALF(1,VdV.w[i], fVSATH( (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt))) 1022 1023ITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhrndsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat", 1024"Vector arithmetic shift right words, shuffle even halfwords", 1025 fHIDE(int ) shamt = RtV & 0xF; 1026 fSETHALF(0,VdV.w[i], fVSATUH( (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)); 1027 fSETHALF(1,VdV.w[i], fVSATUH( (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt))) 1028 1029ITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):sat", 1030"Vector arithmetic shift right words, shuffle even halfwords", 1031 fSETHALF(0, VdV.uw[i], fVSATUH(VvV.w[i] >> (RtV & 0xF))); 1032 fSETHALF(1, VdV.uw[i], fVSATUH(VuV.w[i] >> (RtV & 0xF)))) 1033 1034ITERATOR_INSN2_SHIFT_SLOT(32,vasruwuhrndsat,"Vd32=vasruwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.uw,Vv32.uw,Rt8):rnd:sat", 1035"Vector arithmetic shift right words, shuffle even halfwords", 1036 fHIDE(int ) shamt = RtV & 0xF; 1037 fSETHALF(0,VdV.w[i], fVSATUH( (VvV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)); 1038 fSETHALF(1,VdV.w[i], fVSATUH( (VuV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt))) 1039#endif 1040 1041 1042 1043ITERATOR_INSN2_SHIFT_SLOT(32,vroundwh,"Vd32=vroundwh(Vu32,Vv32):sat","Vd32.h=vround(Vu32.w,Vv32.w):sat", 1044"Vector round words to halves, shuffle resultant halfwords", 1045 fSETHALF(0, VdV.uw[i], fVSATH((VvV.w[i] + fCONSTLL(0x8000)) >> 16)); 1046 fSETHALF(1, VdV.uw[i], fVSATH((VuV.w[i] + fCONSTLL(0x8000)) >> 16))) 1047 1048ITERATOR_INSN2_SHIFT_SLOT(32,vroundwuh,"Vd32=vroundwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.w,Vv32.w):sat", 1049"Vector round words to halves, shuffle resultant halfwords", 1050 fSETHALF(0, VdV.uw[i], fVSATUH((VvV.w[i] + fCONSTLL(0x8000)) >> 16)); 1051 fSETHALF(1, VdV.uw[i], fVSATUH((VuV.w[i] + fCONSTLL(0x8000)) >> 16))) 1052 1053ITERATOR_INSN2_SHIFT_SLOT(32,vrounduwuh,"Vd32=vrounduwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.uw,Vv32.uw):sat", 1054"Vector round words to halves, shuffle resultant halfwords", 1055 fSETHALF(0, VdV.uw[i], fVSATUH((VvV.uw[i] + fCONSTLL(0x8000)) >> 16)); 1056 fSETHALF(1, VdV.uw[i], fVSATUH((VuV.uw[i] + fCONSTLL(0x8000)) >> 16))) 1057 1058 1059 1060 1061 1062/* HALF TO BYTE*/ 1063 1064ITERATOR_INSN2_SHIFT_SLOT(16,vroundhb,"Vd32=vroundhb(Vu32,Vv32):sat","Vd32.b=vround(Vu32.h,Vv32.h):sat", 1065"Vector round words to halves, shuffle resultant halfwords", 1066 fSETBYTE(0, VdV.uh[i], fVSATB((VvV.h[i] + 0x80) >> 8)); 1067 fSETBYTE(1, VdV.uh[i], fVSATB((VuV.h[i] + 0x80) >> 8))) 1068 1069ITERATOR_INSN2_SHIFT_SLOT(16,vroundhub,"Vd32=vroundhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.h,Vv32.h):sat", 1070"Vector round words to halves, shuffle resultant halfwords", 1071 fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.h[i] + 0x80) >> 8)); 1072 fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.h[i] + 0x80) >> 8))) 1073 1074ITERATOR_INSN2_SHIFT_SLOT(16,vrounduhub,"Vd32=vrounduhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.uh,Vv32.uh):sat", 1075"Vector round words to halves, shuffle resultant halfwords", 1076 fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.uh[i] + 0x80) >> 8)); 1077 fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.uh[i] + 0x80) >> 8))) 1078 1079 1080ITERATOR_INSN2_SHIFT_SLOT(32,vaslw_acc,"Vx32+=vaslw(Vu32,Rt32)","Vx32.w+=vasl(Vu32.w,Rt32)", 1081"Vector shift add word", 1082 VxV.w[i] += (VuV.w[i] << (RtV & (32-1)))) 1083 1084ITERATOR_INSN2_SHIFT_SLOT(32,vasrw_acc,"Vx32+=vasrw(Vu32,Rt32)","Vx32.w+=vasr(Vu32.w,Rt32)", 1085"Vector shift add word", 1086 VxV.w[i] += (VuV.w[i] >> (RtV & (32-1)))) 1087 1088ITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vaslh_acc,"Vx32+=vaslh(Vu32,Rt32)","Vx32.h+=vasl(Vu32.h,Rt32)", 1089"Vector shift add halfword", 1090 VxV.h[i] += (VuV.h[i] << (RtV & (16-1)))) 1091 1092ITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vasrh_acc,"Vx32+=vasrh(Vu32,Rt32)","Vx32.h+=vasr(Vu32.h,Rt32)", 1093"Vector shift add halfword", 1094 VxV.h[i] += (VuV.h[i] >> (RtV & (16-1)))) 1095 1096/************************************************************************** 1097* 1098* MMVECTOR ELEMENT-WISE ARITHMETIC 1099* 1100**************************************************************************/ 1101 1102/************************************************************************** 1103* MACROS GO IN MACROS.DEF NOT HERE!!! 1104**************************************************************************/ 1105 1106 1107#define MMVEC_ABSDIFF(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1108ITERATOR_INSN2_MPY_SLOT(WIDTH, vabsdiff##TYPE, "Vd32=vabsdiff"TYPE2"(Vu32,Vv32)" ,"Vd32."#DEST"=vabsdiff(Vu32."#SRC",Vv32."#SRC")" , "Vector Absolute of Difference "DESCR, VdV.DEST[i] = (VuV.SRC[i] > VvV.SRC[i]) ? (VuV.SRC[i] - VvV.SRC[i]) : (VvV.SRC[i] - VuV.SRC[i])) 1109 1110#define MMVEC_ADDU_SAT(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1111ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat, "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" , "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVUADDSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1112ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv, "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\ 1113ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat, "Vd32=vsub"TYPE2"(Vu32,Vv32):sat", "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVUSUBSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1114ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv, "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\ 1115 1116#define MMVEC_ADDS_SAT(TYPE,TYPE2,DESCR, WIDTH,DEST,SRC)\ 1117ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat, "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" , "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVSADDSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1118ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv, "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\ 1119ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat, "Vd32=vsub"TYPE2"(Vu32,Vv32):sat", "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVSSUBSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1120ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv, "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\ 1121 1122#define MMVEC_AVGU(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1123ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGU( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 1124ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGURND(WIDTH, VuV.SRC[i], VvV.SRC[i])) 1125 1126 1127 1128#define MMVEC_AVGS(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1129ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 1130ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGSRND( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 1131ITERATOR_INSN2_ANY_SLOT(WIDTH,vnavg##TYPE, "Vd32=vnavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")", "Vector Negative Average "DESCR, VdV.DEST[i] = fVNAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) 1132 1133 1134 1135 1136 1137 1138 1139#define MMVEC_ADDWRAP(TYPE,TYPE2, DESCR, WIDTH , DEST,SRC)\ 1140ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE, "Vd32=vadd"TYPE2"(Vu32,Vv32)" , "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC")", "Vector Add "DESCR, VdV.DEST[i] = VuV.SRC[i] + VvV.SRC[i])\ 1141ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE, "Vd32=vsub"TYPE2"(Vu32,Vv32)" , "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC")", "Vector Sub "DESCR, VdV.DEST[i] = VuV.SRC[i] - VvV.SRC[i])\ 1142ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##_dv, "Vdd32=vadd"TYPE2"(Vuu32,Vvv32)" , "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Add "DESCR, VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] + VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] + VvvV.v[1].SRC[i])\ 1143ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##_dv, "Vdd32=vsub"TYPE2"(Vuu32,Vvv32)" , "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Sub "DESCR, VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] - VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] - VvvV.v[1].SRC[i]) \ 1144 1145 1146 1147 1148 1149/* Wrapping Adds */ 1150MMVEC_ADDWRAP(b, "b", "Byte", 8, b, b) 1151MMVEC_ADDWRAP(h, "h", "Halfword", 16, h, h) 1152MMVEC_ADDWRAP(w, "w", "Word", 32, w, w) 1153 1154/* Saturating Adds */ 1155MMVEC_ADDU_SAT(ub, "ub", "Unsigned Byte", 8, ub, ub) 1156MMVEC_ADDU_SAT(uh, "uh", "Unsigned Halfword", 16, uh, uh) 1157MMVEC_ADDU_SAT(uw, "uw", "Unsigned word", 32, uw, uw) 1158MMVEC_ADDS_SAT(b, "b", "byte", 8, b, b) 1159MMVEC_ADDS_SAT(h, "h", "Halfword", 16, h, h) 1160MMVEC_ADDS_SAT(w, "w", "Word", 32, w, w) 1161 1162 1163/* Averaging Instructions */ 1164MMVEC_AVGU(ub,"ub", "Unsigned Byte", 8, ub, ub) 1165MMVEC_AVGU(uh,"uh", "Unsigned Halfword", 16, uh, uh) 1166MMVEC_AVGU_NOV1(uw,"uw", "Unsigned Word", 32, uw, uw) 1167MMVEC_AVGS_NOV1(b, "b", "Byte", 8, b, b) 1168MMVEC_AVGS(h, "h", "Halfword", 16, h, h) 1169MMVEC_AVGS(w, "w", "Word", 32, w, w) 1170 1171 1172/* Absolute Difference */ 1173MMVEC_ABSDIFF(ub,"ub", "Unsigned Byte", 8, ub, ub) 1174MMVEC_ABSDIFF(uh,"uh", "Unsigned Halfword", 16, uh, uh) 1175MMVEC_ABSDIFF(h,"h", "Halfword", 16, uh, h) 1176MMVEC_ABSDIFF(w,"w", "Word", 32, uw, w) 1177 1178ITERATOR_INSN2_ANY_SLOT(8,vnavgub, "Vd32=vnavgub(Vu32,Vv32)", "Vd32.b=vnavg(Vu32.ub,Vv32.ub)", 1179"Vector Negative Average Unsigned Byte", VdV.b[i] = fVNAVGU(8, VuV.ub[i], VvV.ub[i])) 1180 1181ITERATOR_INSN_ANY_SLOT(32,vaddcarrysat,"Vd32.w=vadd(Vu32.w,Vv32.w,Qs4):carry:sat","add w/carry and saturate", 1182VdV.w[i] = fVSATW(VuV.w[i]+VvV.w[i]+fGETQBIT(QsV,i*4))) 1183 1184ITERATOR_INSN_ANY_SLOT(32,vaddcarry,"Vd32.w=vadd(Vu32.w,Vv32.w,Qx4):carry","add w/carry", 1185VdV.w[i] = VuV.w[i]+VvV.w[i]+fGETQBIT(QxV,i*4); 1186fSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],fGETQBIT(QxV,i*4)))) 1187 1188ITERATOR_INSN_ANY_SLOT(32,vsubcarry,"Vd32.w=vsub(Vu32.w,Vv32.w,Qx4):carry","add w/carry", 1189VdV.w[i] = VuV.w[i]+~VvV.w[i]+fGETQBIT(QxV,i*4); 1190fSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],fGETQBIT(QxV,i*4)))) 1191 1192ITERATOR_INSN_ANY_SLOT(32,vaddcarryo,"Vd32.w,Qe4=vadd(Vu32.w,Vv32.w):carry","add w/carry out-only", 1193VdV.w[i] = VuV.w[i]+VvV.w[i]; 1194fSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],0))) 1195 1196ITERATOR_INSN_ANY_SLOT(32,vsubcarryo,"Vd32.w,Qe4=vsub(Vu32.w,Vv32.w):carry","subtract w/carry out-only", 1197VdV.w[i] = VuV.w[i]+~VvV.w[i]+1; 1198fSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],1))) 1199 1200 1201ITERATOR_INSN_ANY_SLOT(32,vsatdw,"Vd32.w=vsatdw(Vu32.w,Vv32.w)","Saturate from 64-bits (higher 32-bits come from first vector) to 32-bits",VdV.w[i] = fVSATDW(VuV.w[i],VvV.w[i])) 1202 1203 1204#define MMVEC_ADDSAT_MIX(TAGEND,SATF,WIDTH,DEST,SRC1,SRC2)\ 1205ITERATOR_INSN_ANY_SLOT(WIDTH, vadd##TAGEND,"Vd32."#DEST"=vadd(Vu32."#SRC1",Vv32."#SRC2"):sat", "Vector Add mixed", VdV.DEST[i] = SATF(VuV.SRC1[i] + VvV.SRC2[i]))\ 1206ITERATOR_INSN_ANY_SLOT(WIDTH, vsub##TAGEND,"Vd32."#DEST"=vsub(Vu32."#SRC1",Vv32."#SRC2"):sat", "Vector Sub mixed", VdV.DEST[i] = SATF(VuV.SRC1[i] - VvV.SRC2[i]))\ 1207 1208MMVEC_ADDSAT_MIX(ububb_sat,fVSATUB,8,ub,ub,b) 1209 1210/**************************** 1211* WIDENING 1212****************************/ 1213 1214 1215 1216 1217ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh,"Vdd32=vaddub(Vu32,Vv32)","Vdd32.h=vadd(Vu32.ub,Vv32.ub)", 1218"Vector addition with widen into two vectors", 1219 VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) + fZE8_16(fGETUBYTE(0, VvV.uh[i])); 1220 VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) + fZE8_16(fGETUBYTE(1, VvV.uh[i]))) 1221 1222ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vsububh,"Vdd32=vsubub(Vu32,Vv32)","Vdd32.h=vsub(Vu32.ub,Vv32.ub)", 1223"Vector subtraction with widen into two vectors", 1224 VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) - fZE8_16(fGETUBYTE(0, VvV.uh[i])); 1225 VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) - fZE8_16(fGETUBYTE(1, VvV.uh[i]))) 1226 1227 1228 1229ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw,"Vdd32=vaddh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.h,Vv32.h)", 1230"Vector addition with widen into two vectors", 1231 VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]); 1232 VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i])) 1233 1234ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubhw,"Vdd32=vsubh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.h,Vv32.h)", 1235"Vector subtraction with widen into two vectors", 1236 VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) - fGETHALF(0, VvV.w[i]); 1237 VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) - fGETHALF(1, VvV.w[i])) 1238 1239 1240ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw,"Vdd32=vadduh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.uh,Vv32.uh)", 1241"Vector addition with widen into two vectors", 1242 VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) + fZE16_32(fGETUHALF(0, VvV.uw[i])); 1243 VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) + fZE16_32(fGETUHALF(1, VvV.uw[i]))) 1244 1245ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubuhw,"Vdd32=vsubuh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.uh,Vv32.uh)", 1246"Vector subtraction with widen into two vectors", 1247 VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) - fZE16_32(fGETUHALF(0, VvV.uw[i])); 1248 VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) - fZE16_32(fGETUHALF(1, VvV.uw[i]))) 1249 1250 1251 1252ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw_acc,"Vxx32+=vaddh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.h,Vv32.h)", 1253"Vector addition with widen into two vectors", 1254 VxxV.v[0].w[i] += fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]); 1255 VxxV.v[1].w[i] += fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i])) 1256 1257ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw_acc,"Vxx32+=vadduh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.uh,Vv32.uh)", 1258"Vector addition with widen into two vectors", 1259 VxxV.v[0].w[i] += fGETUHALF(0, VuV.w[i]) + fGETUHALF(0, VvV.w[i]); 1260 VxxV.v[1].w[i] += fGETUHALF(1, VuV.w[i]) + fGETUHALF(1, VvV.w[i])) 1261 1262ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh_acc,"Vxx32+=vaddub(Vu32,Vv32)","Vxx32.h+=vadd(Vu32.ub,Vv32.ub)", 1263"Vector addition with widen into two vectors", 1264 VxxV.v[0].h[i] += fGETUBYTE(0, VuV.h[i]) + fGETUBYTE(0, VvV.h[i]); 1265 VxxV.v[1].h[i] += fGETUBYTE(1, VuV.h[i]) + fGETUBYTE(1, VvV.h[i])) 1266 1267 1268/**************************** 1269* Conditional 1270****************************/ 1271 1272#define CONDADDSUB(WIDTH,TAGEND,LHSYN,RHSYN,DESCR,LHBEH,RHBEH) \ 1273ITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH+RHBEH,LHBEH)) \ 1274ITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH-RHBEH,LHBEH)) \ 1275ITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (!Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH+RHBEH)) \ 1276ITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (!Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH-RHBEH)) \ 1277 1278CONDADDSUB(8,b,"Vx32.b","Vu32.b","Conditional add/sub Byte",VxV.ub[i],VuV.ub[i]) 1279CONDADDSUB(16,h,"Vx32.h","Vu32.h","Conditional add/sub Half",VxV.h[i],VuV.h[i]) 1280CONDADDSUB(32,w,"Vx32.w","Vu32.w","Conditional add/sub Word",VxV.w[i],VuV.w[i]) 1281 1282/***************************************************** 1283 ABSOLUTE VALUES 1284*****************************************************/ 1285// V65 1286ITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb, "Vd32=vabsb(Vu32)", "Vd32.b=vabs(Vu32.b)", "Vector absolute value of bytes", VdV.b[i] = fABS(VuV.b[i])) 1287ITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb_sat, "Vd32=vabsb(Vu32):sat", "Vd32.b=vabs(Vu32.b):sat", "Vector absolute value of bytes", VdV.b[i] = fVSATB(fABS(fSE8_16(VuV.b[i])))) 1288 1289 1290ITERATOR_INSN2_ANY_SLOT(16,vabsh, "Vd32=vabsh(Vu32)", "Vd32.h=vabs(Vu32.h)", "Vector absolute value of halfwords", VdV.h[i] = fABS(VuV.h[i])) 1291ITERATOR_INSN2_ANY_SLOT(16,vabsh_sat, "Vd32=vabsh(Vu32):sat", "Vd32.h=vabs(Vu32.h):sat", "Vector absolute value of halfwords", VdV.h[i] = fVSATH(fABS(fSE16_32(VuV.h[i])))) 1292ITERATOR_INSN2_ANY_SLOT(32,vabsw, "Vd32=vabsw(Vu32)", "Vd32.w=vabs(Vu32.w)", "Vector absolute value of words", VdV.w[i] = fABS(VuV.w[i])) 1293ITERATOR_INSN2_ANY_SLOT(32,vabsw_sat, "Vd32=vabsw(Vu32):sat", "Vd32.w=vabs(Vu32.w):sat", "Vector absolute value of words", VdV.w[i] = fVSATW(fABS(fSE32_64(VuV.w[i])))) 1294 1295 1296/************************************************************************** 1297 * MMVECTOR MULTIPLICATIONS 1298 * ************************************************************************/ 1299 1300 1301/* Byte by Byte */ 1302ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv,"Vdd32=vmpyb(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.b,Vv32.b)", 1303"Vector absolute value of words", 1304 VddV.v[0].h[i] = fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i])); 1305 VddV.v[1].h[i] = fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i]))) 1306 1307ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv_acc,"Vxx32+=vmpyb(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.b,Vv32.b)", 1308"Vector absolute value of words", 1309 VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i])); 1310 VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i]))) 1311 1312 1313ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv,"Vdd32=vmpyub(Vu32,Vv32)","Vdd32.uh=vmpy(Vu32.ub,Vv32.ub)", 1314"Vector absolute value of words", 1315 VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) ); 1316 VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) )) 1317 1318ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv_acc,"Vxx32+=vmpyub(Vu32,Vv32)","Vxx32.uh+=vmpy(Vu32.ub,Vv32.ub)", 1319"Vector absolute value of words", 1320 VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) ); 1321 VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) )) 1322 1323 1324 1325 1326 1327 1328ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv,"Vdd32=vmpybus(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.ub,Vv32.b)", 1329"Vector absolute value of words", 1330 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i])); 1331 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i]))) 1332 1333ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv_acc,"Vxx32+=vmpybus(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.ub,Vv32.b)", 1334"Vector absolute value of words", 1335 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i])); 1336 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i]))) 1337 1338 1339 1340 1341ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabusv,"Vdd32=vmpabus(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.b)", 1342"Vertical Byte Multiply", 1343 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(0, VvvV.v[1].uh[i])); 1344 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(1, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(1, VvvV.v[1].uh[i]))) 1345 1346ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabuuv,"Vdd32=vmpabuu(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.ub)", 1347"Vertical Byte Multiply", 1348 VddV.v[0].h[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(0, VvvV.v[1].uh[i])); 1349 VddV.v[1].h[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(1, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(1, VvvV.v[1].uh[i]))) 1350 1351 1352 1353 1354 1355 1356 1357ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv,"Vdd32=vmpyh(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.h)", 1358"Vector by Vector Halfword Multiply", 1359 VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i])); 1360 VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i]))) 1361 1362ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv_acc,"Vxx32+=vmpyh(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.h)", 1363"Vector by Vector Halfword Multiply", 1364 VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i])); 1365 VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i]))) 1366 1367ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv,"Vdd32=vmpyuh(Vu32,Vv32)","Vdd32.uw=vmpy(Vu32.uh,Vv32.uh)", 1368"Vector by Vector Unsigned Halfword Multiply", 1369 VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i])); 1370 VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i]))) 1371 1372ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv_acc,"Vxx32+=vmpyuh(Vu32,Vv32)","Vxx32.uw+=vmpy(Vu32.uh,Vv32.uh)", 1373"Vector by Vector Unsigned Halfword Multiply", 1374 VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i])); 1375 VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i]))) 1376 1377 1378 1379/* Vector by Vector */ 1380ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyhvsrs,"Vd32=vmpyh(Vu32,Vv32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Vv32.h):<<1:rnd:sat", 1381"Vector halfword multiply with round, shift, and sat16", 1382 VdV.h[i] = fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(VuV.h[i],VvV.h[i] )<<1)))))) 1383 1384 1385 1386ITERATOR_INSN_MPY_SLOT(16,vmpyuhvs, "Vd32.uh=vmpy(Vu32.uh,Vv32.uh):>>16", 1387"Vector by Vector Unsigned Halfword Multiply with 16 bit rightshift", 1388 VdV.uh[i] = fGETUHALF(1,fMPY16UU(VuV.uh[i],VvV.uh[i]))) 1389 1390 1391ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus, "Vdd32=vmpyhus(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.uh)", 1392"Vector by Vector Halfword Multiply", 1393 VddV.v[0].w[i] = fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i])); 1394 VddV.v[1].w[i] = fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i]))) 1395 1396 1397ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus_acc, "Vxx32+=vmpyhus(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.uh)", 1398"Vector by Vector Halfword Multiply", 1399 VxxV.v[0].w[i] += fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i])); 1400 VxxV.v[1].w[i] += fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i]))) 1401 1402 1403 1404 1405ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih,"Vd32=vmpyih(Vu32,Vv32)","Vd32.h=vmpyi(Vu32.h,Vv32.h)", 1406"Vector by Vector Halfword Multiply", 1407 VdV.h[i] = fMPY16SS(VuV.h[i], VvV.h[i])) 1408 1409ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih_acc,"Vx32+=vmpyih(Vu32,Vv32)","Vx32.h+=vmpyi(Vu32.h,Vv32.h)", 1410"Vector by Vector Halfword Multiply", 1411 VxV.h[i] += fMPY16SS(VuV.h[i], VvV.h[i])) 1412 1413 1414 1415/* 32x32 high half / frac */ 1416 1417 1418ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh,"Vd32=vmpyewuh(Vu32,Vv32)","Vd32.w=vmpye(Vu32.w,Vv32.uh)", 1419"Vector by Vector Halfword Multiply", 1420VdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) >> 16) 1421 1422ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh,"Vd32=vmpyowh(Vu32,Vv32):<<1:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:sat", 1423"Vector by Vector Halfword Multiply", 1424VdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 0) >> 1))) 1425 1426ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd,"Vd32=vmpyowh(Vu32,Vv32):<<1:rnd:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat", 1427"Vector by Vector Halfword Multiply", 1428VdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 1) >> 1))) 1429 1430ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh_64,"Vdd32=vmpye(Vu32.w,Vv32.uh)", 1431"Word times Halfword Multiply, 64-bit result", 1432 fHIDE(size8s_t prod;) 1433 prod = fMPY32SU(VuV.w[i],fGETUHALF(0,VvV.w[i])); 1434 VddV.v[1].w[i] = prod >> 16; 1435 VddV.v[0].w[i] = prod << 16) 1436 1437ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_64_acc,"Vxx32+=vmpyo(Vu32.w,Vv32.h)", 1438"Word times Halfword Multiply, 64-bit result", 1439 fHIDE(size8s_t prod;) 1440 prod = fMPY32SS(VuV.w[i],fGETHALF(1,VvV.w[i])) + fSE32_64(VxxV.v[1].w[i]); 1441 VxxV.v[1].w[i] = prod >> 16; 1442 fSETHALF(0, VxxV.v[0].w[i], VxxV.v[0].w[i] >> 16); 1443 fSETHALF(1, VxxV.v[0].w[i], prod & 0x0000ffff)) 1444 1445ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:sat:shift", 1446"Vector by Vector Halfword Multiply", 1447IV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 0) >> 1))) 1448 1449ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:rnd:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat:shift", 1450"Vector by Vector Halfword Multiply", 1451IV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 1) >> 1))) 1452 1453/* For 32x32 integer / low half */ 1454 1455ITERATOR_INSN_MPY_SLOT(32,vmpyieoh,"Vd32.w=vmpyieo(Vu32.h,Vv32.h)","Odd/Even multiply for 32x32 low half", 1456 VdV.w[i] = (fGETHALF(0,VuV.w[i])*fGETHALF(1,VvV.w[i])) << 16) 1457 1458ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh,"Vd32=vmpyiewuh(Vu32,Vv32)","Vd32.w=vmpyie(Vu32.w,Vv32.uh)", 1459"Vector by Vector Word by Halfword Multiply", 1460IV1DEAD() VdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) ) 1461 1462ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiowh,"Vd32=vmpyiowh(Vu32,Vv32)","Vd32.w=vmpyio(Vu32.w,Vv32.h)", 1463"Vector by Vector Word by Halfword Multiply", 1464IV1DEAD() VdV.w[i] = fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) ) 1465 1466/* Add back these... */ 1467 1468ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewh_acc,"Vx32+=vmpyiewh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.h)", 1469"Vector by Vector Word by Halfword Multiply", 1470VxV.w[i] = VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(0, VvV.w[i])) ) 1471 1472ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh_acc,"Vx32+=vmpyiewuh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.uh)", 1473"Vector by Vector Word by Halfword Multiply", 1474VxV.w[i] = VxV.w[i] + fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) ) 1475 1476 1477 1478 1479 1480 1481 1482/* Vector by Scalar */ 1483ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub,"Vdd32=vmpyub(Vu32,Rt32)","Vdd32.uh=vmpy(Vu32.ub,Rt32.ub)", 1484"Vector absolute value of words", 1485 VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV)); 1486 VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV))) 1487 1488ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub_acc,"Vxx32+=vmpyub(Vu32,Rt32)","Vxx32.uh+=vmpy(Vu32.ub,Rt32.ub)", 1489"Vector absolute value of words", 1490 VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV)); 1491 VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV))) 1492 1493 1494ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus,"Vdd32=vmpybus(Vu32,Rt32)","Vdd32.h=vmpy(Vu32.ub,Rt32.b)", 1495"Vector absolute value of words", 1496 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV)); 1497 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 1498 1499ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus_acc,"Vxx32+=vmpybus(Vu32,Rt32)","Vxx32.h+=vmpy(Vu32.ub,Rt32.b)", 1500"Vector absolute value of words", 1501 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV)); 1502 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 1503 1504 1505ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus,"Vdd32=vmpabus(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.b)", 1506"Vertical Byte Multiply", 1507 VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV)); 1508 VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV))) 1509 1510ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus_acc,"Vxx32+=vmpabus(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.b)", 1511"Vertical Byte Multiply", 1512 VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV)); 1513 VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV))) 1514 1515// V65 1516 1517ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu,"Vdd32=vmpabuu(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.ub)", 1518"Vertical Byte Multiply", 1519 VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV)); 1520 VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV))) 1521 1522ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu_acc,"Vxx32+=vmpabuu(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.ub)", 1523"Vertical Byte Multiply", 1524 VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV)); 1525 VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV))) 1526 1527 1528 1529 1530/* Half by Byte */ 1531ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb,"Vdd32=vmpahb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.h,Rt32.b)", 1532"Vertical Byte Multiply", 1533 VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1534 VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1535 1536ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb_acc,"Vxx32+=vmpahb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.h,Rt32.b)", 1537"Vertical Byte Multiply", 1538 VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1539 VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1540 1541/* Half by Byte */ 1542ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb,"Vdd32=vmpauhb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.uh,Rt32.b)", 1543"Vertical Byte Multiply", 1544 VddV.v[0].w[i] = fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1545 VddV.v[1].w[i] = fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1546 1547ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb_acc,"Vxx32+=vmpauhb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.uh,Rt32.b)", 1548"Vertical Byte Multiply", 1549 VxxV.v[0].w[i] += fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1550 VxxV.v[1].w[i] += fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1551 1552 1553 1554 1555 1556 1557 1558/* Half by Half */ 1559ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyh,"Vdd32=vmpyh(Vu32,Rt32)","Vdd32.w=vmpy(Vu32.h,Rt32.h)", 1560"Vector absolute value of words", 1561 VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV)); 1562 VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV))) 1563 1564ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(32,vmpyh_acc,"Vxx32+=vmpyh(Vu32,Rt32)","Vxx32.w+=vmpy(Vu32.h,Rt32.h)", 1565"Vector even halfwords with scalar lower halfword multiply with shift and sat32", 1566 VxxV.v[0].w[i] = fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV)); 1567 VxxV.v[1].w[i] = fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV))) 1568 1569 1570ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsat_acc,"Vxx32+=vmpyh(Vu32,Rt32):sat","Vxx32.w+=vmpy(Vu32.h,Rt32.h):sat", 1571"Vector even halfwords with scalar lower halfword multiply with shift and sat32", 1572 VxxV.v[0].w[i] = fVSATW(fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV))); 1573 VxxV.v[1].w[i] = fVSATW(fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV)))) 1574 1575 1576 1577ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhss,"Vd32=vmpyh(Vu32,Rt32):<<1:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:sat", 1578"Vector halfword by halfword multiply, shift by 1, and take upper 16 msb", 1579 fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1))))); 1580 fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1))))); 1581) 1582 1583ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsrs,"Vd32=vmpyh(Vu32,Rt32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:rnd:sat", 1584"Vector halfword with scalar halfword multiply with round, shift, and sat16", 1585 fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1)))))); 1586 fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1)))))); 1587) 1588 1589 1590ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh,"Vdd32=vmpyuh(Vu32,Rt32)","Vdd32.uw=vmpy(Vu32.uh,Rt32.uh)", 1591"Vector even halfword unsigned multiply by scalar", 1592 VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)); 1593 VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV))) 1594 1595 1596ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh_acc,"Vxx32+=vmpyuh(Vu32,Rt32)","Vxx32.uw+=vmpy(Vu32.uh,Rt32.uh)", 1597"Vector even halfword unsigned multiply by scalar", 1598 VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)); 1599 VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV))) 1600 1601 1602 1603 1604/******************************************** 1605* HALF BY BYTE 1606********************************************/ 1607ITERATOR_INSN2_MPY_SLOT(16,vmpyihb,"Vd32=vmpyihb(Vu32,Rt32)","Vd32.h=vmpyi(Vu32.h,Rt32.b)", 1608"Vector word by byte multiply, keep lower result", 1609VdV.h[i] = fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) )) 1610 1611ITERATOR_INSN2_MPY_SLOT(16,vmpyihb_acc,"Vx32+=vmpyihb(Vu32,Rt32)","Vx32.h+=vmpyi(Vu32.h,Rt32.b)", 1612"Vector word by byte multiply, keep lower result", 1613VxV.h[i] += fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) )) 1614 1615 1616/******************************************** 1617* WORD BY BYTE 1618********************************************/ 1619ITERATOR_INSN2_MPY_SLOT(32,vmpyiwb,"Vd32=vmpyiwb(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.b)", 1620"Vector word by byte multiply, keep lower result", 1621VdV.w[i] = fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) )) 1622 1623ITERATOR_INSN2_MPY_SLOT(32,vmpyiwb_acc,"Vx32+=vmpyiwb(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.b)", 1624"Vector word by byte multiply, keep lower result", 1625VxV.w[i] += fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) )) 1626 1627ITERATOR_INSN2_MPY_SLOT(32,vmpyiwub,"Vd32=vmpyiwub(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.ub)", 1628"Vector word by byte multiply, keep lower result", 1629VdV.w[i] = fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) )) 1630 1631ITERATOR_INSN2_MPY_SLOT(32,vmpyiwub_acc,"Vx32+=vmpyiwub(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.ub)", 1632"Vector word by byte multiply, keep lower result", 1633VxV.w[i] += fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) )) 1634 1635 1636/******************************************** 1637* WORD BY HALF 1638********************************************/ 1639ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh,"Vd32=vmpyiwh(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.h)", 1640"Vector word by byte multiply, keep lower result", 1641VdV.w[i] = fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV))) 1642 1643ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh_acc,"Vx32+=vmpyiwh(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.h)", 1644"Vector word by byte multiply, keep lower result", 1645VxV.w[i] += fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV))) 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665/************************************************************************** 1666 * MMVECTOR LOGICAL OPERATIONS 1667 * ************************************************************************/ 1668ITERATOR_INSN_ANY_SLOT(16,vand,"Vd32=vand(Vu32,Vv32)", "Vector Logical And", VdV.uh[i] = VuV.uh[i] & VvV.h[i]) 1669ITERATOR_INSN_ANY_SLOT(16,vor, "Vd32=vor(Vu32,Vv32)", "Vector Logical Or", VdV.uh[i] = VuV.uh[i] | VvV.h[i]) 1670ITERATOR_INSN_ANY_SLOT(16,vxor,"Vd32=vxor(Vu32,Vv32)", "Vector Logical XOR", VdV.uh[i] = VuV.uh[i] ^ VvV.h[i]) 1671ITERATOR_INSN_ANY_SLOT(16,vnot,"Vd32=vnot(Vu32)", "Vector Logical NOT", VdV.uh[i] = ~VuV.uh[i]) 1672 1673 1674 1675 1676 1677ITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt, 1678"Vd32.ub=vand(Qu4.ub,Rt32.ub)", "Vd32=vand(Qu4,Rt32)", "Insert Predicate into Vector", 1679 VdV.ub[i] = fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0) 1680 1681ITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt_acc, 1682"Vx32.ub|=vand(Qu4.ub,Rt32.ub)", "Vx32|=vand(Qu4,Rt32)", "Insert Predicate into Vector", 1683 VxV.ub[i] |= (fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0) 1684 1685ITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt, 1686"Vd32.ub=vand(!Qu4.ub,Rt32.ub)", "Vd32=vand(!Qu4,Rt32)", "Insert Predicate into Vector", 1687 VdV.ub[i] = !fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0) 1688 1689ITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt_acc, 1690"Vx32.ub|=vand(!Qu4.ub,Rt32.ub)", "Vx32|=vand(!Qu4,Rt32)", "Insert Predicate into Vector", 1691 VxV.ub[i] |= !(fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0) 1692 1693 1694ITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt, 1695"Qd4.ub=vand(Vu32.ub,Rt32.ub)", "Qd4=vand(Vu32,Rt32)", "Insert into Predicate", 1696 fSETQBIT(QdV,i,((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0)) 1697 1698ITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt_acc, 1699"Qx4.ub|=vand(Vu32.ub,Rt32.ub)", "Qx4|=vand(Vu32,Rt32)", "Insert into Predicate ", 1700 fSETQBIT(QxV,i,fGETQBIT(QxV,i)|(((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0))) 1701 1702ITERATOR_INSN_ANY_SLOT(8,vandvqv,"Vd32=vand(Qv4,Vu32)","Mask off bytes", 1703VdV.b[i] = fGETQBIT(QvV,i) ? VuV.b[i] : 0) 1704ITERATOR_INSN_ANY_SLOT(8,vandvnqv,"Vd32=vand(!Qv4,Vu32)","Mask off bytes", 1705VdV.b[i] = !fGETQBIT(QvV,i) ? VuV.b[i] : 0) 1706 1707 1708 /*************************************************** 1709 * Compare Vector with Vector 1710 ***************************************************/ 1711#define VCMP(DEST, ASRC, ASRCOP, CMP, N, SRC, MASK, WIDTH) \ 1712{ \ 1713 for(fHIDE(int) i = 0; i < fVBYTES(); i += WIDTH) { \ 1714 fSETQBITS(DEST,WIDTH,MASK,i,ASRC ASRCOP ((VuV.SRC[i/WIDTH] CMP VvV.SRC[i/WIDTH]) ? MASK : 0)); \ 1715 } \ 1716 } 1717 1718 1719#define MMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \ 1720EXTINSN(V6_vgt##TYPE, "Qd4=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than", \ 1721 VCMP(QdV, , , >, N, SRC, MASK, WIDTH)) \ 1722EXTINSN(V6_vgt##TYPE##_and, "Qx4&=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-and", \ 1723 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, >, N, SRC, MASK, WIDTH)) \ 1724EXTINSN(V6_vgt##TYPE##_or, "Qx4|=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-or", \ 1725 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, >, N, SRC, MASK, WIDTH)) \ 1726EXTINSN(V6_vgt##TYPE##_xor, "Qx4^=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-xor", \ 1727 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, >, N, SRC, MASK, WIDTH)) 1728 1729#define MMVEC_CMP(TYPE,TYPE2,TYPE3,DESCR,N,MASK, WIDTH, SRC)\ 1730MMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \ 1731EXTINSN(V6_veq##TYPE, "Qd4=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equal to", \ 1732 VCMP(QdV, , , ==, N, SRC, MASK, WIDTH)) \ 1733EXTINSN(V6_veq##TYPE##_and, "Qx4&=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-and", \ 1734 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, ==, N, SRC, MASK, WIDTH)) \ 1735EXTINSN(V6_veq##TYPE##_or, "Qx4|=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-or", \ 1736 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, ==, N, SRC, MASK, WIDTH)) \ 1737EXTINSN(V6_veq##TYPE##_xor, "Qx4^=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-xor", \ 1738 VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, ==, N, SRC, MASK, WIDTH)) 1739 1740 1741MMVEC_CMP(w,"w","","Vector Word Compare ", fVELEM(32), 0xF, 4, w) 1742MMVEC_CMP(h,"h","","Vector Half Compare ", fVELEM(16), 0x3, 2, h) 1743MMVEC_CMP(b,"b","","Vector Half Compare ", fVELEM(8), 0x1, 1, b) 1744MMVEC_CMPGT(uw,"uw","","Vector Unsigned Half Compare ", fVELEM(32), 0xF, 4,uw) 1745MMVEC_CMPGT(uh,"uh","","Vector Unsigned Half Compare ", fVELEM(16), 0x3, 2,uh) 1746MMVEC_CMPGT(ub,"ub","","Vector Unsigned Byte Compare ", fVELEM(8), 0x1, 1,ub) 1747 1748/*************************************************** 1749* Predicate Operations 1750***************************************************/ 1751 1752EXTINSN(V6_pred_scalar2, "Qd4=vsetq(Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), "Set Vector Predicate ", 1753{ 1754 fHIDE(int i;) 1755 for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i < (RtV & (fVBYTES()-1))) ? 1 : 0); 1756}) 1757 1758EXTINSN(V6_pred_scalar2v2, "Qd4=vsetq2(Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), "Set Vector Predicate ", 1759{ 1760 fHIDE(int i;) 1761 for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i <= ((RtV-1) & (fVBYTES()-1))) ? 1 : 0); 1762}) 1763 1764 1765ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqw, "Qd4.h=vshuffe(Qs4.w,Qt4.w)","Shrink Predicate", fSETQBIT(QdV,i, (i & 2) ? fGETQBIT(QsV,i-2) : fGETQBIT(QtV,i) ) ) 1766ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqh, "Qd4.b=vshuffe(Qs4.h,Qt4.h)","Shrink Predicate", fSETQBIT(QdV,i, (i & 1) ? fGETQBIT(QsV,i-1) : fGETQBIT(QtV,i) ) ) 1767ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or, "Qd4=or(Qs4,Qt4)","Vector Predicate Or", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || fGETQBIT(QtV,i) ) ) 1768ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and, "Qd4=and(Qs4,Qt4)","Vector Predicate And", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && fGETQBIT(QtV,i) ) ) 1769ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_xor, "Qd4=xor(Qs4,Qt4)","Vector Predicate Xor", fSETQBIT(QdV,i,fGETQBIT(QsV,i) ^ fGETQBIT(QtV,i) ) ) 1770ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or_n, "Qd4=or(Qs4,!Qt4)","Vector Predicate Or with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || !fGETQBIT(QtV,i) ) ) 1771ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and_n, "Qd4=and(Qs4,!Qt4)","Vector Predicate And with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && !fGETQBIT(QtV,i) ) ) 1772ITERATOR_INSN_ANY_SLOT(8, pred_not, "Qd4=not(Qs4)","Vector Predicate Not", fSETQBIT(QdV,i,!fGETQBIT(QsV,i) ) ) 1773 1774 1775 1776EXTINSN(V6_vcmov, "if (Ps4) Vd32=Vu32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), "Conditional Mov", 1777{ 1778if (fLSBOLD(PsV)) { 1779 fHIDE(int i;) 1780 fVFOREACH(8, i) { 1781 VdV.ub[i] = VuV.ub[i]; 1782 } 1783 } else {CANCEL;} 1784}) 1785 1786EXTINSN(V6_vncmov, "if (!Ps4) Vd32=Vu32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), "Conditional Mov", 1787{ 1788if (fLSBOLDNOT(PsV)) { 1789 fHIDE(int i;) 1790 fVFOREACH(8, i) { 1791 VdV.ub[i] = VuV.ub[i]; 1792 } 1793 } else {CANCEL;} 1794}) 1795 1796EXTINSN(V6_vccombine, "if (Ps4) Vdd32=vcombine(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV), "Conditional Combine", 1797{ 1798if (fLSBOLD(PsV)) { 1799 fHIDE(int i;) 1800 fVFOREACH(8, i) { 1801 VddV.v[0].ub[i] = VvV.ub[i]; 1802 VddV.v[1].ub[i] = VuV.ub[i]; 1803 } 1804 } else {CANCEL;} 1805}) 1806 1807EXTINSN(V6_vnccombine, "if (!Ps4) Vdd32=vcombine(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV), "Conditional Combine", 1808{ 1809if (fLSBOLDNOT(PsV)) { 1810 fHIDE(int i;) 1811 fVFOREACH(8, i) { 1812 VddV.v[0].ub[i] = VvV.ub[i]; 1813 VddV.v[1].ub[i] = VuV.ub[i]; 1814 } 1815 } else {CANCEL;} 1816}) 1817 1818 1819 1820ITERATOR_INSN_ANY_SLOT(8,vmux,"Vd32=vmux(Qt4,Vu32,Vv32)", 1821"Vector Select Element 8-bit", 1822 VdV.ub[i] = fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i]) 1823 1824ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vswap,"Vdd32=vswap(Qt4,Vu32,Vv32)", 1825"Vector Swap Element 8-bit", 1826 VddV.v[0].ub[i] = fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i]; 1827 VddV.v[1].ub[i] = !fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i]) 1828 1829 1830/*************************************************************************** 1831* 1832* MMVECTOR SORTING 1833* 1834****************************************************************************/ 1835 1836#define MMVEC_SORT(TYPE,TYPE2,DESCR,ELEMENTSIZE,SRC)\ 1837ITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmax##TYPE, "Vd32=vmax" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmax(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " max", VdV.SRC[i] = (VuV.SRC[i] > VvV.SRC[i]) ? VuV.SRC[i] : VvV.SRC[i]) \ 1838ITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmin##TYPE, "Vd32=vmin" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmin(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " min", VdV.SRC[i] = (VuV.SRC[i] < VvV.SRC[i]) ? VuV.SRC[i] : VvV.SRC[i]) 1839 1840MMVEC_SORT(b,"b", "signed byte", 8, b) 1841MMVEC_SORT(ub,"ub", "unsigned byte", 8, ub) 1842MMVEC_SORT(uh,"uh", "unsigned halfword",16, uh) 1843MMVEC_SORT(h, "h", "halfword", 16, h) 1844MMVEC_SORT(w, "w", "word", 32, w) 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854/************************************************************* 1855* SHUFFLES 1856****************************************************************/ 1857 1858ITERATOR_INSN2_ANY_SLOT(16,vsathub,"Vd32=vsathub(Vu32,Vv32)","Vd32.ub=vsat(Vu32.h,Vv32.h)", 1859"Saturate and pack 32 halfwords to 32 unsigned bytes, and interleave them", 1860 fSETBYTE(0, VdV.uh[i], fVSATUB(VvV.h[i])); 1861 fSETBYTE(1, VdV.uh[i], fVSATUB(VuV.h[i]))) 1862 1863ITERATOR_INSN2_ANY_SLOT(32,vsatwh,"Vd32=vsatwh(Vu32,Vv32)","Vd32.h=vsat(Vu32.w,Vv32.w)", 1864"Saturate and pack 16 words to 16 halfwords, and interleave them", 1865 fSETHALF(0, VdV.w[i], fVSATH(VvV.w[i])); 1866 fSETHALF(1, VdV.w[i], fVSATH(VuV.w[i]))) 1867 1868ITERATOR_INSN2_ANY_SLOT(32,vsatuwuh,"Vd32=vsatuwuh(Vu32,Vv32)","Vd32.uh=vsat(Vu32.uw,Vv32.uw)", 1869"Saturate and pack 16 words to 16 halfwords, and interleave them", 1870 fSETHALF(0, VdV.w[i], fVSATUH(VvV.uw[i])); 1871 fSETHALF(1, VdV.w[i], fVSATUH(VuV.uw[i]))) 1872 1873ITERATOR_INSN2_ANY_SLOT(16,vshuffeb,"Vd32=vshuffeb(Vu32,Vv32)","Vd32.b=vshuffe(Vu32.b,Vv32.b)", 1874"Shuffle half words with in a lane", 1875 fSETBYTE(0, VdV.uh[i], fGETUBYTE(0, VvV.uh[i])); 1876 fSETBYTE(1, VdV.uh[i], fGETUBYTE(0, VuV.uh[i]))) 1877 1878ITERATOR_INSN2_ANY_SLOT(16,vshuffob,"Vd32=vshuffob(Vu32,Vv32)","Vd32.b=vshuffo(Vu32.b,Vv32.b)", 1879"Shuffle half words with in a lane", 1880 fSETBYTE(0, VdV.uh[i], fGETUBYTE(1, VvV.uh[i])); 1881 fSETBYTE(1, VdV.uh[i], fGETUBYTE(1, VuV.uh[i]))) 1882 1883ITERATOR_INSN2_ANY_SLOT(32,vshufeh,"Vd32=vshuffeh(Vu32,Vv32)","Vd32.h=vshuffe(Vu32.h,Vv32.h)", 1884"Shuffle half words with in a lane", 1885 fSETHALF(0, VdV.uw[i], fGETUHALF(0, VvV.uw[i])); 1886 fSETHALF(1, VdV.uw[i], fGETUHALF(0, VuV.uw[i]))) 1887 1888ITERATOR_INSN2_ANY_SLOT(32,vshufoh,"Vd32=vshuffoh(Vu32,Vv32)","Vd32.h=vshuffo(Vu32.h,Vv32.h)", 1889"Shuffle half words with in a lane", 1890 fSETHALF(0, VdV.uw[i], fGETUHALF(1, VvV.uw[i])); 1891 fSETHALF(1, VdV.uw[i], fGETUHALF(1, VuV.uw[i]))) 1892 1893 1894 1895 1896/************************************************************************** 1897* Double Vector Shuffles 1898**************************************************************************/ 1899 1900EXTINSN(V6_vshuff, "vshuff(Vy32,Vx32,Rt32)", 1901ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1902"2x2->2x2 transpose, for multiple data sizes, inplace", 1903{ 1904 fHIDE(int offset;) 1905 for (offset=1; offset<fVBYTES(); offset<<=1) { 1906 if ( RtV & offset) { 1907 fHIDE(int k;) \ 1908 fVFOREACH(8, k) {\ 1909 if (!( k & offset)) { 1910 fSWAPB(VyV.ub[k], VxV.ub[k+offset]); 1911 } 1912 } 1913 } 1914 } 1915 }) 1916 1917EXTINSN(V6_vshuffvdd, "Vdd32=vshuff(Vu32,Vv32,Rt8)", 1918ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1919"2x2->2x2 transpose for multiple data sizes", 1920{ 1921 fHIDE(int offset;) 1922 VddV.v[0] = VvV; 1923 VddV.v[1] = VuV; 1924 for (offset=1; offset<fVBYTES(); offset<<=1) { 1925 if ( RtV & offset) { 1926 fHIDE(int k;) \ 1927 fVFOREACH(8, k) {\ 1928 if (!( k & offset)) { 1929 fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]); 1930 } 1931 } 1932 } 1933 } 1934 }) 1935 1936EXTINSN(V6_vdeal, "vdeal(Vy32,Vx32,Rt32)", 1937ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1938" vector - vector deal - or deinterleave, for multiple data sizes, inplace", 1939{ 1940 fHIDE(int offset;) 1941 for (offset=fVBYTES()>>1; offset>0; offset>>=1) { 1942 if ( RtV & offset) { 1943 fHIDE(int k;) \ 1944 fVFOREACH(8, k) {\ 1945 if (!( k & offset)) { 1946 fSWAPB(VyV.ub[k], VxV.ub[k+offset]); 1947 } 1948 } 1949 } 1950 } 1951 }) 1952 1953EXTINSN(V6_vdealvdd, "Vdd32=vdeal(Vu32,Vv32,Rt8)", 1954ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1955" vector - vector deal - or deinterleave, for multiple data sizes", 1956{ 1957 fHIDE(int offset;) 1958 VddV.v[0] = VvV; 1959 VddV.v[1] = VuV; 1960 for (offset=fVBYTES()>>1; offset>0; offset>>=1) { 1961 if ( RtV & offset) { 1962 fHIDE(int k;) \ 1963 fVFOREACH(8, k) {\ 1964 if (!( k & offset)) { 1965 fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]); 1966 } 1967 } 1968 } 1969 } 1970 }) 1971 1972/**************************************************************************/ 1973 1974 1975 1976ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vshufoeh,"Vdd32=vshuffoeh(Vu32,Vv32)","Vdd32.h=vshuffoe(Vu32.h,Vv32.h)", 1977"Vector Shuffle half words", 1978 fSETHALF(0, VddV.v[0].uw[i], fGETUHALF(0, VvV.uw[i])); 1979 fSETHALF(1, VddV.v[0].uw[i], fGETUHALF(0, VuV.uw[i])); 1980 fSETHALF(0, VddV.v[1].uw[i], fGETUHALF(1, VvV.uw[i])); 1981 fSETHALF(1, VddV.v[1].uw[i], fGETUHALF(1, VuV.uw[i]))) 1982 1983ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vshufoeb,"Vdd32=vshuffoeb(Vu32,Vv32)","Vdd32.b=vshuffoe(Vu32.b,Vv32.b)", 1984"Vector Shuffle bytes", 1985 fSETBYTE(0, VddV.v[0].uh[i], fGETUBYTE(0, VvV.uh[i])); 1986 fSETBYTE(1, VddV.v[0].uh[i], fGETUBYTE(0, VuV.uh[i])); 1987 fSETBYTE(0, VddV.v[1].uh[i], fGETUBYTE(1, VvV.uh[i])); 1988 fSETBYTE(1, VddV.v[1].uh[i], fGETUBYTE(1, VuV.uh[i]))) 1989 1990 1991/*************************************************************** 1992* Deal 1993***************************************************************/ 1994 1995ITERATOR_INSN2_PERMUTE_SLOT(32, vdealh, "Vd32=vdealh(Vu32)", "Vd32.h=vdeal(Vu32.h)", 1996"Deal Halfwords", 1997 VdV.uh[i ] = fGETUHALF(0, VuV.uw[i]); 1998 VdV.uh[i+fVELEM(32)] = fGETUHALF(1, VuV.uw[i])) 1999 2000ITERATOR_INSN2_PERMUTE_SLOT(16, vdealb, "Vd32=vdealb(Vu32)", "Vd32.b=vdeal(Vu32.b)", 2001"Deal Halfwords", 2002 VdV.ub[i ] = fGETUBYTE(0, VuV.uh[i]); 2003 VdV.ub[i+fVELEM(16)] = fGETUBYTE(1, VuV.uh[i])) 2004 2005ITERATOR_INSN2_PERMUTE_SLOT(32, vdealb4w, "Vd32=vdealb4w(Vu32,Vv32)", "Vd32.b=vdeale(Vu32.b,Vv32.b)", 2006"Deal Two Vectors Bytes", 2007 VdV.ub[0+i ] = fGETUBYTE(0, VvV.uw[i]); 2008 VdV.ub[fVELEM(32)+i ] = fGETUBYTE(2, VvV.uw[i]); 2009 VdV.ub[2*fVELEM(32)+i] = fGETUBYTE(0, VuV.uw[i]); 2010 VdV.ub[3*fVELEM(32)+i] = fGETUBYTE(2, VuV.uw[i])) 2011 2012/*************************************************************** 2013* shuffle 2014***************************************************************/ 2015 2016ITERATOR_INSN2_PERMUTE_SLOT(32, vshuffh, "Vd32=vshuffh(Vu32)", "Vd32.h=vshuff(Vu32.h)", 2017"Deal Halfwords", 2018 fSETHALF(0, VdV.uw[i], VuV.uh[i]); 2019 fSETHALF(1, VdV.uw[i], VuV.uh[i+fVELEM(32)])) 2020 2021ITERATOR_INSN2_PERMUTE_SLOT(16, vshuffb, "Vd32=vshuffb(Vu32)", "Vd32.b=vshuff(Vu32.b)", 2022"Deal Halfwords", 2023 fSETBYTE(0, VdV.uh[i], VuV.ub[i]); 2024 fSETBYTE(1, VdV.uh[i], VuV.ub[i+fVELEM(16)])) 2025 2026 2027 2028 2029 2030/*********************************************************** 2031* INSERT AND EXTRACT 2032*********************************************************/ 2033EXTINSN(V6_extractw, "Rd32=vextract(Vu32,Rs32)", 2034ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA,A_MEMLIKE,A_RESTRICT_SLOT0ONLY), 2035"Extract an element from a vector to scalar", 2036fHIDE(warn("RdN=%d VuN=%d RsN=%d RsV=0x%08x widx=%d",RdN,VuN,RsN,RsV,((RsV & (fVBYTES()-1)) >> 2));) 2037RdV = VuV.uw[ (RsV & (fVBYTES()-1)) >> 2]; 2038fHIDE(warn("RdV=0x%08x",RdV);)) 2039 2040EXTINSN(V6_vinsertwr, "Vx32.w=vinsert(Rt32)", 2041ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), 2042"Insert Word Scalar into Vector", 2043VxV.uw[0] = RtV;) 2044 2045 2046 2047 2048ITERATOR_INSN_MPY_SLOT_LATE(32,lvsplatw, "Vd32=vsplat(Rt32)", "Replicates scalar accross words in vector", VdV.uw[i] = RtV) 2049 2050ITERATOR_INSN_MPY_SLOT_LATE(16,lvsplath, "Vd32.h=vsplat(Rt32)", "Replicates scalar accross halves in vector", VdV.uh[i] = RtV) 2051 2052ITERATOR_INSN_MPY_SLOT_LATE(8,lvsplatb, "Vd32.b=vsplat(Rt32)", "Replicates scalar accross bytes in vector", VdV.ub[i] = RtV) 2053 2054 2055ITERATOR_INSN_ANY_SLOT(32,vassign,"Vd32=Vu32","Copy a vector",VdV.w[i]=VuV.w[i]) 2056 2057 2058ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vcombine,"Vdd32=vcombine(Vu32,Vv32)", 2059"Vector assign, Any two to Vector Pair", 2060 VddV.v[0].ub[i] = VvV.ub[i]; 2061 VddV.v[1].ub[i] = VuV.ub[i]) 2062 2063 2064 2065/////////////////////////////////////////////////////////////////////////// 2066 2067EXTINSN(V6_vcombine_tmp, "Vdd32.tmp=vcombine(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_REMAP,A_CVI_TMP,A_NO_INTRINSIC), 2068"Vector assign tmp, Any two to Vector Pair ", 2069{ 2070 fHIDE(int i;) 2071 fVFOREACH(8, i) { 2072 VddV.v[0].ub[i] = VvV.ub[i]; 2073 VddV.v[1].ub[i] = VuV.ub[i]; 2074 } 2075}) 2076 2077EXTINSN(V6_vassign_tmp, "Vd32.tmp=Vu32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_REMAP,A_CVI_TMP,A_NO_INTRINSIC), 2078"Vector assign tmp, Any two to Vector Pair ", 2079{ 2080 fHIDE(int i;) 2081 fVFOREACH(32, i) { 2082 VdV.w[i]=VuV.w[i]; 2083 } 2084}) 2085 2086/********************************************************* 2087* GENERAL PERMUTE NETWORKS 2088*********************************************************/ 2089 2090 2091EXTINSN(V6_vdelta, "Vd32=vdelta(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 2092"Reverse Benes Butterfly network ", 2093{ 2094 fHIDE(int offset;) 2095 fHIDE(int k;) 2096 fHIDE(mmvector_t tmp;) 2097 tmp = VuV; 2098 for (offset=fVBYTES(); (offset>>=1)>0; ) { 2099 for (k = 0; k<fVBYTES(); k++) { 2100 VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k]; 2101 } 2102 for (k = 0; k<fVBYTES(); k++) { 2103 tmp.ub[k] = VdV.ub[k]; 2104 } 2105 } 2106}) 2107 2108 2109EXTINSN(V6_vrdelta, "Vd32=vrdelta(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 2110"Forward Benes Butterfly network ", 2111{ 2112 fHIDE(int offset;) 2113 fHIDE(int k;) 2114 fHIDE(mmvector_t tmp;) 2115 tmp = VuV; 2116 for (offset=1; offset<fVBYTES(); offset<<=1){ 2117 for (k = 0; k<fVBYTES(); k++) { 2118 VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k]; 2119 } 2120 for (k = 0; k<fVBYTES(); k++) { 2121 tmp.ub[k] = VdV.ub[k]; 2122 } 2123 } 2124}) 2125 2126 2127 2128 2129 2130ITERATOR_INSN2_SHIFT_SLOT(32,vcl0w,"Vd32=vcl0w(Vu32)","Vd32.uw=vcl0(Vu32.uw)", "Count Leading Zeros in Word", VdV.uw[i]=fCL1_4(~VuV.uw[i])) 2131ITERATOR_INSN2_SHIFT_SLOT(16,vcl0h,"Vd32=vcl0h(Vu32)","Vd32.uh=vcl0(Vu32.uh)", "Count Leading Zeros in Word", VdV.uh[i]=fCL1_2(~VuV.uh[i])) 2132 2133ITERATOR_INSN2_SHIFT_SLOT(32,vnormamtw,"Vd32=vnormamtw(Vu32)","Vd32.w=vnormamt(Vu32.w)","Norm Amount Word", 2134VdV.w[i]=fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i]))-1; fHIDE(IV1DEAD();)) 2135ITERATOR_INSN2_SHIFT_SLOT(16,vnormamth,"Vd32=vnormamth(Vu32)","Vd32.h=vnormamt(Vu32.h)","Norm Amount Halfword", 2136VdV.h[i]=fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i]))-1; fHIDE(IV1DEAD();)) 2137 2138ITERATOR_INSN_SHIFT_SLOT_VV_LATE(32,vaddclbw,"Vd32.w=vadd(vclb(Vu32.w),Vv32.w)", 2139"Count leading bits and add", 2140VdV.w[i] = fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i])) + VvV.w[i]) 2141 2142ITERATOR_INSN_SHIFT_SLOT_VV_LATE(16,vaddclbh,"Vd32.h=vadd(vclb(Vu32.h),Vv32.h)", 2143"Count leading bits and add", 2144VdV.h[i] = fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i])) + VvV.h[i]) 2145 2146 2147ITERATOR_INSN2_SHIFT_SLOT(16,vpopcounth,"Vd32=vpopcounth(Vu32)","Vd32.h=vpopcount(Vu32.h)", "Count Leading Zeros in Word", VdV.uh[i]=fCOUNTONES_2(VuV.uh[i])) 2148 2149 2150#define fHIST(INPUTVEC) \ 2151 fUARCH_NOTE_PUMP_4X(); \ 2152 fHIDE(int lane;) \ 2153 fHIDE(mmvector_t tmp;) \ 2154 fVFOREACH(128, lane) { \ 2155 for (fHIDE(int )i=0; i<128/8; ++i) { \ 2156 unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \ 2157 unsigned char regno = value>>3; \ 2158 unsigned char element = value & 7; \ 2159 READ_EXT_VREG(regno,tmp,0); \ 2160 tmp.uh[(128/16)*lane+(element)]++; \ 2161 WRITE_EXT_VREG(regno,tmp,EXT_NEW); \ 2162 } \ 2163 } 2164 2165#define fHISTQ(INPUTVEC,QVAL) \ 2166 fUARCH_NOTE_PUMP_4X(); \ 2167 fHIDE(int lane;) \ 2168 fHIDE(mmvector_t tmp;) \ 2169 fVFOREACH(128, lane) { \ 2170 for (fHIDE(int )i=0; i<128/8; ++i) { \ 2171 unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \ 2172 unsigned char regno = value>>3; \ 2173 unsigned char element = value & 7; \ 2174 READ_EXT_VREG(regno,tmp,0); \ 2175 if (fGETQBIT(QVAL,128/8*lane+i)) tmp.uh[(128/16)*lane+(element)]++; \ 2176 WRITE_EXT_VREG(regno,tmp,EXT_NEW); \ 2177 } \ 2178 } 2179 2180 2181 2182EXTINSN(V6_vhist, "vhist",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHIST(inputVec); }) 2183EXTINSN(V6_vhistq, "vhist(Qv4)",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHISTQ(inputVec,QvV); }) 2184 2185#undef fHIST 2186#undef fHISTQ 2187 2188 2189/* **** WEIGHTED HISTOGRAM **** */ 2190 2191 2192#if 1 2193#define WHIST(EL,MASK,BSHIFT,COND,SATF) \ 2194 fHIDE(unsigned int) bucket = fGETUBYTE(0,input.h[i]); \ 2195 fHIDE(unsigned int) weight = fGETUBYTE(1,input.h[i]); \ 2196 fHIDE(unsigned int) vindex = (bucket >> 3) & 0x1F; \ 2197 fHIDE(unsigned int) elindex = ((i>>BSHIFT) & (~MASK)) | ((bucket>>BSHIFT) & MASK); \ 2198 fHIDE(mmvector_t tmp;) \ 2199 READ_EXT_VREG(vindex,tmp,0); \ 2200 COND tmp.EL[elindex] = SATF(tmp.EL[elindex] + weight); \ 2201 WRITE_EXT_VREG(vindex,tmp,EXT_NEW); \ 2202 fUARCH_NOTE_PUMP_2X(); 2203 2204ITERATOR_INSN_VHISTLIKE(16,vwhist256,"vwhist256","vector weighted histogram halfword counters", WHIST(uh,7,0,,)) 2205ITERATOR_INSN_VHISTLIKE(16,vwhist256q,"vwhist256(Qv4)","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),)) 2206ITERATOR_INSN_VHISTLIKE(16,vwhist256_sat,"vwhist256:sat","vector weighted histogram halfword counters", WHIST(uh,7,0,,fVSATUH)) 2207ITERATOR_INSN_VHISTLIKE(16,vwhist256q_sat,"vwhist256(Qv4):sat","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),fVSATUH)) 2208ITERATOR_INSN_VHISTLIKE(16,vwhist128,"vwhist128","vector weighted histogram word counters", WHIST(uw,3,1,,)) 2209ITERATOR_INSN_VHISTLIKE(16,vwhist128q,"vwhist128(Qv4)","vector weighted histogram word counters", WHIST(uw,3,1,if (fGETQBIT(QvV,2*i)),)) 2210ITERATOR_INSN_VHISTLIKE(16,vwhist128m,"vwhist128(#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if ((bucket & 1) == uiV),)) 2211ITERATOR_INSN_VHISTLIKE(16,vwhist128qm,"vwhist128(Qv4,#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if (((bucket & 1) == uiV) && fGETQBIT(QvV,2*i)),)) 2212 2213 2214#endif 2215 2216 2217 2218/* ****** lookup table instructions *********** */ 2219 2220/* Use low bits from idx to choose next-bigger elements from vector, then use LSB from idx to choose odd or even element */ 2221 2222ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup", 2223fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2224matchval = RtV & 0x7; 2225oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2226idx = VuV.ub[i]; 2227VdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2228 2229 2230ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracc,"Vx32.b|=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup", 2231fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2232matchval = RtV & 0x7; 2233oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2234idx = VuV.ub[i]; 2235VxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2236 2237ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup", 2238fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2239matchval = RtV & 0xF; 2240oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2241idx = fGETUBYTE(0,VuV.uh[i]); 2242VddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2243idx = fGETUBYTE(1,VuV.uh[i]); 2244VddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2245 2246ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracc,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup", 2247fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2248matchval = fGETUBYTE(0,RtV) & 0xF; 2249oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2250idx = fGETUBYTE(0,VuV.uh[i]); 2251VxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2252idx = fGETUBYTE(1,VuV.uh[i]); 2253VxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2254 2255ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvbi,"Vd32.b=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup", 2256fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2257matchval = uiV & 0x7; 2258oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2259idx = VuV.ub[i]; 2260VdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2261 2262 2263ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracci,"Vx32.b|=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup", 2264fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2265matchval = uiV & 0x7; 2266oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2267idx = VuV.ub[i]; 2268VxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2269 2270ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwhi,"Vdd32.h=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup", 2271fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2272matchval = uiV & 0xF; 2273oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2274idx = fGETUBYTE(0,VuV.uh[i]); 2275VddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2276idx = fGETUBYTE(1,VuV.uh[i]); 2277VddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2278 2279ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracci,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup", 2280fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2281matchval = uiV & 0xF; 2282oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2283idx = fGETUBYTE(0,VuV.uh[i]); 2284VxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2285idx = fGETUBYTE(1,VuV.uh[i]); 2286VxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2287 2288ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb_nm,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8):nomatch","vector-vector table lookup", 2289fHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;) 2290 matchval = RtV & 0x7; 2291 oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2292 idx = VuV.ub[i]; 2293 idx = (idx&0x1F) | (matchval<<5); 2294 VdV.b[i] = fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)])) 2295 2296ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_nm,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8):nomatch","vector-vector table lookup", 2297fHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;) 2298 matchval = RtV & 0xF; 2299 oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2300 idx = fGETUBYTE(0,VuV.uh[i]); 2301 idx = (idx&0x0F) | (matchval<<4); 2302 VddV.v[0].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]); 2303 idx = fGETUBYTE(1,VuV.uh[i]); 2304 idx = (idx&0x0F) | (matchval<<4); 2305 VddV.v[1].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)])) 2306 2307 2308 2309 2310/****************************************************************************** 2311NON LINEAR - V65 2312 ******************************************************************************/ 2313 2314ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpahhsat,"Vx32.h=vmpa(Vx32.h,Vu32.h,Rtt32.h):sat","piecewise linear approximation", 2315 VxV.h[i]= fVSATH( ( ( fMPY16SS(VxV.h[i],VuV.h[i])<<1) + (fGETHALF(( (VuV.h[i]>>14)&0x3), RttV )<<15))>>16)) 2316 2317 2318ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpauhuhsat,"Vx32.h=vmpa(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation", 2319 VxV.h[i]= fVSATH( ( fMPY16SU(VxV.h[i],VuV.uh[i]) + (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16)) 2320 2321ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpsuhuhsat,"Vx32.h=vmps(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation", 2322 VxV.h[i]= fVSATH( ( fMPY16SU(VxV.h[i],VuV.uh[i]) - (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16)) 2323 2324 2325ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vlut4,"Vd32.h=vlut4(Vu32.uh,Rtt32.h)","4 entry lookup table", 2326 VdV.h[i]= fGETHALF( ((VuV.h[i]>>14)&0x3), RttV )) 2327 2328 2329 2330/****************************************************************************** 2331V65 2332 ******************************************************************************/ 2333 2334ITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe,"Vd32.uw=vmpye(Vu32.uh,Rt32.uh)", 2335"Vector even halfword unsigned multiply by scalar", 2336 VdV.uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV))) 2337 2338 2339ITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe_acc,"Vx32.uw+=vmpye(Vu32.uh,Rt32.uh)", 2340"Vector even halfword unsigned multiply by scalar", 2341 VxV.uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV))) 2342 2343 2344 2345 2346EXTINSN(V6_vgathermw, "vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words", 2347{ 2348 fHIDE(int i;) 2349 fHIDE(int element_size = 4;) 2350 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2351 fVLASTBYTE(MuV, element_size); 2352 fVALIGN(RtV, element_size); 2353 fVFOREACH(32, i) { 2354 EA = RtV+VvV.uw[i]; 2355 fVLOG_VTCM_GATHER_WORD(EA, VvV.uw[i], i,MuV); 2356 } 2357 fGATHER_FINISH() 2358}) 2359EXTINSN(V6_vgathermh, "vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2360{ 2361 fHIDE(int i;) 2362 fHIDE(int element_size = 2;) 2363 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2364 fVLASTBYTE(MuV, element_size); 2365 fVALIGN(RtV, element_size); 2366 fVFOREACH(16, i) { 2367 EA = RtV+VvV.uh[i]; 2368 fVLOG_VTCM_GATHER_HALFWORD(EA, VvV.uh[i], i,MuV); 2369 } 2370 fGATHER_FINISH() 2371}) 2372 2373 2374 2375EXTINSN(V6_vgathermhw, "vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2376{ 2377 fHIDE(int i;) 2378 fHIDE(int j;) 2379 fHIDE(int element_size = 2;) 2380 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2381 fVLASTBYTE(MuV, element_size); 2382 fVALIGN(RtV, element_size); 2383 fVFOREACH(32, i) { 2384 for(j = 0; j < 2; j++) { 2385 EA = RtV+VvvV.v[j].uw[i]; 2386 fVLOG_VTCM_GATHER_HALFWORD_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,MuV); 2387 } 2388 } 2389 fGATHER_FINISH() 2390}) 2391 2392 2393EXTINSN(V6_vgathermwq, "if (Qs4) vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words", 2394{ 2395 fHIDE(int i;) 2396 fHIDE(int element_size = 4;) 2397 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2398 fVLASTBYTE(MuV, element_size); 2399 fVALIGN(RtV, element_size); 2400 fVFOREACH(32, i) { 2401 EA = RtV+VvV.uw[i]; 2402 fVLOG_VTCM_GATHER_WORDQ(EA, VvV.uw[i], i,QsV,MuV); 2403 } 2404 fGATHER_FINISH() 2405}) 2406EXTINSN(V6_vgathermhq, "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2407{ 2408 fHIDE(int i;) 2409 fHIDE(int element_size = 2;) 2410 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2411 fVLASTBYTE(MuV, element_size); 2412 fVALIGN(RtV, element_size); 2413 fVFOREACH(16, i) { 2414 EA = RtV+VvV.uh[i]; 2415 fVLOG_VTCM_GATHER_HALFWORDQ(EA, VvV.uh[i], i,QsV,MuV); 2416 } 2417 fGATHER_FINISH() 2418}) 2419 2420 2421 2422EXTINSN(V6_vgathermhwq, "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2423{ 2424 fHIDE(int i;) 2425 fHIDE(int j;) 2426 fHIDE(int element_size = 2;) 2427 fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2428 fVLASTBYTE(MuV, element_size); 2429 fVALIGN(RtV, element_size); 2430 fVFOREACH(32, i) { 2431 for(j = 0; j < 2; j++) { 2432 EA = RtV+VvvV.v[j].uw[i]; 2433 fVLOG_VTCM_GATHER_HALFWORDQ_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,QsV,MuV); 2434 } 2435 } 2436 fGATHER_FINISH() 2437}) 2438 2439 2440 2441EXTINSN(V6_vscattermw , "vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words", 2442{ 2443 fHIDE(int i;) 2444 fHIDE(int element_size = 4;) 2445 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2446 fVLASTBYTE(MuV, element_size); 2447 fVALIGN(RtV, element_size); 2448 fVFOREACH(32, i) { 2449 EA = RtV+VvV.uw[i]; 2450 fVLOG_VTCM_WORD(EA, VvV.uw[i], VwV,i,MuV); 2451 } 2452 fSCATTER_FINISH(0) 2453}) 2454 2455 2456 2457EXTINSN(V6_vscattermh , "vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfWords", 2458{ 2459 fHIDE(int i;) 2460 fHIDE(int element_size = 2;) 2461 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2462 fVLASTBYTE(MuV, element_size); 2463 fVALIGN(RtV, element_size); 2464 fVFOREACH(16, i) { 2465 EA = RtV+VvV.uh[i]; 2466 fVLOG_VTCM_HALFWORD(EA,VvV.uh[i],VwV,i,MuV); 2467 } 2468 fSCATTER_FINISH(0) 2469}) 2470 2471 2472EXTINSN(V6_vscattermw_add, "vscatter(Rt32,Mu2,Vv32.w).w+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words-Add", 2473{ 2474 fHIDE(int i;) 2475 fHIDE(int ALIGNMENT=4;) 2476 fHIDE(int element_size = 4;) 2477 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2478 fVLASTBYTE(MuV, element_size); 2479 fVALIGN(RtV, element_size); 2480 fVFOREACH(32, i) { 2481 EA = (RtV+fVALIGN(VvV.uw[i],ALIGNMENT)); 2482 fVLOG_VTCM_WORD_INCREMENT(EA,VvV.uw[i],VwV,i,ALIGNMENT,MuV); 2483 } 2484 fHIDE(fLOG_SCATTER_OP(4);) 2485 fSCATTER_FINISH(1) 2486}) 2487 2488EXTINSN(V6_vscattermh_add, "vscatter(Rt32,Mu2,Vv32.h).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfword-Add", 2489{ 2490 fHIDE(int i;) 2491 fHIDE(int ALIGNMENT=2;) 2492 fHIDE(int element_size = 2;) 2493 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2494 fVLASTBYTE(MuV, element_size); 2495 fVALIGN(RtV, element_size); 2496 fVFOREACH(16, i) { 2497 EA = (RtV+fVALIGN(VvV.uh[i],ALIGNMENT)); 2498 fVLOG_VTCM_HALFWORD_INCREMENT(EA,VvV.uh[i],VwV,i,ALIGNMENT,MuV); 2499 } 2500 fHIDE(fLOG_SCATTER_OP(2);) 2501 fSCATTER_FINISH(1) 2502}) 2503 2504 2505EXTINSN(V6_vscattermwq, "if (Qs4) vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words conditional", 2506{ 2507 fHIDE(int i;) 2508 fHIDE(int element_size = 4;) 2509 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2510 fVLASTBYTE(MuV, element_size); 2511 fVALIGN(RtV, element_size); 2512 fVFOREACH(32, i) { 2513 EA = RtV+VvV.uw[i]; 2514 fVLOG_VTCM_WORDQ(EA,VvV.uw[i], VwV,i,QsV,MuV); 2515 } 2516 fSCATTER_FINISH(0) 2517}) 2518 2519EXTINSN(V6_vscattermhq, "if (Qs4) vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter HalfWords conditional", 2520{ 2521 fHIDE(int i;) 2522 fHIDE(int element_size = 2;) 2523 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2524 fVLASTBYTE(MuV, element_size); 2525 fVALIGN(RtV, element_size); 2526 fVFOREACH(16, i) { 2527 EA = RtV+VvV.uh[i]; 2528 fVLOG_VTCM_HALFWORDQ(EA,VvV.uh[i],VwV,i,QsV,MuV); 2529 } 2530 fSCATTER_FINISH(0) 2531}) 2532 2533 2534 2535 2536EXTINSN(V6_vscattermhw , "vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter Words", 2537{ 2538 fHIDE(int i;) 2539 fHIDE(int j;) 2540 fHIDE(int element_size = 2;) 2541 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2542 fVLASTBYTE(MuV, element_size); 2543 fVALIGN(RtV, element_size); 2544 fVFOREACH(32, i) { 2545 for(j = 0; j < 2; j++) { 2546 EA = RtV+VvvV.v[j].uw[i]; 2547 fVLOG_VTCM_HALFWORD_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,MuV); 2548 } 2549 } 2550 fSCATTER_FINISH(0) 2551}) 2552 2553 2554ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(32, v6mpyvubs10_vxx, "Vxx32.w+=v6mpy(Vuu32.ub,Vvv32.b,#u2):v", "", 2555 fHIDE(size2s_t c00;) 2556 fGET10BIT(c00, VvvV.v[0].uw[i], 0) 2557 fHIDE(size2s_t c01;) 2558 fGET10BIT(c01, VvvV.v[0].uw[i], 1) 2559 fHIDE(size2s_t c02;) 2560 fGET10BIT(c02, VvvV.v[0].uw[i], 2) 2561 2562 fHIDE(size2s_t c10;) 2563 fGET10BIT(c10, VvvV.v[1].uw[i], 0) 2564 fHIDE(size2s_t c11;) 2565 fGET10BIT(c11, VvvV.v[1].uw[i], 1) 2566 fHIDE(size2s_t c12;) 2567 fGET10BIT(c12, VvvV.v[1].uw[i], 2) 2568 2569 if (uiV == 0) { 2570 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10); 2571 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11); 2572 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12); 2573 2574 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00); 2575 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2576 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02); 2577 2578 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c10); 2579 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11); 2580 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c12); 2581 2582 } else if (uiV == 1) { 2583 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c00); 2584 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c01); 2585 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c02); 2586 2587 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10); 2588 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11); 2589 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12); 2590 2591 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00); 2592 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2593 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02); 2594 2595 } else if (uiV == 2) { 2596 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10); 2597 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2598 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12); 2599 2600 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00); 2601 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01); 2602 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02); 2603 2604 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c10); 2605 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c11); 2606 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c12); 2607 2608 } else if (uiV == 3) { 2609 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c00); 2610 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01); 2611 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c02); 2612 2613 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10); 2614 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2615 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12); 2616 2617 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00); 2618 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01); 2619 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02); 2620 } 2621) 2622ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(32, v6mpyhubs10_vxx, "Vxx32.w+=v6mpy(Vuu32.ub,Vvv32.b,#u2):h", "", 2623 fHIDE(size2s_t c00;) 2624 fGET10BIT(c00, VvvV.v[0].uw[i], 0) 2625 fHIDE(size2s_t c01;) 2626 fGET10BIT(c01, VvvV.v[0].uw[i], 1) 2627 fHIDE(size2s_t c02;) 2628 fGET10BIT(c02, VvvV.v[0].uw[i], 2) 2629 fHIDE(size2s_t c10;) 2630 fGET10BIT(c10, VvvV.v[1].uw[i], 0) 2631 fHIDE(size2s_t c11;) 2632 fGET10BIT(c11, VvvV.v[1].uw[i], 1) 2633 fHIDE(size2s_t c12;) 2634 fGET10BIT(c12, VvvV.v[1].uw[i], 2) 2635 2636 if (uiV == 0) { 2637 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10); 2638 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11); 2639 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12); 2640 2641 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00); 2642 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2643 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02); 2644 2645 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c10); 2646 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11); 2647 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c12); 2648 2649 } else if (uiV == 1) { 2650 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c00); 2651 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c01); 2652 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c02); 2653 2654 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10); 2655 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11); 2656 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12); 2657 2658 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00); 2659 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2660 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02); 2661 2662 } else if (uiV == 2) { 2663 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10); 2664 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2665 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12); 2666 2667 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00); 2668 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01); 2669 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02); 2670 2671 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c10); 2672 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c11); 2673 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c12); 2674 2675 } else if (uiV == 3) { 2676 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c00); 2677 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01); 2678 VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c02); 2679 2680 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10); 2681 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2682 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12); 2683 2684 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00); 2685 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01); 2686 VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02); 2687 } 2688) 2689 2690 2691ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32, v6mpyvubs10, "Vdd32.w=v6mpy(Vuu32.ub,Vvv32.b,#u2):v", "", 2692 fHIDE(short c00;) 2693 fGET10BIT(c00, VvvV.v[0].uw[i], 0) 2694 fHIDE(short c01;) 2695 fGET10BIT(c01, VvvV.v[0].uw[i], 1) 2696 fHIDE(short c02;) 2697 fGET10BIT(c02, VvvV.v[0].uw[i], 2) 2698 fHIDE(short c10;) 2699 fGET10BIT(c10, VvvV.v[1].uw[i], 0) 2700 fHIDE(short c11;) 2701 fGET10BIT(c11, VvvV.v[1].uw[i], 1) 2702 fHIDE(short c12;) 2703 fGET10BIT(c12, VvvV.v[1].uw[i], 2) 2704 2705 2706 2707 if (uiV == 0) { 2708 VddV.v[1].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10); 2709 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11); 2710 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12); 2711 2712 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00); 2713 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2714 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02); 2715 2716 VddV.v[0].w[i] = fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c10); 2717 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11); 2718 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c12); 2719 2720 } else if (uiV == 1) { 2721 VddV.v[1].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c00); 2722 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c01); 2723 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c02); 2724 2725 VddV.v[0].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10); 2726 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11); 2727 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12); 2728 2729 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00); 2730 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2731 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02); 2732 2733 } else if (uiV == 2) { 2734 VddV.v[1].w[i] = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10); 2735 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2736 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12); 2737 2738 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00); 2739 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01); 2740 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02); 2741 2742 VddV.v[0].w[i] = fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c10); 2743 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c11); 2744 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c12); 2745 2746 } else if (uiV == 3) { 2747 VddV.v[1].w[i] = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c00); 2748 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01); 2749 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c02); 2750 2751 VddV.v[0].w[i] = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10); 2752 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2753 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12); 2754 2755 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00); 2756 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01); 2757 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02); 2758 } 2759) 2760 2761ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32, v6mpyhubs10, "Vdd32.w=v6mpy(Vuu32.ub,Vvv32.b,#u2):h", "", 2762 fHIDE(short c00;) 2763 fGET10BIT(c00, VvvV.v[0].uw[i], 0) 2764 fHIDE(short c01;) 2765 fGET10BIT(c01, VvvV.v[0].uw[i], 1) 2766 fHIDE(short c02;) 2767 fGET10BIT(c02, VvvV.v[0].uw[i], 2) 2768 fHIDE(short c10;) 2769 fGET10BIT(c10, VvvV.v[1].uw[i], 0) 2770 fHIDE(short c11;) 2771 fGET10BIT(c11, VvvV.v[1].uw[i], 1) 2772 fHIDE(short c12;) 2773 fGET10BIT(c12, VvvV.v[1].uw[i], 2) 2774 2775 if (uiV == 0) { 2776 VddV.v[1].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10); 2777 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11); 2778 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12); 2779 2780 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00); 2781 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2782 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02); 2783 2784 VddV.v[0].w[i] = fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c10); 2785 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11); 2786 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c12); 2787 2788 } else if (uiV == 1) { 2789 VddV.v[1].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c00); 2790 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c01); 2791 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c02); 2792 2793 VddV.v[0].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10); 2794 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11); 2795 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12); 2796 2797 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00); 2798 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2799 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02); 2800 2801 } else if (uiV == 2) { 2802 VddV.v[1].w[i] = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10); 2803 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2804 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12); 2805 2806 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00); 2807 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01); 2808 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02); 2809 2810 VddV.v[0].w[i] = fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c10); 2811 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c11); 2812 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c12); 2813 2814 } else if (uiV == 3) { 2815 VddV.v[1].w[i] = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c00); 2816 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01); 2817 VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c02); 2818 2819 VddV.v[0].w[i] = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10); 2820 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2821 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12); 2822 2823 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00); 2824 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01); 2825 VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02); 2826 } 2827) 2828 2829 2830EXTINSN(V6_vscattermhwq, "if (Qs4) vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords conditional", 2831{ 2832 fHIDE(int i;) 2833 fHIDE(int j;) 2834 fHIDE(int element_size = 2;) 2835 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2836 fVLASTBYTE(MuV, element_size); 2837 fVALIGN(RtV, element_size); 2838 fVFOREACH(32, i) { 2839 for(j = 0; j < 2; j++) { 2840 EA = RtV+VvvV.v[j].uw[i]; 2841 fVLOG_VTCM_HALFWORDQ_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),QsV,i,j,MuV); 2842 } 2843 } 2844 fSCATTER_FINISH(0) 2845}) 2846 2847EXTINSN(V6_vscattermhw_add, "vscatter(Rt32,Mu2,Vvv32.w).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords-add", 2848{ 2849 fHIDE(int i;) 2850 fHIDE(int j;) 2851 fHIDE(int ALIGNMENT=2;) 2852 fHIDE(int element_size = 2;) 2853 fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2854 fVLASTBYTE(MuV, element_size); 2855 fVALIGN(RtV, element_size); 2856 fVFOREACH(32, i) { 2857 for(j = 0; j < 2; j++) { 2858 EA = RtV + fVALIGN(VvvV.v[j].uw[i],ALIGNMENT);; 2859 fVLOG_VTCM_HALFWORD_INCREMENT_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,ALIGNMENT,MuV); 2860 } 2861 } 2862 fHIDE(fLOG_SCATTER_OP(2);) 2863 fSCATTER_FINISH(1) 2864}) 2865 2866EXTINSN(V6_vprefixqb,"Vd32.b=prefixsum(Qv4)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), "parallel prefix sum of Q into byte", 2867{ 2868 fHIDE(int i;) 2869 fHIDE(size1u_t acc = 0;) 2870 fVFOREACH(8, i) { 2871 acc += fGETQBIT(QvV,i); 2872 VdV.ub[i] = acc; 2873 } 2874 } ) 2875EXTINSN(V6_vprefixqh,"Vd32.h=prefixsum(Qv4)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), "parallel prefix sum of Q into halfwords", 2876{ 2877 fHIDE(int i;) 2878 fHIDE(size2u_t acc = 0;) 2879 fVFOREACH(16, i) { 2880 acc += fGETQBIT(QvV,i*2+0); 2881 acc += fGETQBIT(QvV,i*2+1); 2882 VdV.uh[i] = acc; 2883 } 2884 } ) 2885EXTINSN(V6_vprefixqw,"Vd32.w=prefixsum(Qv4)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), "parallel prefix sum of Q into words", 2886{ 2887 fHIDE(int i;) 2888 fHIDE(size4u_t acc = 0;) 2889 fVFOREACH(32, i) { 2890 acc += fGETQBIT(QvV,i*4+0); 2891 acc += fGETQBIT(QvV,i*4+1); 2892 acc += fGETQBIT(QvV,i*4+2); 2893 acc += fGETQBIT(QvV,i*4+3); 2894 VdV.uw[i] = acc; 2895 } 2896 } ) 2897 2898 2899 2900 2901 2902/****************************************************************************** 2903 DEBUG Vector/Register Printing 2904 ******************************************************************************/ 2905 2906#define PRINT_VU(TYPE, TYPE2, COUNT)\ 2907 int i; \ 2908 size4u_t vec_len = fVBYTES();\ 2909 fprintf(stdout,"V%2d: ",VuN); \ 2910 for (i=0;i<vec_len>>COUNT;i++) { \ 2911 fprintf(stdout,TYPE2 " ", VuV.TYPE[i]); \ 2912 }; \ 2913 fprintf(stdout,"\\n"); \ 2914 fflush(stdout);\ 2915 2916#undef ATTR_VMEM 2917#undef ATTR_VMEMU 2918#undef ATTR_VMEM_NT 2919 2920#endif /* NO_MMVEC */ 2921 2922#ifdef __SELF_DEF_EXTINSN 2923#undef EXTINSN 2924#undef __SELF_DEF_EXTINSN 2925#endif 2926