1887d61b2STaylor Simpson/* 2f128c0feSTaylor Simpson * Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved. 3887d61b2STaylor Simpson * 4887d61b2STaylor Simpson * This program is free software; you can redistribute it and/or modify 5887d61b2STaylor Simpson * it under the terms of the GNU General Public License as published by 6887d61b2STaylor Simpson * the Free Software Foundation; either version 2 of the License, or 7887d61b2STaylor Simpson * (at your option) any later version. 8887d61b2STaylor Simpson * 9887d61b2STaylor Simpson * This program is distributed in the hope that it will be useful, 10887d61b2STaylor Simpson * but WITHOUT ANY WARRANTY; without even the implied warranty of 11887d61b2STaylor Simpson * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12887d61b2STaylor Simpson * GNU General Public License for more details. 13887d61b2STaylor Simpson * 14887d61b2STaylor Simpson * You should have received a copy of the GNU General Public License 15887d61b2STaylor Simpson * along with this program; if not, see <http://www.gnu.org/licenses/>. 16887d61b2STaylor Simpson */ 17887d61b2STaylor Simpson 18887d61b2STaylor Simpson/****************************************************************************** 19887d61b2STaylor Simpson * 206c67d98cSMichael Tokarev * HOYA: MULTI MEDIA INSTRUCTIONS 21887d61b2STaylor Simpson * 22887d61b2STaylor Simpson ******************************************************************************/ 23887d61b2STaylor Simpson 24887d61b2STaylor Simpson#ifndef EXTINSN 25887d61b2STaylor Simpson#define EXTINSN Q6INSN 26887d61b2STaylor Simpson#define __SELF_DEF_EXTINSN 1 27887d61b2STaylor Simpson#endif 28887d61b2STaylor Simpson 29887d61b2STaylor Simpson#ifndef NO_MMVEC 30887d61b2STaylor Simpson 31887d61b2STaylor Simpson#define DO_FOR_EACH_CODE(WIDTH, CODE) \ 32887d61b2STaylor Simpson{ \ 33887d61b2STaylor Simpson fHIDE(int i;) \ 34887d61b2STaylor Simpson fVFOREACH(WIDTH, i) {\ 35887d61b2STaylor Simpson CODE ;\ 36887d61b2STaylor Simpson } \ 37887d61b2STaylor Simpson} 38887d61b2STaylor Simpson 39887d61b2STaylor Simpson 40887d61b2STaylor Simpson 41887d61b2STaylor Simpson 42887d61b2STaylor Simpson#define ITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 43887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), \ 44887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 45887d61b2STaylor Simpson 46887d61b2STaylor Simpson 47887d61b2STaylor Simpson 48887d61b2STaylor Simpson#define ITERATOR_INSN2_ANY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 49887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 50887d61b2STaylor Simpson 51887d61b2STaylor Simpson#define ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 52887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV), \ 53887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 54887d61b2STaylor Simpson 55887d61b2STaylor Simpson 56887d61b2STaylor Simpson#define ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 57887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE) 58887d61b2STaylor Simpson 59887d61b2STaylor Simpson 60887d61b2STaylor Simpson#define ITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 61887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), \ 62887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 63887d61b2STaylor Simpson 64887d61b2STaylor Simpson 65b2f20c2cSTaylor Simpson#define ITERATOR_INSN_SHIFT3_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 66b2f20c2cSTaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS,A_CVI_VS_3SRC,A_NOTE_SHIFT_RESOURCE,A_NOTE_NOVP,A_NOTE_VA_UNARY), \ 67b2f20c2cSTaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 68887d61b2STaylor Simpson 69887d61b2STaylor Simpson#define ITERATOR_INSN_SHIFT_SLOT_VV_LATE(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 70887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), \ 71887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 72887d61b2STaylor Simpson 73887d61b2STaylor Simpson#define ITERATOR_INSN2_SHIFT_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 74887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 75887d61b2STaylor Simpson 76887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 77887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), \ 78887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 79887d61b2STaylor Simpson 80887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 81887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 82887d61b2STaylor Simpson 83887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 84887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 85887d61b2STaylor Simpson 86887d61b2STaylor Simpson 87887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 88887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX2,DESCR,CODE) 89887d61b2STaylor Simpson 90887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 91887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 92887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 93887d61b2STaylor Simpson 94887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 95887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 96887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 97887d61b2STaylor Simpson 98887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 99887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE) 100887d61b2STaylor Simpson 101887d61b2STaylor Simpson#define ITERATOR_INSN_MPY_SLOT(WIDTH,TAG, SYNTAX,DESCR,CODE) \ 102887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, \ 103887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), \ 104887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 105887d61b2STaylor Simpson 106887d61b2STaylor Simpson#define ITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,DESCR,CODE) \ 107887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, \ 108887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), \ 109887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 110887d61b2STaylor Simpson 111887d61b2STaylor Simpson#define ITERATOR_INSN2_MPY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 112887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 113887d61b2STaylor Simpson 114887d61b2STaylor Simpson#define ITERATOR_INSN2_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,SYNTAX2,DESCR,CODE) \ 115887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX2,DESCR,CODE) 116887d61b2STaylor Simpson 117887d61b2STaylor Simpson 118887d61b2STaylor Simpson#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 119887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV), \ 120887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 121887d61b2STaylor Simpson 122f128c0feSTaylor Simpson#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 123f128c0feSTaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV), \ 124f128c0feSTaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 125f128c0feSTaylor Simpson 126887d61b2STaylor Simpson#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 127887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE) 128887d61b2STaylor Simpson 129887d61b2STaylor Simpson 130887d61b2STaylor Simpson 131887d61b2STaylor Simpson 132887d61b2STaylor Simpson#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC2(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 133887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_CVI_VX_VSRC0_IS_DST), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 134887d61b2STaylor Simpson 135887d61b2STaylor Simpson#define ITERATOR_INSN_SLOT2_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 136887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_RESTRICT_SLOT2ONLY), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 137887d61b2STaylor Simpson 138887d61b2STaylor Simpson#define ITERATOR_INSN_VHISTLIKE(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 139887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), \ 140887d61b2STaylor SimpsonDESCR, fHIDE(mmvector_t input;) input = fTMPVDATA(); DO_FOR_EACH_CODE(WIDTH, CODE)) 141887d61b2STaylor Simpson 142887d61b2STaylor Simpson 143887d61b2STaylor Simpson 144887d61b2STaylor Simpson 145887d61b2STaylor Simpson 146887d61b2STaylor Simpson/****************************************************************************************** 147887d61b2STaylor Simpson* 148887d61b2STaylor Simpson* MMVECTOR MEMORY OPERATIONS - NO NAPALI V1 149887d61b2STaylor Simpson* 150887d61b2STaylor Simpson*******************************************************************************************/ 151887d61b2STaylor Simpson 152887d61b2STaylor Simpson 153887d61b2STaylor Simpson 154887d61b2STaylor Simpson#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 155887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV), \ 156887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 157887d61b2STaylor Simpson 158887d61b2STaylor Simpson#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 159887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 160887d61b2STaylor Simpson 161887d61b2STaylor Simpson 162887d61b2STaylor Simpson 163887d61b2STaylor Simpson#define ITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 164887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), \ 165887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 166887d61b2STaylor Simpson 167887d61b2STaylor Simpson#define ITERATOR_INSN2_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 168887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 169887d61b2STaylor Simpson 170887d61b2STaylor Simpson 171887d61b2STaylor Simpson#define ITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 172887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), \ 173887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 174887d61b2STaylor Simpson 175887d61b2STaylor Simpson#define ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 176887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 177887d61b2STaylor Simpson 178887d61b2STaylor Simpson 179887d61b2STaylor Simpson#define ITERATOR_INSN_MPY_SLOT_NOV1(WIDTH,TAG, SYNTAX,DESCR,CODE) \ 180887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, \ 181887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), \ 182887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 183887d61b2STaylor Simpson 184887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 185887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), \ 186887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 187887d61b2STaylor Simpson 188887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOTT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 189887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE) 190887d61b2STaylor Simpson 191887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 192887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 193887d61b2STaylor Simpson 194887d61b2STaylor Simpson 195887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 196887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DEP_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 197887d61b2STaylor Simpson 198887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 199887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 200887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 201887d61b2STaylor Simpson 202887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \ 203887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), \ 204887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE)) 205887d61b2STaylor Simpson 206887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \ 207887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE) 208887d61b2STaylor Simpson 209887d61b2STaylor Simpson#define NARROWING_SHIFT_NOV1(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \ 210887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT_NOV1(ITERSIZE,TAG, \ 211887d61b2STaylor Simpson"Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \ 212887d61b2STaylor Simpson"Vector shift right and shuffle", \ 213887d61b2STaylor Simpson fHIDE(int )shamt = RtV & SHAMTMASK; \ 214887d61b2STaylor Simpson DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \ 215887d61b2STaylor Simpson DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt))) 216887d61b2STaylor Simpson 217887d61b2STaylor Simpson#define MMVEC_AVGS_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 218887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 219887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGSRND( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 220887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vnavg##TYPE, "Vd32=vnavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")", "Vector Negative Average "DESCR, VdV.DEST[i] = fVNAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) 221887d61b2STaylor Simpson 222887d61b2STaylor Simpson #define MMVEC_AVGU_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 223887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGU( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 224887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGURND(WIDTH, VuV.SRC[i], VvV.SRC[i])) 225887d61b2STaylor Simpson 226887d61b2STaylor Simpson 227887d61b2STaylor Simpson 228887d61b2STaylor Simpson/****************************************************************************************** 229887d61b2STaylor Simpson* 230887d61b2STaylor Simpson* MMVECTOR MEMORY OPERATIONS 231887d61b2STaylor Simpson* 232887d61b2STaylor Simpson*******************************************************************************************/ 233887d61b2STaylor Simpson 234887d61b2STaylor Simpson#define MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,BEH) \ 235887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_pi, SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_I(RxV,VEC_SCALE(siV)); }) \ 236887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_ai, SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_RI(RtV,VEC_SCALE(siV)); BEH;}) \ 237887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_ppu, SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_M(RxV,MuV); }) \ 238887d61b2STaylor Simpson 239887d61b2STaylor Simpson 240887d61b2STaylor Simpson#define MMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 241887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_pred_pi, "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB, ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \ 242887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_pred_ai, "if (" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB, ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \ 243887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_pred_ppu, "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}}) \ 244887d61b2STaylor Simpson 245887d61b2STaylor Simpson#define MMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 246887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_npred_pi, "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \ 247887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_npred_ai, "if (!" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR, { if (fLSBOLDNOT(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \ 248887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_npred_ppu, "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}}) 249887d61b2STaylor Simpson 250887d61b2STaylor Simpson#define MMVEC_COND_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 251887d61b2STaylor SimpsonMMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \ 252887d61b2STaylor SimpsonMMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) 253887d61b2STaylor Simpson 254887d61b2STaylor Simpson 255887d61b2STaylor Simpson#define VEC_SCALE(X) X*fVECSIZE() 256887d61b2STaylor Simpson 257887d61b2STaylor Simpson 258887d61b2STaylor Simpson#define MMVEC_LD(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmem","",fLOADMMV(EA,VdV)) 259887d61b2STaylor Simpson#define MMVEC_LDC(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_cur,DESCR,ATTRIB,NT,"Vd32.cur=vmem","",fLOADMMV(EA,VdV)) 260887d61b2STaylor Simpson#define MMVEC_LDT(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_tmp,DESCR,ATTRIB,NT,"Vd32.tmp=vmem","",fLOADMMV(EA,VdV)) 261887d61b2STaylor Simpson#define MMVEC_LDU(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmemu","",fLOADMMVU(EA,VdV)) 262887d61b2STaylor Simpson 263887d61b2STaylor Simpson 264887d61b2STaylor Simpson#define MMVEC_STQ(TAG,DESCR,ATTRIB,NT) \ 265887d61b2STaylor SimpsonMMVEC_EACH_EA(TAG##_qpred,DESCR,ATTRIB,NT,"if (Qv4) vmem","=Vs32",fSTOREMMVQ(EA,VsV,QvV)) \ 266887d61b2STaylor SimpsonMMVEC_EACH_EA(TAG##_nqpred,DESCR,ATTRIB,NT,"if (!Qv4) vmem","=Vs32",fSTOREMMVNQ(EA,VsV,QvV)) 267887d61b2STaylor Simpson 268887d61b2STaylor Simpson/**************************************************************** 269887d61b2STaylor Simpson* MAPPING FOR VMEMs 270887d61b2STaylor Simpson****************************************************************/ 271887d61b2STaylor Simpson 272887d61b2STaylor Simpson#define ATTR_VMEM A_EXTENSION,A_CVI,A_CVI_VM 273887d61b2STaylor Simpson#define ATTR_VMEMU A_EXTENSION,A_CVI,A_CVI_VM,A_CVI_VP 274887d61b2STaylor Simpson 275887d61b2STaylor Simpson 276887d61b2STaylor SimpsonMMVEC_LD(vL32b, "Aligned Vector Load", ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),) 277887d61b2STaylor SimpsonMMVEC_LDC(vL32b, "Aligned Vector Load Cur", ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_NEW,A_CVI_VA),) 278887d61b2STaylor SimpsonMMVEC_LDT(vL32b, "Aligned Vector Load Tmp", ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),) 279887d61b2STaylor Simpson 280887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),,"Vd32=vmem",,Pv,fLOADMMV(EA,VdV);) 281887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",,Pv,fLOADMMV(EA,VdV);) 282887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),,"Vd32.tmp=vmem",,Pv,fLOADMMV(EA,VdV);) 283887d61b2STaylor Simpson 284887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",fSTOREMMV(EA,VsV)) 285887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",Pv,fSTOREMMV(EA,VsV)) 286887d61b2STaylor Simpson 287887d61b2STaylor Simpson 288887d61b2STaylor SimpsonMMVEC_STQ(vS32b, "Aligned Vector Store", ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),) 289887d61b2STaylor Simpson 290887d61b2STaylor SimpsonMMVEC_LDU(vL32Ub, "Unaligned Vector Load", ATTRIBS(ATTR_VMEMU,A_LOAD,A_RESTRICT_NOSLOT1),) 291887d61b2STaylor Simpson 292887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",fSTOREMMVU(EA,VsV)) 293887d61b2STaylor Simpson 294887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",Pv,fSTOREMMVU(EA,VsV)) 295887d61b2STaylor Simpson 296887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN))) 297887d61b2STaylor Simpson 2986c67d98cSMichael Tokarev// V65 store release, zero byte store 299887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32b_srls,"Aligned Vector Scatter Release",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_SCATTER_RELEASE,A_CVI_NEW,A_RESTRICT_SLOT0ONLY),,"vmem",":scatter_release",fSTORERELEASE(EA,0)) 300887d61b2STaylor Simpson 301887d61b2STaylor Simpson 302887d61b2STaylor Simpson 303887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN))) 304887d61b2STaylor Simpson 305887d61b2STaylor Simpson 306887d61b2STaylor Simpson/****************************************************************************************** 307887d61b2STaylor Simpson* 308887d61b2STaylor Simpson* MMVECTOR MEMORY OPERATIONS - NON TEMPORAL 309887d61b2STaylor Simpson* 310887d61b2STaylor Simpson*******************************************************************************************/ 311887d61b2STaylor Simpson 312887d61b2STaylor Simpson#define ATTR_VMEM_NT A_EXTENSION,A_CVI,A_CVI_VM 313887d61b2STaylor Simpson 314887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",fSTOREMMV(EA,VsV)) 315887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",Pv,fSTOREMMV(EA,VsV)) 316887d61b2STaylor Simpson 317887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN))) 318887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN))) 319887d61b2STaylor Simpson 320887d61b2STaylor Simpson 321887d61b2STaylor SimpsonMMVEC_STQ(vS32b_nt, "Aligned Vector Store - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt") 322887d61b2STaylor Simpson 323887d61b2STaylor SimpsonMMVEC_LD(vL32b_nt, "Aligned Vector Load - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_VA),":nt") 324887d61b2STaylor SimpsonMMVEC_LDC(vL32b_nt, "Aligned Vector Load Cur - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_NEW,A_CVI_VA),":nt") 325887d61b2STaylor SimpsonMMVEC_LDT(vL32b_nt, "Aligned Vector Load Tmp - Non temporal", ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_TMP),":nt") 326887d61b2STaylor Simpson 327887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b_nt,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA),,"Vd32=vmem",":nt",Pv,fLOADMMV(EA,VdV);) 328887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b_nt_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",":nt",Pv,fLOADMMV(EA,VdV);) 329887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b_nt_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM_NT,A_CVI_TMP),,"Vd32.tmp=vmem",":nt",Pv,fLOADMMV(EA,VdV);) 330887d61b2STaylor Simpson 331887d61b2STaylor Simpson 332887d61b2STaylor Simpson#undef VEC_SCALE 333887d61b2STaylor Simpson 334887d61b2STaylor Simpson 335887d61b2STaylor Simpson/*************************************************** 336887d61b2STaylor Simpson * Vector Alignment 337887d61b2STaylor Simpson ************************************************/ 338887d61b2STaylor Simpson 339887d61b2STaylor Simpson#define VALIGNB(SHIFT) \ 340887d61b2STaylor Simpson fHIDE(int i;) \ 341887d61b2STaylor Simpson for(i = 0; i < fVBYTES(); i++) {\ 342887d61b2STaylor Simpson VdV.ub[i] = (i+SHIFT>=fVBYTES()) ? VuV.ub[i+SHIFT-fVBYTES()] : VvV.ub[i+SHIFT];\ 343887d61b2STaylor Simpson } 344887d61b2STaylor Simpson 345887d61b2STaylor SimpsonEXTINSN(V6_valignb, "Vd32=valign(Vu32,Vv32,Rt8)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control", 346887d61b2STaylor Simpson{ 347887d61b2STaylor Simpson unsigned shift = RtV & (fVBYTES()-1); 348887d61b2STaylor Simpson VALIGNB(shift) 349887d61b2STaylor Simpson}) 350887d61b2STaylor SimpsonEXTINSN(V6_vlalignb, "Vd32=vlalign(Vu32,Vv32,Rt8)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control", 351887d61b2STaylor Simpson{ 352887d61b2STaylor Simpson unsigned shift = fVBYTES() - (RtV & (fVBYTES()-1)); 353887d61b2STaylor Simpson VALIGNB(shift) 354887d61b2STaylor Simpson}) 355887d61b2STaylor SimpsonEXTINSN(V6_valignbi, "Vd32=valign(Vu32,Vv32,#u3)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control", 356887d61b2STaylor Simpson{ 357887d61b2STaylor Simpson VALIGNB(uiV) 358887d61b2STaylor Simpson}) 359887d61b2STaylor SimpsonEXTINSN(V6_vlalignbi,"Vd32=vlalign(Vu32,Vv32,#u3)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control", 360887d61b2STaylor Simpson{ 361887d61b2STaylor Simpson unsigned shift = fVBYTES() - uiV; 362887d61b2STaylor Simpson VALIGNB(shift) 363887d61b2STaylor Simpson}) 364887d61b2STaylor Simpson 365887d61b2STaylor SimpsonEXTINSN(V6_vror, "Vd32=vror(Vu32,Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 366887d61b2STaylor Simpson"Align Two vectors by Rt32 as control", 367887d61b2STaylor Simpson{ 368887d61b2STaylor Simpson fHIDE(int k;) 369887d61b2STaylor Simpson for (k=0;k<fVBYTES();k++) { 370887d61b2STaylor Simpson VdV.ub[k] = VuV.ub[(k+RtV)&(fVBYTES()-1)]; 371887d61b2STaylor Simpson } 372887d61b2STaylor Simpson }) 373887d61b2STaylor Simpson 374887d61b2STaylor Simpson 375887d61b2STaylor Simpson 376887d61b2STaylor Simpson 377887d61b2STaylor Simpson 378887d61b2STaylor Simpson 379887d61b2STaylor Simpson 380887d61b2STaylor Simpson/************************************************************** 381887d61b2STaylor Simpson* Unpack elements with zero/sign extend and cross lane permute 382887d61b2STaylor Simpson***************************************************************/ 383887d61b2STaylor Simpson 384887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackub, "Vdd32=vunpackub(Vu32)", "Vdd32.uh=vunpack(Vu32.ub)", "Unpack byte with zero-extend", fVARRAY_ELEMENT_ACCESS(VddV, uh, i) = fZE8_16( VuV.ub[i])) 385887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackb, "Vdd32=vunpackb(Vu32)", "Vdd32.h=vunpack(Vu32.b)", "Unpack bytes with sign-extend", fVARRAY_ELEMENT_ACCESS(VddV, h, i) = fSE8_16( VuV.b[i] )) 386887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackuh, "Vdd32=vunpackuh(Vu32)", "Vdd32.uw=vunpack(Vu32.uh)", "Unpack halves with zero-extend", fVARRAY_ELEMENT_ACCESS(VddV, uw, i) = fZE16_32(VuV.uh[i])) 387887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackh, "Vdd32=vunpackh(Vu32)", "Vdd32.w=vunpack(Vu32.h)", "Unpack halves with sign-extend", fVARRAY_ELEMENT_ACCESS(VddV, w, i) = fSE16_32(VuV.h[i] )) 388887d61b2STaylor Simpson 389887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8, vunpackob, "Vxx32|=vunpackob(Vu32)", "Vxx32.h|=vunpacko(Vu32.b)", "Unpack byte to odd bytes ", fVARRAY_ELEMENT_ACCESS(VxxV, uh, i) |= fZE8_16( VuV.ub[i])<<8) 390887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackoh, "Vxx32|=vunpackoh(Vu32)", "Vxx32.w|=vunpacko(Vu32.h)", "Unpack halves to odd halves", fVARRAY_ELEMENT_ACCESS(VxxV, uw, i) |= fZE16_32(VuV.uh[i])<<16) 391887d61b2STaylor Simpson 392887d61b2STaylor Simpson 393887d61b2STaylor Simpson/************************************************************** 394887d61b2STaylor Simpson* Pack elements and cross lane permute 395887d61b2STaylor Simpson***************************************************************/ 396887d61b2STaylor Simpson 397887d61b2STaylor Simpson ITERATOR_INSN2_PERMUTE_SLOT(16, vpackeb, "Vd32=vpackeb(Vu32,Vv32)", "Vd32.b=vpacke(Vu32.h,Vv32.h)", 398887d61b2STaylor Simpson "Pack bytes", 399887d61b2STaylor Simpson VdV.ub[i] = fGETUBYTE(0, VvV.uh[i]); 400887d61b2STaylor Simpson VdV.ub[i+fVELEM(16)] = fGETUBYTE(0, VuV.uh[i])) 401887d61b2STaylor Simpson 402887d61b2STaylor Simpson ITERATOR_INSN2_PERMUTE_SLOT(32, vpackeh, "Vd32=vpackeh(Vu32,Vv32)", "Vd32.h=vpacke(Vu32.w,Vv32.w)", 403887d61b2STaylor Simpson "Pack halfwords", 404887d61b2STaylor Simpson VdV.uh[i] = fGETUHALF(0, VvV.uw[i]); 405887d61b2STaylor Simpson VdV.uh[i+fVELEM(32)] = fGETUHALF(0, VuV.uw[i])) 406887d61b2STaylor Simpson 407887d61b2STaylor Simpson ITERATOR_INSN2_PERMUTE_SLOT(16, vpackob, "Vd32=vpackob(Vu32,Vv32)", "Vd32.b=vpacko(Vu32.h,Vv32.h)", 408887d61b2STaylor Simpson "Pack bytes", 409887d61b2STaylor Simpson VdV.ub[i] = fGETUBYTE(1, VvV.uh[i]); 410887d61b2STaylor Simpson VdV.ub[i+fVELEM(16)] = fGETUBYTE(1, VuV.uh[i])) 411887d61b2STaylor Simpson 412887d61b2STaylor Simpson ITERATOR_INSN2_PERMUTE_SLOT(32, vpackoh, "Vd32=vpackoh(Vu32,Vv32)", "Vd32.h=vpacko(Vu32.w,Vv32.w)", 413887d61b2STaylor Simpson "Pack halfwords", 414887d61b2STaylor Simpson VdV.uh[i] = fGETUHALF(1, VvV.uw[i]); 415887d61b2STaylor Simpson VdV.uh[i+fVELEM(32)] = fGETUHALF(1, VuV.uw[i])) 416887d61b2STaylor Simpson 417887d61b2STaylor Simpson 418887d61b2STaylor Simpson 419887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(16, vpackhub_sat, "Vd32=vpackhub(Vu32,Vv32):sat", "Vd32.ub=vpack(Vu32.h,Vv32.h):sat", 420887d61b2STaylor Simpson "Pack ubytes with saturation", 421887d61b2STaylor Simpson VdV.ub[i] = fVSATUB(VvV.h[i]); 422887d61b2STaylor Simpson VdV.ub[i+fVELEM(16)] = fVSATUB(VuV.h[i])) 423887d61b2STaylor Simpson 424887d61b2STaylor Simpson 425887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(16, vpackhb_sat, "Vd32=vpackhb(Vu32,Vv32):sat", "Vd32.b=vpack(Vu32.h,Vv32.h):sat", 426887d61b2STaylor Simpson "Pack bytes with saturation", 427887d61b2STaylor Simpson VdV.b[i] = fVSATB(VvV.h[i]); 428887d61b2STaylor Simpson VdV.b[i+fVELEM(16)] = fVSATB(VuV.h[i])) 429887d61b2STaylor Simpson 430887d61b2STaylor Simpson 431887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(32, vpackwuh_sat, "Vd32=vpackwuh(Vu32,Vv32):sat", "Vd32.uh=vpack(Vu32.w,Vv32.w):sat", 432887d61b2STaylor Simpson "Pack ubytes with saturation", 433887d61b2STaylor Simpson VdV.uh[i] = fVSATUH(VvV.w[i]); 434887d61b2STaylor Simpson VdV.uh[i+fVELEM(32)] = fVSATUH(VuV.w[i])) 435887d61b2STaylor Simpson 436887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(32, vpackwh_sat, "Vd32=vpackwh(Vu32,Vv32):sat", "Vd32.h=vpack(Vu32.w,Vv32.w):sat", 437887d61b2STaylor Simpson "Pack bytes with saturation", 438887d61b2STaylor Simpson VdV.h[i] = fVSATH(VvV.w[i]); 439887d61b2STaylor Simpson VdV.h[i+fVELEM(32)] = fVSATH(VuV.w[i])) 440887d61b2STaylor Simpson 441887d61b2STaylor Simpson 442887d61b2STaylor Simpson 443887d61b2STaylor Simpson 444887d61b2STaylor Simpson 445887d61b2STaylor Simpson/************************************************************** 446887d61b2STaylor Simpson* Zero/Sign Extend with in-lane permute 447887d61b2STaylor Simpson***************************************************************/ 448887d61b2STaylor Simpson 449887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vzb,"Vdd32=vzxtb(Vu32)","Vdd32.uh=vzxt(Vu32.ub)", 450887d61b2STaylor Simpson"Vector Zero Extend Bytes", 451887d61b2STaylor Simpson VddV.v[0].uh[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])); 452887d61b2STaylor Simpson VddV.v[1].uh[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i]))) 453887d61b2STaylor Simpson 454887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vsb,"Vdd32=vsxtb(Vu32)","Vdd32.h=vsxt(Vu32.b)", 455887d61b2STaylor Simpson"Vector Sign Extend Bytes", 456887d61b2STaylor Simpson VddV.v[0].h[i] = fSE8_16(fGETBYTE(0, VuV.h[i])); 457887d61b2STaylor Simpson VddV.v[1].h[i] = fSE8_16(fGETBYTE(1, VuV.h[i]))) 458887d61b2STaylor Simpson 459887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vzh,"Vdd32=vzxth(Vu32)","Vdd32.uw=vzxt(Vu32.uh)", 460887d61b2STaylor Simpson"Vector Zero Extend halfwords", 461887d61b2STaylor Simpson VddV.v[0].uw[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])); 462887d61b2STaylor Simpson VddV.v[1].uw[i] = fZE16_32(fGETUHALF(1, VuV.uw[i]))) 463887d61b2STaylor Simpson 464887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vsh,"Vdd32=vsxth(Vu32)","Vdd32.w=vsxt(Vu32.h)", 465887d61b2STaylor Simpson"Vector Sign Extend halfwords", 466887d61b2STaylor Simpson VddV.v[0].w[i] = fSE16_32(fGETHALF(0, VuV.w[i])); 467887d61b2STaylor Simpson VddV.v[1].w[i] = fSE16_32(fGETHALF(1, VuV.w[i]))) 468887d61b2STaylor Simpson 469887d61b2STaylor Simpson 470887d61b2STaylor Simpson/********************************************************************** 471887d61b2STaylor Simpson* 472887d61b2STaylor Simpson* 473887d61b2STaylor Simpson* 474887d61b2STaylor Simpson* MMVECTOR REDUCTION 475887d61b2STaylor Simpson* 476887d61b2STaylor Simpson* 477887d61b2STaylor Simpson* 478887d61b2STaylor Simpson**********************************************************************/ 479887d61b2STaylor Simpson 480887d61b2STaylor Simpson/******************************************** 481887d61b2STaylor Simpson* 2-WAY REDUCTION - UNSIGNED BYTE BY BYTE 482887d61b2STaylor Simpson********************************************/ 483887d61b2STaylor Simpson 484887d61b2STaylor Simpson 485887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(16,vdmpybus,"Vd32=vdmpybus(Vu32,Rt32)","Vd32.h=vdmpy(Vu32.ub,Rt32.b)", 486887d61b2STaylor Simpson"Vector Dual Multiply-Accumulates unsigned bytes by bytes", 487887d61b2STaylor Simpson VdV.h[i] = fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV)); 488887d61b2STaylor Simpson VdV.h[i] += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 489887d61b2STaylor Simpson 490887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(16,vdmpybus_acc,"Vx32+=vdmpybus(Vu32,Rt32)","Vx32.h+=vdmpy(Vu32.ub,Rt32.b)", 491887d61b2STaylor Simpson"Vector Dual Multiply-Accumulates unsigned bytes by bytes, and accumulate", 492887d61b2STaylor Simpson VxV.h[i] += fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV)); 493887d61b2STaylor Simpson VxV.h[i] += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 494887d61b2STaylor Simpson 495887d61b2STaylor Simpson 496887d61b2STaylor Simpson 497887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv,"Vdd32=vdmpybus(Vuu32,Rt32)","Vdd32.h=vdmpy(Vuu32.ub,Rt32.b)", 498887d61b2STaylor Simpson"Vector Dual Multiply-Accumulates unsigned bytes by bytes, and accumulate Sliding Window Reduction", 499887d61b2STaylor Simpson VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 500887d61b2STaylor Simpson VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV)); 501887d61b2STaylor Simpson 502887d61b2STaylor Simpson VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 503887d61b2STaylor Simpson VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV))) 504887d61b2STaylor Simpson 505887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv_acc,"Vxx32+=vdmpybus(Vuu32,Rt32)","Vxx32.h+=vdmpy(Vuu32.ub,Rt32.b)", 506887d61b2STaylor Simpson"Vector Dual Multiply-Accumulates unsigned bytes by bytes, and accumulate Sliding Window Reduction", 507887d61b2STaylor Simpson VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 508887d61b2STaylor Simpson VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV)); 509887d61b2STaylor Simpson 510887d61b2STaylor Simpson VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV)); 511887d61b2STaylor Simpson VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV))) 512887d61b2STaylor Simpson 513887d61b2STaylor Simpson 514887d61b2STaylor Simpson 515887d61b2STaylor Simpson/******************************************** 516887d61b2STaylor Simpson* 2-WAY REDUCTION - HALF BY BYTE 517887d61b2STaylor Simpson********************************************/ 518887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vdmpyhb,"Vd32=vdmpyhb(Vu32,Rt32)","Vd32.w=vdmpy(Vu32.h,Rt32.b)", 519887d61b2STaylor Simpson"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 520887d61b2STaylor Simpson VdV.w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV)); 521887d61b2STaylor Simpson VdV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV))) 522887d61b2STaylor Simpson 523887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vdmpyhb_acc,"Vx32+=vdmpyhb(Vu32,Rt32)","Vx32.w+=vdmpy(Vu32.h,Rt32.b)", 524887d61b2STaylor Simpson"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 525887d61b2STaylor Simpson VxV.w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV)); 526887d61b2STaylor Simpson VxV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV))) 527887d61b2STaylor Simpson 528887d61b2STaylor Simpson 529887d61b2STaylor Simpson 530887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv,"Vdd32=vdmpyhb(Vuu32,Rt32)","Vdd32.w=vdmpy(Vuu32.h,Rt32.b)", 531887d61b2STaylor Simpson"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 532887d61b2STaylor Simpson VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 533887d61b2STaylor Simpson VddV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV)); 534887d61b2STaylor Simpson 535887d61b2STaylor Simpson VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 536887d61b2STaylor Simpson VddV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV))) 537887d61b2STaylor Simpson 538887d61b2STaylor Simpson 539887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv_acc,"Vxx32+=vdmpyhb(Vuu32,Rt32)","Vxx32.w+=vdmpy(Vuu32.h,Rt32.b)", 540887d61b2STaylor Simpson"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap", 541887d61b2STaylor Simpson VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 542887d61b2STaylor Simpson VxxV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV)); 543887d61b2STaylor Simpson 544887d61b2STaylor Simpson VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV)); 545887d61b2STaylor Simpson VxxV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV))) 546887d61b2STaylor Simpson 547887d61b2STaylor Simpson 548887d61b2STaylor Simpson 549887d61b2STaylor Simpson 550887d61b2STaylor Simpson 551887d61b2STaylor Simpson/******************************************** 552887d61b2STaylor Simpson* 2-WAY REDUCTION - HALF BY HALF 553887d61b2STaylor Simpson********************************************/ 554887d61b2STaylor Simpson 555887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat,"Vd32=vdmpyh(Vu32,Vv32):sat","Vd32.w=vdmpy(Vu32.h,Vv32.h):sat", 556887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, sat to word", 557887d61b2STaylor Simpson fHIDE(size8s_t accum;) 558887d61b2STaylor Simpson accum = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i])); 559887d61b2STaylor Simpson accum += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i])); 560887d61b2STaylor Simpson VdV.w[i] = fVSATW(accum)) 561887d61b2STaylor Simpson 562887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat_acc,"Vx32+=vdmpyh(Vu32,Vv32):sat","Vx32.w+=vdmpy(Vu32.h,Vv32.h):sat", 563887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, sat to word", 564887d61b2STaylor Simpson fHIDE(size8s_t accum;) 565887d61b2STaylor Simpson accum = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i])); 566887d61b2STaylor Simpson accum += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i])); 567887d61b2STaylor Simpson VxV.w[i] = fVSATW(VxV.w[i]+accum)) 568887d61b2STaylor Simpson 569887d61b2STaylor Simpson 570887d61b2STaylor Simpson/* VDMPYH */ 571887d61b2STaylor Simpson 572887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat,"Vd32=vdmpyh(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.h):sat", 573887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, saturate to word", 574887d61b2STaylor Simpson fHIDE(size8s_t accum;) 575887d61b2STaylor Simpson accum = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV)); 576887d61b2STaylor Simpson accum += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV)); 577887d61b2STaylor Simpson VdV.w[i] = fVSATW(accum)) 578887d61b2STaylor Simpson 579887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat_acc,"Vx32+=vdmpyh(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.h):sat", 580887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, saturate to word", 581887d61b2STaylor Simpson fHIDE(size8s_t) accum = VxV.w[i]; 582887d61b2STaylor Simpson accum += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV)); 583887d61b2STaylor Simpson accum += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV)); 584887d61b2STaylor Simpson VxV.w[i] = fVSATW(accum)) 585887d61b2STaylor Simpson 586887d61b2STaylor Simpson 587887d61b2STaylor Simpson 588887d61b2STaylor Simpson 589887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat,"Vd32=vdmpyh(Vuu32,Rt32):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.h):sat", 590887d61b2STaylor Simpson"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation", 591887d61b2STaylor Simpson fHIDE(size8s_t accum;) 592887d61b2STaylor Simpson accum = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV)); 593887d61b2STaylor Simpson accum += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV)); 594887d61b2STaylor Simpson VdV.w[i] = fVSATW(accum)) 595887d61b2STaylor Simpson 596887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat_acc,"Vx32+=vdmpyh(Vuu32,Rt32):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.h):sat", 597887d61b2STaylor Simpson"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation", 598887d61b2STaylor Simpson fHIDE(size8s_t) accum = VxV.w[i]; 599887d61b2STaylor Simpson accum += fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV)); 600887d61b2STaylor Simpson accum += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV)); 601887d61b2STaylor Simpson VxV.w[i] = fVSATW(accum)) 602887d61b2STaylor Simpson 603887d61b2STaylor Simpson 604887d61b2STaylor Simpson 605887d61b2STaylor Simpson 606887d61b2STaylor Simpson 607887d61b2STaylor Simpson 608887d61b2STaylor Simpson 609887d61b2STaylor Simpson/* VDMPYHSU */ 610887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat,"Vd32=vdmpyhsu(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.uh):sat", 611887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, saturate to word", 612887d61b2STaylor Simpson fHIDE(size8s_t accum;) 613887d61b2STaylor Simpson accum = fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV)); 614887d61b2STaylor Simpson accum += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV)); 615887d61b2STaylor Simpson VdV.w[i] = fVSATW(accum)) 616887d61b2STaylor Simpson 617887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat_acc,"Vx32+=vdmpyhsu(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.uh):sat", 618887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, saturate to word", 619887d61b2STaylor Simpson fHIDE(size8s_t) accum=VxV.w[i]; 620887d61b2STaylor Simpson accum += fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV)); 621887d61b2STaylor Simpson accum += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV)); 622887d61b2STaylor Simpson VxV.w[i] = fVSATW(accum)) 623887d61b2STaylor Simpson 624887d61b2STaylor Simpson 625887d61b2STaylor Simpson 626887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat,"Vd32=vdmpyhsu(Vuu32,Rt32,#1):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.uh,#1):sat", 627887d61b2STaylor Simpson"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation", 628887d61b2STaylor Simpson fHIDE(size8s_t accum;) 629887d61b2STaylor Simpson accum = fMPY16SU(fGETHALF(1,VuuV.v[0].w[i]),fGETUHALF(0,RtV)); 630887d61b2STaylor Simpson accum += fMPY16SU(fGETHALF(0,VuuV.v[1].w[i]),fGETUHALF(1,RtV)); 631887d61b2STaylor Simpson VdV.w[i] = fVSATW(accum)) 632887d61b2STaylor Simpson 633887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat_acc,"Vx32+=vdmpyhsu(Vuu32,Rt32,#1):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.uh,#1):sat", 634887d61b2STaylor Simpson"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation", 635887d61b2STaylor Simpson fHIDE(size8s_t) accum=VxV.w[i]; 636887d61b2STaylor Simpson accum += fMPY16SU(fGETHALF(1, VuuV.v[0].w[i]),fGETUHALF(0,RtV)); 637887d61b2STaylor Simpson accum += fMPY16SU(fGETHALF(0, VuuV.v[1].w[i]),fGETUHALF(1,RtV)); 638887d61b2STaylor Simpson VxV.w[i] = fVSATW(accum)) 639887d61b2STaylor Simpson 640887d61b2STaylor Simpson 641887d61b2STaylor Simpson 642887d61b2STaylor Simpson/******************************************** 643887d61b2STaylor Simpson* 3-WAY REDUCTION - UNSIGNED BYTE BY BYTE 644887d61b2STaylor Simpson********************************************/ 645887d61b2STaylor Simpson 646887d61b2STaylor Simpson ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb, "Vdd32=vtmpyb(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.b,Rt32.b)", 647887d61b2STaylor Simpson"Dual Vector 3x1 Reduction", 648887d61b2STaylor Simpson VddV.v[0].h[i] = fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 649887d61b2STaylor Simpson VddV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV)); 650887d61b2STaylor Simpson VddV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]); 651887d61b2STaylor Simpson 652887d61b2STaylor Simpson VddV.v[1].h[i] = fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 653887d61b2STaylor Simpson VddV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV)); 654887d61b2STaylor Simpson VddV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i])) 655887d61b2STaylor Simpson 656887d61b2STaylor Simpson 657887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb_acc, "Vxx32+=vtmpyb(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.b,Rt32.b)", 658887d61b2STaylor Simpson"Dual Vector 3x1 Reduction", 659887d61b2STaylor Simpson VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 660887d61b2STaylor Simpson VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV)); 661887d61b2STaylor Simpson VxxV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]); 662887d61b2STaylor Simpson 663887d61b2STaylor Simpson VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i )%4, RtV)); 664887d61b2STaylor Simpson VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV)); 665887d61b2STaylor Simpson VxxV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i])) 666887d61b2STaylor Simpson 667887d61b2STaylor Simpson 668887d61b2STaylor Simpson 669887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus, "Vdd32=vtmpybus(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.ub,Rt32.b)", 670887d61b2STaylor Simpson"Dual Vector 3x1 Reduction", 671887d61b2STaylor Simpson VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 672887d61b2STaylor Simpson VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 673887d61b2STaylor Simpson VddV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]); 674887d61b2STaylor Simpson 675887d61b2STaylor Simpson VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 676887d61b2STaylor Simpson VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 677887d61b2STaylor Simpson VddV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i])) 678887d61b2STaylor Simpson 679887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus_acc, "Vxx32+=vtmpybus(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.ub,Rt32.b)", 680887d61b2STaylor Simpson"Dual Vector 3x1 Reduction", 681887d61b2STaylor Simpson VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 682887d61b2STaylor Simpson VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 683887d61b2STaylor Simpson VxxV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]); 684887d61b2STaylor Simpson 685887d61b2STaylor Simpson VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i )%4, RtV)); 686887d61b2STaylor Simpson VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV)); 687887d61b2STaylor Simpson VxxV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i])) 688887d61b2STaylor Simpson 689887d61b2STaylor Simpson 690887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb, "Vdd32=vtmpyhb(Vuu32,Rt32)", "Vdd32.w=vtmpy(Vuu32.h,Rt32.b)", 691887d61b2STaylor Simpson"Dual Vector 3x1 Reduction", 692887d61b2STaylor Simpson VddV.v[0].w[i] = fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 693887d61b2STaylor Simpson VddV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 694887d61b2STaylor Simpson VddV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]); 695887d61b2STaylor Simpson 696887d61b2STaylor Simpson VddV.v[1].w[i] = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 697887d61b2STaylor Simpson VddV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 698887d61b2STaylor Simpson VddV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i])) 699887d61b2STaylor Simpson 700887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb_acc, "Vxx32+=vtmpyhb(Vuu32,Rt32)", "Vxx32.w+=vtmpy(Vuu32.h,Rt32.b)", 701887d61b2STaylor Simpson"Dual Vector 3x1 Reduction", 702887d61b2STaylor Simpson VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 703887d61b2STaylor Simpson VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 704887d61b2STaylor Simpson VxxV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]); 705887d61b2STaylor Simpson 706887d61b2STaylor Simpson VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV))); 707887d61b2STaylor Simpson VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV))); 708887d61b2STaylor Simpson VxxV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i])) 709887d61b2STaylor Simpson 710887d61b2STaylor Simpson 711887d61b2STaylor Simpson/******************************************** 712887d61b2STaylor Simpson* 4-WAY REDUCTION - UNSIGNED BYTE BY UNSIGNED BYTE 713887d61b2STaylor Simpson********************************************/ 714887d61b2STaylor Simpson 715887d61b2STaylor Simpson 716887d61b2STaylor Simpson 717887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpyub,"Vd32=vrmpyub(Vu32,Rt32)","Vd32.uw=vrmpy(Vu32.ub,Rt32.ub)", 718887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 719887d61b2STaylor Simpson VdV.uw[i] = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV)); 720887d61b2STaylor Simpson VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV)); 721887d61b2STaylor Simpson VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV)); 722887d61b2STaylor Simpson VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV))) 723887d61b2STaylor Simpson 724887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpyub_acc,"Vx32+=vrmpyub(Vu32,Rt32)","Vx32.uw+=vrmpy(Vu32.ub,Rt32.ub)", 725887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate", 726887d61b2STaylor Simpson VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV)); 727887d61b2STaylor Simpson VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV)); 728887d61b2STaylor Simpson VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV)); 729887d61b2STaylor Simpson VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV))) 730887d61b2STaylor Simpson 731887d61b2STaylor Simpson 732887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpyubv,"Vd32=vrmpyub(Vu32,Vv32)","Vd32.uw=vrmpy(Vu32.ub,Vv32.ub)", 733887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 734887d61b2STaylor Simpson VdV.uw[i] = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i])); 735887d61b2STaylor Simpson VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i])); 736887d61b2STaylor Simpson VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i])); 737887d61b2STaylor Simpson VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i]))) 738887d61b2STaylor Simpson 739887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubv_acc,"Vx32+=vrmpyub(Vu32,Vv32)","Vx32.uw+=vrmpy(Vu32.ub,Vv32.ub)", 740887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate", 741887d61b2STaylor Simpson VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i])); 742887d61b2STaylor Simpson VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i])); 743887d61b2STaylor Simpson VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i])); 744887d61b2STaylor Simpson VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i]))) 745887d61b2STaylor Simpson 746887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpybv,"Vd32=vrmpyb(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.b,Vv32.b)", 747887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 748887d61b2STaylor Simpson VdV.w[i] = fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i])); 749887d61b2STaylor Simpson VdV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i])); 750887d61b2STaylor Simpson VdV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i])); 751887d61b2STaylor Simpson VdV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i]))) 752887d61b2STaylor Simpson 753887d61b2STaylor Simpson 754887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybv_acc,"Vx32+=vrmpyb(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.b,Vv32.b)", 755887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 756887d61b2STaylor Simpson VxV.w[i] += fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i])); 757887d61b2STaylor Simpson VxV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i])); 758887d61b2STaylor Simpson VxV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i])); 759887d61b2STaylor Simpson VxV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i]))) 760887d61b2STaylor Simpson 761887d61b2STaylor Simpson 762887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi,"Vdd32=vrmpyub(Vuu32,Rt32,#u1)","Vdd32.uw=vrmpy(Vuu32.ub,Rt32.ub,#u1)", 763887d61b2STaylor Simpson"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word", 764887d61b2STaylor Simpson VddV.v[0].uw[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 765887d61b2STaylor Simpson VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)); 766887d61b2STaylor Simpson VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 767887d61b2STaylor Simpson VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 768887d61b2STaylor Simpson 769887d61b2STaylor Simpson VddV.v[1].uw[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 770887d61b2STaylor Simpson VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 771887d61b2STaylor Simpson VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 772887d61b2STaylor Simpson VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV))) 773887d61b2STaylor Simpson 774887d61b2STaylor Simpson 775887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi_acc,"Vxx32+=vrmpyub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrmpy(Vuu32.ub,Rt32.ub,#u1)", 776887d61b2STaylor Simpson"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word", 777887d61b2STaylor Simpson VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 778887d61b2STaylor Simpson VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)); 779887d61b2STaylor Simpson VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 780887d61b2STaylor Simpson VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 781887d61b2STaylor Simpson 782887d61b2STaylor Simpson VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV)); 783887d61b2STaylor Simpson VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV)); 784887d61b2STaylor Simpson VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV)); 785887d61b2STaylor Simpson VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV))) 786887d61b2STaylor Simpson 787887d61b2STaylor Simpson 788887d61b2STaylor Simpson 789887d61b2STaylor Simpson 790887d61b2STaylor Simpson/******************************************** 791887d61b2STaylor Simpson* 4-WAY REDUCTION - UNSIGNED BYTE BY BYTE 792887d61b2STaylor Simpson********************************************/ 793887d61b2STaylor Simpson 794887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpybus,"Vd32=vrmpybus(Vu32,Rt32)","Vd32.w=vrmpy(Vu32.ub,Rt32.b)", 795887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 796887d61b2STaylor Simpson VdV.w[i] = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV)); 797887d61b2STaylor Simpson VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV)); 798887d61b2STaylor Simpson VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV)); 799887d61b2STaylor Simpson VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV))) 800887d61b2STaylor Simpson 801887d61b2STaylor Simpson 802887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpybus_acc,"Vx32+=vrmpybus(Vu32,Rt32)","Vx32.w+=vrmpy(Vu32.ub,Rt32.b)", 803887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 804887d61b2STaylor Simpson VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV)); 805887d61b2STaylor Simpson VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV)); 806887d61b2STaylor Simpson VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV)); 807887d61b2STaylor Simpson VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV))) 808887d61b2STaylor Simpson 809887d61b2STaylor Simpson 810887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi,"Vdd32=vrmpybus(Vuu32,Rt32,#u1)","Vdd32.w=vrmpy(Vuu32.ub,Rt32.b,#u1)", 811887d61b2STaylor Simpson"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word", 812887d61b2STaylor Simpson VddV.v[0].w[i] = fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 813887d61b2STaylor Simpson VddV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)); 814887d61b2STaylor Simpson VddV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 815887d61b2STaylor Simpson VddV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 816887d61b2STaylor Simpson 817887d61b2STaylor Simpson VddV.v[1].w[i] = fMPY8US(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 818887d61b2STaylor Simpson VddV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 819887d61b2STaylor Simpson VddV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 820887d61b2STaylor Simpson VddV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV))) 821887d61b2STaylor Simpson 822887d61b2STaylor Simpson 823887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi_acc,"Vxx32+=vrmpybus(Vuu32,Rt32,#u1)","Vxx32.w+=vrmpy(Vuu32.ub,Rt32.b,#u1)", 824887d61b2STaylor Simpson"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word", 825887d61b2STaylor Simpson VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 826887d61b2STaylor Simpson VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)); 827887d61b2STaylor Simpson VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 828887d61b2STaylor Simpson VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 829887d61b2STaylor Simpson 830887d61b2STaylor Simpson VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1 ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV)); 831887d61b2STaylor Simpson VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1 ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV)); 832887d61b2STaylor Simpson VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV)); 833887d61b2STaylor Simpson VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0 ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV))) 834887d61b2STaylor Simpson 835887d61b2STaylor Simpson 836887d61b2STaylor Simpson 837887d61b2STaylor Simpson 838887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpybusv,"Vd32=vrmpybus(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.ub,Vv32.b)", 839887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 840887d61b2STaylor Simpson VdV.w[i] = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i])); 841887d61b2STaylor Simpson VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i])); 842887d61b2STaylor Simpson VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i])); 843887d61b2STaylor Simpson VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i]))) 844887d61b2STaylor Simpson 845887d61b2STaylor Simpson 846887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusv_acc,"Vx32+=vrmpybus(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.ub,Vv32.b)", 847887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients", 848887d61b2STaylor Simpson VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i])); 849887d61b2STaylor Simpson VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i])); 850887d61b2STaylor Simpson VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i])); 851887d61b2STaylor Simpson VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i]))) 852887d61b2STaylor Simpson 853887d61b2STaylor Simpson 854887d61b2STaylor Simpson 855887d61b2STaylor Simpson 856887d61b2STaylor Simpson 857887d61b2STaylor Simpson 858887d61b2STaylor Simpson 859887d61b2STaylor Simpson 860887d61b2STaylor Simpson 861887d61b2STaylor Simpson 862887d61b2STaylor Simpson 863887d61b2STaylor Simpson/******************************************** 864887d61b2STaylor Simpson* 2-WAY REDUCTION - SAD 865887d61b2STaylor Simpson********************************************/ 866887d61b2STaylor Simpson 867887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh,"Vdd32=vdsaduh(Vuu32,Rt32)","Vdd32.uw=vdsad(Vuu32.uh,Rt32.uh)", 868887d61b2STaylor Simpson"Dual Vector Halfword by Byte 4-Way Reduction to Word", 869887d61b2STaylor Simpson VddV.v[0].uw[i] = fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 870887d61b2STaylor Simpson VddV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV)); 871887d61b2STaylor Simpson VddV.v[1].uw[i] = fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 872887d61b2STaylor Simpson VddV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV))) 873887d61b2STaylor Simpson 874887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh_acc,"Vxx32+=vdsaduh(Vuu32,Rt32)","Vxx32.uw+=vdsad(Vuu32.uh,Rt32.uh)", 875887d61b2STaylor Simpson"Dual Vector Halfword by Byte 4-Way Reduction to Word", 876887d61b2STaylor Simpson VxxV.v[0].uw[i] += fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 877887d61b2STaylor Simpson VxxV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV)); 878887d61b2STaylor Simpson VxxV.v[1].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV)); 879887d61b2STaylor Simpson VxxV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV))) 880887d61b2STaylor Simpson 881887d61b2STaylor Simpson 882887d61b2STaylor Simpson 883887d61b2STaylor Simpson 884887d61b2STaylor Simpson/******************************************** 885887d61b2STaylor Simpson* 4-WAY REDUCTION - SAD 886887d61b2STaylor Simpson********************************************/ 887887d61b2STaylor Simpson 888887d61b2STaylor Simpson 889887d61b2STaylor Simpson 890887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi,"Vdd32=vrsadub(Vuu32,Rt32,#u1)","Vdd32.uw=vrsad(Vuu32.ub,Rt32.ub,#u1)", 891887d61b2STaylor Simpson"Dual Vector Halfword by Byte 4-Way Reduction to Word", 892887d61b2STaylor Simpson VddV.v[0].uw[i] = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 893887d61b2STaylor Simpson VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))); 894887d61b2STaylor Simpson VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 895887d61b2STaylor Simpson VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 896887d61b2STaylor Simpson 897887d61b2STaylor Simpson VddV.v[1].uw[i] = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 898887d61b2STaylor Simpson VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 899887d61b2STaylor Simpson VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 900887d61b2STaylor Simpson VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)))) 901887d61b2STaylor Simpson 902887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi_acc,"Vxx32+=vrsadub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrsad(Vuu32.ub,Rt32.ub,#u1)", 903887d61b2STaylor Simpson"Dual Vector Halfword by Byte 4-Way Reduction to Word", 904887d61b2STaylor Simpson VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 905887d61b2STaylor Simpson VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))); 906887d61b2STaylor Simpson VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 907887d61b2STaylor Simpson VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 908887d61b2STaylor Simpson 909887d61b2STaylor Simpson VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV))); 910887d61b2STaylor Simpson VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1 ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV))); 911887d61b2STaylor Simpson VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV))); 912887d61b2STaylor Simpson VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0 ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)))) 913887d61b2STaylor Simpson 914887d61b2STaylor Simpson 915887d61b2STaylor Simpson 916887d61b2STaylor Simpson 917887d61b2STaylor Simpson 918887d61b2STaylor Simpson 919887d61b2STaylor Simpson 920887d61b2STaylor Simpson 921887d61b2STaylor Simpson 922887d61b2STaylor Simpson 923887d61b2STaylor Simpson/********************************************************************* 924887d61b2STaylor Simpson * MMVECTOR SHIFTING 925887d61b2STaylor Simpson * ******************************************************************/ 926887d61b2STaylor Simpson// Macro to shift arithmetically left/right and by either RT or Vv 927887d61b2STaylor Simpson 928887d61b2STaylor Simpson#define V_SHIFT(TYPE, DESC, SIZE, LOGSIZE, CASTTYPE) \ 929887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE, "Vd32=vasr" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Rt32)", "Vector arithmetic shift right " DESC, VdV.TYPE[i] = (VuV.TYPE[i] >> (RtV & (SIZE-1)))) \ 930887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE, "Vd32=vasl" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Rt32)", "Vector arithmetic shift left " DESC, VdV.TYPE[i] = (VuV.TYPE[i] << (RtV & (SIZE-1)))) \ 931887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE, "Vd32=vlsr" #TYPE "(Vu32,Rt32)","Vd32.u"#TYPE"=vlsr(Vu32.u"#TYPE",Rt32)", "Vector logical shift right " DESC, VdV.u##TYPE[i] = (VuV.u##TYPE[i] >> (RtV & (SIZE-1)))) \ 932887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE##v,"Vd32=vasr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift right " DESC, VdV.TYPE[i] = fBIDIR_ASHIFTR(VuV.TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \ 933887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE##v,"Vd32=vasl" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift left " DESC, VdV.TYPE[i] = fBIDIR_ASHIFTL(VuV.TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \ 934887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE##v,"Vd32=vlsr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vlsr(Vu32."#TYPE",Vv32."#TYPE")", "Vector logical shift right " DESC, VdV.u##TYPE[i] = fBIDIR_LSHIFTR(VuV.u##TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \ 935887d61b2STaylor Simpson 936887d61b2STaylor SimpsonV_SHIFT(w, "word", 32,5,4_4) 937887d61b2STaylor SimpsonV_SHIFT(h, "halfword", 16,4,2_2) 938887d61b2STaylor Simpson 939887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT(8,vlsrb,"Vd32.ub=vlsr(Vu32.ub,Rt32)","vec log shift right bytes", VdV.b[i] = VuV.ub[i] >> (RtV & 0x7)) 940887d61b2STaylor Simpson 941887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vrotr,"Vd32=vrotr(Vu32,Vv32)","Vd32.uw=vrotr(Vu32.uw,Vv32.uw)","Vector word rotate right", VdV.uw[i] = ((VuV.uw[i] >> (VvV.uw[i] & 0x1f)) | (VuV.uw[i] << (32 - (VvV.uw[i] & 0x1f))))) 942887d61b2STaylor Simpson 943887d61b2STaylor Simpson/********************************************************************* 944887d61b2STaylor Simpson * MMVECTOR SHIFT AND PERMUTE 945887d61b2STaylor Simpson * ******************************************************************/ 946887d61b2STaylor Simpson 947887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(32,vasr_into,"Vxx32=vasrinto(Vu32,Vv32)","Vxx32.w=vasrinto(Vu32.w,Vv32.w)","ASR vector 1 elements and overlay dropping bits to MSB of vector 2 elements", 948887d61b2STaylor Simpson fHIDE(int64_t ) shift = (fSE32_64(VuV.w[i]) << 32); 949887d61b2STaylor Simpson fHIDE(int64_t ) mask = (((fSE32_64(VxxV.v[0].w[i])) << 32) | fZE32_64(VxxV.v[0].w[i])); 950887d61b2STaylor Simpson fHIDE(int64_t) lomask = (((fSE32_64(1)) << 32) - 1); 951887d61b2STaylor Simpson fHIDE(int ) count = -(0x40 & VvV.w[i]) + (VvV.w[i] & 0x3f); 952887d61b2STaylor Simpson fHIDE(int64_t ) result = (count == -0x40) ? 0 : (((count < 0) ? ((shift << -(count)) | (mask & (lomask << -(count)))) : ((shift >> count) | (mask & (lomask >> count))))); 953887d61b2STaylor Simpson VxxV.v[1].w[i] = ((result >> 32) & 0xffffffff); 954887d61b2STaylor Simpson VxxV.v[0].w[i] = (result & 0xffffffff)) 955887d61b2STaylor Simpson 956887d61b2STaylor Simpson#define NEW_NARROWING_SHIFT 1 957887d61b2STaylor Simpson 958887d61b2STaylor Simpson#if NEW_NARROWING_SHIFT 959887d61b2STaylor Simpson#define NARROWING_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \ 960887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT(ITERSIZE,TAG, \ 961887d61b2STaylor Simpson"Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \ 962887d61b2STaylor Simpson"Vector shift right and shuffle", \ 963887d61b2STaylor Simpson fHIDE(int )shamt = RtV & SHAMTMASK; \ 964887d61b2STaylor Simpson DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \ 965887d61b2STaylor Simpson DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt))) 966887d61b2STaylor Simpson 967887d61b2STaylor Simpson 968887d61b2STaylor Simpson 969887d61b2STaylor Simpson 970887d61b2STaylor Simpson 971887d61b2STaylor Simpson/* WORD TO HALF*/ 972887d61b2STaylor Simpson 973887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasrwh,fSETHALF,h,w,,fECHO,fVNOROUND,0xF) 974887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasrwhsat,fSETHALF,h,w,:sat,fVSATH,fVNOROUND,0xF) 975887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasrwhrndsat,fSETHALF,h,w,:rnd:sat,fVSATH,fVROUND,0xF) 976887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasrwuhrndsat,fSETHALF,uh,w,:rnd:sat,fVSATUH,fVROUND,0xF) 977887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasrwuhsat,fSETHALF,uh,w,:sat,fVSATUH,fVNOROUND,0xF) 978887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasruwuhrndsat,fSETHALF,uh,uw,:rnd:sat,fVSATUH,fVROUND,0xF) 979887d61b2STaylor Simpson 980887d61b2STaylor SimpsonNARROWING_SHIFT_NOV1(32,vasruwuhsat,fSETHALF,uh,uw,:sat,fVSATUH,fVNOROUND,0xF) 981887d61b2STaylor SimpsonNARROWING_SHIFT(16,vasrhubsat,fSETBYTE,ub,h,:sat,fVSATUB,fVNOROUND,0x7) 982887d61b2STaylor SimpsonNARROWING_SHIFT(16,vasrhubrndsat,fSETBYTE,ub,h,:rnd:sat,fVSATUB,fVROUND,0x7) 983887d61b2STaylor SimpsonNARROWING_SHIFT(16,vasrhbsat,fSETBYTE,b,h,:sat,fVSATB,fVNOROUND,0x7) 984887d61b2STaylor SimpsonNARROWING_SHIFT(16,vasrhbrndsat,fSETBYTE,b,h,:rnd:sat,fVSATB,fVROUND,0x7) 985887d61b2STaylor Simpson 986b2f20c2cSTaylor Simpson#define NARROWING_VECTOR_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SRCTYPE2,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \ 987b2f20c2cSTaylor SimpsonITERATOR_INSN_SHIFT3_SLOT(ITERSIZE,TAG, \ 988b2f20c2cSTaylor Simpson"Vd32." #DSTTYPE "=vasr(Vuu32." #SRCTYPE ",Vv32." #SRCTYPE2 ")" #SYNOPTS, \ 989b2f20c2cSTaylor Simpson"Vector shift by vector right and shuffle", \ 990b2f20c2cSTaylor Simpson fHIDE(int )shamt = VvV.SRCTYPE2[2*i+0] & SHAMTMASK; \ 991b2f20c2cSTaylor Simpson DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuuV.v[0].SRCTYPE[i],shamt) >> shamt)); \ 992b2f20c2cSTaylor Simpson shamt = VvV.SRCTYPE2[2*i+1] & SHAMTMASK; \ 993b2f20c2cSTaylor Simpson DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuuV.v[1].SRCTYPE[i],shamt) >> shamt))) 994b2f20c2cSTaylor Simpson 995b2f20c2cSTaylor Simpson/* WORD TO HALF*/ 996b2f20c2cSTaylor SimpsonNARROWING_VECTOR_SHIFT(32,vasrvwuhsat,fSETHALF,uh,w,uh,:sat,fVSATUH,fVNOROUND,0xF) 997b2f20c2cSTaylor SimpsonNARROWING_VECTOR_SHIFT(32,vasrvwuhrndsat,fSETHALF,uh,w,uh,:rnd:sat,fVSATUH,fVROUND,0xF) 998b2f20c2cSTaylor Simpson/* HALF TO BYTE*/ 999b2f20c2cSTaylor SimpsonNARROWING_VECTOR_SHIFT(16,vasrvuhubsat,fSETBYTE,ub,uh,ub,:sat,fVSATUB,fVNOROUND,0x7) 1000b2f20c2cSTaylor SimpsonNARROWING_VECTOR_SHIFT(16,vasrvuhubrndsat,fSETBYTE,ub,uh,ub,:rnd:sat,fVSATUB,fVROUND,0x7) 1001b2f20c2cSTaylor Simpson 1002887d61b2STaylor SimpsonNARROWING_SHIFT_NOV1(16,vasruhubsat,fSETBYTE,ub,uh,:sat,fVSATUB,fVNOROUND,0x7) 1003887d61b2STaylor SimpsonNARROWING_SHIFT_NOV1(16,vasruhubrndsat,fSETBYTE,ub,uh,:rnd:sat,fVSATUB,fVROUND,0x7) 1004887d61b2STaylor Simpson 1005887d61b2STaylor Simpson#else 1006887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrwh,"Vd32=vasrwh(Vu32,Vv32,Rt8)","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8)", 1007887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords", 1008887d61b2STaylor Simpson fSETHALF(0,VdV.w[i], (VvV.w[i] >> (RtV & 0xF))); 1009887d61b2STaylor Simpson fSETHALF(1,VdV.w[i], (VuV.w[i] >> (RtV & 0xF)))) 1010887d61b2STaylor Simpson 1011887d61b2STaylor Simpson 1012887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrwhsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):sat", 1013887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords", 1014887d61b2STaylor Simpson fSETHALF(0,VdV.w[i], fVSATH(VvV.w[i] >> (RtV & 0xF))); 1015887d61b2STaylor Simpson fSETHALF(1,VdV.w[i], fVSATH(VuV.w[i] >> (RtV & 0xF)))) 1016887d61b2STaylor Simpson 1017887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrwhrndsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):rnd:sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat", 1018887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords", 1019887d61b2STaylor Simpson fHIDE(int ) shamt = RtV & 0xF; 1020887d61b2STaylor Simpson fSETHALF(0,VdV.w[i], fVSATH( (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)); 1021887d61b2STaylor Simpson fSETHALF(1,VdV.w[i], fVSATH( (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt))) 1022887d61b2STaylor Simpson 1023887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhrndsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat", 1024887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords", 1025887d61b2STaylor Simpson fHIDE(int ) shamt = RtV & 0xF; 1026887d61b2STaylor Simpson fSETHALF(0,VdV.w[i], fVSATUH( (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)); 1027887d61b2STaylor Simpson fSETHALF(1,VdV.w[i], fVSATUH( (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt))) 1028887d61b2STaylor Simpson 1029887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):sat", 1030887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords", 1031887d61b2STaylor Simpson fSETHALF(0, VdV.uw[i], fVSATUH(VvV.w[i] >> (RtV & 0xF))); 1032887d61b2STaylor Simpson fSETHALF(1, VdV.uw[i], fVSATUH(VuV.w[i] >> (RtV & 0xF)))) 1033887d61b2STaylor Simpson 1034887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasruwuhrndsat,"Vd32=vasruwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.uw,Vv32.uw,Rt8):rnd:sat", 1035887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords", 1036887d61b2STaylor Simpson fHIDE(int ) shamt = RtV & 0xF; 1037887d61b2STaylor Simpson fSETHALF(0,VdV.w[i], fVSATUH( (VvV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)); 1038887d61b2STaylor Simpson fSETHALF(1,VdV.w[i], fVSATUH( (VuV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt))) 1039887d61b2STaylor Simpson#endif 1040887d61b2STaylor Simpson 1041887d61b2STaylor Simpson 1042887d61b2STaylor Simpson 1043887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vroundwh,"Vd32=vroundwh(Vu32,Vv32):sat","Vd32.h=vround(Vu32.w,Vv32.w):sat", 1044887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords", 1045887d61b2STaylor Simpson fSETHALF(0, VdV.uw[i], fVSATH((VvV.w[i] + fCONSTLL(0x8000)) >> 16)); 1046887d61b2STaylor Simpson fSETHALF(1, VdV.uw[i], fVSATH((VuV.w[i] + fCONSTLL(0x8000)) >> 16))) 1047887d61b2STaylor Simpson 1048887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vroundwuh,"Vd32=vroundwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.w,Vv32.w):sat", 1049887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords", 1050887d61b2STaylor Simpson fSETHALF(0, VdV.uw[i], fVSATUH((VvV.w[i] + fCONSTLL(0x8000)) >> 16)); 1051887d61b2STaylor Simpson fSETHALF(1, VdV.uw[i], fVSATUH((VuV.w[i] + fCONSTLL(0x8000)) >> 16))) 1052887d61b2STaylor Simpson 1053887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vrounduwuh,"Vd32=vrounduwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.uw,Vv32.uw):sat", 1054887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords", 1055887d61b2STaylor Simpson fSETHALF(0, VdV.uw[i], fVSATUH((VvV.uw[i] + fCONSTLL(0x8000)) >> 16)); 1056887d61b2STaylor Simpson fSETHALF(1, VdV.uw[i], fVSATUH((VuV.uw[i] + fCONSTLL(0x8000)) >> 16))) 1057887d61b2STaylor Simpson 1058887d61b2STaylor Simpson 1059887d61b2STaylor Simpson 1060887d61b2STaylor Simpson 1061887d61b2STaylor Simpson 1062887d61b2STaylor Simpson/* HALF TO BYTE*/ 1063887d61b2STaylor Simpson 1064887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vroundhb,"Vd32=vroundhb(Vu32,Vv32):sat","Vd32.b=vround(Vu32.h,Vv32.h):sat", 1065887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords", 1066887d61b2STaylor Simpson fSETBYTE(0, VdV.uh[i], fVSATB((VvV.h[i] + 0x80) >> 8)); 1067887d61b2STaylor Simpson fSETBYTE(1, VdV.uh[i], fVSATB((VuV.h[i] + 0x80) >> 8))) 1068887d61b2STaylor Simpson 1069887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vroundhub,"Vd32=vroundhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.h,Vv32.h):sat", 1070887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords", 1071887d61b2STaylor Simpson fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.h[i] + 0x80) >> 8)); 1072887d61b2STaylor Simpson fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.h[i] + 0x80) >> 8))) 1073887d61b2STaylor Simpson 1074887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vrounduhub,"Vd32=vrounduhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.uh,Vv32.uh):sat", 1075887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords", 1076887d61b2STaylor Simpson fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.uh[i] + 0x80) >> 8)); 1077887d61b2STaylor Simpson fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.uh[i] + 0x80) >> 8))) 1078887d61b2STaylor Simpson 1079887d61b2STaylor Simpson 1080887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vaslw_acc,"Vx32+=vaslw(Vu32,Rt32)","Vx32.w+=vasl(Vu32.w,Rt32)", 1081887d61b2STaylor Simpson"Vector shift add word", 1082887d61b2STaylor Simpson VxV.w[i] += (VuV.w[i] << (RtV & (32-1)))) 1083887d61b2STaylor Simpson 1084887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrw_acc,"Vx32+=vasrw(Vu32,Rt32)","Vx32.w+=vasr(Vu32.w,Rt32)", 1085887d61b2STaylor Simpson"Vector shift add word", 1086887d61b2STaylor Simpson VxV.w[i] += (VuV.w[i] >> (RtV & (32-1)))) 1087887d61b2STaylor Simpson 1088887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vaslh_acc,"Vx32+=vaslh(Vu32,Rt32)","Vx32.h+=vasl(Vu32.h,Rt32)", 1089887d61b2STaylor Simpson"Vector shift add halfword", 1090887d61b2STaylor Simpson VxV.h[i] += (VuV.h[i] << (RtV & (16-1)))) 1091887d61b2STaylor Simpson 1092887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vasrh_acc,"Vx32+=vasrh(Vu32,Rt32)","Vx32.h+=vasr(Vu32.h,Rt32)", 1093887d61b2STaylor Simpson"Vector shift add halfword", 1094887d61b2STaylor Simpson VxV.h[i] += (VuV.h[i] >> (RtV & (16-1)))) 1095887d61b2STaylor Simpson 1096887d61b2STaylor Simpson/************************************************************************** 1097887d61b2STaylor Simpson* 1098887d61b2STaylor Simpson* MMVECTOR ELEMENT-WISE ARITHMETIC 1099887d61b2STaylor Simpson* 1100887d61b2STaylor Simpson**************************************************************************/ 1101887d61b2STaylor Simpson 1102887d61b2STaylor Simpson/************************************************************************** 1103887d61b2STaylor Simpson* MACROS GO IN MACROS.DEF NOT HERE!!! 1104887d61b2STaylor Simpson**************************************************************************/ 1105887d61b2STaylor Simpson 1106887d61b2STaylor Simpson 1107887d61b2STaylor Simpson#define MMVEC_ABSDIFF(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1108887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(WIDTH, vabsdiff##TYPE, "Vd32=vabsdiff"TYPE2"(Vu32,Vv32)" ,"Vd32."#DEST"=vabsdiff(Vu32."#SRC",Vv32."#SRC")" , "Vector Absolute of Difference "DESCR, VdV.DEST[i] = (VuV.SRC[i] > VvV.SRC[i]) ? (VuV.SRC[i] - VvV.SRC[i]) : (VvV.SRC[i] - VuV.SRC[i])) 1109887d61b2STaylor Simpson 1110887d61b2STaylor Simpson#define MMVEC_ADDU_SAT(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1111887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat, "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" , "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVUADDSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1112887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv, "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\ 1113887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat, "Vd32=vsub"TYPE2"(Vu32,Vv32):sat", "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVUSUBSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1114887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv, "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\ 1115887d61b2STaylor Simpson 1116887d61b2STaylor Simpson#define MMVEC_ADDS_SAT(TYPE,TYPE2,DESCR, WIDTH,DEST,SRC)\ 1117887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat, "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" , "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVSADDSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1118887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv, "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\ 1119887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat, "Vd32=vsub"TYPE2"(Vu32,Vv32):sat", "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat", "Vector Add & Saturate "DESCR, VdV.DEST[i] = fVSSUBSAT(WIDTH, VuV.SRC[i], VvV.SRC[i]))\ 1120887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv, "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat", "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR, VddV.v[0].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\ 1121887d61b2STaylor Simpson 1122887d61b2STaylor Simpson#define MMVEC_AVGU(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1123887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGU( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 1124887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGURND(WIDTH, VuV.SRC[i], VvV.SRC[i])) 1125887d61b2STaylor Simpson 1126887d61b2STaylor Simpson 1127887d61b2STaylor Simpson 1128887d61b2STaylor Simpson#define MMVEC_AVGS(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\ 1129887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE, "Vd32=vavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")", "Vector Average "DESCR, VdV.DEST[i] = fVAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 1130887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd, "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd", "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd", "Vector Average % Round"DESCR, VdV.DEST[i] = fVAVGSRND( WIDTH, VuV.SRC[i], VvV.SRC[i])) \ 1131887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vnavg##TYPE, "Vd32=vnavg"TYPE2"(Vu32,Vv32)", "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")", "Vector Negative Average "DESCR, VdV.DEST[i] = fVNAVGS( WIDTH, VuV.SRC[i], VvV.SRC[i])) 1132887d61b2STaylor Simpson 1133887d61b2STaylor Simpson 1134887d61b2STaylor Simpson 1135887d61b2STaylor Simpson 1136887d61b2STaylor Simpson 1137887d61b2STaylor Simpson 1138887d61b2STaylor Simpson 1139887d61b2STaylor Simpson#define MMVEC_ADDWRAP(TYPE,TYPE2, DESCR, WIDTH , DEST,SRC)\ 1140887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE, "Vd32=vadd"TYPE2"(Vu32,Vv32)" , "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC")", "Vector Add "DESCR, VdV.DEST[i] = VuV.SRC[i] + VvV.SRC[i])\ 1141887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE, "Vd32=vsub"TYPE2"(Vu32,Vv32)" , "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC")", "Vector Sub "DESCR, VdV.DEST[i] = VuV.SRC[i] - VvV.SRC[i])\ 1142887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##_dv, "Vdd32=vadd"TYPE2"(Vuu32,Vvv32)" , "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Add "DESCR, VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] + VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] + VvvV.v[1].SRC[i])\ 1143887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##_dv, "Vdd32=vsub"TYPE2"(Vuu32,Vvv32)" , "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Sub "DESCR, VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] - VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] - VvvV.v[1].SRC[i]) \ 1144887d61b2STaylor Simpson 1145887d61b2STaylor Simpson 1146887d61b2STaylor Simpson 1147887d61b2STaylor Simpson 1148887d61b2STaylor Simpson 1149887d61b2STaylor Simpson/* Wrapping Adds */ 1150887d61b2STaylor SimpsonMMVEC_ADDWRAP(b, "b", "Byte", 8, b, b) 1151887d61b2STaylor SimpsonMMVEC_ADDWRAP(h, "h", "Halfword", 16, h, h) 1152887d61b2STaylor SimpsonMMVEC_ADDWRAP(w, "w", "Word", 32, w, w) 1153887d61b2STaylor Simpson 1154887d61b2STaylor Simpson/* Saturating Adds */ 1155887d61b2STaylor SimpsonMMVEC_ADDU_SAT(ub, "ub", "Unsigned Byte", 8, ub, ub) 1156887d61b2STaylor SimpsonMMVEC_ADDU_SAT(uh, "uh", "Unsigned Halfword", 16, uh, uh) 1157887d61b2STaylor SimpsonMMVEC_ADDU_SAT(uw, "uw", "Unsigned word", 32, uw, uw) 1158887d61b2STaylor SimpsonMMVEC_ADDS_SAT(b, "b", "byte", 8, b, b) 1159887d61b2STaylor SimpsonMMVEC_ADDS_SAT(h, "h", "Halfword", 16, h, h) 1160887d61b2STaylor SimpsonMMVEC_ADDS_SAT(w, "w", "Word", 32, w, w) 1161887d61b2STaylor Simpson 1162887d61b2STaylor Simpson 1163887d61b2STaylor Simpson/* Averaging Instructions */ 1164887d61b2STaylor SimpsonMMVEC_AVGU(ub,"ub", "Unsigned Byte", 8, ub, ub) 1165887d61b2STaylor SimpsonMMVEC_AVGU(uh,"uh", "Unsigned Halfword", 16, uh, uh) 1166887d61b2STaylor SimpsonMMVEC_AVGU_NOV1(uw,"uw", "Unsigned Word", 32, uw, uw) 1167887d61b2STaylor SimpsonMMVEC_AVGS_NOV1(b, "b", "Byte", 8, b, b) 1168887d61b2STaylor SimpsonMMVEC_AVGS(h, "h", "Halfword", 16, h, h) 1169887d61b2STaylor SimpsonMMVEC_AVGS(w, "w", "Word", 32, w, w) 1170887d61b2STaylor Simpson 1171887d61b2STaylor Simpson 1172887d61b2STaylor Simpson/* Absolute Difference */ 1173887d61b2STaylor SimpsonMMVEC_ABSDIFF(ub,"ub", "Unsigned Byte", 8, ub, ub) 1174887d61b2STaylor SimpsonMMVEC_ABSDIFF(uh,"uh", "Unsigned Halfword", 16, uh, uh) 1175887d61b2STaylor SimpsonMMVEC_ABSDIFF(h,"h", "Halfword", 16, uh, h) 1176887d61b2STaylor SimpsonMMVEC_ABSDIFF(w,"w", "Word", 32, uw, w) 1177887d61b2STaylor Simpson 1178887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(8,vnavgub, "Vd32=vnavgub(Vu32,Vv32)", "Vd32.b=vnavg(Vu32.ub,Vv32.ub)", 1179887d61b2STaylor Simpson"Vector Negative Average Unsigned Byte", VdV.b[i] = fVNAVGU(8, VuV.ub[i], VvV.ub[i])) 1180887d61b2STaylor Simpson 1181887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vaddcarrysat,"Vd32.w=vadd(Vu32.w,Vv32.w,Qs4):carry:sat","add w/carry and saturate", 1182887d61b2STaylor SimpsonVdV.w[i] = fVSATW(VuV.w[i]+VvV.w[i]+fGETQBIT(QsV,i*4))) 1183887d61b2STaylor Simpson 1184887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vaddcarry,"Vd32.w=vadd(Vu32.w,Vv32.w,Qx4):carry","add w/carry", 1185887d61b2STaylor SimpsonVdV.w[i] = VuV.w[i]+VvV.w[i]+fGETQBIT(QxV,i*4); 1186887d61b2STaylor SimpsonfSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],fGETQBIT(QxV,i*4)))) 1187887d61b2STaylor Simpson 1188887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vsubcarry,"Vd32.w=vsub(Vu32.w,Vv32.w,Qx4):carry","add w/carry", 1189887d61b2STaylor SimpsonVdV.w[i] = VuV.w[i]+~VvV.w[i]+fGETQBIT(QxV,i*4); 1190887d61b2STaylor SimpsonfSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],fGETQBIT(QxV,i*4)))) 1191887d61b2STaylor Simpson 1192887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vaddcarryo,"Vd32.w,Qe4=vadd(Vu32.w,Vv32.w):carry","add w/carry out-only", 1193887d61b2STaylor SimpsonVdV.w[i] = VuV.w[i]+VvV.w[i]; 1194887d61b2STaylor SimpsonfSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],0))) 1195887d61b2STaylor Simpson 1196887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vsubcarryo,"Vd32.w,Qe4=vsub(Vu32.w,Vv32.w):carry","subtract w/carry out-only", 1197887d61b2STaylor SimpsonVdV.w[i] = VuV.w[i]+~VvV.w[i]+1; 1198887d61b2STaylor SimpsonfSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],1))) 1199887d61b2STaylor Simpson 1200887d61b2STaylor Simpson 1201887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vsatdw,"Vd32.w=vsatdw(Vu32.w,Vv32.w)","Saturate from 64-bits (higher 32-bits come from first vector) to 32-bits",VdV.w[i] = fVSATDW(VuV.w[i],VvV.w[i])) 1202887d61b2STaylor Simpson 1203887d61b2STaylor Simpson 1204887d61b2STaylor Simpson#define MMVEC_ADDSAT_MIX(TAGEND,SATF,WIDTH,DEST,SRC1,SRC2)\ 1205887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(WIDTH, vadd##TAGEND,"Vd32."#DEST"=vadd(Vu32."#SRC1",Vv32."#SRC2"):sat", "Vector Add mixed", VdV.DEST[i] = SATF(VuV.SRC1[i] + VvV.SRC2[i]))\ 1206887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(WIDTH, vsub##TAGEND,"Vd32."#DEST"=vsub(Vu32."#SRC1",Vv32."#SRC2"):sat", "Vector Sub mixed", VdV.DEST[i] = SATF(VuV.SRC1[i] - VvV.SRC2[i]))\ 1207887d61b2STaylor Simpson 1208887d61b2STaylor SimpsonMMVEC_ADDSAT_MIX(ububb_sat,fVSATUB,8,ub,ub,b) 1209887d61b2STaylor Simpson 1210887d61b2STaylor Simpson/**************************** 1211887d61b2STaylor Simpson* WIDENING 1212887d61b2STaylor Simpson****************************/ 1213887d61b2STaylor Simpson 1214887d61b2STaylor Simpson 1215887d61b2STaylor Simpson 1216887d61b2STaylor Simpson 1217887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh,"Vdd32=vaddub(Vu32,Vv32)","Vdd32.h=vadd(Vu32.ub,Vv32.ub)", 1218887d61b2STaylor Simpson"Vector addition with widen into two vectors", 1219887d61b2STaylor Simpson VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) + fZE8_16(fGETUBYTE(0, VvV.uh[i])); 1220887d61b2STaylor Simpson VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) + fZE8_16(fGETUBYTE(1, VvV.uh[i]))) 1221887d61b2STaylor Simpson 1222887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vsububh,"Vdd32=vsubub(Vu32,Vv32)","Vdd32.h=vsub(Vu32.ub,Vv32.ub)", 1223887d61b2STaylor Simpson"Vector subtraction with widen into two vectors", 1224887d61b2STaylor Simpson VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) - fZE8_16(fGETUBYTE(0, VvV.uh[i])); 1225887d61b2STaylor Simpson VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) - fZE8_16(fGETUBYTE(1, VvV.uh[i]))) 1226887d61b2STaylor Simpson 1227887d61b2STaylor Simpson 1228887d61b2STaylor Simpson 1229887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw,"Vdd32=vaddh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.h,Vv32.h)", 1230887d61b2STaylor Simpson"Vector addition with widen into two vectors", 1231887d61b2STaylor Simpson VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]); 1232887d61b2STaylor Simpson VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i])) 1233887d61b2STaylor Simpson 1234887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubhw,"Vdd32=vsubh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.h,Vv32.h)", 1235887d61b2STaylor Simpson"Vector subtraction with widen into two vectors", 1236887d61b2STaylor Simpson VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) - fGETHALF(0, VvV.w[i]); 1237887d61b2STaylor Simpson VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) - fGETHALF(1, VvV.w[i])) 1238887d61b2STaylor Simpson 1239887d61b2STaylor Simpson 1240887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw,"Vdd32=vadduh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.uh,Vv32.uh)", 1241887d61b2STaylor Simpson"Vector addition with widen into two vectors", 1242887d61b2STaylor Simpson VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) + fZE16_32(fGETUHALF(0, VvV.uw[i])); 1243887d61b2STaylor Simpson VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) + fZE16_32(fGETUHALF(1, VvV.uw[i]))) 1244887d61b2STaylor Simpson 1245887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubuhw,"Vdd32=vsubuh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.uh,Vv32.uh)", 1246887d61b2STaylor Simpson"Vector subtraction with widen into two vectors", 1247887d61b2STaylor Simpson VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) - fZE16_32(fGETUHALF(0, VvV.uw[i])); 1248887d61b2STaylor Simpson VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) - fZE16_32(fGETUHALF(1, VvV.uw[i]))) 1249887d61b2STaylor Simpson 1250887d61b2STaylor Simpson 1251887d61b2STaylor Simpson 1252887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw_acc,"Vxx32+=vaddh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.h,Vv32.h)", 1253887d61b2STaylor Simpson"Vector addition with widen into two vectors", 1254887d61b2STaylor Simpson VxxV.v[0].w[i] += fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]); 1255887d61b2STaylor Simpson VxxV.v[1].w[i] += fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i])) 1256887d61b2STaylor Simpson 1257887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw_acc,"Vxx32+=vadduh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.uh,Vv32.uh)", 1258887d61b2STaylor Simpson"Vector addition with widen into two vectors", 1259887d61b2STaylor Simpson VxxV.v[0].w[i] += fGETUHALF(0, VuV.w[i]) + fGETUHALF(0, VvV.w[i]); 1260887d61b2STaylor Simpson VxxV.v[1].w[i] += fGETUHALF(1, VuV.w[i]) + fGETUHALF(1, VvV.w[i])) 1261887d61b2STaylor Simpson 1262887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh_acc,"Vxx32+=vaddub(Vu32,Vv32)","Vxx32.h+=vadd(Vu32.ub,Vv32.ub)", 1263887d61b2STaylor Simpson"Vector addition with widen into two vectors", 1264887d61b2STaylor Simpson VxxV.v[0].h[i] += fGETUBYTE(0, VuV.h[i]) + fGETUBYTE(0, VvV.h[i]); 1265887d61b2STaylor Simpson VxxV.v[1].h[i] += fGETUBYTE(1, VuV.h[i]) + fGETUBYTE(1, VvV.h[i])) 1266887d61b2STaylor Simpson 1267887d61b2STaylor Simpson 1268887d61b2STaylor Simpson/**************************** 1269887d61b2STaylor Simpson* Conditional 1270887d61b2STaylor Simpson****************************/ 1271887d61b2STaylor Simpson 1272887d61b2STaylor Simpson#define CONDADDSUB(WIDTH,TAGEND,LHSYN,RHSYN,DESCR,LHBEH,RHBEH) \ 1273887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH+RHBEH,LHBEH)) \ 1274887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH-RHBEH,LHBEH)) \ 1275887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (!Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH+RHBEH)) \ 1276887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (!Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH-RHBEH)) \ 1277887d61b2STaylor Simpson 1278887d61b2STaylor SimpsonCONDADDSUB(8,b,"Vx32.b","Vu32.b","Conditional add/sub Byte",VxV.ub[i],VuV.ub[i]) 1279887d61b2STaylor SimpsonCONDADDSUB(16,h,"Vx32.h","Vu32.h","Conditional add/sub Half",VxV.h[i],VuV.h[i]) 1280887d61b2STaylor SimpsonCONDADDSUB(32,w,"Vx32.w","Vu32.w","Conditional add/sub Word",VxV.w[i],VuV.w[i]) 1281887d61b2STaylor Simpson 1282887d61b2STaylor Simpson/***************************************************** 1283887d61b2STaylor Simpson ABSOLUTE VALUES 1284887d61b2STaylor Simpson*****************************************************/ 1285887d61b2STaylor Simpson// V65 1286887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb, "Vd32=vabsb(Vu32)", "Vd32.b=vabs(Vu32.b)", "Vector absolute value of bytes", VdV.b[i] = fABS(VuV.b[i])) 1287887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb_sat, "Vd32=vabsb(Vu32):sat", "Vd32.b=vabs(Vu32.b):sat", "Vector absolute value of bytes", VdV.b[i] = fVSATB(fABS(fSE8_16(VuV.b[i])))) 1288887d61b2STaylor Simpson 1289887d61b2STaylor Simpson 1290887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(16,vabsh, "Vd32=vabsh(Vu32)", "Vd32.h=vabs(Vu32.h)", "Vector absolute value of halfwords", VdV.h[i] = fABS(VuV.h[i])) 1291887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(16,vabsh_sat, "Vd32=vabsh(Vu32):sat", "Vd32.h=vabs(Vu32.h):sat", "Vector absolute value of halfwords", VdV.h[i] = fVSATH(fABS(fSE16_32(VuV.h[i])))) 1292887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vabsw, "Vd32=vabsw(Vu32)", "Vd32.w=vabs(Vu32.w)", "Vector absolute value of words", VdV.w[i] = fABS(VuV.w[i])) 1293887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vabsw_sat, "Vd32=vabsw(Vu32):sat", "Vd32.w=vabs(Vu32.w):sat", "Vector absolute value of words", VdV.w[i] = fVSATW(fABS(fSE32_64(VuV.w[i])))) 1294887d61b2STaylor Simpson 1295887d61b2STaylor Simpson 1296887d61b2STaylor Simpson/************************************************************************** 1297887d61b2STaylor Simpson * MMVECTOR MULTIPLICATIONS 1298887d61b2STaylor Simpson * ************************************************************************/ 1299887d61b2STaylor Simpson 1300887d61b2STaylor Simpson 1301887d61b2STaylor Simpson/* Byte by Byte */ 1302887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv,"Vdd32=vmpyb(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.b,Vv32.b)", 1303887d61b2STaylor Simpson"Vector absolute value of words", 1304887d61b2STaylor Simpson VddV.v[0].h[i] = fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i])); 1305887d61b2STaylor Simpson VddV.v[1].h[i] = fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i]))) 1306887d61b2STaylor Simpson 1307887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv_acc,"Vxx32+=vmpyb(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.b,Vv32.b)", 1308887d61b2STaylor Simpson"Vector absolute value of words", 1309887d61b2STaylor Simpson VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i])); 1310887d61b2STaylor Simpson VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i]))) 1311887d61b2STaylor Simpson 1312887d61b2STaylor Simpson 1313887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv,"Vdd32=vmpyub(Vu32,Vv32)","Vdd32.uh=vmpy(Vu32.ub,Vv32.ub)", 1314887d61b2STaylor Simpson"Vector absolute value of words", 1315887d61b2STaylor Simpson VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) ); 1316887d61b2STaylor Simpson VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) )) 1317887d61b2STaylor Simpson 1318887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv_acc,"Vxx32+=vmpyub(Vu32,Vv32)","Vxx32.uh+=vmpy(Vu32.ub,Vv32.ub)", 1319887d61b2STaylor Simpson"Vector absolute value of words", 1320887d61b2STaylor Simpson VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) ); 1321887d61b2STaylor Simpson VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) )) 1322887d61b2STaylor Simpson 1323887d61b2STaylor Simpson 1324887d61b2STaylor Simpson 1325887d61b2STaylor Simpson 1326887d61b2STaylor Simpson 1327887d61b2STaylor Simpson 1328887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv,"Vdd32=vmpybus(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.ub,Vv32.b)", 1329887d61b2STaylor Simpson"Vector absolute value of words", 1330887d61b2STaylor Simpson VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i])); 1331887d61b2STaylor Simpson VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i]))) 1332887d61b2STaylor Simpson 1333887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv_acc,"Vxx32+=vmpybus(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.ub,Vv32.b)", 1334887d61b2STaylor Simpson"Vector absolute value of words", 1335887d61b2STaylor Simpson VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i])); 1336887d61b2STaylor Simpson VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i]))) 1337887d61b2STaylor Simpson 1338887d61b2STaylor Simpson 1339887d61b2STaylor Simpson 1340887d61b2STaylor Simpson 1341887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabusv,"Vdd32=vmpabus(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.b)", 1342887d61b2STaylor Simpson"Vertical Byte Multiply", 1343887d61b2STaylor Simpson VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(0, VvvV.v[1].uh[i])); 1344887d61b2STaylor Simpson VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(1, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(1, VvvV.v[1].uh[i]))) 1345887d61b2STaylor Simpson 1346887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabuuv,"Vdd32=vmpabuu(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.ub)", 1347887d61b2STaylor Simpson"Vertical Byte Multiply", 1348887d61b2STaylor Simpson VddV.v[0].h[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(0, VvvV.v[1].uh[i])); 1349887d61b2STaylor Simpson VddV.v[1].h[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(1, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(1, VvvV.v[1].uh[i]))) 1350887d61b2STaylor Simpson 1351887d61b2STaylor Simpson 1352887d61b2STaylor Simpson 1353887d61b2STaylor Simpson 1354887d61b2STaylor Simpson 1355887d61b2STaylor Simpson 1356887d61b2STaylor Simpson 1357887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv,"Vdd32=vmpyh(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.h)", 1358887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1359887d61b2STaylor Simpson VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i])); 1360887d61b2STaylor Simpson VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i]))) 1361887d61b2STaylor Simpson 1362887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv_acc,"Vxx32+=vmpyh(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.h)", 1363887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1364887d61b2STaylor Simpson VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i])); 1365887d61b2STaylor Simpson VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i]))) 1366887d61b2STaylor Simpson 1367887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv,"Vdd32=vmpyuh(Vu32,Vv32)","Vdd32.uw=vmpy(Vu32.uh,Vv32.uh)", 1368887d61b2STaylor Simpson"Vector by Vector Unsigned Halfword Multiply", 1369887d61b2STaylor Simpson VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i])); 1370887d61b2STaylor Simpson VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i]))) 1371887d61b2STaylor Simpson 1372887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv_acc,"Vxx32+=vmpyuh(Vu32,Vv32)","Vxx32.uw+=vmpy(Vu32.uh,Vv32.uh)", 1373887d61b2STaylor Simpson"Vector by Vector Unsigned Halfword Multiply", 1374887d61b2STaylor Simpson VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i])); 1375887d61b2STaylor Simpson VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i]))) 1376887d61b2STaylor Simpson 1377887d61b2STaylor Simpson 1378887d61b2STaylor Simpson 1379887d61b2STaylor Simpson/* Vector by Vector */ 1380887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyhvsrs,"Vd32=vmpyh(Vu32,Vv32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Vv32.h):<<1:rnd:sat", 1381887d61b2STaylor Simpson"Vector halfword multiply with round, shift, and sat16", 1382887d61b2STaylor Simpson VdV.h[i] = fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(VuV.h[i],VvV.h[i] )<<1)))))) 1383887d61b2STaylor Simpson 1384887d61b2STaylor Simpson 1385887d61b2STaylor Simpson 1386b2f20c2cSTaylor SimpsonITERATOR_INSN_MPY_SLOT(16,vmpyuhvs, "Vd32.uh=vmpy(Vu32.uh,Vv32.uh):>>16", 1387b2f20c2cSTaylor Simpson"Vector by Vector Unsigned Halfword Multiply with 16 bit rightshift", 1388b2f20c2cSTaylor Simpson VdV.uh[i] = fGETUHALF(1,fMPY16UU(VuV.uh[i],VvV.uh[i]))) 1389887d61b2STaylor Simpson 1390887d61b2STaylor Simpson 1391887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus, "Vdd32=vmpyhus(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.uh)", 1392887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1393887d61b2STaylor Simpson VddV.v[0].w[i] = fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i])); 1394887d61b2STaylor Simpson VddV.v[1].w[i] = fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i]))) 1395887d61b2STaylor Simpson 1396887d61b2STaylor Simpson 1397887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus_acc, "Vxx32+=vmpyhus(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.uh)", 1398887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1399887d61b2STaylor Simpson VxxV.v[0].w[i] += fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i])); 1400887d61b2STaylor Simpson VxxV.v[1].w[i] += fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i]))) 1401887d61b2STaylor Simpson 1402887d61b2STaylor Simpson 1403887d61b2STaylor Simpson 1404887d61b2STaylor Simpson 1405887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih,"Vd32=vmpyih(Vu32,Vv32)","Vd32.h=vmpyi(Vu32.h,Vv32.h)", 1406887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1407887d61b2STaylor Simpson VdV.h[i] = fMPY16SS(VuV.h[i], VvV.h[i])) 1408887d61b2STaylor Simpson 1409887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih_acc,"Vx32+=vmpyih(Vu32,Vv32)","Vx32.h+=vmpyi(Vu32.h,Vv32.h)", 1410887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1411887d61b2STaylor Simpson VxV.h[i] += fMPY16SS(VuV.h[i], VvV.h[i])) 1412887d61b2STaylor Simpson 1413887d61b2STaylor Simpson 1414887d61b2STaylor Simpson 1415887d61b2STaylor Simpson/* 32x32 high half / frac */ 1416887d61b2STaylor Simpson 1417887d61b2STaylor Simpson 1418887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh,"Vd32=vmpyewuh(Vu32,Vv32)","Vd32.w=vmpye(Vu32.w,Vv32.uh)", 1419887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1420887d61b2STaylor SimpsonVdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) >> 16) 1421887d61b2STaylor Simpson 1422887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh,"Vd32=vmpyowh(Vu32,Vv32):<<1:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:sat", 1423887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1424887d61b2STaylor SimpsonVdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 0) >> 1))) 1425887d61b2STaylor Simpson 1426887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd,"Vd32=vmpyowh(Vu32,Vv32):<<1:rnd:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat", 1427887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1428887d61b2STaylor SimpsonVdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 1) >> 1))) 1429887d61b2STaylor Simpson 1430887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh_64,"Vdd32=vmpye(Vu32.w,Vv32.uh)", 1431887d61b2STaylor Simpson"Word times Halfword Multiply, 64-bit result", 1432887d61b2STaylor Simpson fHIDE(size8s_t prod;) 1433887d61b2STaylor Simpson prod = fMPY32SU(VuV.w[i],fGETUHALF(0,VvV.w[i])); 1434887d61b2STaylor Simpson VddV.v[1].w[i] = prod >> 16; 1435887d61b2STaylor Simpson VddV.v[0].w[i] = prod << 16) 1436887d61b2STaylor Simpson 1437887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_64_acc,"Vxx32+=vmpyo(Vu32.w,Vv32.h)", 1438887d61b2STaylor Simpson"Word times Halfword Multiply, 64-bit result", 1439887d61b2STaylor Simpson fHIDE(size8s_t prod;) 1440887d61b2STaylor Simpson prod = fMPY32SS(VuV.w[i],fGETHALF(1,VvV.w[i])) + fSE32_64(VxxV.v[1].w[i]); 1441887d61b2STaylor Simpson VxxV.v[1].w[i] = prod >> 16; 1442887d61b2STaylor Simpson fSETHALF(0, VxxV.v[0].w[i], VxxV.v[0].w[i] >> 16); 1443887d61b2STaylor Simpson fSETHALF(1, VxxV.v[0].w[i], prod & 0x0000ffff)) 1444887d61b2STaylor Simpson 1445887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:sat:shift", 1446887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1447887d61b2STaylor SimpsonIV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 0) >> 1))) 1448887d61b2STaylor Simpson 1449887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:rnd:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat:shift", 1450887d61b2STaylor Simpson"Vector by Vector Halfword Multiply", 1451887d61b2STaylor SimpsonIV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 1) >> 1))) 1452887d61b2STaylor Simpson 1453887d61b2STaylor Simpson/* For 32x32 integer / low half */ 1454887d61b2STaylor Simpson 1455887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT(32,vmpyieoh,"Vd32.w=vmpyieo(Vu32.h,Vv32.h)","Odd/Even multiply for 32x32 low half", 1456887d61b2STaylor Simpson VdV.w[i] = (fGETHALF(0,VuV.w[i])*fGETHALF(1,VvV.w[i])) << 16) 1457887d61b2STaylor Simpson 1458887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh,"Vd32=vmpyiewuh(Vu32,Vv32)","Vd32.w=vmpyie(Vu32.w,Vv32.uh)", 1459887d61b2STaylor Simpson"Vector by Vector Word by Halfword Multiply", 1460887d61b2STaylor SimpsonIV1DEAD() VdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) ) 1461887d61b2STaylor Simpson 1462887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiowh,"Vd32=vmpyiowh(Vu32,Vv32)","Vd32.w=vmpyio(Vu32.w,Vv32.h)", 1463887d61b2STaylor Simpson"Vector by Vector Word by Halfword Multiply", 1464887d61b2STaylor SimpsonIV1DEAD() VdV.w[i] = fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) ) 1465887d61b2STaylor Simpson 1466887d61b2STaylor Simpson/* Add back these... */ 1467887d61b2STaylor Simpson 1468887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewh_acc,"Vx32+=vmpyiewh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.h)", 1469887d61b2STaylor Simpson"Vector by Vector Word by Halfword Multiply", 1470887d61b2STaylor SimpsonVxV.w[i] = VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(0, VvV.w[i])) ) 1471887d61b2STaylor Simpson 1472887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh_acc,"Vx32+=vmpyiewuh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.uh)", 1473887d61b2STaylor Simpson"Vector by Vector Word by Halfword Multiply", 1474887d61b2STaylor SimpsonVxV.w[i] = VxV.w[i] + fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) ) 1475887d61b2STaylor Simpson 1476887d61b2STaylor Simpson 1477887d61b2STaylor Simpson 1478887d61b2STaylor Simpson 1479887d61b2STaylor Simpson 1480887d61b2STaylor Simpson 1481887d61b2STaylor Simpson 1482887d61b2STaylor Simpson/* Vector by Scalar */ 1483887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub,"Vdd32=vmpyub(Vu32,Rt32)","Vdd32.uh=vmpy(Vu32.ub,Rt32.ub)", 1484887d61b2STaylor Simpson"Vector absolute value of words", 1485887d61b2STaylor Simpson VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV)); 1486887d61b2STaylor Simpson VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV))) 1487887d61b2STaylor Simpson 1488887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub_acc,"Vxx32+=vmpyub(Vu32,Rt32)","Vxx32.uh+=vmpy(Vu32.ub,Rt32.ub)", 1489887d61b2STaylor Simpson"Vector absolute value of words", 1490887d61b2STaylor Simpson VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV)); 1491887d61b2STaylor Simpson VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV))) 1492887d61b2STaylor Simpson 1493887d61b2STaylor Simpson 1494887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus,"Vdd32=vmpybus(Vu32,Rt32)","Vdd32.h=vmpy(Vu32.ub,Rt32.b)", 1495887d61b2STaylor Simpson"Vector absolute value of words", 1496887d61b2STaylor Simpson VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV)); 1497887d61b2STaylor Simpson VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 1498887d61b2STaylor Simpson 1499887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus_acc,"Vxx32+=vmpybus(Vu32,Rt32)","Vxx32.h+=vmpy(Vu32.ub,Rt32.b)", 1500887d61b2STaylor Simpson"Vector absolute value of words", 1501887d61b2STaylor Simpson VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV)); 1502887d61b2STaylor Simpson VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV))) 1503887d61b2STaylor Simpson 1504887d61b2STaylor Simpson 1505887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus,"Vdd32=vmpabus(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.b)", 1506887d61b2STaylor Simpson"Vertical Byte Multiply", 1507887d61b2STaylor Simpson VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV)); 1508887d61b2STaylor Simpson VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV))) 1509887d61b2STaylor Simpson 1510887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus_acc,"Vxx32+=vmpabus(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.b)", 1511887d61b2STaylor Simpson"Vertical Byte Multiply", 1512887d61b2STaylor Simpson VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV)); 1513887d61b2STaylor Simpson VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV))) 1514887d61b2STaylor Simpson 1515887d61b2STaylor Simpson// V65 1516887d61b2STaylor Simpson 1517887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu,"Vdd32=vmpabuu(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.ub)", 1518887d61b2STaylor Simpson"Vertical Byte Multiply", 1519887d61b2STaylor Simpson VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV)); 1520887d61b2STaylor Simpson VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV))) 1521887d61b2STaylor Simpson 1522887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu_acc,"Vxx32+=vmpabuu(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.ub)", 1523887d61b2STaylor Simpson"Vertical Byte Multiply", 1524887d61b2STaylor Simpson VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV)); 1525887d61b2STaylor Simpson VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV))) 1526887d61b2STaylor Simpson 1527887d61b2STaylor Simpson 1528887d61b2STaylor Simpson 1529887d61b2STaylor Simpson 1530887d61b2STaylor Simpson/* Half by Byte */ 1531887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb,"Vdd32=vmpahb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.h,Rt32.b)", 1532887d61b2STaylor Simpson"Vertical Byte Multiply", 1533887d61b2STaylor Simpson VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1534887d61b2STaylor Simpson VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1535887d61b2STaylor Simpson 1536887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb_acc,"Vxx32+=vmpahb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.h,Rt32.b)", 1537887d61b2STaylor Simpson"Vertical Byte Multiply", 1538887d61b2STaylor Simpson VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1539887d61b2STaylor Simpson VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1540887d61b2STaylor Simpson 1541887d61b2STaylor Simpson/* Half by Byte */ 1542887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb,"Vdd32=vmpauhb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.uh,Rt32.b)", 1543887d61b2STaylor Simpson"Vertical Byte Multiply", 1544887d61b2STaylor Simpson VddV.v[0].w[i] = fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1545887d61b2STaylor Simpson VddV.v[1].w[i] = fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1546887d61b2STaylor Simpson 1547887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb_acc,"Vxx32+=vmpauhb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.uh,Rt32.b)", 1548887d61b2STaylor Simpson"Vertical Byte Multiply", 1549887d61b2STaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV))); 1550887d61b2STaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV)))) 1551887d61b2STaylor Simpson 1552887d61b2STaylor Simpson 1553887d61b2STaylor Simpson 1554887d61b2STaylor Simpson 1555887d61b2STaylor Simpson 1556887d61b2STaylor Simpson 1557887d61b2STaylor Simpson 1558887d61b2STaylor Simpson/* Half by Half */ 1559887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyh,"Vdd32=vmpyh(Vu32,Rt32)","Vdd32.w=vmpy(Vu32.h,Rt32.h)", 1560887d61b2STaylor Simpson"Vector absolute value of words", 1561887d61b2STaylor Simpson VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV)); 1562887d61b2STaylor Simpson VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV))) 1563887d61b2STaylor Simpson 1564887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(32,vmpyh_acc,"Vxx32+=vmpyh(Vu32,Rt32)","Vxx32.w+=vmpy(Vu32.h,Rt32.h)", 1565887d61b2STaylor Simpson"Vector even halfwords with scalar lower halfword multiply with shift and sat32", 1566887d61b2STaylor Simpson VxxV.v[0].w[i] = fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV)); 1567887d61b2STaylor Simpson VxxV.v[1].w[i] = fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV))) 1568887d61b2STaylor Simpson 1569887d61b2STaylor Simpson 1570887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsat_acc,"Vxx32+=vmpyh(Vu32,Rt32):sat","Vxx32.w+=vmpy(Vu32.h,Rt32.h):sat", 1571887d61b2STaylor Simpson"Vector even halfwords with scalar lower halfword multiply with shift and sat32", 1572887d61b2STaylor Simpson VxxV.v[0].w[i] = fVSATW(fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV))); 1573887d61b2STaylor Simpson VxxV.v[1].w[i] = fVSATW(fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV)))) 1574887d61b2STaylor Simpson 1575887d61b2STaylor Simpson 1576887d61b2STaylor Simpson 1577887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhss,"Vd32=vmpyh(Vu32,Rt32):<<1:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:sat", 1578887d61b2STaylor Simpson"Vector halfword by halfword multiply, shift by 1, and take upper 16 msb", 1579887d61b2STaylor Simpson fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1))))); 1580887d61b2STaylor Simpson fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1))))); 1581887d61b2STaylor Simpson) 1582887d61b2STaylor Simpson 1583887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsrs,"Vd32=vmpyh(Vu32,Rt32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:rnd:sat", 1584887d61b2STaylor Simpson"Vector halfword with scalar halfword multiply with round, shift, and sat16", 1585887d61b2STaylor Simpson fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1)))))); 1586887d61b2STaylor Simpson fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1)))))); 1587887d61b2STaylor Simpson) 1588887d61b2STaylor Simpson 1589887d61b2STaylor Simpson 1590887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh,"Vdd32=vmpyuh(Vu32,Rt32)","Vdd32.uw=vmpy(Vu32.uh,Rt32.uh)", 1591887d61b2STaylor Simpson"Vector even halfword unsigned multiply by scalar", 1592887d61b2STaylor Simpson VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)); 1593887d61b2STaylor Simpson VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV))) 1594887d61b2STaylor Simpson 1595887d61b2STaylor Simpson 1596887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh_acc,"Vxx32+=vmpyuh(Vu32,Rt32)","Vxx32.uw+=vmpy(Vu32.uh,Rt32.uh)", 1597887d61b2STaylor Simpson"Vector even halfword unsigned multiply by scalar", 1598887d61b2STaylor Simpson VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)); 1599887d61b2STaylor Simpson VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV))) 1600887d61b2STaylor Simpson 1601887d61b2STaylor Simpson 1602887d61b2STaylor Simpson 1603887d61b2STaylor Simpson 1604887d61b2STaylor Simpson/******************************************** 1605887d61b2STaylor Simpson* HALF BY BYTE 1606887d61b2STaylor Simpson********************************************/ 1607887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(16,vmpyihb,"Vd32=vmpyihb(Vu32,Rt32)","Vd32.h=vmpyi(Vu32.h,Rt32.b)", 1608887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result", 1609887d61b2STaylor SimpsonVdV.h[i] = fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) )) 1610887d61b2STaylor Simpson 1611887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(16,vmpyihb_acc,"Vx32+=vmpyihb(Vu32,Rt32)","Vx32.h+=vmpyi(Vu32.h,Rt32.b)", 1612887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result", 1613887d61b2STaylor SimpsonVxV.h[i] += fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) )) 1614887d61b2STaylor Simpson 1615887d61b2STaylor Simpson 1616887d61b2STaylor Simpson/******************************************** 1617887d61b2STaylor Simpson* WORD BY BYTE 1618887d61b2STaylor Simpson********************************************/ 1619887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vmpyiwb,"Vd32=vmpyiwb(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.b)", 1620887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result", 1621887d61b2STaylor SimpsonVdV.w[i] = fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) )) 1622887d61b2STaylor Simpson 1623887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vmpyiwb_acc,"Vx32+=vmpyiwb(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.b)", 1624887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result", 1625887d61b2STaylor SimpsonVxV.w[i] += fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) )) 1626887d61b2STaylor Simpson 1627887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vmpyiwub,"Vd32=vmpyiwub(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.ub)", 1628887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result", 1629887d61b2STaylor SimpsonVdV.w[i] = fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) )) 1630887d61b2STaylor Simpson 1631887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vmpyiwub_acc,"Vx32+=vmpyiwub(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.ub)", 1632887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result", 1633887d61b2STaylor SimpsonVxV.w[i] += fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) )) 1634887d61b2STaylor Simpson 1635887d61b2STaylor Simpson 1636887d61b2STaylor Simpson/******************************************** 1637887d61b2STaylor Simpson* WORD BY HALF 1638887d61b2STaylor Simpson********************************************/ 1639887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh,"Vd32=vmpyiwh(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.h)", 1640887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result", 1641887d61b2STaylor SimpsonVdV.w[i] = fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV))) 1642887d61b2STaylor Simpson 1643887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh_acc,"Vx32+=vmpyiwh(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.h)", 1644887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result", 1645887d61b2STaylor SimpsonVxV.w[i] += fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV))) 1646887d61b2STaylor Simpson 1647887d61b2STaylor Simpson 1648887d61b2STaylor Simpson 1649887d61b2STaylor Simpson 1650887d61b2STaylor Simpson 1651887d61b2STaylor Simpson 1652887d61b2STaylor Simpson 1653887d61b2STaylor Simpson 1654887d61b2STaylor Simpson 1655887d61b2STaylor Simpson 1656887d61b2STaylor Simpson 1657887d61b2STaylor Simpson 1658887d61b2STaylor Simpson 1659887d61b2STaylor Simpson 1660887d61b2STaylor Simpson 1661887d61b2STaylor Simpson 1662887d61b2STaylor Simpson 1663887d61b2STaylor Simpson 1664887d61b2STaylor Simpson 1665887d61b2STaylor Simpson/************************************************************************** 1666887d61b2STaylor Simpson * MMVECTOR LOGICAL OPERATIONS 1667887d61b2STaylor Simpson * ************************************************************************/ 1668887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(16,vand,"Vd32=vand(Vu32,Vv32)", "Vector Logical And", VdV.uh[i] = VuV.uh[i] & VvV.h[i]) 1669887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(16,vor, "Vd32=vor(Vu32,Vv32)", "Vector Logical Or", VdV.uh[i] = VuV.uh[i] | VvV.h[i]) 1670887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(16,vxor,"Vd32=vxor(Vu32,Vv32)", "Vector Logical XOR", VdV.uh[i] = VuV.uh[i] ^ VvV.h[i]) 1671887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(16,vnot,"Vd32=vnot(Vu32)", "Vector Logical NOT", VdV.uh[i] = ~VuV.uh[i]) 1672887d61b2STaylor Simpson 1673887d61b2STaylor Simpson 1674887d61b2STaylor Simpson 1675887d61b2STaylor Simpson 1676887d61b2STaylor Simpson 1677887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt, 1678887d61b2STaylor Simpson"Vd32.ub=vand(Qu4.ub,Rt32.ub)", "Vd32=vand(Qu4,Rt32)", "Insert Predicate into Vector", 1679887d61b2STaylor Simpson VdV.ub[i] = fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0) 1680887d61b2STaylor Simpson 1681887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt_acc, 1682887d61b2STaylor Simpson"Vx32.ub|=vand(Qu4.ub,Rt32.ub)", "Vx32|=vand(Qu4,Rt32)", "Insert Predicate into Vector", 1683887d61b2STaylor Simpson VxV.ub[i] |= (fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0) 1684887d61b2STaylor Simpson 1685887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt, 1686887d61b2STaylor Simpson"Vd32.ub=vand(!Qu4.ub,Rt32.ub)", "Vd32=vand(!Qu4,Rt32)", "Insert Predicate into Vector", 1687887d61b2STaylor Simpson VdV.ub[i] = !fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0) 1688887d61b2STaylor Simpson 1689887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt_acc, 1690887d61b2STaylor Simpson"Vx32.ub|=vand(!Qu4.ub,Rt32.ub)", "Vx32|=vand(!Qu4,Rt32)", "Insert Predicate into Vector", 1691887d61b2STaylor Simpson VxV.ub[i] |= !(fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0) 1692887d61b2STaylor Simpson 1693887d61b2STaylor Simpson 1694887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt, 1695887d61b2STaylor Simpson"Qd4.ub=vand(Vu32.ub,Rt32.ub)", "Qd4=vand(Vu32,Rt32)", "Insert into Predicate", 1696887d61b2STaylor Simpson fSETQBIT(QdV,i,((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0)) 1697887d61b2STaylor Simpson 1698887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt_acc, 1699887d61b2STaylor Simpson"Qx4.ub|=vand(Vu32.ub,Rt32.ub)", "Qx4|=vand(Vu32,Rt32)", "Insert into Predicate ", 1700887d61b2STaylor Simpson fSETQBIT(QxV,i,fGETQBIT(QxV,i)|(((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0))) 1701887d61b2STaylor Simpson 1702887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(8,vandvqv,"Vd32=vand(Qv4,Vu32)","Mask off bytes", 1703887d61b2STaylor SimpsonVdV.b[i] = fGETQBIT(QvV,i) ? VuV.b[i] : 0) 1704887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(8,vandvnqv,"Vd32=vand(!Qv4,Vu32)","Mask off bytes", 1705887d61b2STaylor SimpsonVdV.b[i] = !fGETQBIT(QvV,i) ? VuV.b[i] : 0) 1706887d61b2STaylor Simpson 1707887d61b2STaylor Simpson 1708887d61b2STaylor Simpson /*************************************************** 1709887d61b2STaylor Simpson * Compare Vector with Vector 1710887d61b2STaylor Simpson ***************************************************/ 1711887d61b2STaylor Simpson#define VCMP(DEST, ASRC, ASRCOP, CMP, N, SRC, MASK, WIDTH) \ 1712887d61b2STaylor Simpson{ \ 1713887d61b2STaylor Simpson for(fHIDE(int) i = 0; i < fVBYTES(); i += WIDTH) { \ 1714887d61b2STaylor Simpson fSETQBITS(DEST,WIDTH,MASK,i,ASRC ASRCOP ((VuV.SRC[i/WIDTH] CMP VvV.SRC[i/WIDTH]) ? MASK : 0)); \ 1715887d61b2STaylor Simpson } \ 1716887d61b2STaylor Simpson } 1717887d61b2STaylor Simpson 1718887d61b2STaylor Simpson 1719887d61b2STaylor Simpson#define MMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \ 1720887d61b2STaylor SimpsonEXTINSN(V6_vgt##TYPE, "Qd4=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than", \ 1721887d61b2STaylor Simpson VCMP(QdV, , , >, N, SRC, MASK, WIDTH)) \ 1722887d61b2STaylor SimpsonEXTINSN(V6_vgt##TYPE##_and, "Qx4&=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-and", \ 1723887d61b2STaylor Simpson VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, >, N, SRC, MASK, WIDTH)) \ 1724887d61b2STaylor SimpsonEXTINSN(V6_vgt##TYPE##_or, "Qx4|=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-or", \ 1725887d61b2STaylor Simpson VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, >, N, SRC, MASK, WIDTH)) \ 1726887d61b2STaylor SimpsonEXTINSN(V6_vgt##TYPE##_xor, "Qx4^=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-xor", \ 1727887d61b2STaylor Simpson VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, >, N, SRC, MASK, WIDTH)) 1728887d61b2STaylor Simpson 1729887d61b2STaylor Simpson#define MMVEC_CMP(TYPE,TYPE2,TYPE3,DESCR,N,MASK, WIDTH, SRC)\ 1730887d61b2STaylor SimpsonMMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \ 1731887d61b2STaylor SimpsonEXTINSN(V6_veq##TYPE, "Qd4=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equal to", \ 1732887d61b2STaylor Simpson VCMP(QdV, , , ==, N, SRC, MASK, WIDTH)) \ 1733887d61b2STaylor SimpsonEXTINSN(V6_veq##TYPE##_and, "Qx4&=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-and", \ 1734887d61b2STaylor Simpson VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, ==, N, SRC, MASK, WIDTH)) \ 1735887d61b2STaylor SimpsonEXTINSN(V6_veq##TYPE##_or, "Qx4|=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-or", \ 1736887d61b2STaylor Simpson VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, ==, N, SRC, MASK, WIDTH)) \ 1737887d61b2STaylor SimpsonEXTINSN(V6_veq##TYPE##_xor, "Qx4^=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-xor", \ 1738887d61b2STaylor Simpson VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, ==, N, SRC, MASK, WIDTH)) 1739887d61b2STaylor Simpson 1740887d61b2STaylor Simpson 1741887d61b2STaylor SimpsonMMVEC_CMP(w,"w","","Vector Word Compare ", fVELEM(32), 0xF, 4, w) 1742887d61b2STaylor SimpsonMMVEC_CMP(h,"h","","Vector Half Compare ", fVELEM(16), 0x3, 2, h) 1743887d61b2STaylor SimpsonMMVEC_CMP(b,"b","","Vector Half Compare ", fVELEM(8), 0x1, 1, b) 1744887d61b2STaylor SimpsonMMVEC_CMPGT(uw,"uw","","Vector Unsigned Half Compare ", fVELEM(32), 0xF, 4,uw) 1745887d61b2STaylor SimpsonMMVEC_CMPGT(uh,"uh","","Vector Unsigned Half Compare ", fVELEM(16), 0x3, 2,uh) 1746887d61b2STaylor SimpsonMMVEC_CMPGT(ub,"ub","","Vector Unsigned Byte Compare ", fVELEM(8), 0x1, 1,ub) 1747887d61b2STaylor Simpson 1748887d61b2STaylor Simpson/*************************************************** 1749887d61b2STaylor Simpson* Predicate Operations 1750887d61b2STaylor Simpson***************************************************/ 1751887d61b2STaylor Simpson 1752887d61b2STaylor SimpsonEXTINSN(V6_pred_scalar2, "Qd4=vsetq(Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), "Set Vector Predicate ", 1753887d61b2STaylor Simpson{ 1754887d61b2STaylor Simpson fHIDE(int i;) 1755887d61b2STaylor Simpson for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i < (RtV & (fVBYTES()-1))) ? 1 : 0); 1756887d61b2STaylor Simpson}) 1757887d61b2STaylor Simpson 1758887d61b2STaylor SimpsonEXTINSN(V6_pred_scalar2v2, "Qd4=vsetq2(Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), "Set Vector Predicate ", 1759887d61b2STaylor Simpson{ 1760887d61b2STaylor Simpson fHIDE(int i;) 1761887d61b2STaylor Simpson for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i <= ((RtV-1) & (fVBYTES()-1))) ? 1 : 0); 1762887d61b2STaylor Simpson}) 1763887d61b2STaylor Simpson 1764887d61b2STaylor Simpson 1765887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqw, "Qd4.h=vshuffe(Qs4.w,Qt4.w)","Shrink Predicate", fSETQBIT(QdV,i, (i & 2) ? fGETQBIT(QsV,i-2) : fGETQBIT(QtV,i) ) ) 1766887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqh, "Qd4.b=vshuffe(Qs4.h,Qt4.h)","Shrink Predicate", fSETQBIT(QdV,i, (i & 1) ? fGETQBIT(QsV,i-1) : fGETQBIT(QtV,i) ) ) 1767887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or, "Qd4=or(Qs4,Qt4)","Vector Predicate Or", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || fGETQBIT(QtV,i) ) ) 1768887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and, "Qd4=and(Qs4,Qt4)","Vector Predicate And", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && fGETQBIT(QtV,i) ) ) 1769887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_xor, "Qd4=xor(Qs4,Qt4)","Vector Predicate Xor", fSETQBIT(QdV,i,fGETQBIT(QsV,i) ^ fGETQBIT(QtV,i) ) ) 1770887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or_n, "Qd4=or(Qs4,!Qt4)","Vector Predicate Or with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || !fGETQBIT(QtV,i) ) ) 1771887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and_n, "Qd4=and(Qs4,!Qt4)","Vector Predicate And with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && !fGETQBIT(QtV,i) ) ) 1772887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(8, pred_not, "Qd4=not(Qs4)","Vector Predicate Not", fSETQBIT(QdV,i,!fGETQBIT(QsV,i) ) ) 1773887d61b2STaylor Simpson 1774887d61b2STaylor Simpson 1775887d61b2STaylor Simpson 1776887d61b2STaylor SimpsonEXTINSN(V6_vcmov, "if (Ps4) Vd32=Vu32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), "Conditional Mov", 1777887d61b2STaylor Simpson{ 1778887d61b2STaylor Simpsonif (fLSBOLD(PsV)) { 1779887d61b2STaylor Simpson fHIDE(int i;) 1780887d61b2STaylor Simpson fVFOREACH(8, i) { 1781887d61b2STaylor Simpson VdV.ub[i] = VuV.ub[i]; 1782887d61b2STaylor Simpson } 1783887d61b2STaylor Simpson } else {CANCEL;} 1784887d61b2STaylor Simpson}) 1785887d61b2STaylor Simpson 1786887d61b2STaylor SimpsonEXTINSN(V6_vncmov, "if (!Ps4) Vd32=Vu32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), "Conditional Mov", 1787887d61b2STaylor Simpson{ 1788887d61b2STaylor Simpsonif (fLSBOLDNOT(PsV)) { 1789887d61b2STaylor Simpson fHIDE(int i;) 1790887d61b2STaylor Simpson fVFOREACH(8, i) { 1791887d61b2STaylor Simpson VdV.ub[i] = VuV.ub[i]; 1792887d61b2STaylor Simpson } 1793887d61b2STaylor Simpson } else {CANCEL;} 1794887d61b2STaylor Simpson}) 1795887d61b2STaylor Simpson 1796887d61b2STaylor SimpsonEXTINSN(V6_vccombine, "if (Ps4) Vdd32=vcombine(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV), "Conditional Combine", 1797887d61b2STaylor Simpson{ 1798887d61b2STaylor Simpsonif (fLSBOLD(PsV)) { 1799887d61b2STaylor Simpson fHIDE(int i;) 1800887d61b2STaylor Simpson fVFOREACH(8, i) { 1801887d61b2STaylor Simpson VddV.v[0].ub[i] = VvV.ub[i]; 1802887d61b2STaylor Simpson VddV.v[1].ub[i] = VuV.ub[i]; 1803887d61b2STaylor Simpson } 1804887d61b2STaylor Simpson } else {CANCEL;} 1805887d61b2STaylor Simpson}) 1806887d61b2STaylor Simpson 1807887d61b2STaylor SimpsonEXTINSN(V6_vnccombine, "if (!Ps4) Vdd32=vcombine(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV), "Conditional Combine", 1808887d61b2STaylor Simpson{ 1809887d61b2STaylor Simpsonif (fLSBOLDNOT(PsV)) { 1810887d61b2STaylor Simpson fHIDE(int i;) 1811887d61b2STaylor Simpson fVFOREACH(8, i) { 1812887d61b2STaylor Simpson VddV.v[0].ub[i] = VvV.ub[i]; 1813887d61b2STaylor Simpson VddV.v[1].ub[i] = VuV.ub[i]; 1814887d61b2STaylor Simpson } 1815887d61b2STaylor Simpson } else {CANCEL;} 1816887d61b2STaylor Simpson}) 1817887d61b2STaylor Simpson 1818887d61b2STaylor Simpson 1819887d61b2STaylor Simpson 1820887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(8,vmux,"Vd32=vmux(Qt4,Vu32,Vv32)", 1821887d61b2STaylor Simpson"Vector Select Element 8-bit", 1822887d61b2STaylor Simpson VdV.ub[i] = fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i]) 1823887d61b2STaylor Simpson 1824887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vswap,"Vdd32=vswap(Qt4,Vu32,Vv32)", 1825887d61b2STaylor Simpson"Vector Swap Element 8-bit", 1826887d61b2STaylor Simpson VddV.v[0].ub[i] = fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i]; 1827887d61b2STaylor Simpson VddV.v[1].ub[i] = !fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i]) 1828887d61b2STaylor Simpson 1829887d61b2STaylor Simpson 1830887d61b2STaylor Simpson/*************************************************************************** 1831887d61b2STaylor Simpson* 1832887d61b2STaylor Simpson* MMVECTOR SORTING 1833887d61b2STaylor Simpson* 1834887d61b2STaylor Simpson****************************************************************************/ 1835887d61b2STaylor Simpson 1836887d61b2STaylor Simpson#define MMVEC_SORT(TYPE,TYPE2,DESCR,ELEMENTSIZE,SRC)\ 1837887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmax##TYPE, "Vd32=vmax" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmax(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " max", VdV.SRC[i] = (VuV.SRC[i] > VvV.SRC[i]) ? VuV.SRC[i] : VvV.SRC[i]) \ 1838887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmin##TYPE, "Vd32=vmin" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmin(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " min", VdV.SRC[i] = (VuV.SRC[i] < VvV.SRC[i]) ? VuV.SRC[i] : VvV.SRC[i]) 1839887d61b2STaylor Simpson 1840887d61b2STaylor SimpsonMMVEC_SORT(b,"b", "signed byte", 8, b) 1841887d61b2STaylor SimpsonMMVEC_SORT(ub,"ub", "unsigned byte", 8, ub) 1842887d61b2STaylor SimpsonMMVEC_SORT(uh,"uh", "unsigned halfword",16, uh) 1843887d61b2STaylor SimpsonMMVEC_SORT(h, "h", "halfword", 16, h) 1844887d61b2STaylor SimpsonMMVEC_SORT(w, "w", "word", 32, w) 1845887d61b2STaylor Simpson 1846887d61b2STaylor Simpson 1847887d61b2STaylor Simpson 1848887d61b2STaylor Simpson 1849887d61b2STaylor Simpson 1850887d61b2STaylor Simpson 1851887d61b2STaylor Simpson 1852887d61b2STaylor Simpson 1853887d61b2STaylor Simpson 1854887d61b2STaylor Simpson/************************************************************* 1855887d61b2STaylor Simpson* SHUFFLES 1856887d61b2STaylor Simpson****************************************************************/ 1857887d61b2STaylor Simpson 1858887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(16,vsathub,"Vd32=vsathub(Vu32,Vv32)","Vd32.ub=vsat(Vu32.h,Vv32.h)", 1859887d61b2STaylor Simpson"Saturate and pack 32 halfwords to 32 unsigned bytes, and interleave them", 1860887d61b2STaylor Simpson fSETBYTE(0, VdV.uh[i], fVSATUB(VvV.h[i])); 1861887d61b2STaylor Simpson fSETBYTE(1, VdV.uh[i], fVSATUB(VuV.h[i]))) 1862887d61b2STaylor Simpson 1863887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vsatwh,"Vd32=vsatwh(Vu32,Vv32)","Vd32.h=vsat(Vu32.w,Vv32.w)", 1864887d61b2STaylor Simpson"Saturate and pack 16 words to 16 halfwords, and interleave them", 1865887d61b2STaylor Simpson fSETHALF(0, VdV.w[i], fVSATH(VvV.w[i])); 1866887d61b2STaylor Simpson fSETHALF(1, VdV.w[i], fVSATH(VuV.w[i]))) 1867887d61b2STaylor Simpson 1868887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vsatuwuh,"Vd32=vsatuwuh(Vu32,Vv32)","Vd32.uh=vsat(Vu32.uw,Vv32.uw)", 1869887d61b2STaylor Simpson"Saturate and pack 16 words to 16 halfwords, and interleave them", 1870887d61b2STaylor Simpson fSETHALF(0, VdV.w[i], fVSATUH(VvV.uw[i])); 1871887d61b2STaylor Simpson fSETHALF(1, VdV.w[i], fVSATUH(VuV.uw[i]))) 1872887d61b2STaylor Simpson 1873887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(16,vshuffeb,"Vd32=vshuffeb(Vu32,Vv32)","Vd32.b=vshuffe(Vu32.b,Vv32.b)", 1874887d61b2STaylor Simpson"Shuffle half words with in a lane", 1875887d61b2STaylor Simpson fSETBYTE(0, VdV.uh[i], fGETUBYTE(0, VvV.uh[i])); 1876887d61b2STaylor Simpson fSETBYTE(1, VdV.uh[i], fGETUBYTE(0, VuV.uh[i]))) 1877887d61b2STaylor Simpson 1878887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(16,vshuffob,"Vd32=vshuffob(Vu32,Vv32)","Vd32.b=vshuffo(Vu32.b,Vv32.b)", 1879887d61b2STaylor Simpson"Shuffle half words with in a lane", 1880887d61b2STaylor Simpson fSETBYTE(0, VdV.uh[i], fGETUBYTE(1, VvV.uh[i])); 1881887d61b2STaylor Simpson fSETBYTE(1, VdV.uh[i], fGETUBYTE(1, VuV.uh[i]))) 1882887d61b2STaylor Simpson 1883887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vshufeh,"Vd32=vshuffeh(Vu32,Vv32)","Vd32.h=vshuffe(Vu32.h,Vv32.h)", 1884887d61b2STaylor Simpson"Shuffle half words with in a lane", 1885887d61b2STaylor Simpson fSETHALF(0, VdV.uw[i], fGETUHALF(0, VvV.uw[i])); 1886887d61b2STaylor Simpson fSETHALF(1, VdV.uw[i], fGETUHALF(0, VuV.uw[i]))) 1887887d61b2STaylor Simpson 1888887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vshufoh,"Vd32=vshuffoh(Vu32,Vv32)","Vd32.h=vshuffo(Vu32.h,Vv32.h)", 1889887d61b2STaylor Simpson"Shuffle half words with in a lane", 1890887d61b2STaylor Simpson fSETHALF(0, VdV.uw[i], fGETUHALF(1, VvV.uw[i])); 1891887d61b2STaylor Simpson fSETHALF(1, VdV.uw[i], fGETUHALF(1, VuV.uw[i]))) 1892887d61b2STaylor Simpson 1893887d61b2STaylor Simpson 1894887d61b2STaylor Simpson 1895887d61b2STaylor Simpson 1896887d61b2STaylor Simpson/************************************************************************** 1897887d61b2STaylor Simpson* Double Vector Shuffles 1898887d61b2STaylor Simpson**************************************************************************/ 1899887d61b2STaylor Simpson 1900887d61b2STaylor SimpsonEXTINSN(V6_vshuff, "vshuff(Vy32,Vx32,Rt32)", 1901887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1902887d61b2STaylor Simpson"2x2->2x2 transpose, for multiple data sizes, inplace", 1903887d61b2STaylor Simpson{ 1904887d61b2STaylor Simpson fHIDE(int offset;) 1905887d61b2STaylor Simpson for (offset=1; offset<fVBYTES(); offset<<=1) { 1906887d61b2STaylor Simpson if ( RtV & offset) { 1907887d61b2STaylor Simpson fHIDE(int k;) \ 1908887d61b2STaylor Simpson fVFOREACH(8, k) {\ 1909887d61b2STaylor Simpson if (!( k & offset)) { 1910887d61b2STaylor Simpson fSWAPB(VyV.ub[k], VxV.ub[k+offset]); 1911887d61b2STaylor Simpson } 1912887d61b2STaylor Simpson } 1913887d61b2STaylor Simpson } 1914887d61b2STaylor Simpson } 1915887d61b2STaylor Simpson }) 1916887d61b2STaylor Simpson 1917887d61b2STaylor SimpsonEXTINSN(V6_vshuffvdd, "Vdd32=vshuff(Vu32,Vv32,Rt8)", 1918887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1919887d61b2STaylor Simpson"2x2->2x2 transpose for multiple data sizes", 1920887d61b2STaylor Simpson{ 1921887d61b2STaylor Simpson fHIDE(int offset;) 1922887d61b2STaylor Simpson VddV.v[0] = VvV; 1923887d61b2STaylor Simpson VddV.v[1] = VuV; 1924887d61b2STaylor Simpson for (offset=1; offset<fVBYTES(); offset<<=1) { 1925887d61b2STaylor Simpson if ( RtV & offset) { 1926887d61b2STaylor Simpson fHIDE(int k;) \ 1927887d61b2STaylor Simpson fVFOREACH(8, k) {\ 1928887d61b2STaylor Simpson if (!( k & offset)) { 1929887d61b2STaylor Simpson fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]); 1930887d61b2STaylor Simpson } 1931887d61b2STaylor Simpson } 1932887d61b2STaylor Simpson } 1933887d61b2STaylor Simpson } 1934887d61b2STaylor Simpson }) 1935887d61b2STaylor Simpson 1936887d61b2STaylor SimpsonEXTINSN(V6_vdeal, "vdeal(Vy32,Vx32,Rt32)", 1937887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1938887d61b2STaylor Simpson" vector - vector deal - or deinterleave, for multiple data sizes, inplace", 1939887d61b2STaylor Simpson{ 1940887d61b2STaylor Simpson fHIDE(int offset;) 1941887d61b2STaylor Simpson for (offset=fVBYTES()>>1; offset>0; offset>>=1) { 1942887d61b2STaylor Simpson if ( RtV & offset) { 1943887d61b2STaylor Simpson fHIDE(int k;) \ 1944887d61b2STaylor Simpson fVFOREACH(8, k) {\ 1945887d61b2STaylor Simpson if (!( k & offset)) { 1946887d61b2STaylor Simpson fSWAPB(VyV.ub[k], VxV.ub[k+offset]); 1947887d61b2STaylor Simpson } 1948887d61b2STaylor Simpson } 1949887d61b2STaylor Simpson } 1950887d61b2STaylor Simpson } 1951887d61b2STaylor Simpson }) 1952887d61b2STaylor Simpson 1953887d61b2STaylor SimpsonEXTINSN(V6_vdealvdd, "Vdd32=vdeal(Vu32,Vv32,Rt8)", 1954887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS), 1955887d61b2STaylor Simpson" vector - vector deal - or deinterleave, for multiple data sizes", 1956887d61b2STaylor Simpson{ 1957887d61b2STaylor Simpson fHIDE(int offset;) 1958887d61b2STaylor Simpson VddV.v[0] = VvV; 1959887d61b2STaylor Simpson VddV.v[1] = VuV; 1960887d61b2STaylor Simpson for (offset=fVBYTES()>>1; offset>0; offset>>=1) { 1961887d61b2STaylor Simpson if ( RtV & offset) { 1962887d61b2STaylor Simpson fHIDE(int k;) \ 1963887d61b2STaylor Simpson fVFOREACH(8, k) {\ 1964887d61b2STaylor Simpson if (!( k & offset)) { 1965887d61b2STaylor Simpson fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]); 1966887d61b2STaylor Simpson } 1967887d61b2STaylor Simpson } 1968887d61b2STaylor Simpson } 1969887d61b2STaylor Simpson } 1970887d61b2STaylor Simpson }) 1971887d61b2STaylor Simpson 1972887d61b2STaylor Simpson/**************************************************************************/ 1973887d61b2STaylor Simpson 1974887d61b2STaylor Simpson 1975887d61b2STaylor Simpson 1976887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vshufoeh,"Vdd32=vshuffoeh(Vu32,Vv32)","Vdd32.h=vshuffoe(Vu32.h,Vv32.h)", 1977887d61b2STaylor Simpson"Vector Shuffle half words", 1978887d61b2STaylor Simpson fSETHALF(0, VddV.v[0].uw[i], fGETUHALF(0, VvV.uw[i])); 1979887d61b2STaylor Simpson fSETHALF(1, VddV.v[0].uw[i], fGETUHALF(0, VuV.uw[i])); 1980887d61b2STaylor Simpson fSETHALF(0, VddV.v[1].uw[i], fGETUHALF(1, VvV.uw[i])); 1981887d61b2STaylor Simpson fSETHALF(1, VddV.v[1].uw[i], fGETUHALF(1, VuV.uw[i]))) 1982887d61b2STaylor Simpson 1983887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vshufoeb,"Vdd32=vshuffoeb(Vu32,Vv32)","Vdd32.b=vshuffoe(Vu32.b,Vv32.b)", 1984887d61b2STaylor Simpson"Vector Shuffle bytes", 1985887d61b2STaylor Simpson fSETBYTE(0, VddV.v[0].uh[i], fGETUBYTE(0, VvV.uh[i])); 1986887d61b2STaylor Simpson fSETBYTE(1, VddV.v[0].uh[i], fGETUBYTE(0, VuV.uh[i])); 1987887d61b2STaylor Simpson fSETBYTE(0, VddV.v[1].uh[i], fGETUBYTE(1, VvV.uh[i])); 1988887d61b2STaylor Simpson fSETBYTE(1, VddV.v[1].uh[i], fGETUBYTE(1, VuV.uh[i]))) 1989887d61b2STaylor Simpson 1990887d61b2STaylor Simpson 1991887d61b2STaylor Simpson/*************************************************************** 1992887d61b2STaylor Simpson* Deal 1993887d61b2STaylor Simpson***************************************************************/ 1994887d61b2STaylor Simpson 1995887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(32, vdealh, "Vd32=vdealh(Vu32)", "Vd32.h=vdeal(Vu32.h)", 1996887d61b2STaylor Simpson"Deal Halfwords", 1997887d61b2STaylor Simpson VdV.uh[i ] = fGETUHALF(0, VuV.uw[i]); 1998887d61b2STaylor Simpson VdV.uh[i+fVELEM(32)] = fGETUHALF(1, VuV.uw[i])) 1999887d61b2STaylor Simpson 2000887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(16, vdealb, "Vd32=vdealb(Vu32)", "Vd32.b=vdeal(Vu32.b)", 2001887d61b2STaylor Simpson"Deal Halfwords", 2002887d61b2STaylor Simpson VdV.ub[i ] = fGETUBYTE(0, VuV.uh[i]); 2003887d61b2STaylor Simpson VdV.ub[i+fVELEM(16)] = fGETUBYTE(1, VuV.uh[i])) 2004887d61b2STaylor Simpson 2005887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(32, vdealb4w, "Vd32=vdealb4w(Vu32,Vv32)", "Vd32.b=vdeale(Vu32.b,Vv32.b)", 2006887d61b2STaylor Simpson"Deal Two Vectors Bytes", 2007887d61b2STaylor Simpson VdV.ub[0+i ] = fGETUBYTE(0, VvV.uw[i]); 2008887d61b2STaylor Simpson VdV.ub[fVELEM(32)+i ] = fGETUBYTE(2, VvV.uw[i]); 2009887d61b2STaylor Simpson VdV.ub[2*fVELEM(32)+i] = fGETUBYTE(0, VuV.uw[i]); 2010887d61b2STaylor Simpson VdV.ub[3*fVELEM(32)+i] = fGETUBYTE(2, VuV.uw[i])) 2011887d61b2STaylor Simpson 2012887d61b2STaylor Simpson/*************************************************************** 2013887d61b2STaylor Simpson* shuffle 2014887d61b2STaylor Simpson***************************************************************/ 2015887d61b2STaylor Simpson 2016887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(32, vshuffh, "Vd32=vshuffh(Vu32)", "Vd32.h=vshuff(Vu32.h)", 2017887d61b2STaylor Simpson"Deal Halfwords", 2018887d61b2STaylor Simpson fSETHALF(0, VdV.uw[i], VuV.uh[i]); 2019887d61b2STaylor Simpson fSETHALF(1, VdV.uw[i], VuV.uh[i+fVELEM(32)])) 2020887d61b2STaylor Simpson 2021887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(16, vshuffb, "Vd32=vshuffb(Vu32)", "Vd32.b=vshuff(Vu32.b)", 2022887d61b2STaylor Simpson"Deal Halfwords", 2023887d61b2STaylor Simpson fSETBYTE(0, VdV.uh[i], VuV.ub[i]); 2024887d61b2STaylor Simpson fSETBYTE(1, VdV.uh[i], VuV.ub[i+fVELEM(16)])) 2025887d61b2STaylor Simpson 2026887d61b2STaylor Simpson 2027887d61b2STaylor Simpson 2028887d61b2STaylor Simpson 2029887d61b2STaylor Simpson 2030887d61b2STaylor Simpson/*********************************************************** 2031887d61b2STaylor Simpson* INSERT AND EXTRACT 2032887d61b2STaylor Simpson*********************************************************/ 2033887d61b2STaylor SimpsonEXTINSN(V6_extractw, "Rd32=vextract(Vu32,Rs32)", 2034887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA,A_MEMLIKE,A_RESTRICT_SLOT0ONLY), 2035887d61b2STaylor Simpson"Extract an element from a vector to scalar", 2036887d61b2STaylor SimpsonfHIDE(warn("RdN=%d VuN=%d RsN=%d RsV=0x%08x widx=%d",RdN,VuN,RsN,RsV,((RsV & (fVBYTES()-1)) >> 2));) 2037887d61b2STaylor SimpsonRdV = VuV.uw[ (RsV & (fVBYTES()-1)) >> 2]; 2038887d61b2STaylor SimpsonfHIDE(warn("RdV=0x%08x",RdV);)) 2039887d61b2STaylor Simpson 2040887d61b2STaylor SimpsonEXTINSN(V6_vinsertwr, "Vx32.w=vinsert(Rt32)", 2041887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX), 2042887d61b2STaylor Simpson"Insert Word Scalar into Vector", 2043887d61b2STaylor SimpsonVxV.uw[0] = RtV;) 2044887d61b2STaylor Simpson 2045887d61b2STaylor Simpson 2046887d61b2STaylor Simpson 2047887d61b2STaylor Simpson 20486c67d98cSMichael TokarevITERATOR_INSN_MPY_SLOT_LATE(32,lvsplatw, "Vd32=vsplat(Rt32)", "Replicates scalar across words in vector", VdV.uw[i] = RtV) 2049887d61b2STaylor Simpson 20506c67d98cSMichael TokarevITERATOR_INSN_MPY_SLOT_LATE(16,lvsplath, "Vd32.h=vsplat(Rt32)", "Replicates scalar across halves in vector", VdV.uh[i] = RtV) 2051887d61b2STaylor Simpson 20526c67d98cSMichael TokarevITERATOR_INSN_MPY_SLOT_LATE(8,lvsplatb, "Vd32.b=vsplat(Rt32)", "Replicates scalar across bytes in vector", VdV.ub[i] = RtV) 2053887d61b2STaylor Simpson 2054887d61b2STaylor Simpson 2055887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vassign,"Vd32=Vu32","Copy a vector",VdV.w[i]=VuV.w[i]) 2056887d61b2STaylor Simpson 2057887d61b2STaylor Simpson 2058887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vcombine,"Vdd32=vcombine(Vu32,Vv32)", 2059887d61b2STaylor Simpson"Vector assign, Any two to Vector Pair", 2060887d61b2STaylor Simpson VddV.v[0].ub[i] = VvV.ub[i]; 2061887d61b2STaylor Simpson VddV.v[1].ub[i] = VuV.ub[i]) 2062887d61b2STaylor Simpson 2063887d61b2STaylor Simpson 2064887d61b2STaylor Simpson 2065887d61b2STaylor Simpson/////////////////////////////////////////////////////////////////////////// 2066887d61b2STaylor Simpson 2067b2f20c2cSTaylor SimpsonEXTINSN(V6_vcombine_tmp, "Vdd32.tmp=vcombine(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_REMAP,A_CVI_TMP,A_NO_INTRINSIC), 2068b2f20c2cSTaylor Simpson"Vector assign tmp, Any two to Vector Pair ", 2069b2f20c2cSTaylor Simpson{ 2070b2f20c2cSTaylor Simpson fHIDE(int i;) 2071b2f20c2cSTaylor Simpson fVFOREACH(8, i) { 2072b2f20c2cSTaylor Simpson VddV.v[0].ub[i] = VvV.ub[i]; 2073b2f20c2cSTaylor Simpson VddV.v[1].ub[i] = VuV.ub[i]; 2074b2f20c2cSTaylor Simpson } 2075b2f20c2cSTaylor Simpson}) 2076b2f20c2cSTaylor Simpson 2077b2f20c2cSTaylor SimpsonEXTINSN(V6_vassign_tmp, "Vd32.tmp=Vu32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_REMAP,A_CVI_TMP,A_NO_INTRINSIC), 2078b2f20c2cSTaylor Simpson"Vector assign tmp, Any two to Vector Pair ", 2079b2f20c2cSTaylor Simpson{ 2080b2f20c2cSTaylor Simpson fHIDE(int i;) 2081b2f20c2cSTaylor Simpson fVFOREACH(32, i) { 2082b2f20c2cSTaylor Simpson VdV.w[i]=VuV.w[i]; 2083b2f20c2cSTaylor Simpson } 2084b2f20c2cSTaylor Simpson}) 2085887d61b2STaylor Simpson 2086887d61b2STaylor Simpson/********************************************************* 2087887d61b2STaylor Simpson* GENERAL PERMUTE NETWORKS 2088887d61b2STaylor Simpson*********************************************************/ 2089887d61b2STaylor Simpson 2090887d61b2STaylor Simpson 2091887d61b2STaylor SimpsonEXTINSN(V6_vdelta, "Vd32=vdelta(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 2092887d61b2STaylor Simpson"Reverse Benes Butterfly network ", 2093887d61b2STaylor Simpson{ 2094887d61b2STaylor Simpson fHIDE(int offset;) 2095887d61b2STaylor Simpson fHIDE(int k;) 2096887d61b2STaylor Simpson fHIDE(mmvector_t tmp;) 2097887d61b2STaylor Simpson tmp = VuV; 2098887d61b2STaylor Simpson for (offset=fVBYTES(); (offset>>=1)>0; ) { 2099887d61b2STaylor Simpson for (k = 0; k<fVBYTES(); k++) { 2100887d61b2STaylor Simpson VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k]; 2101887d61b2STaylor Simpson } 2102887d61b2STaylor Simpson for (k = 0; k<fVBYTES(); k++) { 2103887d61b2STaylor Simpson tmp.ub[k] = VdV.ub[k]; 2104887d61b2STaylor Simpson } 2105887d61b2STaylor Simpson } 2106887d61b2STaylor Simpson}) 2107887d61b2STaylor Simpson 2108887d61b2STaylor Simpson 2109887d61b2STaylor SimpsonEXTINSN(V6_vrdelta, "Vd32=vrdelta(Vu32,Vv32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP), 2110887d61b2STaylor Simpson"Forward Benes Butterfly network ", 2111887d61b2STaylor Simpson{ 2112887d61b2STaylor Simpson fHIDE(int offset;) 2113887d61b2STaylor Simpson fHIDE(int k;) 2114887d61b2STaylor Simpson fHIDE(mmvector_t tmp;) 2115887d61b2STaylor Simpson tmp = VuV; 2116887d61b2STaylor Simpson for (offset=1; offset<fVBYTES(); offset<<=1){ 2117887d61b2STaylor Simpson for (k = 0; k<fVBYTES(); k++) { 2118887d61b2STaylor Simpson VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k]; 2119887d61b2STaylor Simpson } 2120887d61b2STaylor Simpson for (k = 0; k<fVBYTES(); k++) { 2121887d61b2STaylor Simpson tmp.ub[k] = VdV.ub[k]; 2122887d61b2STaylor Simpson } 2123887d61b2STaylor Simpson } 2124887d61b2STaylor Simpson}) 2125887d61b2STaylor Simpson 2126887d61b2STaylor Simpson 2127887d61b2STaylor Simpson 2128887d61b2STaylor Simpson 2129887d61b2STaylor Simpson 2130887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vcl0w,"Vd32=vcl0w(Vu32)","Vd32.uw=vcl0(Vu32.uw)", "Count Leading Zeros in Word", VdV.uw[i]=fCL1_4(~VuV.uw[i])) 2131887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vcl0h,"Vd32=vcl0h(Vu32)","Vd32.uh=vcl0(Vu32.uh)", "Count Leading Zeros in Word", VdV.uh[i]=fCL1_2(~VuV.uh[i])) 2132887d61b2STaylor Simpson 2133887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vnormamtw,"Vd32=vnormamtw(Vu32)","Vd32.w=vnormamt(Vu32.w)","Norm Amount Word", 2134887d61b2STaylor SimpsonVdV.w[i]=fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i]))-1; fHIDE(IV1DEAD();)) 2135887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vnormamth,"Vd32=vnormamth(Vu32)","Vd32.h=vnormamt(Vu32.h)","Norm Amount Halfword", 2136887d61b2STaylor SimpsonVdV.h[i]=fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i]))-1; fHIDE(IV1DEAD();)) 2137887d61b2STaylor Simpson 2138887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT_VV_LATE(32,vaddclbw,"Vd32.w=vadd(vclb(Vu32.w),Vv32.w)", 2139887d61b2STaylor Simpson"Count leading bits and add", 2140887d61b2STaylor SimpsonVdV.w[i] = fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i])) + VvV.w[i]) 2141887d61b2STaylor Simpson 2142887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT_VV_LATE(16,vaddclbh,"Vd32.h=vadd(vclb(Vu32.h),Vv32.h)", 2143887d61b2STaylor Simpson"Count leading bits and add", 2144887d61b2STaylor SimpsonVdV.h[i] = fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i])) + VvV.h[i]) 2145887d61b2STaylor Simpson 2146887d61b2STaylor Simpson 2147887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vpopcounth,"Vd32=vpopcounth(Vu32)","Vd32.h=vpopcount(Vu32.h)", "Count Leading Zeros in Word", VdV.uh[i]=fCOUNTONES_2(VuV.uh[i])) 2148887d61b2STaylor Simpson 2149887d61b2STaylor Simpson 2150887d61b2STaylor Simpson#define fHIST(INPUTVEC) \ 2151887d61b2STaylor Simpson fUARCH_NOTE_PUMP_4X(); \ 2152887d61b2STaylor Simpson fHIDE(int lane;) \ 2153887d61b2STaylor Simpson fHIDE(mmvector_t tmp;) \ 2154887d61b2STaylor Simpson fVFOREACH(128, lane) { \ 2155887d61b2STaylor Simpson for (fHIDE(int )i=0; i<128/8; ++i) { \ 2156887d61b2STaylor Simpson unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \ 2157887d61b2STaylor Simpson unsigned char regno = value>>3; \ 2158887d61b2STaylor Simpson unsigned char element = value & 7; \ 2159887d61b2STaylor Simpson READ_EXT_VREG(regno,tmp,0); \ 2160887d61b2STaylor Simpson tmp.uh[(128/16)*lane+(element)]++; \ 2161887d61b2STaylor Simpson WRITE_EXT_VREG(regno,tmp,EXT_NEW); \ 2162887d61b2STaylor Simpson } \ 2163887d61b2STaylor Simpson } 2164887d61b2STaylor Simpson 2165887d61b2STaylor Simpson#define fHISTQ(INPUTVEC,QVAL) \ 2166887d61b2STaylor Simpson fUARCH_NOTE_PUMP_4X(); \ 2167887d61b2STaylor Simpson fHIDE(int lane;) \ 2168887d61b2STaylor Simpson fHIDE(mmvector_t tmp;) \ 2169887d61b2STaylor Simpson fVFOREACH(128, lane) { \ 2170887d61b2STaylor Simpson for (fHIDE(int )i=0; i<128/8; ++i) { \ 2171887d61b2STaylor Simpson unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \ 2172887d61b2STaylor Simpson unsigned char regno = value>>3; \ 2173887d61b2STaylor Simpson unsigned char element = value & 7; \ 2174887d61b2STaylor Simpson READ_EXT_VREG(regno,tmp,0); \ 2175887d61b2STaylor Simpson if (fGETQBIT(QVAL,128/8*lane+i)) tmp.uh[(128/16)*lane+(element)]++; \ 2176887d61b2STaylor Simpson WRITE_EXT_VREG(regno,tmp,EXT_NEW); \ 2177887d61b2STaylor Simpson } \ 2178887d61b2STaylor Simpson } 2179887d61b2STaylor Simpson 2180887d61b2STaylor Simpson 2181887d61b2STaylor Simpson 2182887d61b2STaylor SimpsonEXTINSN(V6_vhist, "vhist",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHIST(inputVec); }) 2183887d61b2STaylor SimpsonEXTINSN(V6_vhistq, "vhist(Qv4)",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHISTQ(inputVec,QvV); }) 2184887d61b2STaylor Simpson 2185887d61b2STaylor Simpson#undef fHIST 2186887d61b2STaylor Simpson#undef fHISTQ 2187887d61b2STaylor Simpson 2188887d61b2STaylor Simpson 2189887d61b2STaylor Simpson/* **** WEIGHTED HISTOGRAM **** */ 2190887d61b2STaylor Simpson 2191887d61b2STaylor Simpson 2192887d61b2STaylor Simpson#if 1 2193887d61b2STaylor Simpson#define WHIST(EL,MASK,BSHIFT,COND,SATF) \ 2194887d61b2STaylor Simpson fHIDE(unsigned int) bucket = fGETUBYTE(0,input.h[i]); \ 2195887d61b2STaylor Simpson fHIDE(unsigned int) weight = fGETUBYTE(1,input.h[i]); \ 2196887d61b2STaylor Simpson fHIDE(unsigned int) vindex = (bucket >> 3) & 0x1F; \ 2197887d61b2STaylor Simpson fHIDE(unsigned int) elindex = ((i>>BSHIFT) & (~MASK)) | ((bucket>>BSHIFT) & MASK); \ 2198887d61b2STaylor Simpson fHIDE(mmvector_t tmp;) \ 2199887d61b2STaylor Simpson READ_EXT_VREG(vindex,tmp,0); \ 2200887d61b2STaylor Simpson COND tmp.EL[elindex] = SATF(tmp.EL[elindex] + weight); \ 2201887d61b2STaylor Simpson WRITE_EXT_VREG(vindex,tmp,EXT_NEW); \ 2202887d61b2STaylor Simpson fUARCH_NOTE_PUMP_2X(); 2203887d61b2STaylor Simpson 2204887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist256,"vwhist256","vector weighted histogram halfword counters", WHIST(uh,7,0,,)) 2205887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist256q,"vwhist256(Qv4)","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),)) 2206887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist256_sat,"vwhist256:sat","vector weighted histogram halfword counters", WHIST(uh,7,0,,fVSATUH)) 2207887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist256q_sat,"vwhist256(Qv4):sat","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),fVSATUH)) 2208887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist128,"vwhist128","vector weighted histogram word counters", WHIST(uw,3,1,,)) 2209887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist128q,"vwhist128(Qv4)","vector weighted histogram word counters", WHIST(uw,3,1,if (fGETQBIT(QvV,2*i)),)) 2210887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist128m,"vwhist128(#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if ((bucket & 1) == uiV),)) 2211887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist128qm,"vwhist128(Qv4,#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if (((bucket & 1) == uiV) && fGETQBIT(QvV,2*i)),)) 2212887d61b2STaylor Simpson 2213887d61b2STaylor Simpson 2214887d61b2STaylor Simpson#endif 2215887d61b2STaylor Simpson 2216887d61b2STaylor Simpson 2217887d61b2STaylor Simpson 2218887d61b2STaylor Simpson/* ****** lookup table instructions *********** */ 2219887d61b2STaylor Simpson 2220887d61b2STaylor Simpson/* Use low bits from idx to choose next-bigger elements from vector, then use LSB from idx to choose odd or even element */ 2221887d61b2STaylor Simpson 2222887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup", 2223887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2224887d61b2STaylor Simpsonmatchval = RtV & 0x7; 2225887d61b2STaylor Simpsonoddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2226887d61b2STaylor Simpsonidx = VuV.ub[i]; 2227887d61b2STaylor SimpsonVdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2228887d61b2STaylor Simpson 2229887d61b2STaylor Simpson 2230887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracc,"Vx32.b|=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup", 2231887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2232887d61b2STaylor Simpsonmatchval = RtV & 0x7; 2233887d61b2STaylor Simpsonoddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2234887d61b2STaylor Simpsonidx = VuV.ub[i]; 2235887d61b2STaylor SimpsonVxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2236887d61b2STaylor Simpson 2237887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup", 2238887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2239887d61b2STaylor Simpsonmatchval = RtV & 0xF; 2240887d61b2STaylor Simpsonoddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2241887d61b2STaylor Simpsonidx = fGETUBYTE(0,VuV.uh[i]); 2242887d61b2STaylor SimpsonVddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2243887d61b2STaylor Simpsonidx = fGETUBYTE(1,VuV.uh[i]); 2244887d61b2STaylor SimpsonVddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2245887d61b2STaylor Simpson 2246887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracc,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup", 2247887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2248887d61b2STaylor Simpsonmatchval = fGETUBYTE(0,RtV) & 0xF; 2249887d61b2STaylor Simpsonoddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2250887d61b2STaylor Simpsonidx = fGETUBYTE(0,VuV.uh[i]); 2251887d61b2STaylor SimpsonVxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2252887d61b2STaylor Simpsonidx = fGETUBYTE(1,VuV.uh[i]); 2253887d61b2STaylor SimpsonVxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2254887d61b2STaylor Simpson 2255887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT(8,vlutvvbi,"Vd32.b=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup", 2256887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2257887d61b2STaylor Simpsonmatchval = uiV & 0x7; 2258887d61b2STaylor Simpsonoddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2259887d61b2STaylor Simpsonidx = VuV.ub[i]; 2260887d61b2STaylor SimpsonVdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2261887d61b2STaylor Simpson 2262887d61b2STaylor Simpson 2263887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracci,"Vx32.b|=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup", 2264887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2265887d61b2STaylor Simpsonmatchval = uiV & 0x7; 2266887d61b2STaylor Simpsonoddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2267887d61b2STaylor Simpsonidx = VuV.ub[i]; 2268887d61b2STaylor SimpsonVxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0) 2269887d61b2STaylor Simpson 2270887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwhi,"Vdd32.h=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup", 2271887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2272887d61b2STaylor Simpsonmatchval = uiV & 0xF; 2273887d61b2STaylor Simpsonoddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2274887d61b2STaylor Simpsonidx = fGETUBYTE(0,VuV.uh[i]); 2275887d61b2STaylor SimpsonVddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2276887d61b2STaylor Simpsonidx = fGETUBYTE(1,VuV.uh[i]); 2277887d61b2STaylor SimpsonVddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2278887d61b2STaylor Simpson 2279887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracci,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup", 2280887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;) 2281887d61b2STaylor Simpsonmatchval = uiV & 0xF; 2282887d61b2STaylor Simpsonoddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1; 2283887d61b2STaylor Simpsonidx = fGETUBYTE(0,VuV.uh[i]); 2284887d61b2STaylor SimpsonVxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0; 2285887d61b2STaylor Simpsonidx = fGETUBYTE(1,VuV.uh[i]); 2286887d61b2STaylor SimpsonVxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0) 2287887d61b2STaylor Simpson 2288887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb_nm,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8):nomatch","vector-vector table lookup", 2289887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;) 2290887d61b2STaylor Simpson matchval = RtV & 0x7; 2291887d61b2STaylor Simpson oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2292887d61b2STaylor Simpson idx = VuV.ub[i]; 2293887d61b2STaylor Simpson idx = (idx&0x1F) | (matchval<<5); 2294887d61b2STaylor Simpson VdV.b[i] = fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)])) 2295887d61b2STaylor Simpson 2296887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_nm,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8):nomatch","vector-vector table lookup", 2297887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;) 2298887d61b2STaylor Simpson matchval = RtV & 0xF; 2299887d61b2STaylor Simpson oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1; 2300887d61b2STaylor Simpson idx = fGETUBYTE(0,VuV.uh[i]); 2301887d61b2STaylor Simpson idx = (idx&0x0F) | (matchval<<4); 2302887d61b2STaylor Simpson VddV.v[0].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]); 2303887d61b2STaylor Simpson idx = fGETUBYTE(1,VuV.uh[i]); 2304887d61b2STaylor Simpson idx = (idx&0x0F) | (matchval<<4); 2305887d61b2STaylor Simpson VddV.v[1].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)])) 2306887d61b2STaylor Simpson 2307887d61b2STaylor Simpson 2308887d61b2STaylor Simpson 2309887d61b2STaylor Simpson 2310887d61b2STaylor Simpson/****************************************************************************** 2311887d61b2STaylor SimpsonNON LINEAR - V65 2312887d61b2STaylor Simpson ******************************************************************************/ 2313887d61b2STaylor Simpson 2314887d61b2STaylor SimpsonITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpahhsat,"Vx32.h=vmpa(Vx32.h,Vu32.h,Rtt32.h):sat","piecewise linear approximation", 2315887d61b2STaylor Simpson VxV.h[i]= fVSATH( ( ( fMPY16SS(VxV.h[i],VuV.h[i])<<1) + (fGETHALF(( (VuV.h[i]>>14)&0x3), RttV )<<15))>>16)) 2316887d61b2STaylor Simpson 2317887d61b2STaylor Simpson 2318887d61b2STaylor SimpsonITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpauhuhsat,"Vx32.h=vmpa(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation", 2319887d61b2STaylor Simpson VxV.h[i]= fVSATH( ( fMPY16SU(VxV.h[i],VuV.uh[i]) + (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16)) 2320887d61b2STaylor Simpson 2321887d61b2STaylor SimpsonITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpsuhuhsat,"Vx32.h=vmps(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation", 2322887d61b2STaylor Simpson VxV.h[i]= fVSATH( ( fMPY16SU(VxV.h[i],VuV.uh[i]) - (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16)) 2323887d61b2STaylor Simpson 2324887d61b2STaylor Simpson 2325887d61b2STaylor SimpsonITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vlut4,"Vd32.h=vlut4(Vu32.uh,Rtt32.h)","4 entry lookup table", 2326887d61b2STaylor Simpson VdV.h[i]= fGETHALF( ((VuV.h[i]>>14)&0x3), RttV )) 2327887d61b2STaylor Simpson 2328887d61b2STaylor Simpson 2329887d61b2STaylor Simpson 2330887d61b2STaylor Simpson/****************************************************************************** 2331887d61b2STaylor SimpsonV65 2332887d61b2STaylor Simpson ******************************************************************************/ 2333887d61b2STaylor Simpson 2334887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe,"Vd32.uw=vmpye(Vu32.uh,Rt32.uh)", 2335887d61b2STaylor Simpson"Vector even halfword unsigned multiply by scalar", 2336887d61b2STaylor Simpson VdV.uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV))) 2337887d61b2STaylor Simpson 2338887d61b2STaylor Simpson 2339887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe_acc,"Vx32.uw+=vmpye(Vu32.uh,Rt32.uh)", 2340887d61b2STaylor Simpson"Vector even halfword unsigned multiply by scalar", 2341887d61b2STaylor Simpson VxV.uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV))) 2342887d61b2STaylor Simpson 2343887d61b2STaylor Simpson 2344887d61b2STaylor Simpson 2345887d61b2STaylor Simpson 2346887d61b2STaylor SimpsonEXTINSN(V6_vgathermw, "vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words", 2347887d61b2STaylor Simpson{ 2348887d61b2STaylor Simpson fHIDE(int i;) 2349887d61b2STaylor Simpson fHIDE(int element_size = 4;) 2350887d61b2STaylor Simpson fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2351887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2352887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2353887d61b2STaylor Simpson fVFOREACH(32, i) { 2354887d61b2STaylor Simpson EA = RtV+VvV.uw[i]; 2355887d61b2STaylor Simpson fVLOG_VTCM_GATHER_WORD(EA, VvV.uw[i], i,MuV); 2356887d61b2STaylor Simpson } 2357887d61b2STaylor Simpson fGATHER_FINISH() 2358887d61b2STaylor Simpson}) 2359887d61b2STaylor SimpsonEXTINSN(V6_vgathermh, "vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2360887d61b2STaylor Simpson{ 2361887d61b2STaylor Simpson fHIDE(int i;) 2362887d61b2STaylor Simpson fHIDE(int element_size = 2;) 2363887d61b2STaylor Simpson fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2364887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2365887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2366887d61b2STaylor Simpson fVFOREACH(16, i) { 2367887d61b2STaylor Simpson EA = RtV+VvV.uh[i]; 2368887d61b2STaylor Simpson fVLOG_VTCM_GATHER_HALFWORD(EA, VvV.uh[i], i,MuV); 2369887d61b2STaylor Simpson } 2370887d61b2STaylor Simpson fGATHER_FINISH() 2371887d61b2STaylor Simpson}) 2372887d61b2STaylor Simpson 2373887d61b2STaylor Simpson 2374887d61b2STaylor Simpson 2375887d61b2STaylor SimpsonEXTINSN(V6_vgathermhw, "vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2376887d61b2STaylor Simpson{ 2377887d61b2STaylor Simpson fHIDE(int i;) 2378887d61b2STaylor Simpson fHIDE(int j;) 2379887d61b2STaylor Simpson fHIDE(int element_size = 2;) 2380887d61b2STaylor Simpson fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2381887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2382887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2383887d61b2STaylor Simpson fVFOREACH(32, i) { 2384887d61b2STaylor Simpson for(j = 0; j < 2; j++) { 2385887d61b2STaylor Simpson EA = RtV+VvvV.v[j].uw[i]; 2386887d61b2STaylor Simpson fVLOG_VTCM_GATHER_HALFWORD_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,MuV); 2387887d61b2STaylor Simpson } 2388887d61b2STaylor Simpson } 2389887d61b2STaylor Simpson fGATHER_FINISH() 2390887d61b2STaylor Simpson}) 2391887d61b2STaylor Simpson 2392887d61b2STaylor Simpson 2393887d61b2STaylor SimpsonEXTINSN(V6_vgathermwq, "if (Qs4) vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words", 2394887d61b2STaylor Simpson{ 2395887d61b2STaylor Simpson fHIDE(int i;) 2396887d61b2STaylor Simpson fHIDE(int element_size = 4;) 2397887d61b2STaylor Simpson fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2398887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2399887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2400887d61b2STaylor Simpson fVFOREACH(32, i) { 2401887d61b2STaylor Simpson EA = RtV+VvV.uw[i]; 2402887d61b2STaylor Simpson fVLOG_VTCM_GATHER_WORDQ(EA, VvV.uw[i], i,QsV,MuV); 2403887d61b2STaylor Simpson } 2404887d61b2STaylor Simpson fGATHER_FINISH() 2405887d61b2STaylor Simpson}) 2406887d61b2STaylor SimpsonEXTINSN(V6_vgathermhq, "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2407887d61b2STaylor Simpson{ 2408887d61b2STaylor Simpson fHIDE(int i;) 2409887d61b2STaylor Simpson fHIDE(int element_size = 2;) 2410887d61b2STaylor Simpson fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2411887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2412887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2413887d61b2STaylor Simpson fVFOREACH(16, i) { 2414887d61b2STaylor Simpson EA = RtV+VvV.uh[i]; 2415887d61b2STaylor Simpson fVLOG_VTCM_GATHER_HALFWORDQ(EA, VvV.uh[i], i,QsV,MuV); 2416887d61b2STaylor Simpson } 2417887d61b2STaylor Simpson fGATHER_FINISH() 2418887d61b2STaylor Simpson}) 2419887d61b2STaylor Simpson 2420887d61b2STaylor Simpson 2421887d61b2STaylor Simpson 2422887d61b2STaylor SimpsonEXTINSN(V6_vgathermhwq, "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords", 2423887d61b2STaylor Simpson{ 2424887d61b2STaylor Simpson fHIDE(int i;) 2425887d61b2STaylor Simpson fHIDE(int j;) 2426887d61b2STaylor Simpson fHIDE(int element_size = 2;) 2427887d61b2STaylor Simpson fHIDE(fGATHER_INIT( RtV, MuV, element_size);) 2428887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2429887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2430887d61b2STaylor Simpson fVFOREACH(32, i) { 2431887d61b2STaylor Simpson for(j = 0; j < 2; j++) { 2432887d61b2STaylor Simpson EA = RtV+VvvV.v[j].uw[i]; 2433887d61b2STaylor Simpson fVLOG_VTCM_GATHER_HALFWORDQ_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,QsV,MuV); 2434887d61b2STaylor Simpson } 2435887d61b2STaylor Simpson } 2436887d61b2STaylor Simpson fGATHER_FINISH() 2437887d61b2STaylor Simpson}) 2438887d61b2STaylor Simpson 2439887d61b2STaylor Simpson 2440887d61b2STaylor Simpson 2441887d61b2STaylor SimpsonEXTINSN(V6_vscattermw , "vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words", 2442887d61b2STaylor Simpson{ 2443887d61b2STaylor Simpson fHIDE(int i;) 2444887d61b2STaylor Simpson fHIDE(int element_size = 4;) 2445887d61b2STaylor Simpson fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2446887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2447887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2448887d61b2STaylor Simpson fVFOREACH(32, i) { 2449887d61b2STaylor Simpson EA = RtV+VvV.uw[i]; 2450887d61b2STaylor Simpson fVLOG_VTCM_WORD(EA, VvV.uw[i], VwV,i,MuV); 2451887d61b2STaylor Simpson } 2452887d61b2STaylor Simpson fSCATTER_FINISH(0) 2453887d61b2STaylor Simpson}) 2454887d61b2STaylor Simpson 2455887d61b2STaylor Simpson 2456887d61b2STaylor Simpson 2457887d61b2STaylor SimpsonEXTINSN(V6_vscattermh , "vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfWords", 2458887d61b2STaylor Simpson{ 2459887d61b2STaylor Simpson fHIDE(int i;) 2460887d61b2STaylor Simpson fHIDE(int element_size = 2;) 2461887d61b2STaylor Simpson fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2462887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2463887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2464887d61b2STaylor Simpson fVFOREACH(16, i) { 2465887d61b2STaylor Simpson EA = RtV+VvV.uh[i]; 2466887d61b2STaylor Simpson fVLOG_VTCM_HALFWORD(EA,VvV.uh[i],VwV,i,MuV); 2467887d61b2STaylor Simpson } 2468887d61b2STaylor Simpson fSCATTER_FINISH(0) 2469887d61b2STaylor Simpson}) 2470887d61b2STaylor Simpson 2471887d61b2STaylor Simpson 2472887d61b2STaylor SimpsonEXTINSN(V6_vscattermw_add, "vscatter(Rt32,Mu2,Vv32.w).w+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words-Add", 2473887d61b2STaylor Simpson{ 2474887d61b2STaylor Simpson fHIDE(int i;) 2475887d61b2STaylor Simpson fHIDE(int ALIGNMENT=4;) 2476887d61b2STaylor Simpson fHIDE(int element_size = 4;) 2477887d61b2STaylor Simpson fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2478887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2479887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2480887d61b2STaylor Simpson fVFOREACH(32, i) { 2481887d61b2STaylor Simpson EA = (RtV+fVALIGN(VvV.uw[i],ALIGNMENT)); 2482887d61b2STaylor Simpson fVLOG_VTCM_WORD_INCREMENT(EA,VvV.uw[i],VwV,i,ALIGNMENT,MuV); 2483887d61b2STaylor Simpson } 2484887d61b2STaylor Simpson fHIDE(fLOG_SCATTER_OP(4);) 2485887d61b2STaylor Simpson fSCATTER_FINISH(1) 2486887d61b2STaylor Simpson}) 2487887d61b2STaylor Simpson 2488887d61b2STaylor SimpsonEXTINSN(V6_vscattermh_add, "vscatter(Rt32,Mu2,Vv32.h).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfword-Add", 2489887d61b2STaylor Simpson{ 2490887d61b2STaylor Simpson fHIDE(int i;) 2491887d61b2STaylor Simpson fHIDE(int ALIGNMENT=2;) 2492887d61b2STaylor Simpson fHIDE(int element_size = 2;) 2493887d61b2STaylor Simpson fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2494887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2495887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2496887d61b2STaylor Simpson fVFOREACH(16, i) { 2497887d61b2STaylor Simpson EA = (RtV+fVALIGN(VvV.uh[i],ALIGNMENT)); 2498887d61b2STaylor Simpson fVLOG_VTCM_HALFWORD_INCREMENT(EA,VvV.uh[i],VwV,i,ALIGNMENT,MuV); 2499887d61b2STaylor Simpson } 2500887d61b2STaylor Simpson fHIDE(fLOG_SCATTER_OP(2);) 2501887d61b2STaylor Simpson fSCATTER_FINISH(1) 2502887d61b2STaylor Simpson}) 2503887d61b2STaylor Simpson 2504887d61b2STaylor Simpson 2505887d61b2STaylor SimpsonEXTINSN(V6_vscattermwq, "if (Qs4) vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words conditional", 2506887d61b2STaylor Simpson{ 2507887d61b2STaylor Simpson fHIDE(int i;) 2508887d61b2STaylor Simpson fHIDE(int element_size = 4;) 2509887d61b2STaylor Simpson fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2510887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2511887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2512887d61b2STaylor Simpson fVFOREACH(32, i) { 2513887d61b2STaylor Simpson EA = RtV+VvV.uw[i]; 2514887d61b2STaylor Simpson fVLOG_VTCM_WORDQ(EA,VvV.uw[i], VwV,i,QsV,MuV); 2515887d61b2STaylor Simpson } 2516887d61b2STaylor Simpson fSCATTER_FINISH(0) 2517887d61b2STaylor Simpson}) 2518887d61b2STaylor Simpson 2519887d61b2STaylor SimpsonEXTINSN(V6_vscattermhq, "if (Qs4) vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter HalfWords conditional", 2520887d61b2STaylor Simpson{ 2521887d61b2STaylor Simpson fHIDE(int i;) 2522887d61b2STaylor Simpson fHIDE(int element_size = 2;) 2523887d61b2STaylor Simpson fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2524887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2525887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2526887d61b2STaylor Simpson fVFOREACH(16, i) { 2527887d61b2STaylor Simpson EA = RtV+VvV.uh[i]; 2528887d61b2STaylor Simpson fVLOG_VTCM_HALFWORDQ(EA,VvV.uh[i],VwV,i,QsV,MuV); 2529887d61b2STaylor Simpson } 2530887d61b2STaylor Simpson fSCATTER_FINISH(0) 2531887d61b2STaylor Simpson}) 2532887d61b2STaylor Simpson 2533887d61b2STaylor Simpson 2534887d61b2STaylor Simpson 2535887d61b2STaylor Simpson 2536887d61b2STaylor SimpsonEXTINSN(V6_vscattermhw , "vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter Words", 2537887d61b2STaylor Simpson{ 2538887d61b2STaylor Simpson fHIDE(int i;) 2539887d61b2STaylor Simpson fHIDE(int j;) 2540887d61b2STaylor Simpson fHIDE(int element_size = 2;) 2541887d61b2STaylor Simpson fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2542887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2543887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2544887d61b2STaylor Simpson fVFOREACH(32, i) { 2545887d61b2STaylor Simpson for(j = 0; j < 2; j++) { 2546887d61b2STaylor Simpson EA = RtV+VvvV.v[j].uw[i]; 2547887d61b2STaylor Simpson fVLOG_VTCM_HALFWORD_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,MuV); 2548887d61b2STaylor Simpson } 2549887d61b2STaylor Simpson } 2550887d61b2STaylor Simpson fSCATTER_FINISH(0) 2551887d61b2STaylor Simpson}) 2552887d61b2STaylor Simpson 2553887d61b2STaylor Simpson 2554f128c0feSTaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(32, v6mpyvubs10_vxx, "Vxx32.w+=v6mpy(Vuu32.ub,Vvv32.b,#u2):v", "", 2555f128c0feSTaylor Simpson fHIDE(size2s_t c00;) 2556f128c0feSTaylor Simpson fGET10BIT(c00, VvvV.v[0].uw[i], 0) 2557f128c0feSTaylor Simpson fHIDE(size2s_t c01;) 2558f128c0feSTaylor Simpson fGET10BIT(c01, VvvV.v[0].uw[i], 1) 2559f128c0feSTaylor Simpson fHIDE(size2s_t c02;) 2560f128c0feSTaylor Simpson fGET10BIT(c02, VvvV.v[0].uw[i], 2) 2561f128c0feSTaylor Simpson 2562f128c0feSTaylor Simpson fHIDE(size2s_t c10;) 2563f128c0feSTaylor Simpson fGET10BIT(c10, VvvV.v[1].uw[i], 0) 2564f128c0feSTaylor Simpson fHIDE(size2s_t c11;) 2565f128c0feSTaylor Simpson fGET10BIT(c11, VvvV.v[1].uw[i], 1) 2566f128c0feSTaylor Simpson fHIDE(size2s_t c12;) 2567f128c0feSTaylor Simpson fGET10BIT(c12, VvvV.v[1].uw[i], 2) 2568f128c0feSTaylor Simpson 2569f128c0feSTaylor Simpson if (uiV == 0) { 2570f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10); 2571f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11); 2572f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12); 2573f128c0feSTaylor Simpson 2574f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00); 2575f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2576f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02); 2577f128c0feSTaylor Simpson 2578f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c10); 2579f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11); 2580f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c12); 2581f128c0feSTaylor Simpson 2582f128c0feSTaylor Simpson } else if (uiV == 1) { 2583f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c00); 2584f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c01); 2585f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c02); 2586f128c0feSTaylor Simpson 2587f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10); 2588f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11); 2589f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12); 2590f128c0feSTaylor Simpson 2591f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00); 2592f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2593f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02); 2594f128c0feSTaylor Simpson 2595f128c0feSTaylor Simpson } else if (uiV == 2) { 2596f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10); 2597f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2598f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12); 2599f128c0feSTaylor Simpson 2600f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00); 2601f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01); 2602f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02); 2603f128c0feSTaylor Simpson 2604f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c10); 2605f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c11); 2606f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c12); 2607f128c0feSTaylor Simpson 2608f128c0feSTaylor Simpson } else if (uiV == 3) { 2609f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c00); 2610f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01); 2611f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c02); 2612f128c0feSTaylor Simpson 2613f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10); 2614f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2615f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12); 2616f128c0feSTaylor Simpson 2617f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00); 2618f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01); 2619f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02); 2620f128c0feSTaylor Simpson } 2621f128c0feSTaylor Simpson) 2622f128c0feSTaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(32, v6mpyhubs10_vxx, "Vxx32.w+=v6mpy(Vuu32.ub,Vvv32.b,#u2):h", "", 2623f128c0feSTaylor Simpson fHIDE(size2s_t c00;) 2624f128c0feSTaylor Simpson fGET10BIT(c00, VvvV.v[0].uw[i], 0) 2625f128c0feSTaylor Simpson fHIDE(size2s_t c01;) 2626f128c0feSTaylor Simpson fGET10BIT(c01, VvvV.v[0].uw[i], 1) 2627f128c0feSTaylor Simpson fHIDE(size2s_t c02;) 2628f128c0feSTaylor Simpson fGET10BIT(c02, VvvV.v[0].uw[i], 2) 2629f128c0feSTaylor Simpson fHIDE(size2s_t c10;) 2630f128c0feSTaylor Simpson fGET10BIT(c10, VvvV.v[1].uw[i], 0) 2631f128c0feSTaylor Simpson fHIDE(size2s_t c11;) 2632f128c0feSTaylor Simpson fGET10BIT(c11, VvvV.v[1].uw[i], 1) 2633f128c0feSTaylor Simpson fHIDE(size2s_t c12;) 2634f128c0feSTaylor Simpson fGET10BIT(c12, VvvV.v[1].uw[i], 2) 2635f128c0feSTaylor Simpson 2636f128c0feSTaylor Simpson if (uiV == 0) { 2637f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10); 2638f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11); 2639f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12); 2640f128c0feSTaylor Simpson 2641f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00); 2642f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2643f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02); 2644f128c0feSTaylor Simpson 2645f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c10); 2646f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11); 2647f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c12); 2648f128c0feSTaylor Simpson 2649f128c0feSTaylor Simpson } else if (uiV == 1) { 2650f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c00); 2651f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c01); 2652f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c02); 2653f128c0feSTaylor Simpson 2654f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10); 2655f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11); 2656f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12); 2657f128c0feSTaylor Simpson 2658f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00); 2659f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2660f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02); 2661f128c0feSTaylor Simpson 2662f128c0feSTaylor Simpson } else if (uiV == 2) { 2663f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10); 2664f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2665f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12); 2666f128c0feSTaylor Simpson 2667f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00); 2668f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01); 2669f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02); 2670f128c0feSTaylor Simpson 2671f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c10); 2672f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c11); 2673f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c12); 2674f128c0feSTaylor Simpson 2675f128c0feSTaylor Simpson } else if (uiV == 3) { 2676f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c00); 2677f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01); 2678f128c0feSTaylor Simpson VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c02); 2679f128c0feSTaylor Simpson 2680f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10); 2681f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2682f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12); 2683f128c0feSTaylor Simpson 2684f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00); 2685f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01); 2686f128c0feSTaylor Simpson VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02); 2687f128c0feSTaylor Simpson } 2688f128c0feSTaylor Simpson) 2689f128c0feSTaylor Simpson 2690f128c0feSTaylor Simpson 2691f128c0feSTaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32, v6mpyvubs10, "Vdd32.w=v6mpy(Vuu32.ub,Vvv32.b,#u2):v", "", 2692f128c0feSTaylor Simpson fHIDE(short c00;) 2693f128c0feSTaylor Simpson fGET10BIT(c00, VvvV.v[0].uw[i], 0) 2694f128c0feSTaylor Simpson fHIDE(short c01;) 2695f128c0feSTaylor Simpson fGET10BIT(c01, VvvV.v[0].uw[i], 1) 2696f128c0feSTaylor Simpson fHIDE(short c02;) 2697f128c0feSTaylor Simpson fGET10BIT(c02, VvvV.v[0].uw[i], 2) 2698f128c0feSTaylor Simpson fHIDE(short c10;) 2699f128c0feSTaylor Simpson fGET10BIT(c10, VvvV.v[1].uw[i], 0) 2700f128c0feSTaylor Simpson fHIDE(short c11;) 2701f128c0feSTaylor Simpson fGET10BIT(c11, VvvV.v[1].uw[i], 1) 2702f128c0feSTaylor Simpson fHIDE(short c12;) 2703f128c0feSTaylor Simpson fGET10BIT(c12, VvvV.v[1].uw[i], 2) 2704f128c0feSTaylor Simpson 2705f128c0feSTaylor Simpson 2706f128c0feSTaylor Simpson 2707f128c0feSTaylor Simpson if (uiV == 0) { 2708f128c0feSTaylor Simpson VddV.v[1].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10); 2709f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11); 2710f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12); 2711f128c0feSTaylor Simpson 2712f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00); 2713f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2714f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02); 2715f128c0feSTaylor Simpson 2716f128c0feSTaylor Simpson VddV.v[0].w[i] = fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c10); 2717f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11); 2718f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c12); 2719f128c0feSTaylor Simpson 2720f128c0feSTaylor Simpson } else if (uiV == 1) { 2721f128c0feSTaylor Simpson VddV.v[1].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c00); 2722f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c01); 2723f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c02); 2724f128c0feSTaylor Simpson 2725f128c0feSTaylor Simpson VddV.v[0].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10); 2726f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11); 2727f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12); 2728f128c0feSTaylor Simpson 2729f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00); 2730f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2731f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02); 2732f128c0feSTaylor Simpson 2733f128c0feSTaylor Simpson } else if (uiV == 2) { 2734f128c0feSTaylor Simpson VddV.v[1].w[i] = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10); 2735f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2736f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12); 2737f128c0feSTaylor Simpson 2738f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00); 2739f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01); 2740f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02); 2741f128c0feSTaylor Simpson 2742f128c0feSTaylor Simpson VddV.v[0].w[i] = fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c10); 2743f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c11); 2744f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c12); 2745f128c0feSTaylor Simpson 2746f128c0feSTaylor Simpson } else if (uiV == 3) { 2747f128c0feSTaylor Simpson VddV.v[1].w[i] = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c00); 2748f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01); 2749f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c02); 2750f128c0feSTaylor Simpson 2751f128c0feSTaylor Simpson VddV.v[0].w[i] = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10); 2752f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2753f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12); 2754f128c0feSTaylor Simpson 2755f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00); 2756f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01); 2757f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02); 2758f128c0feSTaylor Simpson } 2759f128c0feSTaylor Simpson) 2760f128c0feSTaylor Simpson 2761f128c0feSTaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32, v6mpyhubs10, "Vdd32.w=v6mpy(Vuu32.ub,Vvv32.b,#u2):h", "", 2762f128c0feSTaylor Simpson fHIDE(short c00;) 2763f128c0feSTaylor Simpson fGET10BIT(c00, VvvV.v[0].uw[i], 0) 2764f128c0feSTaylor Simpson fHIDE(short c01;) 2765f128c0feSTaylor Simpson fGET10BIT(c01, VvvV.v[0].uw[i], 1) 2766f128c0feSTaylor Simpson fHIDE(short c02;) 2767f128c0feSTaylor Simpson fGET10BIT(c02, VvvV.v[0].uw[i], 2) 2768f128c0feSTaylor Simpson fHIDE(short c10;) 2769f128c0feSTaylor Simpson fGET10BIT(c10, VvvV.v[1].uw[i], 0) 2770f128c0feSTaylor Simpson fHIDE(short c11;) 2771f128c0feSTaylor Simpson fGET10BIT(c11, VvvV.v[1].uw[i], 1) 2772f128c0feSTaylor Simpson fHIDE(short c12;) 2773f128c0feSTaylor Simpson fGET10BIT(c12, VvvV.v[1].uw[i], 2) 2774f128c0feSTaylor Simpson 2775f128c0feSTaylor Simpson if (uiV == 0) { 2776f128c0feSTaylor Simpson VddV.v[1].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10); 2777f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11); 2778f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12); 2779f128c0feSTaylor Simpson 2780f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00); 2781f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2782f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02); 2783f128c0feSTaylor Simpson 2784f128c0feSTaylor Simpson VddV.v[0].w[i] = fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c10); 2785f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11); 2786f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c12); 2787f128c0feSTaylor Simpson 2788f128c0feSTaylor Simpson } else if (uiV == 1) { 2789f128c0feSTaylor Simpson VddV.v[1].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c00); 2790f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c01); 2791f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c02); 2792f128c0feSTaylor Simpson 2793f128c0feSTaylor Simpson VddV.v[0].w[i] = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10); 2794f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11); 2795f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12); 2796f128c0feSTaylor Simpson 2797f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00); 2798f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01); 2799f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02); 2800f128c0feSTaylor Simpson 2801f128c0feSTaylor Simpson } else if (uiV == 2) { 2802f128c0feSTaylor Simpson VddV.v[1].w[i] = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10); 2803f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2804f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12); 2805f128c0feSTaylor Simpson 2806f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00); 2807f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01); 2808f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02); 2809f128c0feSTaylor Simpson 2810f128c0feSTaylor Simpson VddV.v[0].w[i] = fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c10); 2811f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c11); 2812f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c12); 2813f128c0feSTaylor Simpson 2814f128c0feSTaylor Simpson } else if (uiV == 3) { 2815f128c0feSTaylor Simpson VddV.v[1].w[i] = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c00); 2816f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01); 2817f128c0feSTaylor Simpson VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c02); 2818f128c0feSTaylor Simpson 2819f128c0feSTaylor Simpson VddV.v[0].w[i] = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10); 2820f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11); 2821f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12); 2822f128c0feSTaylor Simpson 2823f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00); 2824f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01); 2825f128c0feSTaylor Simpson VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02); 2826f128c0feSTaylor Simpson } 2827f128c0feSTaylor Simpson) 2828f128c0feSTaylor Simpson 2829887d61b2STaylor Simpson 2830887d61b2STaylor SimpsonEXTINSN(V6_vscattermhwq, "if (Qs4) vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords conditional", 2831887d61b2STaylor Simpson{ 2832887d61b2STaylor Simpson fHIDE(int i;) 2833887d61b2STaylor Simpson fHIDE(int j;) 2834887d61b2STaylor Simpson fHIDE(int element_size = 2;) 2835887d61b2STaylor Simpson fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2836887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2837887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2838887d61b2STaylor Simpson fVFOREACH(32, i) { 2839887d61b2STaylor Simpson for(j = 0; j < 2; j++) { 2840887d61b2STaylor Simpson EA = RtV+VvvV.v[j].uw[i]; 2841887d61b2STaylor Simpson fVLOG_VTCM_HALFWORDQ_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),QsV,i,j,MuV); 2842887d61b2STaylor Simpson } 2843887d61b2STaylor Simpson } 2844887d61b2STaylor Simpson fSCATTER_FINISH(0) 2845887d61b2STaylor Simpson}) 2846887d61b2STaylor Simpson 2847887d61b2STaylor SimpsonEXTINSN(V6_vscattermhw_add, "vscatter(Rt32,Mu2,Vvv32.w).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords-add", 2848887d61b2STaylor Simpson{ 2849887d61b2STaylor Simpson fHIDE(int i;) 2850887d61b2STaylor Simpson fHIDE(int j;) 2851887d61b2STaylor Simpson fHIDE(int ALIGNMENT=2;) 2852887d61b2STaylor Simpson fHIDE(int element_size = 2;) 2853887d61b2STaylor Simpson fHIDE(fSCATTER_INIT( RtV, MuV, element_size);) 2854887d61b2STaylor Simpson fVLASTBYTE(MuV, element_size); 2855887d61b2STaylor Simpson fVALIGN(RtV, element_size); 2856887d61b2STaylor Simpson fVFOREACH(32, i) { 2857887d61b2STaylor Simpson for(j = 0; j < 2; j++) { 2858*29ea1946SZhao Liu EA = RtV + fVALIGN(VvvV.v[j].uw[i],ALIGNMENT); 2859887d61b2STaylor Simpson fVLOG_VTCM_HALFWORD_INCREMENT_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,ALIGNMENT,MuV); 2860887d61b2STaylor Simpson } 2861887d61b2STaylor Simpson } 2862887d61b2STaylor Simpson fHIDE(fLOG_SCATTER_OP(2);) 2863887d61b2STaylor Simpson fSCATTER_FINISH(1) 2864887d61b2STaylor Simpson}) 2865887d61b2STaylor Simpson 2866887d61b2STaylor SimpsonEXTINSN(V6_vprefixqb,"Vd32.b=prefixsum(Qv4)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), "parallel prefix sum of Q into byte", 2867887d61b2STaylor Simpson{ 2868887d61b2STaylor Simpson fHIDE(int i;) 2869887d61b2STaylor Simpson fHIDE(size1u_t acc = 0;) 2870887d61b2STaylor Simpson fVFOREACH(8, i) { 2871887d61b2STaylor Simpson acc += fGETQBIT(QvV,i); 2872887d61b2STaylor Simpson VdV.ub[i] = acc; 2873887d61b2STaylor Simpson } 2874887d61b2STaylor Simpson } ) 2875887d61b2STaylor SimpsonEXTINSN(V6_vprefixqh,"Vd32.h=prefixsum(Qv4)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), "parallel prefix sum of Q into halfwords", 2876887d61b2STaylor Simpson{ 2877887d61b2STaylor Simpson fHIDE(int i;) 2878887d61b2STaylor Simpson fHIDE(size2u_t acc = 0;) 2879887d61b2STaylor Simpson fVFOREACH(16, i) { 2880887d61b2STaylor Simpson acc += fGETQBIT(QvV,i*2+0); 2881887d61b2STaylor Simpson acc += fGETQBIT(QvV,i*2+1); 2882887d61b2STaylor Simpson VdV.uh[i] = acc; 2883887d61b2STaylor Simpson } 2884887d61b2STaylor Simpson } ) 2885887d61b2STaylor SimpsonEXTINSN(V6_vprefixqw,"Vd32.w=prefixsum(Qv4)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS), "parallel prefix sum of Q into words", 2886887d61b2STaylor Simpson{ 2887887d61b2STaylor Simpson fHIDE(int i;) 2888887d61b2STaylor Simpson fHIDE(size4u_t acc = 0;) 2889887d61b2STaylor Simpson fVFOREACH(32, i) { 2890887d61b2STaylor Simpson acc += fGETQBIT(QvV,i*4+0); 2891887d61b2STaylor Simpson acc += fGETQBIT(QvV,i*4+1); 2892887d61b2STaylor Simpson acc += fGETQBIT(QvV,i*4+2); 2893887d61b2STaylor Simpson acc += fGETQBIT(QvV,i*4+3); 2894887d61b2STaylor Simpson VdV.uw[i] = acc; 2895887d61b2STaylor Simpson } 2896887d61b2STaylor Simpson } ) 2897887d61b2STaylor Simpson 2898887d61b2STaylor Simpson 2899887d61b2STaylor Simpson 2900887d61b2STaylor Simpson 2901887d61b2STaylor Simpson 2902887d61b2STaylor Simpson/****************************************************************************** 2903887d61b2STaylor Simpson DEBUG Vector/Register Printing 2904887d61b2STaylor Simpson ******************************************************************************/ 2905887d61b2STaylor Simpson 2906887d61b2STaylor Simpson#define PRINT_VU(TYPE, TYPE2, COUNT)\ 2907887d61b2STaylor Simpson int i; \ 2908887d61b2STaylor Simpson size4u_t vec_len = fVBYTES();\ 2909887d61b2STaylor Simpson fprintf(stdout,"V%2d: ",VuN); \ 2910887d61b2STaylor Simpson for (i=0;i<vec_len>>COUNT;i++) { \ 2911887d61b2STaylor Simpson fprintf(stdout,TYPE2 " ", VuV.TYPE[i]); \ 2912887d61b2STaylor Simpson }; \ 2913887d61b2STaylor Simpson fprintf(stdout,"\\n"); \ 2914887d61b2STaylor Simpson fflush(stdout);\ 2915887d61b2STaylor Simpson 2916887d61b2STaylor Simpson#undef ATTR_VMEM 2917887d61b2STaylor Simpson#undef ATTR_VMEMU 2918887d61b2STaylor Simpson#undef ATTR_VMEM_NT 2919887d61b2STaylor Simpson 2920887d61b2STaylor Simpson#endif /* NO_MMVEC */ 2921887d61b2STaylor Simpson 2922887d61b2STaylor Simpson#ifdef __SELF_DEF_EXTINSN 2923887d61b2STaylor Simpson#undef EXTINSN 2924887d61b2STaylor Simpson#undef __SELF_DEF_EXTINSN 2925887d61b2STaylor Simpson#endif 2926