1887d61b2STaylor Simpson/*
2f128c0feSTaylor Simpson *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
3887d61b2STaylor Simpson *
4887d61b2STaylor Simpson *  This program is free software; you can redistribute it and/or modify
5887d61b2STaylor Simpson *  it under the terms of the GNU General Public License as published by
6887d61b2STaylor Simpson *  the Free Software Foundation; either version 2 of the License, or
7887d61b2STaylor Simpson *  (at your option) any later version.
8887d61b2STaylor Simpson *
9887d61b2STaylor Simpson *  This program is distributed in the hope that it will be useful,
10887d61b2STaylor Simpson *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11887d61b2STaylor Simpson *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12887d61b2STaylor Simpson *  GNU General Public License for more details.
13887d61b2STaylor Simpson *
14887d61b2STaylor Simpson *  You should have received a copy of the GNU General Public License
15887d61b2STaylor Simpson *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16887d61b2STaylor Simpson */
17887d61b2STaylor Simpson
18887d61b2STaylor Simpson/******************************************************************************
19887d61b2STaylor Simpson *
206c67d98cSMichael Tokarev *     HOYA: MULTI MEDIA INSTRUCTIONS
21887d61b2STaylor Simpson *
22887d61b2STaylor Simpson ******************************************************************************/
23887d61b2STaylor Simpson
24887d61b2STaylor Simpson#ifndef EXTINSN
25887d61b2STaylor Simpson#define EXTINSN Q6INSN
26887d61b2STaylor Simpson#define __SELF_DEF_EXTINSN 1
27887d61b2STaylor Simpson#endif
28887d61b2STaylor Simpson
29887d61b2STaylor Simpson#ifndef NO_MMVEC
30887d61b2STaylor Simpson
31887d61b2STaylor Simpson#define DO_FOR_EACH_CODE(WIDTH, CODE) \
32887d61b2STaylor Simpson{ \
33887d61b2STaylor Simpson    fHIDE(int i;) \
34887d61b2STaylor Simpson    fVFOREACH(WIDTH, i) {\
35887d61b2STaylor Simpson        CODE ;\
36887d61b2STaylor Simpson    } \
37887d61b2STaylor Simpson}
38887d61b2STaylor Simpson
39887d61b2STaylor Simpson
40887d61b2STaylor Simpson
41887d61b2STaylor Simpson
42887d61b2STaylor Simpson#define ITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
43887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),  \
44887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
45887d61b2STaylor Simpson
46887d61b2STaylor Simpson
47887d61b2STaylor Simpson
48887d61b2STaylor Simpson#define ITERATOR_INSN2_ANY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
49887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
50887d61b2STaylor Simpson
51887d61b2STaylor Simpson#define ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
52887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV),  \
53887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
54887d61b2STaylor Simpson
55887d61b2STaylor Simpson
56887d61b2STaylor Simpson#define ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
57887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE)
58887d61b2STaylor Simpson
59887d61b2STaylor Simpson
60887d61b2STaylor Simpson#define ITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
61887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  \
62887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
63887d61b2STaylor Simpson
64887d61b2STaylor Simpson
65b2f20c2cSTaylor Simpson#define ITERATOR_INSN_SHIFT3_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
66b2f20c2cSTaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS,A_CVI_VS_3SRC,A_NOTE_SHIFT_RESOURCE,A_NOTE_NOVP,A_NOTE_VA_UNARY),  \
67b2f20c2cSTaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
68887d61b2STaylor Simpson
69887d61b2STaylor Simpson#define ITERATOR_INSN_SHIFT_SLOT_VV_LATE(WIDTH,TAG,SYNTAX,DESCR,CODE) \
70887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  \
71887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
72887d61b2STaylor Simpson
73887d61b2STaylor Simpson#define ITERATOR_INSN2_SHIFT_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
74887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
75887d61b2STaylor Simpson
76887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
77887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),  \
78887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
79887d61b2STaylor Simpson
80887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
81887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
82887d61b2STaylor Simpson
83887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \
84887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
85887d61b2STaylor Simpson
86887d61b2STaylor Simpson
87887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
88887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX2,DESCR,CODE)
89887d61b2STaylor Simpson
90887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
91887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
92887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
93887d61b2STaylor Simpson
94887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \
95887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
96887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
97887d61b2STaylor Simpson
98887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
99887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE)
100887d61b2STaylor Simpson
101887d61b2STaylor Simpson#define ITERATOR_INSN_MPY_SLOT(WIDTH,TAG, SYNTAX,DESCR,CODE) \
102887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, \
103887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),  \
104887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
105887d61b2STaylor Simpson
106887d61b2STaylor Simpson#define ITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,DESCR,CODE) \
107887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, \
108887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),  \
109887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
110887d61b2STaylor Simpson
111887d61b2STaylor Simpson#define ITERATOR_INSN2_MPY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
112887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
113887d61b2STaylor Simpson
114887d61b2STaylor Simpson#define ITERATOR_INSN2_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,SYNTAX2,DESCR,CODE) \
115887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX2,DESCR,CODE)
116887d61b2STaylor Simpson
117887d61b2STaylor Simpson
118887d61b2STaylor Simpson#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
119887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV),  \
120887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
121887d61b2STaylor Simpson
122f128c0feSTaylor Simpson#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(WIDTH,TAG,SYNTAX,DESCR,CODE) \
123f128c0feSTaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV),  \
124f128c0feSTaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
125f128c0feSTaylor Simpson
126887d61b2STaylor Simpson#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
127887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE)
128887d61b2STaylor Simpson
129887d61b2STaylor Simpson
130887d61b2STaylor Simpson
131887d61b2STaylor Simpson
132887d61b2STaylor Simpson#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC2(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
133887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_CVI_VX_VSRC0_IS_DST), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
134887d61b2STaylor Simpson
135887d61b2STaylor Simpson#define ITERATOR_INSN_SLOT2_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
136887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_RESTRICT_SLOT2ONLY), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
137887d61b2STaylor Simpson
138887d61b2STaylor Simpson#define ITERATOR_INSN_VHISTLIKE(WIDTH,TAG,SYNTAX,DESCR,CODE) \
139887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT),  \
140887d61b2STaylor SimpsonDESCR, fHIDE(mmvector_t input;) input = fTMPVDATA(); DO_FOR_EACH_CODE(WIDTH, CODE))
141887d61b2STaylor Simpson
142887d61b2STaylor Simpson
143887d61b2STaylor Simpson
144887d61b2STaylor Simpson
145887d61b2STaylor Simpson
146887d61b2STaylor Simpson/******************************************************************************************
147887d61b2STaylor Simpson*
148887d61b2STaylor Simpson* MMVECTOR MEMORY OPERATIONS - NO NAPALI V1
149887d61b2STaylor Simpson*
150887d61b2STaylor Simpson*******************************************************************************************/
151887d61b2STaylor Simpson
152887d61b2STaylor Simpson
153887d61b2STaylor Simpson
154887d61b2STaylor Simpson#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
155887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV),  \
156887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
157887d61b2STaylor Simpson
158887d61b2STaylor Simpson#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
159887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
160887d61b2STaylor Simpson
161887d61b2STaylor Simpson
162887d61b2STaylor Simpson
163887d61b2STaylor Simpson#define ITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
164887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  \
165887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
166887d61b2STaylor Simpson
167887d61b2STaylor Simpson#define ITERATOR_INSN2_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
168887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
169887d61b2STaylor Simpson
170887d61b2STaylor Simpson
171887d61b2STaylor Simpson#define ITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
172887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),  \
173887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
174887d61b2STaylor Simpson
175887d61b2STaylor Simpson#define ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
176887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
177887d61b2STaylor Simpson
178887d61b2STaylor Simpson
179887d61b2STaylor Simpson#define ITERATOR_INSN_MPY_SLOT_NOV1(WIDTH,TAG, SYNTAX,DESCR,CODE) \
180887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, \
181887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),  \
182887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
183887d61b2STaylor Simpson
184887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
185887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),  \
186887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
187887d61b2STaylor Simpson
188887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOTT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
189887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
190887d61b2STaylor Simpson
191887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
192887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
193887d61b2STaylor Simpson
194887d61b2STaylor Simpson
195887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
196887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DEP_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
197887d61b2STaylor Simpson
198887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
199887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
200887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
201887d61b2STaylor Simpson
202887d61b2STaylor Simpson#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
203887d61b2STaylor SimpsonEXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
204887d61b2STaylor SimpsonDESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
205887d61b2STaylor Simpson
206887d61b2STaylor Simpson#define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
207887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
208887d61b2STaylor Simpson
209887d61b2STaylor Simpson#define NARROWING_SHIFT_NOV1(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \
210887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT_NOV1(ITERSIZE,TAG, \
211887d61b2STaylor Simpson"Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \
212887d61b2STaylor Simpson"Vector shift right and shuffle", \
213887d61b2STaylor Simpson    fHIDE(int )shamt = RtV & SHAMTMASK; \
214887d61b2STaylor Simpson    DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \
215887d61b2STaylor Simpson    DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt)))
216887d61b2STaylor Simpson
217887d61b2STaylor Simpson#define MMVEC_AVGS_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
218887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",          "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",          "Vector Average "DESCR,                                      VdV.DEST[i]  = fVAVGS(       WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
219887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",      "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",      "Vector Average % Round"DESCR,                               VdV.DEST[i]  = fVAVGSRND(    WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
220887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vnavg##TYPE,                       "Vd32=vnavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")",         "Vector Negative Average "DESCR,                             VdV.DEST[i]  = fVNAVGS(      WIDTH,  VuV.SRC[i], VvV.SRC[i]))
221887d61b2STaylor Simpson
222887d61b2STaylor Simpson  #define MMVEC_AVGU_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
223887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",        "Vector Average "DESCR,                                      VdV.DEST[i] = fVAVGU(   WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
224887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",     "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",    "Vector Average % Round"DESCR,                               VdV.DEST[i] = fVAVGURND(WIDTH,  VuV.SRC[i], VvV.SRC[i]))
225887d61b2STaylor Simpson
226887d61b2STaylor Simpson
227887d61b2STaylor Simpson
228887d61b2STaylor Simpson/******************************************************************************************
229887d61b2STaylor Simpson*
230887d61b2STaylor Simpson* MMVECTOR MEMORY OPERATIONS
231887d61b2STaylor Simpson*
232887d61b2STaylor Simpson*******************************************************************************************/
233887d61b2STaylor Simpson
234887d61b2STaylor Simpson#define MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,BEH) \
235887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_pi,      SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_I(RxV,VEC_SCALE(siV)); }) \
236887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_ai,      SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_RI(RtV,VEC_SCALE(siV)); BEH;}) \
237887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_ppu,      SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_M(RxV,MuV); }) \
238887d61b2STaylor Simpson
239887d61b2STaylor Simpson
240887d61b2STaylor Simpson#define MMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
241887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_pred_pi,      "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB, ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \
242887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_pred_ai,      "if (" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB, ATTRIB,DESCR,  { if (fLSBOLD(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \
243887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_pred_ppu,     "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,  { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}}) \
244887d61b2STaylor Simpson
245887d61b2STaylor Simpson#define MMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
246887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_npred_pi,     "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \
247887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_npred_ai,     "if (!" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR, { if (fLSBOLDNOT(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \
248887d61b2STaylor SimpsonEXTINSN(V6_##TAG##_npred_ppu,    "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}})
249887d61b2STaylor Simpson
250887d61b2STaylor Simpson#define MMVEC_COND_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
251887d61b2STaylor SimpsonMMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
252887d61b2STaylor SimpsonMMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH)
253887d61b2STaylor Simpson
254887d61b2STaylor Simpson
255887d61b2STaylor Simpson#define VEC_SCALE(X) X*fVECSIZE()
256887d61b2STaylor Simpson
257887d61b2STaylor Simpson
258887d61b2STaylor Simpson#define MMVEC_LD(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmem","",fLOADMMV(EA,VdV))
259887d61b2STaylor Simpson#define MMVEC_LDC(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_cur,DESCR,ATTRIB,NT,"Vd32.cur=vmem","",fLOADMMV(EA,VdV))
260887d61b2STaylor Simpson#define MMVEC_LDT(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_tmp,DESCR,ATTRIB,NT,"Vd32.tmp=vmem","",fLOADMMV(EA,VdV))
261887d61b2STaylor Simpson#define MMVEC_LDU(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmemu","",fLOADMMVU(EA,VdV))
262887d61b2STaylor Simpson
263887d61b2STaylor Simpson
264887d61b2STaylor Simpson#define MMVEC_STQ(TAG,DESCR,ATTRIB,NT) \
265887d61b2STaylor SimpsonMMVEC_EACH_EA(TAG##_qpred,DESCR,ATTRIB,NT,"if (Qv4) vmem","=Vs32",fSTOREMMVQ(EA,VsV,QvV)) \
266887d61b2STaylor SimpsonMMVEC_EACH_EA(TAG##_nqpred,DESCR,ATTRIB,NT,"if (!Qv4) vmem","=Vs32",fSTOREMMVNQ(EA,VsV,QvV))
267887d61b2STaylor Simpson
268887d61b2STaylor Simpson/****************************************************************
269887d61b2STaylor Simpson* MAPPING FOR VMEMs
270887d61b2STaylor Simpson****************************************************************/
271887d61b2STaylor Simpson
272887d61b2STaylor Simpson#define ATTR_VMEM A_EXTENSION,A_CVI,A_CVI_VM
273887d61b2STaylor Simpson#define ATTR_VMEMU A_EXTENSION,A_CVI,A_CVI_VM,A_CVI_VP
274887d61b2STaylor Simpson
275887d61b2STaylor Simpson
276887d61b2STaylor SimpsonMMVEC_LD(vL32b,  "Aligned Vector Load",        ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),)
277887d61b2STaylor SimpsonMMVEC_LDC(vL32b,  "Aligned Vector Load Cur",	ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_NEW,A_CVI_VA),)
278887d61b2STaylor SimpsonMMVEC_LDT(vL32b,  "Aligned Vector Load Tmp",	ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),)
279887d61b2STaylor Simpson
280887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),,"Vd32=vmem",,Pv,fLOADMMV(EA,VdV);)
281887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",,Pv,fLOADMMV(EA,VdV);)
282887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),,"Vd32.tmp=vmem",,Pv,fLOADMMV(EA,VdV);)
283887d61b2STaylor Simpson
284887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",fSTOREMMV(EA,VsV))
285887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",Pv,fSTOREMMV(EA,VsV))
286887d61b2STaylor Simpson
287887d61b2STaylor Simpson
288887d61b2STaylor SimpsonMMVEC_STQ(vS32b,  "Aligned Vector Store",      ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),)
289887d61b2STaylor Simpson
290887d61b2STaylor SimpsonMMVEC_LDU(vL32Ub, "Unaligned Vector Load",     ATTRIBS(ATTR_VMEMU,A_LOAD,A_RESTRICT_NOSLOT1),)
291887d61b2STaylor Simpson
292887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",fSTOREMMVU(EA,VsV))
293887d61b2STaylor Simpson
294887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",Pv,fSTOREMMVU(EA,VsV))
295887d61b2STaylor Simpson
296887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN)))
297887d61b2STaylor Simpson
2986c67d98cSMichael Tokarev// V65 store release, zero byte store
299887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32b_srls,"Aligned Vector Scatter Release",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_SCATTER_RELEASE,A_CVI_NEW,A_RESTRICT_SLOT0ONLY),,"vmem",":scatter_release",fSTORERELEASE(EA,0))
300887d61b2STaylor Simpson
301887d61b2STaylor Simpson
302887d61b2STaylor Simpson
303887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN)))
304887d61b2STaylor Simpson
305887d61b2STaylor Simpson
306887d61b2STaylor Simpson/******************************************************************************************
307887d61b2STaylor Simpson*
308887d61b2STaylor Simpson* MMVECTOR MEMORY OPERATIONS - NON TEMPORAL
309887d61b2STaylor Simpson*
310887d61b2STaylor Simpson*******************************************************************************************/
311887d61b2STaylor Simpson
312887d61b2STaylor Simpson#define ATTR_VMEM_NT A_EXTENSION,A_CVI,A_CVI_VM
313887d61b2STaylor Simpson
314887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",fSTOREMMV(EA,VsV))
315887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",Pv,fSTOREMMV(EA,VsV))
316887d61b2STaylor Simpson
317887d61b2STaylor SimpsonMMVEC_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN)))
318887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN)))
319887d61b2STaylor Simpson
320887d61b2STaylor Simpson
321887d61b2STaylor SimpsonMMVEC_STQ(vS32b_nt,  "Aligned Vector Store - Non temporal",      ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt")
322887d61b2STaylor Simpson
323887d61b2STaylor SimpsonMMVEC_LD(vL32b_nt,  "Aligned Vector Load - Non temporal",       ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_VA),":nt")
324887d61b2STaylor SimpsonMMVEC_LDC(vL32b_nt,  "Aligned Vector Load Cur - Non temporal",	ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_NEW,A_CVI_VA),":nt")
325887d61b2STaylor SimpsonMMVEC_LDT(vL32b_nt,  "Aligned Vector Load Tmp - Non temporal",	ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_TMP),":nt")
326887d61b2STaylor Simpson
327887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b_nt,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA),,"Vd32=vmem",":nt",Pv,fLOADMMV(EA,VdV);)
328887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b_nt_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",":nt",Pv,fLOADMMV(EA,VdV);)
329887d61b2STaylor SimpsonMMVEC_COND_EACH_EA(vL32b_nt_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM_NT,A_CVI_TMP),,"Vd32.tmp=vmem",":nt",Pv,fLOADMMV(EA,VdV);)
330887d61b2STaylor Simpson
331887d61b2STaylor Simpson
332887d61b2STaylor Simpson#undef VEC_SCALE
333887d61b2STaylor Simpson
334887d61b2STaylor Simpson
335887d61b2STaylor Simpson/***************************************************
336887d61b2STaylor Simpson * Vector Alignment
337887d61b2STaylor Simpson ************************************************/
338887d61b2STaylor Simpson
339887d61b2STaylor Simpson#define VALIGNB(SHIFT)  \
340887d61b2STaylor Simpson    fHIDE(int i;) \
341887d61b2STaylor Simpson    for(i = 0; i < fVBYTES(); i++) {\
342887d61b2STaylor Simpson        VdV.ub[i] = (i+SHIFT>=fVBYTES()) ? VuV.ub[i+SHIFT-fVBYTES()] : VvV.ub[i+SHIFT];\
343887d61b2STaylor Simpson	}
344887d61b2STaylor Simpson
345887d61b2STaylor SimpsonEXTINSN(V6_valignb,  "Vd32=valign(Vu32,Vv32,Rt8)",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control",
346887d61b2STaylor Simpson{
347887d61b2STaylor Simpson	unsigned shift = RtV & (fVBYTES()-1);
348887d61b2STaylor Simpson	VALIGNB(shift)
349887d61b2STaylor Simpson})
350887d61b2STaylor SimpsonEXTINSN(V6_vlalignb, "Vd32=vlalign(Vu32,Vv32,Rt8)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control",
351887d61b2STaylor Simpson{
352887d61b2STaylor Simpson	unsigned shift = fVBYTES() - (RtV & (fVBYTES()-1));
353887d61b2STaylor Simpson	VALIGNB(shift)
354887d61b2STaylor Simpson})
355887d61b2STaylor SimpsonEXTINSN(V6_valignbi, "Vd32=valign(Vu32,Vv32,#u3)", 	ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control",
356887d61b2STaylor Simpson{
357887d61b2STaylor Simpson	VALIGNB(uiV)
358887d61b2STaylor Simpson})
359887d61b2STaylor SimpsonEXTINSN(V6_vlalignbi,"Vd32=vlalign(Vu32,Vv32,#u3)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control",
360887d61b2STaylor Simpson{
361887d61b2STaylor Simpson	unsigned shift = fVBYTES() - uiV;
362887d61b2STaylor Simpson	VALIGNB(shift)
363887d61b2STaylor Simpson})
364887d61b2STaylor Simpson
365887d61b2STaylor SimpsonEXTINSN(V6_vror, "Vd32=vror(Vu32,Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
366887d61b2STaylor Simpson"Align Two vectors by Rt32 as control",
367887d61b2STaylor Simpson{
368887d61b2STaylor Simpson	fHIDE(int k;)
369887d61b2STaylor Simpson	for (k=0;k<fVBYTES();k++) {
370887d61b2STaylor Simpson		VdV.ub[k] = VuV.ub[(k+RtV)&(fVBYTES()-1)];
371887d61b2STaylor Simpson	}
372887d61b2STaylor Simpson	})
373887d61b2STaylor Simpson
374887d61b2STaylor Simpson
375887d61b2STaylor Simpson
376887d61b2STaylor Simpson
377887d61b2STaylor Simpson
378887d61b2STaylor Simpson
379887d61b2STaylor Simpson
380887d61b2STaylor Simpson/**************************************************************
381887d61b2STaylor Simpson* Unpack elements with zero/sign extend and cross lane permute
382887d61b2STaylor Simpson***************************************************************/
383887d61b2STaylor Simpson
384887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackub,  "Vdd32=vunpackub(Vu32)", "Vdd32.uh=vunpack(Vu32.ub)", "Unpack byte with zero-extend",     fVARRAY_ELEMENT_ACCESS(VddV, uh, i)  = fZE8_16( VuV.ub[i]))
385887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackb,   "Vdd32=vunpackb(Vu32)",  "Vdd32.h=vunpack(Vu32.b)",   "Unpack bytes with sign-extend",    fVARRAY_ELEMENT_ACCESS(VddV, h,  i)  = fSE8_16( VuV.b[i] ))
386887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackuh, "Vdd32=vunpackuh(Vu32)", "Vdd32.uw=vunpack(Vu32.uh)", "Unpack halves with zero-extend",   fVARRAY_ELEMENT_ACCESS(VddV, uw, i)  = fZE16_32(VuV.uh[i]))
387887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackh,  "Vdd32=vunpackh(Vu32)",  "Vdd32.w=vunpack(Vu32.h)",   "Unpack halves with sign-extend",   fVARRAY_ELEMENT_ACCESS(VddV, w,  i)  = fSE16_32(VuV.h[i] ))
388887d61b2STaylor Simpson
389887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8, vunpackob, "Vxx32|=vunpackob(Vu32)", "Vxx32.h|=vunpacko(Vu32.b)", "Unpack byte to odd bytes ",       fVARRAY_ELEMENT_ACCESS(VxxV, uh, i) |= fZE8_16( VuV.ub[i])<<8)
390887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackoh, "Vxx32|=vunpackoh(Vu32)", "Vxx32.w|=vunpacko(Vu32.h)", "Unpack halves to odd halves",     fVARRAY_ELEMENT_ACCESS(VxxV, uw, i) |= fZE16_32(VuV.uh[i])<<16)
391887d61b2STaylor Simpson
392887d61b2STaylor Simpson
393887d61b2STaylor Simpson/**************************************************************
394887d61b2STaylor Simpson* Pack elements and cross lane permute
395887d61b2STaylor Simpson***************************************************************/
396887d61b2STaylor Simpson
397887d61b2STaylor Simpson ITERATOR_INSN2_PERMUTE_SLOT(16, vpackeb,  "Vd32=vpackeb(Vu32,Vv32)", "Vd32.b=vpacke(Vu32.h,Vv32.h)",
398887d61b2STaylor Simpson "Pack  bytes",
399887d61b2STaylor Simpson    VdV.ub[i]               = fGETUBYTE(0, VvV.uh[i]);
400887d61b2STaylor Simpson    VdV.ub[i+fVELEM(16)]    = fGETUBYTE(0, VuV.uh[i]))
401887d61b2STaylor Simpson
402887d61b2STaylor Simpson ITERATOR_INSN2_PERMUTE_SLOT(32, vpackeh,  "Vd32=vpackeh(Vu32,Vv32)", "Vd32.h=vpacke(Vu32.w,Vv32.w)",
403887d61b2STaylor Simpson "Pack  halfwords",
404887d61b2STaylor Simpson    VdV.uh[i]               = fGETUHALF(0, VvV.uw[i]);
405887d61b2STaylor Simpson    VdV.uh[i+fVELEM(32)]    = fGETUHALF(0, VuV.uw[i]))
406887d61b2STaylor Simpson
407887d61b2STaylor Simpson  ITERATOR_INSN2_PERMUTE_SLOT(16, vpackob,  "Vd32=vpackob(Vu32,Vv32)", "Vd32.b=vpacko(Vu32.h,Vv32.h)",
408887d61b2STaylor Simpson "Pack  bytes",
409887d61b2STaylor Simpson    VdV.ub[i]               = fGETUBYTE(1, VvV.uh[i]);
410887d61b2STaylor Simpson    VdV.ub[i+fVELEM(16)]    = fGETUBYTE(1, VuV.uh[i]))
411887d61b2STaylor Simpson
412887d61b2STaylor Simpson ITERATOR_INSN2_PERMUTE_SLOT(32, vpackoh,  "Vd32=vpackoh(Vu32,Vv32)", "Vd32.h=vpacko(Vu32.w,Vv32.w)",
413887d61b2STaylor Simpson "Pack  halfwords",
414887d61b2STaylor Simpson    VdV.uh[i]               = fGETUHALF(1, VvV.uw[i]);
415887d61b2STaylor Simpson    VdV.uh[i+fVELEM(32)]    = fGETUHALF(1, VuV.uw[i]))
416887d61b2STaylor Simpson
417887d61b2STaylor Simpson
418887d61b2STaylor Simpson
419887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(16, vpackhub_sat,  "Vd32=vpackhub(Vu32,Vv32):sat", "Vd32.ub=vpack(Vu32.h,Vv32.h):sat",
420887d61b2STaylor Simpson "Pack ubytes with saturation",
421887d61b2STaylor Simpson    VdV.ub[i]               = fVSATUB(VvV.h[i]);
422887d61b2STaylor Simpson    VdV.ub[i+fVELEM(16)]    = fVSATUB(VuV.h[i]))
423887d61b2STaylor Simpson
424887d61b2STaylor Simpson
425887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(16, vpackhb_sat,  "Vd32=vpackhb(Vu32,Vv32):sat", "Vd32.b=vpack(Vu32.h,Vv32.h):sat",
426887d61b2STaylor Simpson "Pack bytes with saturation",
427887d61b2STaylor Simpson    VdV.b[i]               = fVSATB(VvV.h[i]);
428887d61b2STaylor Simpson    VdV.b[i+fVELEM(16)]    = fVSATB(VuV.h[i]))
429887d61b2STaylor Simpson
430887d61b2STaylor Simpson
431887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(32, vpackwuh_sat,  "Vd32=vpackwuh(Vu32,Vv32):sat", "Vd32.uh=vpack(Vu32.w,Vv32.w):sat",
432887d61b2STaylor Simpson "Pack ubytes with saturation",
433887d61b2STaylor Simpson    VdV.uh[i]               = fVSATUH(VvV.w[i]);
434887d61b2STaylor Simpson    VdV.uh[i+fVELEM(32)]    = fVSATUH(VuV.w[i]))
435887d61b2STaylor Simpson
436887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(32, vpackwh_sat,  "Vd32=vpackwh(Vu32,Vv32):sat", "Vd32.h=vpack(Vu32.w,Vv32.w):sat",
437887d61b2STaylor Simpson "Pack bytes with saturation",
438887d61b2STaylor Simpson    VdV.h[i]               = fVSATH(VvV.w[i]);
439887d61b2STaylor Simpson    VdV.h[i+fVELEM(32)]    = fVSATH(VuV.w[i]))
440887d61b2STaylor Simpson
441887d61b2STaylor Simpson
442887d61b2STaylor Simpson
443887d61b2STaylor Simpson
444887d61b2STaylor Simpson
445887d61b2STaylor Simpson/**************************************************************
446887d61b2STaylor Simpson* Zero/Sign Extend with in-lane permute
447887d61b2STaylor Simpson***************************************************************/
448887d61b2STaylor Simpson
449887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vzb,"Vdd32=vzxtb(Vu32)","Vdd32.uh=vzxt(Vu32.ub)",
450887d61b2STaylor Simpson"Vector Zero Extend Bytes",
451887d61b2STaylor Simpson    VddV.v[0].uh[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i]));
452887d61b2STaylor Simpson    VddV.v[1].uh[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])))
453887d61b2STaylor Simpson
454887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vsb,"Vdd32=vsxtb(Vu32)","Vdd32.h=vsxt(Vu32.b)",
455887d61b2STaylor Simpson"Vector Sign Extend Bytes",
456887d61b2STaylor Simpson    VddV.v[0].h[i] = fSE8_16(fGETBYTE(0, VuV.h[i]));
457887d61b2STaylor Simpson    VddV.v[1].h[i] = fSE8_16(fGETBYTE(1, VuV.h[i])))
458887d61b2STaylor Simpson
459887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vzh,"Vdd32=vzxth(Vu32)","Vdd32.uw=vzxt(Vu32.uh)",
460887d61b2STaylor Simpson"Vector Zero Extend halfwords",
461887d61b2STaylor Simpson    VddV.v[0].uw[i] = fZE16_32(fGETUHALF(0, VuV.uw[i]));
462887d61b2STaylor Simpson    VddV.v[1].uw[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])))
463887d61b2STaylor Simpson
464887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vsh,"Vdd32=vsxth(Vu32)","Vdd32.w=vsxt(Vu32.h)",
465887d61b2STaylor Simpson"Vector Sign Extend halfwords",
466887d61b2STaylor Simpson    VddV.v[0].w[i] = fSE16_32(fGETHALF(0, VuV.w[i]));
467887d61b2STaylor Simpson    VddV.v[1].w[i] = fSE16_32(fGETHALF(1, VuV.w[i])))
468887d61b2STaylor Simpson
469887d61b2STaylor Simpson
470887d61b2STaylor Simpson/**********************************************************************
471887d61b2STaylor Simpson*
472887d61b2STaylor Simpson*
473887d61b2STaylor Simpson*
474887d61b2STaylor Simpson*               MMVECTOR REDUCTION
475887d61b2STaylor Simpson*
476887d61b2STaylor Simpson*
477887d61b2STaylor Simpson*
478887d61b2STaylor Simpson**********************************************************************/
479887d61b2STaylor Simpson
480887d61b2STaylor Simpson/********************************************
481887d61b2STaylor Simpson*  2-WAY REDUCTION - UNSIGNED BYTE BY BYTE
482887d61b2STaylor Simpson********************************************/
483887d61b2STaylor Simpson
484887d61b2STaylor Simpson
485887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(16,vdmpybus,"Vd32=vdmpybus(Vu32,Rt32)","Vd32.h=vdmpy(Vu32.ub,Rt32.b)",
486887d61b2STaylor Simpson"Vector Dual Multiply-Accumulates unsigned bytes by bytes",
487887d61b2STaylor Simpson    VdV.h[i]   = fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV));
488887d61b2STaylor Simpson    VdV.h[i]  += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
489887d61b2STaylor Simpson
490887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(16,vdmpybus_acc,"Vx32+=vdmpybus(Vu32,Rt32)","Vx32.h+=vdmpy(Vu32.ub,Rt32.b)",
491887d61b2STaylor Simpson"Vector Dual Multiply-Accumulates unsigned bytes by  bytes, and accumulate",
492887d61b2STaylor Simpson    VxV.h[i] += fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV));
493887d61b2STaylor Simpson    VxV.h[i] += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
494887d61b2STaylor Simpson
495887d61b2STaylor Simpson
496887d61b2STaylor Simpson
497887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv,"Vdd32=vdmpybus(Vuu32,Rt32)","Vdd32.h=vdmpy(Vuu32.ub,Rt32.b)",
498887d61b2STaylor Simpson"Vector Dual Multiply-Accumulates unsigned bytes by  bytes, and accumulate Sliding Window Reduction",
499887d61b2STaylor Simpson    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
500887d61b2STaylor Simpson    VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV));
501887d61b2STaylor Simpson
502887d61b2STaylor Simpson    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
503887d61b2STaylor Simpson    VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV)))
504887d61b2STaylor Simpson
505887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv_acc,"Vxx32+=vdmpybus(Vuu32,Rt32)","Vxx32.h+=vdmpy(Vuu32.ub,Rt32.b)",
506887d61b2STaylor Simpson"Vector Dual Multiply-Accumulates unsigned bytes by  bytes, and accumulate Sliding Window Reduction",
507887d61b2STaylor Simpson    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
508887d61b2STaylor Simpson    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV));
509887d61b2STaylor Simpson
510887d61b2STaylor Simpson    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
511887d61b2STaylor Simpson    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV)))
512887d61b2STaylor Simpson
513887d61b2STaylor Simpson
514887d61b2STaylor Simpson
515887d61b2STaylor Simpson/********************************************
516887d61b2STaylor Simpson*  2-WAY REDUCTION - HALF BY BYTE
517887d61b2STaylor Simpson********************************************/
518887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vdmpyhb,"Vd32=vdmpyhb(Vu32,Rt32)","Vd32.w=vdmpy(Vu32.h,Rt32.b)",
519887d61b2STaylor Simpson"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
520887d61b2STaylor Simpson    VdV.w[i]  = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV));
521887d61b2STaylor Simpson    VdV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV)))
522887d61b2STaylor Simpson
523887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vdmpyhb_acc,"Vx32+=vdmpyhb(Vu32,Rt32)","Vx32.w+=vdmpy(Vu32.h,Rt32.b)",
524887d61b2STaylor Simpson"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
525887d61b2STaylor Simpson    VxV.w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV));
526887d61b2STaylor Simpson    VxV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV)))
527887d61b2STaylor Simpson
528887d61b2STaylor Simpson
529887d61b2STaylor Simpson
530887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv,"Vdd32=vdmpyhb(Vuu32,Rt32)","Vdd32.w=vdmpy(Vuu32.h,Rt32.b)",
531887d61b2STaylor Simpson"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
532887d61b2STaylor Simpson    VddV.v[0].w[i]  = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
533887d61b2STaylor Simpson    VddV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV));
534887d61b2STaylor Simpson
535887d61b2STaylor Simpson    VddV.v[1].w[i]  = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
536887d61b2STaylor Simpson    VddV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV)))
537887d61b2STaylor Simpson
538887d61b2STaylor Simpson
539887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv_acc,"Vxx32+=vdmpyhb(Vuu32,Rt32)","Vxx32.w+=vdmpy(Vuu32.h,Rt32.b)",
540887d61b2STaylor Simpson"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
541887d61b2STaylor Simpson    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
542887d61b2STaylor Simpson    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV));
543887d61b2STaylor Simpson
544887d61b2STaylor Simpson    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
545887d61b2STaylor Simpson    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV)))
546887d61b2STaylor Simpson
547887d61b2STaylor Simpson
548887d61b2STaylor Simpson
549887d61b2STaylor Simpson
550887d61b2STaylor Simpson
551887d61b2STaylor Simpson/********************************************
552887d61b2STaylor Simpson*  2-WAY REDUCTION - HALF BY HALF
553887d61b2STaylor Simpson********************************************/
554887d61b2STaylor Simpson
555887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat,"Vd32=vdmpyh(Vu32,Vv32):sat","Vd32.w=vdmpy(Vu32.h,Vv32.h):sat",
556887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, sat to word",
557887d61b2STaylor Simpson    fHIDE(size8s_t accum;)
558887d61b2STaylor Simpson    accum    = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i]));
559887d61b2STaylor Simpson    accum   += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i]));
560887d61b2STaylor Simpson    VdV.w[i] = fVSATW(accum))
561887d61b2STaylor Simpson
562887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat_acc,"Vx32+=vdmpyh(Vu32,Vv32):sat","Vx32.w+=vdmpy(Vu32.h,Vv32.h):sat",
563887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, sat to word",
564887d61b2STaylor Simpson    fHIDE(size8s_t accum;)
565887d61b2STaylor Simpson    accum    = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i]));
566887d61b2STaylor Simpson    accum   += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i]));
567887d61b2STaylor Simpson    VxV.w[i] = fVSATW(VxV.w[i]+accum))
568887d61b2STaylor Simpson
569887d61b2STaylor Simpson
570887d61b2STaylor Simpson/* VDMPYH */
571887d61b2STaylor Simpson
572887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat,"Vd32=vdmpyh(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.h):sat",
573887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, saturate to word",
574887d61b2STaylor Simpson    fHIDE(size8s_t accum;)
575887d61b2STaylor Simpson    accum    = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV));
576887d61b2STaylor Simpson    accum   += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV));
577887d61b2STaylor Simpson    VdV.w[i] = fVSATW(accum))
578887d61b2STaylor Simpson
579887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat_acc,"Vx32+=vdmpyh(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.h):sat",
580887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, saturate to word",
581887d61b2STaylor Simpson    fHIDE(size8s_t) accum = VxV.w[i];
582887d61b2STaylor Simpson    accum   += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV));
583887d61b2STaylor Simpson    accum   += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV));
584887d61b2STaylor Simpson    VxV.w[i] = fVSATW(accum))
585887d61b2STaylor Simpson
586887d61b2STaylor Simpson
587887d61b2STaylor Simpson
588887d61b2STaylor Simpson
589887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat,"Vd32=vdmpyh(Vuu32,Rt32):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.h):sat",
590887d61b2STaylor Simpson"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation",
591887d61b2STaylor Simpson    fHIDE(size8s_t accum;)
592887d61b2STaylor Simpson    accum    = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV));
593887d61b2STaylor Simpson    accum   += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV));
594887d61b2STaylor Simpson    VdV.w[i] = fVSATW(accum))
595887d61b2STaylor Simpson
596887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat_acc,"Vx32+=vdmpyh(Vuu32,Rt32):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.h):sat",
597887d61b2STaylor Simpson"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation",
598887d61b2STaylor Simpson    fHIDE(size8s_t) accum = VxV.w[i];
599887d61b2STaylor Simpson    accum   += fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV));
600887d61b2STaylor Simpson    accum   += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV));
601887d61b2STaylor Simpson    VxV.w[i] = fVSATW(accum))
602887d61b2STaylor Simpson
603887d61b2STaylor Simpson
604887d61b2STaylor Simpson
605887d61b2STaylor Simpson
606887d61b2STaylor Simpson
607887d61b2STaylor Simpson
608887d61b2STaylor Simpson
609887d61b2STaylor Simpson/* VDMPYHSU */
610887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat,"Vd32=vdmpyhsu(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.uh):sat",
611887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, saturate to word",
612887d61b2STaylor Simpson    fHIDE(size8s_t accum;)
613887d61b2STaylor Simpson    accum    = fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV));
614887d61b2STaylor Simpson    accum   += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV));
615887d61b2STaylor Simpson    VdV.w[i] = fVSATW(accum))
616887d61b2STaylor Simpson
617887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat_acc,"Vx32+=vdmpyhsu(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.uh):sat",
618887d61b2STaylor Simpson"Vector halfword multiply, accumulate pairs, saturate to word",
619887d61b2STaylor Simpson    fHIDE(size8s_t) accum=VxV.w[i];
620887d61b2STaylor Simpson    accum   += fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV));
621887d61b2STaylor Simpson    accum   += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV));
622887d61b2STaylor Simpson    VxV.w[i] = fVSATW(accum))
623887d61b2STaylor Simpson
624887d61b2STaylor Simpson
625887d61b2STaylor Simpson
626887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat,"Vd32=vdmpyhsu(Vuu32,Rt32,#1):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.uh,#1):sat",
627887d61b2STaylor Simpson"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation",
628887d61b2STaylor Simpson    fHIDE(size8s_t accum;)
629887d61b2STaylor Simpson    accum    = fMPY16SU(fGETHALF(1,VuuV.v[0].w[i]),fGETUHALF(0,RtV));
630887d61b2STaylor Simpson    accum   += fMPY16SU(fGETHALF(0,VuuV.v[1].w[i]),fGETUHALF(1,RtV));
631887d61b2STaylor Simpson    VdV.w[i] = fVSATW(accum))
632887d61b2STaylor Simpson
633887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat_acc,"Vx32+=vdmpyhsu(Vuu32,Rt32,#1):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.uh,#1):sat",
634887d61b2STaylor Simpson"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation",
635887d61b2STaylor Simpson    fHIDE(size8s_t) accum=VxV.w[i];
636887d61b2STaylor Simpson    accum   += fMPY16SU(fGETHALF(1, VuuV.v[0].w[i]),fGETUHALF(0,RtV));
637887d61b2STaylor Simpson    accum   += fMPY16SU(fGETHALF(0, VuuV.v[1].w[i]),fGETUHALF(1,RtV));
638887d61b2STaylor Simpson    VxV.w[i] = fVSATW(accum))
639887d61b2STaylor Simpson
640887d61b2STaylor Simpson
641887d61b2STaylor Simpson
642887d61b2STaylor Simpson/********************************************
643887d61b2STaylor Simpson*  3-WAY REDUCTION - UNSIGNED BYTE BY  BYTE
644887d61b2STaylor Simpson********************************************/
645887d61b2STaylor Simpson
646887d61b2STaylor Simpson ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb, "Vdd32=vtmpyb(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.b,Rt32.b)",
647887d61b2STaylor Simpson"Dual Vector 3x1 Reduction",
648887d61b2STaylor Simpson    VddV.v[0].h[i]  = fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
649887d61b2STaylor Simpson    VddV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV));
650887d61b2STaylor Simpson    VddV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]);
651887d61b2STaylor Simpson
652887d61b2STaylor Simpson    VddV.v[1].h[i]  = fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
653887d61b2STaylor Simpson    VddV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV));
654887d61b2STaylor Simpson    VddV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i]))
655887d61b2STaylor Simpson
656887d61b2STaylor Simpson
657887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb_acc, "Vxx32+=vtmpyb(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.b,Rt32.b)",
658887d61b2STaylor Simpson"Dual Vector 3x1 Reduction",
659887d61b2STaylor Simpson    VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
660887d61b2STaylor Simpson    VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV));
661887d61b2STaylor Simpson    VxxV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]);
662887d61b2STaylor Simpson
663887d61b2STaylor Simpson    VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
664887d61b2STaylor Simpson    VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV));
665887d61b2STaylor Simpson    VxxV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i]))
666887d61b2STaylor Simpson
667887d61b2STaylor Simpson
668887d61b2STaylor Simpson
669887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus, "Vdd32=vtmpybus(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.ub,Rt32.b)",
670887d61b2STaylor Simpson"Dual Vector 3x1 Reduction",
671887d61b2STaylor Simpson    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
672887d61b2STaylor Simpson    VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV));
673887d61b2STaylor Simpson    VddV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]);
674887d61b2STaylor Simpson
675887d61b2STaylor Simpson    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
676887d61b2STaylor Simpson    VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV));
677887d61b2STaylor Simpson    VddV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i]))
678887d61b2STaylor Simpson
679887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus_acc, "Vxx32+=vtmpybus(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.ub,Rt32.b)",
680887d61b2STaylor Simpson"Dual Vector 3x1 Reduction",
681887d61b2STaylor Simpson    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
682887d61b2STaylor Simpson    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV));
683887d61b2STaylor Simpson    VxxV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]);
684887d61b2STaylor Simpson
685887d61b2STaylor Simpson    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
686887d61b2STaylor Simpson    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV));
687887d61b2STaylor Simpson    VxxV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i]))
688887d61b2STaylor Simpson
689887d61b2STaylor Simpson
690887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb, "Vdd32=vtmpyhb(Vuu32,Rt32)", "Vdd32.w=vtmpy(Vuu32.h,Rt32.b)",
691887d61b2STaylor Simpson"Dual Vector 3x1 Reduction",
692887d61b2STaylor Simpson    VddV.v[0].w[i] = fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
693887d61b2STaylor Simpson    VddV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
694887d61b2STaylor Simpson    VddV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]);
695887d61b2STaylor Simpson
696887d61b2STaylor Simpson    VddV.v[1].w[i] = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
697887d61b2STaylor Simpson    VddV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
698887d61b2STaylor Simpson    VddV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i]))
699887d61b2STaylor Simpson
700887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb_acc, "Vxx32+=vtmpyhb(Vuu32,Rt32)", "Vxx32.w+=vtmpy(Vuu32.h,Rt32.b)",
701887d61b2STaylor Simpson"Dual Vector 3x1 Reduction",
702887d61b2STaylor Simpson    VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
703887d61b2STaylor Simpson    VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
704887d61b2STaylor Simpson    VxxV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]);
705887d61b2STaylor Simpson
706887d61b2STaylor Simpson    VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
707887d61b2STaylor Simpson    VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
708887d61b2STaylor Simpson    VxxV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i]))
709887d61b2STaylor Simpson
710887d61b2STaylor Simpson
711887d61b2STaylor Simpson/********************************************
712887d61b2STaylor Simpson*  4-WAY REDUCTION - UNSIGNED BYTE BY UNSIGNED BYTE
713887d61b2STaylor Simpson********************************************/
714887d61b2STaylor Simpson
715887d61b2STaylor Simpson
716887d61b2STaylor Simpson
717887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpyub,"Vd32=vrmpyub(Vu32,Rt32)","Vd32.uw=vrmpy(Vu32.ub,Rt32.ub)",
718887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
719887d61b2STaylor Simpson    VdV.uw[i]  = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV));
720887d61b2STaylor Simpson    VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV));
721887d61b2STaylor Simpson    VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV));
722887d61b2STaylor Simpson    VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV)))
723887d61b2STaylor Simpson
724887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpyub_acc,"Vx32+=vrmpyub(Vu32,Rt32)","Vx32.uw+=vrmpy(Vu32.ub,Rt32.ub)",
725887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate",
726887d61b2STaylor Simpson    VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV));
727887d61b2STaylor Simpson    VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV));
728887d61b2STaylor Simpson    VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV));
729887d61b2STaylor Simpson    VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV)))
730887d61b2STaylor Simpson
731887d61b2STaylor Simpson
732887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpyubv,"Vd32=vrmpyub(Vu32,Vv32)","Vd32.uw=vrmpy(Vu32.ub,Vv32.ub)",
733887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
734887d61b2STaylor Simpson    VdV.uw[i]  = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i]));
735887d61b2STaylor Simpson    VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i]));
736887d61b2STaylor Simpson    VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i]));
737887d61b2STaylor Simpson    VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i])))
738887d61b2STaylor Simpson
739887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubv_acc,"Vx32+=vrmpyub(Vu32,Vv32)","Vx32.uw+=vrmpy(Vu32.ub,Vv32.ub)",
740887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate",
741887d61b2STaylor Simpson    VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i]));
742887d61b2STaylor Simpson    VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i]));
743887d61b2STaylor Simpson    VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i]));
744887d61b2STaylor Simpson    VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i])))
745887d61b2STaylor Simpson
746887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpybv,"Vd32=vrmpyb(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.b,Vv32.b)",
747887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
748887d61b2STaylor Simpson    VdV.w[i]  = fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i]));
749887d61b2STaylor Simpson    VdV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i]));
750887d61b2STaylor Simpson    VdV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i]));
751887d61b2STaylor Simpson    VdV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i])))
752887d61b2STaylor Simpson
753887d61b2STaylor Simpson
754887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybv_acc,"Vx32+=vrmpyb(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.b,Vv32.b)",
755887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
756887d61b2STaylor Simpson    VxV.w[i] += fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i]));
757887d61b2STaylor Simpson    VxV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i]));
758887d61b2STaylor Simpson    VxV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i]));
759887d61b2STaylor Simpson    VxV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i])))
760887d61b2STaylor Simpson
761887d61b2STaylor Simpson
762887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi,"Vdd32=vrmpyub(Vuu32,Rt32,#u1)","Vdd32.uw=vrmpy(Vuu32.ub,Rt32.ub,#u1)",
763887d61b2STaylor Simpson"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word",
764887d61b2STaylor Simpson    VddV.v[0].uw[i]  = fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
765887d61b2STaylor Simpson    VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV));
766887d61b2STaylor Simpson    VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
767887d61b2STaylor Simpson    VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
768887d61b2STaylor Simpson
769887d61b2STaylor Simpson    VddV.v[1].uw[i]  = fMPY8UU(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
770887d61b2STaylor Simpson    VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
771887d61b2STaylor Simpson    VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
772887d61b2STaylor Simpson    VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)))
773887d61b2STaylor Simpson
774887d61b2STaylor Simpson
775887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi_acc,"Vxx32+=vrmpyub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrmpy(Vuu32.ub,Rt32.ub,#u1)",
776887d61b2STaylor Simpson"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word",
777887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
778887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV));
779887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
780887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
781887d61b2STaylor Simpson
782887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
783887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
784887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
785887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)))
786887d61b2STaylor Simpson
787887d61b2STaylor Simpson
788887d61b2STaylor Simpson
789887d61b2STaylor Simpson
790887d61b2STaylor Simpson/********************************************
791887d61b2STaylor Simpson*  4-WAY REDUCTION - UNSIGNED BYTE BY  BYTE
792887d61b2STaylor Simpson********************************************/
793887d61b2STaylor Simpson
794887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpybus,"Vd32=vrmpybus(Vu32,Rt32)","Vd32.w=vrmpy(Vu32.ub,Rt32.b)",
795887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
796887d61b2STaylor Simpson    VdV.w[i]  = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV));
797887d61b2STaylor Simpson    VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV));
798887d61b2STaylor Simpson    VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV));
799887d61b2STaylor Simpson    VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV)))
800887d61b2STaylor Simpson
801887d61b2STaylor Simpson
802887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpybus_acc,"Vx32+=vrmpybus(Vu32,Rt32)","Vx32.w+=vrmpy(Vu32.ub,Rt32.b)",
803887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
804887d61b2STaylor Simpson    VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV));
805887d61b2STaylor Simpson    VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV));
806887d61b2STaylor Simpson    VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV));
807887d61b2STaylor Simpson    VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV)))
808887d61b2STaylor Simpson
809887d61b2STaylor Simpson
810887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi,"Vdd32=vrmpybus(Vuu32,Rt32,#u1)","Vdd32.w=vrmpy(Vuu32.ub,Rt32.b,#u1)",
811887d61b2STaylor Simpson"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word",
812887d61b2STaylor Simpson    VddV.v[0].w[i]  = fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
813887d61b2STaylor Simpson    VddV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV));
814887d61b2STaylor Simpson    VddV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
815887d61b2STaylor Simpson    VddV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
816887d61b2STaylor Simpson
817887d61b2STaylor Simpson    VddV.v[1].w[i]  = fMPY8US(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
818887d61b2STaylor Simpson    VddV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
819887d61b2STaylor Simpson    VddV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
820887d61b2STaylor Simpson    VddV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)))
821887d61b2STaylor Simpson
822887d61b2STaylor Simpson
823887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi_acc,"Vxx32+=vrmpybus(Vuu32,Rt32,#u1)","Vxx32.w+=vrmpy(Vuu32.ub,Rt32.b,#u1)",
824887d61b2STaylor Simpson"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word",
825887d61b2STaylor Simpson    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
826887d61b2STaylor Simpson    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV));
827887d61b2STaylor Simpson    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
828887d61b2STaylor Simpson    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
829887d61b2STaylor Simpson
830887d61b2STaylor Simpson    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
831887d61b2STaylor Simpson    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
832887d61b2STaylor Simpson    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
833887d61b2STaylor Simpson    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)))
834887d61b2STaylor Simpson
835887d61b2STaylor Simpson
836887d61b2STaylor Simpson
837887d61b2STaylor Simpson
838887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vrmpybusv,"Vd32=vrmpybus(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.ub,Vv32.b)",
839887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
840887d61b2STaylor Simpson    VdV.w[i]  = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i]));
841887d61b2STaylor Simpson    VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i]));
842887d61b2STaylor Simpson    VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i]));
843887d61b2STaylor Simpson    VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i])))
844887d61b2STaylor Simpson
845887d61b2STaylor Simpson
846887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusv_acc,"Vx32+=vrmpybus(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.ub,Vv32.b)",
847887d61b2STaylor Simpson"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
848887d61b2STaylor Simpson    VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i]));
849887d61b2STaylor Simpson    VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i]));
850887d61b2STaylor Simpson    VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i]));
851887d61b2STaylor Simpson    VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i])))
852887d61b2STaylor Simpson
853887d61b2STaylor Simpson
854887d61b2STaylor Simpson
855887d61b2STaylor Simpson
856887d61b2STaylor Simpson
857887d61b2STaylor Simpson
858887d61b2STaylor Simpson
859887d61b2STaylor Simpson
860887d61b2STaylor Simpson
861887d61b2STaylor Simpson
862887d61b2STaylor Simpson
863887d61b2STaylor Simpson/********************************************
864887d61b2STaylor Simpson*  2-WAY REDUCTION - SAD
865887d61b2STaylor Simpson********************************************/
866887d61b2STaylor Simpson
867887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh,"Vdd32=vdsaduh(Vuu32,Rt32)","Vdd32.uw=vdsad(Vuu32.uh,Rt32.uh)",
868887d61b2STaylor Simpson"Dual Vector Halfword by Byte 4-Way Reduction to Word",
869887d61b2STaylor Simpson    VddV.v[0].uw[i]  = fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
870887d61b2STaylor Simpson    VddV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV));
871887d61b2STaylor Simpson    VddV.v[1].uw[i]  = fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
872887d61b2STaylor Simpson    VddV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV)))
873887d61b2STaylor Simpson
874887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh_acc,"Vxx32+=vdsaduh(Vuu32,Rt32)","Vxx32.uw+=vdsad(Vuu32.uh,Rt32.uh)",
875887d61b2STaylor Simpson"Dual Vector Halfword by Byte 4-Way Reduction to Word",
876887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
877887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV));
878887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
879887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV)))
880887d61b2STaylor Simpson
881887d61b2STaylor Simpson
882887d61b2STaylor Simpson
883887d61b2STaylor Simpson
884887d61b2STaylor Simpson/********************************************
885887d61b2STaylor Simpson*  4-WAY REDUCTION - SAD
886887d61b2STaylor Simpson********************************************/
887887d61b2STaylor Simpson
888887d61b2STaylor Simpson
889887d61b2STaylor Simpson
890887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi,"Vdd32=vrsadub(Vuu32,Rt32,#u1)","Vdd32.uw=vrsad(Vuu32.ub,Rt32.ub,#u1)",
891887d61b2STaylor Simpson"Dual Vector Halfword by Byte 4-Way Reduction to Word",
892887d61b2STaylor Simpson    VddV.v[0].uw[i]  = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
893887d61b2STaylor Simpson    VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)));
894887d61b2STaylor Simpson    VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
895887d61b2STaylor Simpson    VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
896887d61b2STaylor Simpson
897887d61b2STaylor Simpson    VddV.v[1].uw[i]  = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
898887d61b2STaylor Simpson    VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
899887d61b2STaylor Simpson    VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
900887d61b2STaylor Simpson    VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))))
901887d61b2STaylor Simpson
902887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi_acc,"Vxx32+=vrsadub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrsad(Vuu32.ub,Rt32.ub,#u1)",
903887d61b2STaylor Simpson"Dual Vector Halfword by Byte 4-Way Reduction to Word",
904887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
905887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)));
906887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
907887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
908887d61b2STaylor Simpson
909887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
910887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
911887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
912887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))))
913887d61b2STaylor Simpson
914887d61b2STaylor Simpson
915887d61b2STaylor Simpson
916887d61b2STaylor Simpson
917887d61b2STaylor Simpson
918887d61b2STaylor Simpson
919887d61b2STaylor Simpson
920887d61b2STaylor Simpson
921887d61b2STaylor Simpson
922887d61b2STaylor Simpson
923887d61b2STaylor Simpson/*********************************************************************
924887d61b2STaylor Simpson * MMVECTOR SHIFTING
925887d61b2STaylor Simpson * ******************************************************************/
926887d61b2STaylor Simpson// Macro to shift arithmetically left/right and by either RT or Vv
927887d61b2STaylor Simpson
928887d61b2STaylor Simpson#define V_SHIFT(TYPE, DESC, SIZE, LOGSIZE, CASTTYPE)   \
929887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE,   "Vd32=vasr" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Rt32)",         "Vector arithmetic shift right " DESC,    VdV.TYPE[i]     = (VuV.TYPE[i]    >> (RtV & (SIZE-1)))) \
930887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE,   "Vd32=vasl" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Rt32)",         "Vector arithmetic shift left  " DESC,    VdV.TYPE[i]     = (VuV.TYPE[i]    << (RtV & (SIZE-1)))) \
931887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE,   "Vd32=vlsr" #TYPE "(Vu32,Rt32)","Vd32.u"#TYPE"=vlsr(Vu32.u"#TYPE",Rt32)",       "Vector logical shift right "    DESC,    VdV.u##TYPE[i]  = (VuV.u##TYPE[i] >> (RtV & (SIZE-1)))) \
932887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE##v,"Vd32=vasr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift right " DESC,    VdV.TYPE[i]     = fBIDIR_ASHIFTR(VuV.TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \
933887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE##v,"Vd32=vasl" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift left  " DESC,    VdV.TYPE[i]     = fBIDIR_ASHIFTL(VuV.TYPE[i],  fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \
934887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE##v,"Vd32=vlsr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vlsr(Vu32."#TYPE",Vv32."#TYPE")", "Vector logical shift right "    DESC,    VdV.u##TYPE[i]  = fBIDIR_LSHIFTR(VuV.u##TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \
935887d61b2STaylor Simpson
936887d61b2STaylor SimpsonV_SHIFT(w, "word",   32,5,4_4)
937887d61b2STaylor SimpsonV_SHIFT(h, "halfword", 16,4,2_2)
938887d61b2STaylor Simpson
939887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT(8,vlsrb,"Vd32.ub=vlsr(Vu32.ub,Rt32)","vec log shift right bytes", VdV.b[i] = VuV.ub[i] >> (RtV & 0x7))
940887d61b2STaylor Simpson
941887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vrotr,"Vd32=vrotr(Vu32,Vv32)","Vd32.uw=vrotr(Vu32.uw,Vv32.uw)","Vector word rotate right", VdV.uw[i] = ((VuV.uw[i] >> (VvV.uw[i] & 0x1f)) | (VuV.uw[i] << (32 - (VvV.uw[i] & 0x1f)))))
942887d61b2STaylor Simpson
943887d61b2STaylor Simpson/*********************************************************************
944887d61b2STaylor Simpson * MMVECTOR SHIFT AND PERMUTE
945887d61b2STaylor Simpson * ******************************************************************/
946887d61b2STaylor Simpson
947887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(32,vasr_into,"Vxx32=vasrinto(Vu32,Vv32)","Vxx32.w=vasrinto(Vu32.w,Vv32.w)","ASR vector 1 elements and overlay dropping bits to MSB of vector 2 elements",
948887d61b2STaylor Simpson    fHIDE(int64_t ) shift = (fSE32_64(VuV.w[i]) << 32);
949887d61b2STaylor Simpson    fHIDE(int64_t ) mask  = (((fSE32_64(VxxV.v[0].w[i])) << 32) | fZE32_64(VxxV.v[0].w[i]));
950887d61b2STaylor Simpson    fHIDE(int64_t) lomask = (((fSE32_64(1)) << 32) - 1);
951887d61b2STaylor Simpson    fHIDE(int ) count = -(0x40 & VvV.w[i]) + (VvV.w[i] & 0x3f);
952887d61b2STaylor Simpson    fHIDE(int64_t ) result = (count == -0x40) ? 0 : (((count < 0) ? ((shift << -(count)) | (mask & (lomask << -(count)))) : ((shift >> count) | (mask & (lomask >> count)))));
953887d61b2STaylor Simpson    VxxV.v[1].w[i] = ((result >> 32) & 0xffffffff);
954887d61b2STaylor Simpson    VxxV.v[0].w[i] = (result & 0xffffffff))
955887d61b2STaylor Simpson
956887d61b2STaylor Simpson#define NEW_NARROWING_SHIFT 1
957887d61b2STaylor Simpson
958887d61b2STaylor Simpson#if NEW_NARROWING_SHIFT
959887d61b2STaylor Simpson#define NARROWING_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \
960887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT(ITERSIZE,TAG, \
961887d61b2STaylor Simpson"Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \
962887d61b2STaylor Simpson"Vector shift right and shuffle", \
963887d61b2STaylor Simpson    fHIDE(int )shamt = RtV & SHAMTMASK; \
964887d61b2STaylor Simpson    DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \
965887d61b2STaylor Simpson    DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt)))
966887d61b2STaylor Simpson
967887d61b2STaylor Simpson
968887d61b2STaylor Simpson
969887d61b2STaylor Simpson
970887d61b2STaylor Simpson
971887d61b2STaylor Simpson/* WORD TO HALF*/
972887d61b2STaylor Simpson
973887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasrwh,fSETHALF,h,w,,fECHO,fVNOROUND,0xF)
974887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasrwhsat,fSETHALF,h,w,:sat,fVSATH,fVNOROUND,0xF)
975887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasrwhrndsat,fSETHALF,h,w,:rnd:sat,fVSATH,fVROUND,0xF)
976887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasrwuhrndsat,fSETHALF,uh,w,:rnd:sat,fVSATUH,fVROUND,0xF)
977887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasrwuhsat,fSETHALF,uh,w,:sat,fVSATUH,fVNOROUND,0xF)
978887d61b2STaylor SimpsonNARROWING_SHIFT(32,vasruwuhrndsat,fSETHALF,uh,uw,:rnd:sat,fVSATUH,fVROUND,0xF)
979887d61b2STaylor Simpson
980887d61b2STaylor SimpsonNARROWING_SHIFT_NOV1(32,vasruwuhsat,fSETHALF,uh,uw,:sat,fVSATUH,fVNOROUND,0xF)
981887d61b2STaylor SimpsonNARROWING_SHIFT(16,vasrhubsat,fSETBYTE,ub,h,:sat,fVSATUB,fVNOROUND,0x7)
982887d61b2STaylor SimpsonNARROWING_SHIFT(16,vasrhubrndsat,fSETBYTE,ub,h,:rnd:sat,fVSATUB,fVROUND,0x7)
983887d61b2STaylor SimpsonNARROWING_SHIFT(16,vasrhbsat,fSETBYTE,b,h,:sat,fVSATB,fVNOROUND,0x7)
984887d61b2STaylor SimpsonNARROWING_SHIFT(16,vasrhbrndsat,fSETBYTE,b,h,:rnd:sat,fVSATB,fVROUND,0x7)
985887d61b2STaylor Simpson
986b2f20c2cSTaylor Simpson#define NARROWING_VECTOR_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SRCTYPE2,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \
987b2f20c2cSTaylor SimpsonITERATOR_INSN_SHIFT3_SLOT(ITERSIZE,TAG, \
988b2f20c2cSTaylor Simpson"Vd32." #DSTTYPE "=vasr(Vuu32." #SRCTYPE ",Vv32." #SRCTYPE2 ")" #SYNOPTS, \
989b2f20c2cSTaylor Simpson"Vector shift by vector right and shuffle", \
990b2f20c2cSTaylor Simpson    fHIDE(int )shamt = VvV.SRCTYPE2[2*i+0] & SHAMTMASK; \
991b2f20c2cSTaylor Simpson    DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuuV.v[0].SRCTYPE[i],shamt) >> shamt)); \
992b2f20c2cSTaylor Simpson    shamt = VvV.SRCTYPE2[2*i+1] & SHAMTMASK; \
993b2f20c2cSTaylor Simpson    DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuuV.v[1].SRCTYPE[i],shamt) >> shamt)))
994b2f20c2cSTaylor Simpson
995b2f20c2cSTaylor Simpson/* WORD TO HALF*/
996b2f20c2cSTaylor SimpsonNARROWING_VECTOR_SHIFT(32,vasrvwuhsat,fSETHALF,uh,w,uh,:sat,fVSATUH,fVNOROUND,0xF)
997b2f20c2cSTaylor SimpsonNARROWING_VECTOR_SHIFT(32,vasrvwuhrndsat,fSETHALF,uh,w,uh,:rnd:sat,fVSATUH,fVROUND,0xF)
998b2f20c2cSTaylor Simpson/* HALF TO BYTE*/
999b2f20c2cSTaylor SimpsonNARROWING_VECTOR_SHIFT(16,vasrvuhubsat,fSETBYTE,ub,uh,ub,:sat,fVSATUB,fVNOROUND,0x7)
1000b2f20c2cSTaylor SimpsonNARROWING_VECTOR_SHIFT(16,vasrvuhubrndsat,fSETBYTE,ub,uh,ub,:rnd:sat,fVSATUB,fVROUND,0x7)
1001b2f20c2cSTaylor Simpson
1002887d61b2STaylor SimpsonNARROWING_SHIFT_NOV1(16,vasruhubsat,fSETBYTE,ub,uh,:sat,fVSATUB,fVNOROUND,0x7)
1003887d61b2STaylor SimpsonNARROWING_SHIFT_NOV1(16,vasruhubrndsat,fSETBYTE,ub,uh,:rnd:sat,fVSATUB,fVROUND,0x7)
1004887d61b2STaylor Simpson
1005887d61b2STaylor Simpson#else
1006887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrwh,"Vd32=vasrwh(Vu32,Vv32,Rt8)","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8)",
1007887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords",
1008887d61b2STaylor Simpson    fSETHALF(0,VdV.w[i], (VvV.w[i] >> (RtV & 0xF)));
1009887d61b2STaylor Simpson    fSETHALF(1,VdV.w[i], (VuV.w[i] >> (RtV & 0xF))))
1010887d61b2STaylor Simpson
1011887d61b2STaylor Simpson
1012887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrwhsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):sat",
1013887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords",
1014887d61b2STaylor Simpson    fSETHALF(0,VdV.w[i], fVSATH(VvV.w[i] >> (RtV & 0xF)));
1015887d61b2STaylor Simpson    fSETHALF(1,VdV.w[i], fVSATH(VuV.w[i] >> (RtV & 0xF))))
1016887d61b2STaylor Simpson
1017887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrwhrndsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):rnd:sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat",
1018887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords",
1019887d61b2STaylor Simpson    fHIDE(int ) shamt = RtV & 0xF;
1020887d61b2STaylor Simpson    fSETHALF(0,VdV.w[i], fVSATH(  (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt));
1021887d61b2STaylor Simpson    fSETHALF(1,VdV.w[i], fVSATH(  (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)))
1022887d61b2STaylor Simpson
1023887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhrndsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat",
1024887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords",
1025887d61b2STaylor Simpson    fHIDE(int ) shamt = RtV & 0xF;
1026887d61b2STaylor Simpson    fSETHALF(0,VdV.w[i], fVSATUH(  (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt));
1027887d61b2STaylor Simpson    fSETHALF(1,VdV.w[i], fVSATUH(  (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)))
1028887d61b2STaylor Simpson
1029887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):sat",
1030887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords",
1031887d61b2STaylor Simpson    fSETHALF(0, VdV.uw[i], fVSATUH(VvV.w[i] >> (RtV & 0xF)));
1032887d61b2STaylor Simpson    fSETHALF(1, VdV.uw[i], fVSATUH(VuV.w[i] >> (RtV & 0xF))))
1033887d61b2STaylor Simpson
1034887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasruwuhrndsat,"Vd32=vasruwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.uw,Vv32.uw,Rt8):rnd:sat",
1035887d61b2STaylor Simpson"Vector arithmetic shift right words, shuffle even halfwords",
1036887d61b2STaylor Simpson    fHIDE(int ) shamt = RtV & 0xF;
1037887d61b2STaylor Simpson    fSETHALF(0,VdV.w[i], fVSATUH(  (VvV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt));
1038887d61b2STaylor Simpson    fSETHALF(1,VdV.w[i], fVSATUH(  (VuV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)))
1039887d61b2STaylor Simpson#endif
1040887d61b2STaylor Simpson
1041887d61b2STaylor Simpson
1042887d61b2STaylor Simpson
1043887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vroundwh,"Vd32=vroundwh(Vu32,Vv32):sat","Vd32.h=vround(Vu32.w,Vv32.w):sat",
1044887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords",
1045887d61b2STaylor Simpson    fSETHALF(0, VdV.uw[i], fVSATH((VvV.w[i] + fCONSTLL(0x8000)) >> 16));
1046887d61b2STaylor Simpson    fSETHALF(1, VdV.uw[i], fVSATH((VuV.w[i] + fCONSTLL(0x8000)) >> 16)))
1047887d61b2STaylor Simpson
1048887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vroundwuh,"Vd32=vroundwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.w,Vv32.w):sat",
1049887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords",
1050887d61b2STaylor Simpson    fSETHALF(0, VdV.uw[i], fVSATUH((VvV.w[i] + fCONSTLL(0x8000)) >> 16));
1051887d61b2STaylor Simpson    fSETHALF(1, VdV.uw[i], fVSATUH((VuV.w[i] + fCONSTLL(0x8000)) >> 16)))
1052887d61b2STaylor Simpson
1053887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vrounduwuh,"Vd32=vrounduwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.uw,Vv32.uw):sat",
1054887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords",
1055887d61b2STaylor Simpson    fSETHALF(0, VdV.uw[i], fVSATUH((VvV.uw[i] + fCONSTLL(0x8000)) >> 16));
1056887d61b2STaylor Simpson    fSETHALF(1, VdV.uw[i], fVSATUH((VuV.uw[i] + fCONSTLL(0x8000)) >> 16)))
1057887d61b2STaylor Simpson
1058887d61b2STaylor Simpson
1059887d61b2STaylor Simpson
1060887d61b2STaylor Simpson
1061887d61b2STaylor Simpson
1062887d61b2STaylor Simpson/* HALF TO BYTE*/
1063887d61b2STaylor Simpson
1064887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vroundhb,"Vd32=vroundhb(Vu32,Vv32):sat","Vd32.b=vround(Vu32.h,Vv32.h):sat",
1065887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords",
1066887d61b2STaylor Simpson    fSETBYTE(0, VdV.uh[i], fVSATB((VvV.h[i] + 0x80) >> 8));
1067887d61b2STaylor Simpson    fSETBYTE(1, VdV.uh[i], fVSATB((VuV.h[i] + 0x80) >> 8)))
1068887d61b2STaylor Simpson
1069887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vroundhub,"Vd32=vroundhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.h,Vv32.h):sat",
1070887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords",
1071887d61b2STaylor Simpson    fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.h[i] + 0x80) >> 8));
1072887d61b2STaylor Simpson    fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.h[i] + 0x80) >> 8)))
1073887d61b2STaylor Simpson
1074887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vrounduhub,"Vd32=vrounduhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.uh,Vv32.uh):sat",
1075887d61b2STaylor Simpson"Vector round words to halves, shuffle resultant halfwords",
1076887d61b2STaylor Simpson    fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.uh[i] + 0x80) >> 8));
1077887d61b2STaylor Simpson    fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.uh[i] + 0x80) >> 8)))
1078887d61b2STaylor Simpson
1079887d61b2STaylor Simpson
1080887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vaslw_acc,"Vx32+=vaslw(Vu32,Rt32)","Vx32.w+=vasl(Vu32.w,Rt32)",
1081887d61b2STaylor Simpson"Vector shift add word",
1082887d61b2STaylor Simpson    VxV.w[i]  +=  (VuV.w[i] << (RtV & (32-1))))
1083887d61b2STaylor Simpson
1084887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vasrw_acc,"Vx32+=vasrw(Vu32,Rt32)","Vx32.w+=vasr(Vu32.w,Rt32)",
1085887d61b2STaylor Simpson"Vector shift add word",
1086887d61b2STaylor Simpson    VxV.w[i]  +=  (VuV.w[i] >> (RtV & (32-1))))
1087887d61b2STaylor Simpson
1088887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vaslh_acc,"Vx32+=vaslh(Vu32,Rt32)","Vx32.h+=vasl(Vu32.h,Rt32)",
1089887d61b2STaylor Simpson"Vector shift add halfword",
1090887d61b2STaylor Simpson    VxV.h[i]  +=  (VuV.h[i] << (RtV & (16-1))))
1091887d61b2STaylor Simpson
1092887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vasrh_acc,"Vx32+=vasrh(Vu32,Rt32)","Vx32.h+=vasr(Vu32.h,Rt32)",
1093887d61b2STaylor Simpson"Vector shift add halfword",
1094887d61b2STaylor Simpson    VxV.h[i]  +=  (VuV.h[i] >> (RtV & (16-1))))
1095887d61b2STaylor Simpson
1096887d61b2STaylor Simpson/**************************************************************************
1097887d61b2STaylor Simpson*
1098887d61b2STaylor Simpson* MMVECTOR ELEMENT-WISE ARITHMETIC
1099887d61b2STaylor Simpson*
1100887d61b2STaylor Simpson**************************************************************************/
1101887d61b2STaylor Simpson
1102887d61b2STaylor Simpson/**************************************************************************
1103887d61b2STaylor Simpson* MACROS GO IN MACROS.DEF NOT HERE!!!
1104887d61b2STaylor Simpson**************************************************************************/
1105887d61b2STaylor Simpson
1106887d61b2STaylor Simpson
1107887d61b2STaylor Simpson#define MMVEC_ABSDIFF(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1108887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(WIDTH, vabsdiff##TYPE,                   "Vd32=vabsdiff"TYPE2"(Vu32,Vv32)" ,"Vd32."#DEST"=vabsdiff(Vu32."#SRC",Vv32."#SRC")" ,     "Vector Absolute of Difference "DESCR,   VdV.DEST[i] = (VuV.SRC[i] > VvV.SRC[i]) ? (VuV.SRC[i] - VvV.SRC[i]) : (VvV.SRC[i] - VuV.SRC[i]))
1109887d61b2STaylor Simpson
1110887d61b2STaylor Simpson#define MMVEC_ADDU_SAT(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1111887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat,                  "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" ,    "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVUADDSAT(WIDTH,  VuV.SRC[i], VvV.SRC[i]))\
1112887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv,    "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\
1113887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat,                  "Vd32=vsub"TYPE2"(Vu32,Vv32):sat",     "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVUSUBSAT(WIDTH,  VuV.SRC[i], VvV.SRC[i]))\
1114887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv,    "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\
1115887d61b2STaylor Simpson
1116887d61b2STaylor Simpson#define MMVEC_ADDS_SAT(TYPE,TYPE2,DESCR, WIDTH,DEST,SRC)\
1117887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat,                  "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" ,    "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVSADDSAT(WIDTH,  VuV.SRC[i],  VvV.SRC[i]))\
1118887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv,    "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\
1119887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat,                  "Vd32=vsub"TYPE2"(Vu32,Vv32):sat",     "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVSSUBSAT(WIDTH,  VuV.SRC[i],  VvV.SRC[i]))\
1120887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv,    "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\
1121887d61b2STaylor Simpson
1122887d61b2STaylor Simpson#define MMVEC_AVGU(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1123887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",        "Vector Average "DESCR,                                      VdV.DEST[i] = fVAVGU(   WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
1124887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",     "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",    "Vector Average % Round"DESCR,                               VdV.DEST[i] = fVAVGURND(WIDTH,  VuV.SRC[i], VvV.SRC[i]))
1125887d61b2STaylor Simpson
1126887d61b2STaylor Simpson
1127887d61b2STaylor Simpson
1128887d61b2STaylor Simpson#define MMVEC_AVGS(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1129887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",          "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",          "Vector Average "DESCR,                                      VdV.DEST[i]  = fVAVGS(       WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
1130887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",      "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",      "Vector Average % Round"DESCR,                               VdV.DEST[i]  = fVAVGSRND(    WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
1131887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vnavg##TYPE,                       "Vd32=vnavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")",         "Vector Negative Average "DESCR,                             VdV.DEST[i]  = fVNAVGS(      WIDTH,  VuV.SRC[i], VvV.SRC[i]))
1132887d61b2STaylor Simpson
1133887d61b2STaylor Simpson
1134887d61b2STaylor Simpson
1135887d61b2STaylor Simpson
1136887d61b2STaylor Simpson
1137887d61b2STaylor Simpson
1138887d61b2STaylor Simpson
1139887d61b2STaylor Simpson#define MMVEC_ADDWRAP(TYPE,TYPE2, DESCR, WIDTH , DEST,SRC)\
1140887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE,                  "Vd32=vadd"TYPE2"(Vu32,Vv32)" ,     "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC")",    "Vector Add "DESCR,          VdV.DEST[i] =  VuV.SRC[i] +  VvV.SRC[i])\
1141887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE,                  "Vd32=vsub"TYPE2"(Vu32,Vv32)" ,     "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC")",    "Vector Sub "DESCR,          VdV.DEST[i] =  VuV.SRC[i] -  VvV.SRC[i])\
1142887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##_dv,  "Vdd32=vadd"TYPE2"(Vuu32,Vvv32)" ,  "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Add "DESCR,   VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] + VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] + VvvV.v[1].SRC[i])\
1143887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##_dv,  "Vdd32=vsub"TYPE2"(Vuu32,Vvv32)" ,  "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Sub "DESCR,   VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] - VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] - VvvV.v[1].SRC[i]) \
1144887d61b2STaylor Simpson
1145887d61b2STaylor Simpson
1146887d61b2STaylor Simpson
1147887d61b2STaylor Simpson
1148887d61b2STaylor Simpson
1149887d61b2STaylor Simpson/* Wrapping Adds */
1150887d61b2STaylor SimpsonMMVEC_ADDWRAP(b,    "b",    "Byte",         8,   b, b)
1151887d61b2STaylor SimpsonMMVEC_ADDWRAP(h,    "h",    "Halfword",     16,  h, h)
1152887d61b2STaylor SimpsonMMVEC_ADDWRAP(w,    "w",    "Word",         32,   w,    w)
1153887d61b2STaylor Simpson
1154887d61b2STaylor Simpson/* Saturating Adds */
1155887d61b2STaylor SimpsonMMVEC_ADDU_SAT(ub, "ub",    "Unsigned Byte",        8,   ub,    ub)
1156887d61b2STaylor SimpsonMMVEC_ADDU_SAT(uh, "uh",    "Unsigned Halfword",    16,  uh,    uh)
1157887d61b2STaylor SimpsonMMVEC_ADDU_SAT(uw, "uw",    "Unsigned word",    32,  uw,    uw)
1158887d61b2STaylor SimpsonMMVEC_ADDS_SAT(b,  "b",     "byte",             8,  b,     b)
1159887d61b2STaylor SimpsonMMVEC_ADDS_SAT(h,  "h",     "Halfword",             16,  h,     h)
1160887d61b2STaylor SimpsonMMVEC_ADDS_SAT(w,  "w",     "Word",                 32,  w,     w)
1161887d61b2STaylor Simpson
1162887d61b2STaylor Simpson
1163887d61b2STaylor Simpson/* Averaging Instructions */
1164887d61b2STaylor SimpsonMMVEC_AVGU(ub,"ub",     "Unsigned Byte",     8,   ub,   ub)
1165887d61b2STaylor SimpsonMMVEC_AVGU(uh,"uh",     "Unsigned Halfword", 16,  uh,   uh)
1166887d61b2STaylor SimpsonMMVEC_AVGU_NOV1(uw,"uw",     "Unsigned Word",     32,  uw,   uw)
1167887d61b2STaylor SimpsonMMVEC_AVGS_NOV1(b,   "b",    "Byte",               8,   b,   b)
1168887d61b2STaylor SimpsonMMVEC_AVGS(h,   "h",    "Halfword",          16,   h,   h)
1169887d61b2STaylor SimpsonMMVEC_AVGS(w,   "w",    "Word",              32,   w,   w)
1170887d61b2STaylor Simpson
1171887d61b2STaylor Simpson
1172887d61b2STaylor Simpson/* Absolute Difference */
1173887d61b2STaylor SimpsonMMVEC_ABSDIFF(ub,"ub",  "Unsigned Byte",        8,   ub,    ub)
1174887d61b2STaylor SimpsonMMVEC_ABSDIFF(uh,"uh",  "Unsigned Halfword",    16,  uh,    uh)
1175887d61b2STaylor SimpsonMMVEC_ABSDIFF(h,"h",        "Halfword",             16,  uh,    h)
1176887d61b2STaylor SimpsonMMVEC_ABSDIFF(w,"w",        "Word",                 32,  uw,    w)
1177887d61b2STaylor Simpson
1178887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(8,vnavgub, "Vd32=vnavgub(Vu32,Vv32)", "Vd32.b=vnavg(Vu32.ub,Vv32.ub)",
1179887d61b2STaylor Simpson"Vector Negative Average Unsigned Byte", VdV.b[i]   = fVNAVGU(8, VuV.ub[i], VvV.ub[i]))
1180887d61b2STaylor Simpson
1181887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vaddcarrysat,"Vd32.w=vadd(Vu32.w,Vv32.w,Qs4):carry:sat","add w/carry and saturate",
1182887d61b2STaylor SimpsonVdV.w[i] = fVSATW(VuV.w[i]+VvV.w[i]+fGETQBIT(QsV,i*4)))
1183887d61b2STaylor Simpson
1184887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vaddcarry,"Vd32.w=vadd(Vu32.w,Vv32.w,Qx4):carry","add w/carry",
1185887d61b2STaylor SimpsonVdV.w[i] = VuV.w[i]+VvV.w[i]+fGETQBIT(QxV,i*4);
1186887d61b2STaylor SimpsonfSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],fGETQBIT(QxV,i*4))))
1187887d61b2STaylor Simpson
1188887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vsubcarry,"Vd32.w=vsub(Vu32.w,Vv32.w,Qx4):carry","add w/carry",
1189887d61b2STaylor SimpsonVdV.w[i] = VuV.w[i]+~VvV.w[i]+fGETQBIT(QxV,i*4);
1190887d61b2STaylor SimpsonfSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],fGETQBIT(QxV,i*4))))
1191887d61b2STaylor Simpson
1192887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vaddcarryo,"Vd32.w,Qe4=vadd(Vu32.w,Vv32.w):carry","add w/carry out-only",
1193887d61b2STaylor SimpsonVdV.w[i] = VuV.w[i]+VvV.w[i];
1194887d61b2STaylor SimpsonfSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],0)))
1195887d61b2STaylor Simpson
1196887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vsubcarryo,"Vd32.w,Qe4=vsub(Vu32.w,Vv32.w):carry","subtract w/carry out-only",
1197887d61b2STaylor SimpsonVdV.w[i] = VuV.w[i]+~VvV.w[i]+1;
1198887d61b2STaylor SimpsonfSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],1)))
1199887d61b2STaylor Simpson
1200887d61b2STaylor Simpson
1201887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vsatdw,"Vd32.w=vsatdw(Vu32.w,Vv32.w)","Saturate from 64-bits (higher 32-bits come from first vector) to 32-bits",VdV.w[i] = fVSATDW(VuV.w[i],VvV.w[i]))
1202887d61b2STaylor Simpson
1203887d61b2STaylor Simpson
1204887d61b2STaylor Simpson#define MMVEC_ADDSAT_MIX(TAGEND,SATF,WIDTH,DEST,SRC1,SRC2)\
1205887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(WIDTH, vadd##TAGEND,"Vd32."#DEST"=vadd(Vu32."#SRC1",Vv32."#SRC2"):sat",    "Vector Add mixed", VdV.DEST[i] =  SATF(VuV.SRC1[i] +  VvV.SRC2[i]))\
1206887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(WIDTH, vsub##TAGEND,"Vd32."#DEST"=vsub(Vu32."#SRC1",Vv32."#SRC2"):sat",    "Vector Sub mixed", VdV.DEST[i] =  SATF(VuV.SRC1[i] -  VvV.SRC2[i]))\
1207887d61b2STaylor Simpson
1208887d61b2STaylor SimpsonMMVEC_ADDSAT_MIX(ububb_sat,fVSATUB,8,ub,ub,b)
1209887d61b2STaylor Simpson
1210887d61b2STaylor Simpson/****************************
1211887d61b2STaylor Simpson*   WIDENING
1212887d61b2STaylor Simpson****************************/
1213887d61b2STaylor Simpson
1214887d61b2STaylor Simpson
1215887d61b2STaylor Simpson
1216887d61b2STaylor Simpson
1217887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh,"Vdd32=vaddub(Vu32,Vv32)","Vdd32.h=vadd(Vu32.ub,Vv32.ub)",
1218887d61b2STaylor Simpson"Vector addition with widen into two vectors",
1219887d61b2STaylor Simpson    VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) + fZE8_16(fGETUBYTE(0, VvV.uh[i]));
1220887d61b2STaylor Simpson    VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) + fZE8_16(fGETUBYTE(1, VvV.uh[i])))
1221887d61b2STaylor Simpson
1222887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vsububh,"Vdd32=vsubub(Vu32,Vv32)","Vdd32.h=vsub(Vu32.ub,Vv32.ub)",
1223887d61b2STaylor Simpson"Vector subtraction with widen into two vectors",
1224887d61b2STaylor Simpson    VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) - fZE8_16(fGETUBYTE(0, VvV.uh[i]));
1225887d61b2STaylor Simpson    VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) - fZE8_16(fGETUBYTE(1, VvV.uh[i])))
1226887d61b2STaylor Simpson
1227887d61b2STaylor Simpson
1228887d61b2STaylor Simpson
1229887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw,"Vdd32=vaddh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.h,Vv32.h)",
1230887d61b2STaylor Simpson"Vector addition with widen into two vectors",
1231887d61b2STaylor Simpson    VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]);
1232887d61b2STaylor Simpson    VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i]))
1233887d61b2STaylor Simpson
1234887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubhw,"Vdd32=vsubh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.h,Vv32.h)",
1235887d61b2STaylor Simpson"Vector subtraction with widen into two vectors",
1236887d61b2STaylor Simpson    VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) - fGETHALF(0, VvV.w[i]);
1237887d61b2STaylor Simpson    VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) - fGETHALF(1, VvV.w[i]))
1238887d61b2STaylor Simpson
1239887d61b2STaylor Simpson
1240887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw,"Vdd32=vadduh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.uh,Vv32.uh)",
1241887d61b2STaylor Simpson"Vector addition with widen into two vectors",
1242887d61b2STaylor Simpson    VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) + fZE16_32(fGETUHALF(0, VvV.uw[i]));
1243887d61b2STaylor Simpson    VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) + fZE16_32(fGETUHALF(1, VvV.uw[i])))
1244887d61b2STaylor Simpson
1245887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubuhw,"Vdd32=vsubuh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.uh,Vv32.uh)",
1246887d61b2STaylor Simpson"Vector subtraction with widen into two vectors",
1247887d61b2STaylor Simpson    VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) - fZE16_32(fGETUHALF(0, VvV.uw[i]));
1248887d61b2STaylor Simpson    VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) - fZE16_32(fGETUHALF(1, VvV.uw[i])))
1249887d61b2STaylor Simpson
1250887d61b2STaylor Simpson
1251887d61b2STaylor Simpson
1252887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw_acc,"Vxx32+=vaddh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.h,Vv32.h)",
1253887d61b2STaylor Simpson"Vector addition with widen into two vectors",
1254887d61b2STaylor Simpson    VxxV.v[0].w[i] += fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]);
1255887d61b2STaylor Simpson    VxxV.v[1].w[i] += fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i]))
1256887d61b2STaylor Simpson
1257887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw_acc,"Vxx32+=vadduh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.uh,Vv32.uh)",
1258887d61b2STaylor Simpson"Vector addition with widen into two vectors",
1259887d61b2STaylor Simpson    VxxV.v[0].w[i] += fGETUHALF(0, VuV.w[i]) + fGETUHALF(0, VvV.w[i]);
1260887d61b2STaylor Simpson    VxxV.v[1].w[i] += fGETUHALF(1, VuV.w[i]) + fGETUHALF(1, VvV.w[i]))
1261887d61b2STaylor Simpson
1262887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh_acc,"Vxx32+=vaddub(Vu32,Vv32)","Vxx32.h+=vadd(Vu32.ub,Vv32.ub)",
1263887d61b2STaylor Simpson"Vector addition with widen into two vectors",
1264887d61b2STaylor Simpson    VxxV.v[0].h[i] += fGETUBYTE(0, VuV.h[i]) + fGETUBYTE(0, VvV.h[i]);
1265887d61b2STaylor Simpson    VxxV.v[1].h[i] += fGETUBYTE(1, VuV.h[i]) + fGETUBYTE(1, VvV.h[i]))
1266887d61b2STaylor Simpson
1267887d61b2STaylor Simpson
1268887d61b2STaylor Simpson/****************************
1269887d61b2STaylor Simpson*   Conditional
1270887d61b2STaylor Simpson****************************/
1271887d61b2STaylor Simpson
1272887d61b2STaylor Simpson#define CONDADDSUB(WIDTH,TAGEND,LHSYN,RHSYN,DESCR,LHBEH,RHBEH) \
1273887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH+RHBEH,LHBEH)) \
1274887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH-RHBEH,LHBEH)) \
1275887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (!Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH+RHBEH)) \
1276887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (!Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH-RHBEH)) \
1277887d61b2STaylor Simpson
1278887d61b2STaylor SimpsonCONDADDSUB(8,b,"Vx32.b","Vu32.b","Conditional add/sub Byte",VxV.ub[i],VuV.ub[i])
1279887d61b2STaylor SimpsonCONDADDSUB(16,h,"Vx32.h","Vu32.h","Conditional add/sub Half",VxV.h[i],VuV.h[i])
1280887d61b2STaylor SimpsonCONDADDSUB(32,w,"Vx32.w","Vu32.w","Conditional add/sub Word",VxV.w[i],VuV.w[i])
1281887d61b2STaylor Simpson
1282887d61b2STaylor Simpson/*****************************************************
1283887d61b2STaylor Simpson ABSOLUTE VALUES
1284887d61b2STaylor Simpson*****************************************************/
1285887d61b2STaylor Simpson// V65
1286887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb,        "Vd32=vabsb(Vu32)",     "Vd32.b=vabs(Vu32.b)",     "Vector absolute value of bytes",    VdV.b[i]  =  fABS(VuV.b[i]))
1287887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb_sat,    "Vd32=vabsb(Vu32):sat", "Vd32.b=vabs(Vu32.b):sat", "Vector absolute value of bytes",    VdV.b[i]  =  fVSATB(fABS(fSE8_16(VuV.b[i]))))
1288887d61b2STaylor Simpson
1289887d61b2STaylor Simpson
1290887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(16,vabsh,        "Vd32=vabsh(Vu32)",     "Vd32.h=vabs(Vu32.h)",     "Vector absolute value of halfwords",    VdV.h[i]  =  fABS(VuV.h[i]))
1291887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(16,vabsh_sat,    "Vd32=vabsh(Vu32):sat", "Vd32.h=vabs(Vu32.h):sat", "Vector absolute value of halfwords",    VdV.h[i]  =  fVSATH(fABS(fSE16_32(VuV.h[i]))))
1292887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vabsw,        "Vd32=vabsw(Vu32)",     "Vd32.w=vabs(Vu32.w)",     "Vector absolute value of words",        VdV.w[i]  =  fABS(VuV.w[i]))
1293887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vabsw_sat,    "Vd32=vabsw(Vu32):sat", "Vd32.w=vabs(Vu32.w):sat", "Vector absolute value of words",        VdV.w[i]  =  fVSATW(fABS(fSE32_64(VuV.w[i]))))
1294887d61b2STaylor Simpson
1295887d61b2STaylor Simpson
1296887d61b2STaylor Simpson/**************************************************************************
1297887d61b2STaylor Simpson * MMVECTOR MULTIPLICATIONS
1298887d61b2STaylor Simpson * ************************************************************************/
1299887d61b2STaylor Simpson
1300887d61b2STaylor Simpson
1301887d61b2STaylor Simpson/* Byte by Byte */
1302887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv,"Vdd32=vmpyb(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.b,Vv32.b)",
1303887d61b2STaylor Simpson"Vector absolute value of words",
1304887d61b2STaylor Simpson    VddV.v[0].h[i] =  fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i]));
1305887d61b2STaylor Simpson    VddV.v[1].h[i] =  fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i])))
1306887d61b2STaylor Simpson
1307887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv_acc,"Vxx32+=vmpyb(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.b,Vv32.b)",
1308887d61b2STaylor Simpson"Vector absolute value of words",
1309887d61b2STaylor Simpson    VxxV.v[0].h[i] +=  fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i]));
1310887d61b2STaylor Simpson    VxxV.v[1].h[i] +=  fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i])))
1311887d61b2STaylor Simpson
1312887d61b2STaylor Simpson
1313887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv,"Vdd32=vmpyub(Vu32,Vv32)","Vdd32.uh=vmpy(Vu32.ub,Vv32.ub)",
1314887d61b2STaylor Simpson"Vector absolute value of words",
1315887d61b2STaylor Simpson    VddV.v[0].uh[i] =  fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) );
1316887d61b2STaylor Simpson    VddV.v[1].uh[i] =  fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) ))
1317887d61b2STaylor Simpson
1318887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv_acc,"Vxx32+=vmpyub(Vu32,Vv32)","Vxx32.uh+=vmpy(Vu32.ub,Vv32.ub)",
1319887d61b2STaylor Simpson"Vector absolute value of words",
1320887d61b2STaylor Simpson    VxxV.v[0].uh[i] +=  fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) );
1321887d61b2STaylor Simpson    VxxV.v[1].uh[i] +=  fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) ))
1322887d61b2STaylor Simpson
1323887d61b2STaylor Simpson
1324887d61b2STaylor Simpson
1325887d61b2STaylor Simpson
1326887d61b2STaylor Simpson
1327887d61b2STaylor Simpson
1328887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv,"Vdd32=vmpybus(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.ub,Vv32.b)",
1329887d61b2STaylor Simpson"Vector absolute value of words",
1330887d61b2STaylor Simpson    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i]));
1331887d61b2STaylor Simpson    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i])))
1332887d61b2STaylor Simpson
1333887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv_acc,"Vxx32+=vmpybus(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.ub,Vv32.b)",
1334887d61b2STaylor Simpson"Vector absolute value of words",
1335887d61b2STaylor Simpson    VxxV.v[0].h[i]  += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i]));
1336887d61b2STaylor Simpson    VxxV.v[1].h[i]  += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i])))
1337887d61b2STaylor Simpson
1338887d61b2STaylor Simpson
1339887d61b2STaylor Simpson
1340887d61b2STaylor Simpson
1341887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabusv,"Vdd32=vmpabus(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.b)",
1342887d61b2STaylor Simpson"Vertical Byte Multiply",
1343887d61b2STaylor Simpson    VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(0, VvvV.v[1].uh[i]));
1344887d61b2STaylor Simpson    VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(1, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(1, VvvV.v[1].uh[i])))
1345887d61b2STaylor Simpson
1346887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabuuv,"Vdd32=vmpabuu(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.ub)",
1347887d61b2STaylor Simpson"Vertical Byte Multiply",
1348887d61b2STaylor Simpson    VddV.v[0].h[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(0, VvvV.v[1].uh[i]));
1349887d61b2STaylor Simpson    VddV.v[1].h[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(1, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(1, VvvV.v[1].uh[i])))
1350887d61b2STaylor Simpson
1351887d61b2STaylor Simpson
1352887d61b2STaylor Simpson
1353887d61b2STaylor Simpson
1354887d61b2STaylor Simpson
1355887d61b2STaylor Simpson
1356887d61b2STaylor Simpson
1357887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv,"Vdd32=vmpyh(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.h)",
1358887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1359887d61b2STaylor Simpson    VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i]));
1360887d61b2STaylor Simpson    VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i])))
1361887d61b2STaylor Simpson
1362887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv_acc,"Vxx32+=vmpyh(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.h)",
1363887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1364887d61b2STaylor Simpson    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i]));
1365887d61b2STaylor Simpson    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i])))
1366887d61b2STaylor Simpson
1367887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv,"Vdd32=vmpyuh(Vu32,Vv32)","Vdd32.uw=vmpy(Vu32.uh,Vv32.uh)",
1368887d61b2STaylor Simpson"Vector by Vector Unsigned Halfword Multiply",
1369887d61b2STaylor Simpson    VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i]));
1370887d61b2STaylor Simpson    VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i])))
1371887d61b2STaylor Simpson
1372887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv_acc,"Vxx32+=vmpyuh(Vu32,Vv32)","Vxx32.uw+=vmpy(Vu32.uh,Vv32.uh)",
1373887d61b2STaylor Simpson"Vector by Vector Unsigned Halfword Multiply",
1374887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i]));
1375887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i])))
1376887d61b2STaylor Simpson
1377887d61b2STaylor Simpson
1378887d61b2STaylor Simpson
1379887d61b2STaylor Simpson/* Vector by Vector */
1380887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyhvsrs,"Vd32=vmpyh(Vu32,Vv32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Vv32.h):<<1:rnd:sat",
1381887d61b2STaylor Simpson"Vector halfword multiply with round, shift, and sat16",
1382887d61b2STaylor Simpson    VdV.h[i] = fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(VuV.h[i],VvV.h[i]    )<<1))))))
1383887d61b2STaylor Simpson
1384887d61b2STaylor Simpson
1385887d61b2STaylor Simpson
1386b2f20c2cSTaylor SimpsonITERATOR_INSN_MPY_SLOT(16,vmpyuhvs, "Vd32.uh=vmpy(Vu32.uh,Vv32.uh):>>16",
1387b2f20c2cSTaylor Simpson"Vector by Vector Unsigned Halfword Multiply with 16 bit rightshift",
1388b2f20c2cSTaylor Simpson    VdV.uh[i] = fGETUHALF(1,fMPY16UU(VuV.uh[i],VvV.uh[i])))
1389887d61b2STaylor Simpson
1390887d61b2STaylor Simpson
1391887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus, "Vdd32=vmpyhus(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.uh)",
1392887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1393887d61b2STaylor Simpson    VddV.v[0].w[i] = fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i]));
1394887d61b2STaylor Simpson    VddV.v[1].w[i] = fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i])))
1395887d61b2STaylor Simpson
1396887d61b2STaylor Simpson
1397887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus_acc, "Vxx32+=vmpyhus(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.uh)",
1398887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1399887d61b2STaylor Simpson    VxxV.v[0].w[i] += fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i]));
1400887d61b2STaylor Simpson    VxxV.v[1].w[i] += fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i])))
1401887d61b2STaylor Simpson
1402887d61b2STaylor Simpson
1403887d61b2STaylor Simpson
1404887d61b2STaylor Simpson
1405887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih,"Vd32=vmpyih(Vu32,Vv32)","Vd32.h=vmpyi(Vu32.h,Vv32.h)",
1406887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1407887d61b2STaylor Simpson    VdV.h[i] = fMPY16SS(VuV.h[i], VvV.h[i]))
1408887d61b2STaylor Simpson
1409887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih_acc,"Vx32+=vmpyih(Vu32,Vv32)","Vx32.h+=vmpyi(Vu32.h,Vv32.h)",
1410887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1411887d61b2STaylor Simpson    VxV.h[i] += fMPY16SS(VuV.h[i], VvV.h[i]))
1412887d61b2STaylor Simpson
1413887d61b2STaylor Simpson
1414887d61b2STaylor Simpson
1415887d61b2STaylor Simpson/* 32x32 high half / frac */
1416887d61b2STaylor Simpson
1417887d61b2STaylor Simpson
1418887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh,"Vd32=vmpyewuh(Vu32,Vv32)","Vd32.w=vmpye(Vu32.w,Vv32.uh)",
1419887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1420887d61b2STaylor SimpsonVdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) >> 16)
1421887d61b2STaylor Simpson
1422887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh,"Vd32=vmpyowh(Vu32,Vv32):<<1:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:sat",
1423887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1424887d61b2STaylor SimpsonVdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 0) >> 1)))
1425887d61b2STaylor Simpson
1426887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd,"Vd32=vmpyowh(Vu32,Vv32):<<1:rnd:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat",
1427887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1428887d61b2STaylor SimpsonVdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 1) >> 1)))
1429887d61b2STaylor Simpson
1430887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh_64,"Vdd32=vmpye(Vu32.w,Vv32.uh)",
1431887d61b2STaylor Simpson"Word times Halfword Multiply, 64-bit result",
1432887d61b2STaylor Simpson	fHIDE(size8s_t prod;)
1433887d61b2STaylor Simpson	prod = fMPY32SU(VuV.w[i],fGETUHALF(0,VvV.w[i]));
1434887d61b2STaylor Simpson	VddV.v[1].w[i] = prod >> 16;
1435887d61b2STaylor Simpson	VddV.v[0].w[i] = prod << 16)
1436887d61b2STaylor Simpson
1437887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_64_acc,"Vxx32+=vmpyo(Vu32.w,Vv32.h)",
1438887d61b2STaylor Simpson"Word times Halfword Multiply, 64-bit result",
1439887d61b2STaylor Simpson	fHIDE(size8s_t prod;)
1440887d61b2STaylor Simpson	prod = fMPY32SS(VuV.w[i],fGETHALF(1,VvV.w[i]))  + fSE32_64(VxxV.v[1].w[i]);
1441887d61b2STaylor Simpson	VxxV.v[1].w[i] = prod >> 16;
1442887d61b2STaylor Simpson	fSETHALF(0, VxxV.v[0].w[i], VxxV.v[0].w[i] >> 16);
1443887d61b2STaylor Simpson	fSETHALF(1, VxxV.v[0].w[i], prod & 0x0000ffff))
1444887d61b2STaylor Simpson
1445887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:sat:shift",
1446887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1447887d61b2STaylor SimpsonIV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 0) >> 1)))
1448887d61b2STaylor Simpson
1449887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:rnd:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat:shift",
1450887d61b2STaylor Simpson"Vector by Vector Halfword Multiply",
1451887d61b2STaylor SimpsonIV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 1) >> 1)))
1452887d61b2STaylor Simpson
1453887d61b2STaylor Simpson/* For 32x32 integer / low half */
1454887d61b2STaylor Simpson
1455887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT(32,vmpyieoh,"Vd32.w=vmpyieo(Vu32.h,Vv32.h)","Odd/Even multiply for 32x32 low half",
1456887d61b2STaylor Simpson	VdV.w[i] = (fGETHALF(0,VuV.w[i])*fGETHALF(1,VvV.w[i])) << 16)
1457887d61b2STaylor Simpson
1458887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh,"Vd32=vmpyiewuh(Vu32,Vv32)","Vd32.w=vmpyie(Vu32.w,Vv32.uh)",
1459887d61b2STaylor Simpson"Vector by Vector Word by Halfword Multiply",
1460887d61b2STaylor SimpsonIV1DEAD()    VdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) )
1461887d61b2STaylor Simpson
1462887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiowh,"Vd32=vmpyiowh(Vu32,Vv32)","Vd32.w=vmpyio(Vu32.w,Vv32.h)",
1463887d61b2STaylor Simpson"Vector by Vector Word by Halfword Multiply",
1464887d61b2STaylor SimpsonIV1DEAD()    VdV.w[i] = fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) )
1465887d61b2STaylor Simpson
1466887d61b2STaylor Simpson/* Add back these... */
1467887d61b2STaylor Simpson
1468887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewh_acc,"Vx32+=vmpyiewh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.h)",
1469887d61b2STaylor Simpson"Vector by Vector Word by Halfword Multiply",
1470887d61b2STaylor SimpsonVxV.w[i] = VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(0, VvV.w[i])) )
1471887d61b2STaylor Simpson
1472887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh_acc,"Vx32+=vmpyiewuh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.uh)",
1473887d61b2STaylor Simpson"Vector by Vector Word by Halfword Multiply",
1474887d61b2STaylor SimpsonVxV.w[i] = VxV.w[i] + fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) )
1475887d61b2STaylor Simpson
1476887d61b2STaylor Simpson
1477887d61b2STaylor Simpson
1478887d61b2STaylor Simpson
1479887d61b2STaylor Simpson
1480887d61b2STaylor Simpson
1481887d61b2STaylor Simpson
1482887d61b2STaylor Simpson/* Vector by Scalar */
1483887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub,"Vdd32=vmpyub(Vu32,Rt32)","Vdd32.uh=vmpy(Vu32.ub,Rt32.ub)",
1484887d61b2STaylor Simpson"Vector absolute value of words",
1485887d61b2STaylor Simpson    VddV.v[0].uh[i]  = fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV));
1486887d61b2STaylor Simpson    VddV.v[1].uh[i]  = fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV)))
1487887d61b2STaylor Simpson
1488887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub_acc,"Vxx32+=vmpyub(Vu32,Rt32)","Vxx32.uh+=vmpy(Vu32.ub,Rt32.ub)",
1489887d61b2STaylor Simpson"Vector absolute value of words",
1490887d61b2STaylor Simpson    VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV));
1491887d61b2STaylor Simpson    VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV)))
1492887d61b2STaylor Simpson
1493887d61b2STaylor Simpson
1494887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus,"Vdd32=vmpybus(Vu32,Rt32)","Vdd32.h=vmpy(Vu32.ub,Rt32.b)",
1495887d61b2STaylor Simpson"Vector absolute value of words",
1496887d61b2STaylor Simpson    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV));
1497887d61b2STaylor Simpson    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
1498887d61b2STaylor Simpson
1499887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus_acc,"Vxx32+=vmpybus(Vu32,Rt32)","Vxx32.h+=vmpy(Vu32.ub,Rt32.b)",
1500887d61b2STaylor Simpson"Vector absolute value of words",
1501887d61b2STaylor Simpson    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV));
1502887d61b2STaylor Simpson    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
1503887d61b2STaylor Simpson
1504887d61b2STaylor Simpson
1505887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus,"Vdd32=vmpabus(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.b)",
1506887d61b2STaylor Simpson"Vertical Byte Multiply",
1507887d61b2STaylor Simpson    VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV));
1508887d61b2STaylor Simpson    VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV)))
1509887d61b2STaylor Simpson
1510887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus_acc,"Vxx32+=vmpabus(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.b)",
1511887d61b2STaylor Simpson"Vertical Byte Multiply",
1512887d61b2STaylor Simpson    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV));
1513887d61b2STaylor Simpson    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV)))
1514887d61b2STaylor Simpson
1515887d61b2STaylor Simpson// V65
1516887d61b2STaylor Simpson
1517887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu,"Vdd32=vmpabuu(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.ub)",
1518887d61b2STaylor Simpson"Vertical Byte Multiply",
1519887d61b2STaylor Simpson    VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV));
1520887d61b2STaylor Simpson    VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV)))
1521887d61b2STaylor Simpson
1522887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu_acc,"Vxx32+=vmpabuu(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.ub)",
1523887d61b2STaylor Simpson"Vertical Byte Multiply",
1524887d61b2STaylor Simpson    VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV));
1525887d61b2STaylor Simpson    VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV)))
1526887d61b2STaylor Simpson
1527887d61b2STaylor Simpson
1528887d61b2STaylor Simpson
1529887d61b2STaylor Simpson
1530887d61b2STaylor Simpson/* Half by Byte */
1531887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb,"Vdd32=vmpahb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.h,Rt32.b)",
1532887d61b2STaylor Simpson"Vertical Byte Multiply",
1533887d61b2STaylor Simpson    VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1534887d61b2STaylor Simpson    VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1535887d61b2STaylor Simpson
1536887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb_acc,"Vxx32+=vmpahb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.h,Rt32.b)",
1537887d61b2STaylor Simpson"Vertical Byte Multiply",
1538887d61b2STaylor Simpson    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1539887d61b2STaylor Simpson    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1540887d61b2STaylor Simpson
1541887d61b2STaylor Simpson/* Half by Byte */
1542887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb,"Vdd32=vmpauhb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.uh,Rt32.b)",
1543887d61b2STaylor Simpson"Vertical Byte Multiply",
1544887d61b2STaylor Simpson    VddV.v[0].w[i] = fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1545887d61b2STaylor Simpson    VddV.v[1].w[i] = fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1546887d61b2STaylor Simpson
1547887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb_acc,"Vxx32+=vmpauhb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.uh,Rt32.b)",
1548887d61b2STaylor Simpson"Vertical Byte Multiply",
1549887d61b2STaylor Simpson    VxxV.v[0].w[i] += fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1550887d61b2STaylor Simpson    VxxV.v[1].w[i] += fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1551887d61b2STaylor Simpson
1552887d61b2STaylor Simpson
1553887d61b2STaylor Simpson
1554887d61b2STaylor Simpson
1555887d61b2STaylor Simpson
1556887d61b2STaylor Simpson
1557887d61b2STaylor Simpson
1558887d61b2STaylor Simpson/* Half by Half */
1559887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyh,"Vdd32=vmpyh(Vu32,Rt32)","Vdd32.w=vmpy(Vu32.h,Rt32.h)",
1560887d61b2STaylor Simpson"Vector absolute value of words",
1561887d61b2STaylor Simpson    VddV.v[0].w[i] =  fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV));
1562887d61b2STaylor Simpson    VddV.v[1].w[i] =  fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV)))
1563887d61b2STaylor Simpson
1564887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(32,vmpyh_acc,"Vxx32+=vmpyh(Vu32,Rt32)","Vxx32.w+=vmpy(Vu32.h,Rt32.h)",
1565887d61b2STaylor Simpson"Vector even halfwords with scalar lower halfword multiply with shift and sat32",
1566887d61b2STaylor Simpson    VxxV.v[0].w[i] =  fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV));
1567887d61b2STaylor Simpson    VxxV.v[1].w[i] =  fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV)))
1568887d61b2STaylor Simpson
1569887d61b2STaylor Simpson
1570887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsat_acc,"Vxx32+=vmpyh(Vu32,Rt32):sat","Vxx32.w+=vmpy(Vu32.h,Rt32.h):sat",
1571887d61b2STaylor Simpson"Vector even halfwords with scalar lower halfword multiply with shift and sat32",
1572887d61b2STaylor Simpson    VxxV.v[0].w[i] =  fVSATW(fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV)));
1573887d61b2STaylor Simpson    VxxV.v[1].w[i] =  fVSATW(fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV))))
1574887d61b2STaylor Simpson
1575887d61b2STaylor Simpson
1576887d61b2STaylor Simpson
1577887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhss,"Vd32=vmpyh(Vu32,Rt32):<<1:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:sat",
1578887d61b2STaylor Simpson"Vector halfword by halfword multiply, shift by 1, and take upper 16 msb",
1579887d61b2STaylor Simpson          fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1)))));
1580887d61b2STaylor Simpson          fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1)))));
1581887d61b2STaylor Simpson)
1582887d61b2STaylor Simpson
1583887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsrs,"Vd32=vmpyh(Vu32,Rt32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:rnd:sat",
1584887d61b2STaylor Simpson"Vector halfword with scalar halfword multiply with round, shift, and sat16",
1585887d61b2STaylor Simpson       fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1))))));
1586887d61b2STaylor Simpson       fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1))))));
1587887d61b2STaylor Simpson)
1588887d61b2STaylor Simpson
1589887d61b2STaylor Simpson
1590887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh,"Vdd32=vmpyuh(Vu32,Rt32)","Vdd32.uw=vmpy(Vu32.uh,Rt32.uh)",
1591887d61b2STaylor Simpson"Vector even halfword unsigned multiply by scalar",
1592887d61b2STaylor Simpson    VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV));
1593887d61b2STaylor Simpson    VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV)))
1594887d61b2STaylor Simpson
1595887d61b2STaylor Simpson
1596887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh_acc,"Vxx32+=vmpyuh(Vu32,Rt32)","Vxx32.uw+=vmpy(Vu32.uh,Rt32.uh)",
1597887d61b2STaylor Simpson"Vector even halfword unsigned multiply by scalar",
1598887d61b2STaylor Simpson    VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV));
1599887d61b2STaylor Simpson    VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV)))
1600887d61b2STaylor Simpson
1601887d61b2STaylor Simpson
1602887d61b2STaylor Simpson
1603887d61b2STaylor Simpson
1604887d61b2STaylor Simpson/********************************************
1605887d61b2STaylor Simpson*  HALF BY BYTE
1606887d61b2STaylor Simpson********************************************/
1607887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(16,vmpyihb,"Vd32=vmpyihb(Vu32,Rt32)","Vd32.h=vmpyi(Vu32.h,Rt32.b)",
1608887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result",
1609887d61b2STaylor SimpsonVdV.h[i]  = fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) ))
1610887d61b2STaylor Simpson
1611887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(16,vmpyihb_acc,"Vx32+=vmpyihb(Vu32,Rt32)","Vx32.h+=vmpyi(Vu32.h,Rt32.b)",
1612887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result",
1613887d61b2STaylor SimpsonVxV.h[i] += fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) ))
1614887d61b2STaylor Simpson
1615887d61b2STaylor Simpson
1616887d61b2STaylor Simpson/********************************************
1617887d61b2STaylor Simpson*  WORD BY BYTE
1618887d61b2STaylor Simpson********************************************/
1619887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vmpyiwb,"Vd32=vmpyiwb(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.b)",
1620887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result",
1621887d61b2STaylor SimpsonVdV.w[i]  = fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) ))
1622887d61b2STaylor Simpson
1623887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vmpyiwb_acc,"Vx32+=vmpyiwb(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.b)",
1624887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result",
1625887d61b2STaylor SimpsonVxV.w[i] += fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) ))
1626887d61b2STaylor Simpson
1627887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vmpyiwub,"Vd32=vmpyiwub(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.ub)",
1628887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result",
1629887d61b2STaylor SimpsonVdV.w[i]  = fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) ))
1630887d61b2STaylor Simpson
1631887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT(32,vmpyiwub_acc,"Vx32+=vmpyiwub(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.ub)",
1632887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result",
1633887d61b2STaylor SimpsonVxV.w[i] += fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) ))
1634887d61b2STaylor Simpson
1635887d61b2STaylor Simpson
1636887d61b2STaylor Simpson/********************************************
1637887d61b2STaylor Simpson*  WORD BY HALF
1638887d61b2STaylor Simpson********************************************/
1639887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh,"Vd32=vmpyiwh(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.h)",
1640887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result",
1641887d61b2STaylor SimpsonVdV.w[i]  = fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV)))
1642887d61b2STaylor Simpson
1643887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh_acc,"Vx32+=vmpyiwh(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.h)",
1644887d61b2STaylor Simpson"Vector word by byte multiply, keep lower result",
1645887d61b2STaylor SimpsonVxV.w[i] += fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV)))
1646887d61b2STaylor Simpson
1647887d61b2STaylor Simpson
1648887d61b2STaylor Simpson
1649887d61b2STaylor Simpson
1650887d61b2STaylor Simpson
1651887d61b2STaylor Simpson
1652887d61b2STaylor Simpson
1653887d61b2STaylor Simpson
1654887d61b2STaylor Simpson
1655887d61b2STaylor Simpson
1656887d61b2STaylor Simpson
1657887d61b2STaylor Simpson
1658887d61b2STaylor Simpson
1659887d61b2STaylor Simpson
1660887d61b2STaylor Simpson
1661887d61b2STaylor Simpson
1662887d61b2STaylor Simpson
1663887d61b2STaylor Simpson
1664887d61b2STaylor Simpson
1665887d61b2STaylor Simpson/**************************************************************************
1666887d61b2STaylor Simpson * MMVECTOR LOGICAL OPERATIONS
1667887d61b2STaylor Simpson * ************************************************************************/
1668887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(16,vand,"Vd32=vand(Vu32,Vv32)", "Vector Logical And", VdV.uh[i] = VuV.uh[i] & VvV.h[i])
1669887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(16,vor, "Vd32=vor(Vu32,Vv32)",  "Vector Logical Or", VdV.uh[i] = VuV.uh[i] | VvV.h[i])
1670887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(16,vxor,"Vd32=vxor(Vu32,Vv32)", "Vector Logical XOR",    VdV.uh[i] = VuV.uh[i] ^ VvV.h[i])
1671887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(16,vnot,"Vd32=vnot(Vu32)",     "Vector Logical NOT", VdV.uh[i] = ~VuV.uh[i])
1672887d61b2STaylor Simpson
1673887d61b2STaylor Simpson
1674887d61b2STaylor Simpson
1675887d61b2STaylor Simpson
1676887d61b2STaylor Simpson
1677887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt,
1678887d61b2STaylor Simpson"Vd32.ub=vand(Qu4.ub,Rt32.ub)", "Vd32=vand(Qu4,Rt32)", "Insert Predicate into Vector",
1679887d61b2STaylor Simpson    VdV.ub[i] = fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0)
1680887d61b2STaylor Simpson
1681887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt_acc,
1682887d61b2STaylor Simpson"Vx32.ub|=vand(Qu4.ub,Rt32.ub)", "Vx32|=vand(Qu4,Rt32)",  "Insert Predicate into Vector",
1683887d61b2STaylor Simpson    VxV.ub[i] |= (fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0)
1684887d61b2STaylor Simpson
1685887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt,
1686887d61b2STaylor Simpson"Vd32.ub=vand(!Qu4.ub,Rt32.ub)", "Vd32=vand(!Qu4,Rt32)", "Insert Predicate into Vector",
1687887d61b2STaylor Simpson    VdV.ub[i] = !fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0)
1688887d61b2STaylor Simpson
1689887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt_acc,
1690887d61b2STaylor Simpson"Vx32.ub|=vand(!Qu4.ub,Rt32.ub)", "Vx32|=vand(!Qu4,Rt32)",  "Insert Predicate into Vector",
1691887d61b2STaylor Simpson    VxV.ub[i] |= !(fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0)
1692887d61b2STaylor Simpson
1693887d61b2STaylor Simpson
1694887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt,
1695887d61b2STaylor Simpson"Qd4.ub=vand(Vu32.ub,Rt32.ub)", "Qd4=vand(Vu32,Rt32)", "Insert into Predicate",
1696887d61b2STaylor Simpson    fSETQBIT(QdV,i,((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0))
1697887d61b2STaylor Simpson
1698887d61b2STaylor SimpsonITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt_acc,
1699887d61b2STaylor Simpson"Qx4.ub|=vand(Vu32.ub,Rt32.ub)", "Qx4|=vand(Vu32,Rt32)", "Insert into Predicate ",
1700887d61b2STaylor Simpson    fSETQBIT(QxV,i,fGETQBIT(QxV,i)|(((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0)))
1701887d61b2STaylor Simpson
1702887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(8,vandvqv,"Vd32=vand(Qv4,Vu32)","Mask off bytes",
1703887d61b2STaylor SimpsonVdV.b[i] = fGETQBIT(QvV,i) ? VuV.b[i] : 0)
1704887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(8,vandvnqv,"Vd32=vand(!Qv4,Vu32)","Mask off bytes",
1705887d61b2STaylor SimpsonVdV.b[i] = !fGETQBIT(QvV,i) ? VuV.b[i] : 0)
1706887d61b2STaylor Simpson
1707887d61b2STaylor Simpson
1708887d61b2STaylor Simpson /***************************************************
1709887d61b2STaylor Simpson * Compare Vector with Vector
1710887d61b2STaylor Simpson ***************************************************/
1711887d61b2STaylor Simpson#define VCMP(DEST, ASRC, ASRCOP, CMP, N, SRC, MASK, WIDTH)        \
1712887d61b2STaylor Simpson{ \
1713887d61b2STaylor Simpson       for(fHIDE(int) i = 0; i < fVBYTES(); i += WIDTH) { \
1714887d61b2STaylor Simpson		fSETQBITS(DEST,WIDTH,MASK,i,ASRC ASRCOP ((VuV.SRC[i/WIDTH] CMP VvV.SRC[i/WIDTH]) ? MASK : 0)); \
1715887d61b2STaylor Simpson    } \
1716887d61b2STaylor Simpson       }
1717887d61b2STaylor Simpson
1718887d61b2STaylor Simpson
1719887d61b2STaylor Simpson#define MMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \
1720887d61b2STaylor SimpsonEXTINSN(V6_vgt##TYPE,       "Qd4=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than", \
1721887d61b2STaylor Simpson	VCMP(QdV, , , >, N, SRC, MASK, WIDTH)) \
1722887d61b2STaylor SimpsonEXTINSN(V6_vgt##TYPE##_and, "Qx4&=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-and", \
1723887d61b2STaylor Simpson	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, >, N, SRC, MASK, WIDTH)) \
1724887d61b2STaylor SimpsonEXTINSN(V6_vgt##TYPE##_or,  "Qx4|=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-or", \
1725887d61b2STaylor Simpson	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, >, N, SRC, MASK, WIDTH)) \
1726887d61b2STaylor SimpsonEXTINSN(V6_vgt##TYPE##_xor, "Qx4^=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-xor", \
1727887d61b2STaylor Simpson	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, >, N, SRC, MASK, WIDTH))
1728887d61b2STaylor Simpson
1729887d61b2STaylor Simpson#define MMVEC_CMP(TYPE,TYPE2,TYPE3,DESCR,N,MASK, WIDTH, SRC)\
1730887d61b2STaylor SimpsonMMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \
1731887d61b2STaylor SimpsonEXTINSN(V6_veq##TYPE,       "Qd4=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equal to", \
1732887d61b2STaylor Simpson	VCMP(QdV, , , ==, N, SRC, MASK, WIDTH)) \
1733887d61b2STaylor SimpsonEXTINSN(V6_veq##TYPE##_and, "Qx4&=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-and", \
1734887d61b2STaylor Simpson	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, ==, N, SRC, MASK, WIDTH)) \
1735887d61b2STaylor SimpsonEXTINSN(V6_veq##TYPE##_or,  "Qx4|=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-or", \
1736887d61b2STaylor Simpson	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, ==, N, SRC, MASK, WIDTH)) \
1737887d61b2STaylor SimpsonEXTINSN(V6_veq##TYPE##_xor, "Qx4^=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-xor", \
1738887d61b2STaylor Simpson	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, ==, N, SRC, MASK, WIDTH))
1739887d61b2STaylor Simpson
1740887d61b2STaylor Simpson
1741887d61b2STaylor SimpsonMMVEC_CMP(w,"w","","Vector Word Compare ", fVELEM(32), 0xF, 4, w)
1742887d61b2STaylor SimpsonMMVEC_CMP(h,"h","","Vector Half Compare ", fVELEM(16), 0x3, 2, h)
1743887d61b2STaylor SimpsonMMVEC_CMP(b,"b","","Vector Half Compare ", fVELEM(8),  0x1, 1, b)
1744887d61b2STaylor SimpsonMMVEC_CMPGT(uw,"uw","","Vector Unsigned Half Compare ", fVELEM(32), 0xF, 4,uw)
1745887d61b2STaylor SimpsonMMVEC_CMPGT(uh,"uh","","Vector Unsigned Half Compare ", fVELEM(16), 0x3, 2,uh)
1746887d61b2STaylor SimpsonMMVEC_CMPGT(ub,"ub","","Vector Unsigned Byte Compare ", fVELEM(8),  0x1, 1,ub)
1747887d61b2STaylor Simpson
1748887d61b2STaylor Simpson/***************************************************
1749887d61b2STaylor Simpson* Predicate Operations
1750887d61b2STaylor Simpson***************************************************/
1751887d61b2STaylor Simpson
1752887d61b2STaylor SimpsonEXTINSN(V6_pred_scalar2, "Qd4=vsetq(Rt32)",         ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),   "Set Vector Predicate ",
1753887d61b2STaylor Simpson{
1754887d61b2STaylor Simpson    fHIDE(int i;)
1755887d61b2STaylor Simpson    for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i < (RtV & (fVBYTES()-1))) ? 1 : 0);
1756887d61b2STaylor Simpson})
1757887d61b2STaylor Simpson
1758887d61b2STaylor SimpsonEXTINSN(V6_pred_scalar2v2, "Qd4=vsetq2(Rt32)",         ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),   "Set Vector Predicate ",
1759887d61b2STaylor Simpson{
1760887d61b2STaylor Simpson    fHIDE(int i;)
1761887d61b2STaylor Simpson    for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i <= ((RtV-1) & (fVBYTES()-1))) ? 1 : 0);
1762887d61b2STaylor Simpson})
1763887d61b2STaylor Simpson
1764887d61b2STaylor Simpson
1765887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqw, "Qd4.h=vshuffe(Qs4.w,Qt4.w)","Shrink Predicate", fSETQBIT(QdV,i, (i & 2) ? fGETQBIT(QsV,i-2) : fGETQBIT(QtV,i) ) )
1766887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqh, "Qd4.b=vshuffe(Qs4.h,Qt4.h)","Shrink Predicate", fSETQBIT(QdV,i, (i & 1) ? fGETQBIT(QsV,i-1) : fGETQBIT(QtV,i) ) )
1767887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or, "Qd4=or(Qs4,Qt4)","Vector Predicate Or", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || fGETQBIT(QtV,i) ) )
1768887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and, "Qd4=and(Qs4,Qt4)","Vector Predicate And", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && fGETQBIT(QtV,i) ) )
1769887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_xor, "Qd4=xor(Qs4,Qt4)","Vector Predicate Xor", fSETQBIT(QdV,i,fGETQBIT(QsV,i) ^ fGETQBIT(QtV,i) ) )
1770887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or_n, "Qd4=or(Qs4,!Qt4)","Vector Predicate Or with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || !fGETQBIT(QtV,i) ) )
1771887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and_n, "Qd4=and(Qs4,!Qt4)","Vector Predicate And  with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && !fGETQBIT(QtV,i) ) )
1772887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(8, pred_not, "Qd4=not(Qs4)","Vector Predicate Not", fSETQBIT(QdV,i,!fGETQBIT(QsV,i) ) )
1773887d61b2STaylor Simpson
1774887d61b2STaylor Simpson
1775887d61b2STaylor Simpson
1776887d61b2STaylor SimpsonEXTINSN(V6_vcmov,  "if (Ps4) Vd32=Vu32",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),   "Conditional Mov",
1777887d61b2STaylor Simpson{
1778887d61b2STaylor Simpsonif (fLSBOLD(PsV))	{
1779887d61b2STaylor Simpson	fHIDE(int i;)
1780887d61b2STaylor Simpson	fVFOREACH(8, i) {
1781887d61b2STaylor Simpson		VdV.ub[i] = VuV.ub[i];
1782887d61b2STaylor Simpson	}
1783887d61b2STaylor Simpson	} else {CANCEL;}
1784887d61b2STaylor Simpson})
1785887d61b2STaylor Simpson
1786887d61b2STaylor SimpsonEXTINSN(V6_vncmov,  "if (!Ps4) Vd32=Vu32",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),   "Conditional Mov",
1787887d61b2STaylor Simpson{
1788887d61b2STaylor Simpsonif (fLSBOLDNOT(PsV))	{
1789887d61b2STaylor Simpson	fHIDE(int i;)
1790887d61b2STaylor Simpson	fVFOREACH(8, i) {
1791887d61b2STaylor Simpson		VdV.ub[i] = VuV.ub[i];
1792887d61b2STaylor Simpson	}
1793887d61b2STaylor Simpson	} else {CANCEL;}
1794887d61b2STaylor Simpson})
1795887d61b2STaylor Simpson
1796887d61b2STaylor SimpsonEXTINSN(V6_vccombine,  "if (Ps4) Vdd32=vcombine(Vu32,Vv32)",	ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV),   "Conditional Combine",
1797887d61b2STaylor Simpson{
1798887d61b2STaylor Simpsonif (fLSBOLD(PsV))	{
1799887d61b2STaylor Simpson	fHIDE(int i;)
1800887d61b2STaylor Simpson	fVFOREACH(8, i) {
1801887d61b2STaylor Simpson		VddV.v[0].ub[i] = VvV.ub[i];
1802887d61b2STaylor Simpson		VddV.v[1].ub[i] = VuV.ub[i];
1803887d61b2STaylor Simpson	}
1804887d61b2STaylor Simpson	} else {CANCEL;}
1805887d61b2STaylor Simpson})
1806887d61b2STaylor Simpson
1807887d61b2STaylor SimpsonEXTINSN(V6_vnccombine,  "if (!Ps4) Vdd32=vcombine(Vu32,Vv32)",	ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV),   "Conditional Combine",
1808887d61b2STaylor Simpson{
1809887d61b2STaylor Simpsonif (fLSBOLDNOT(PsV))	{
1810887d61b2STaylor Simpson	fHIDE(int i;)
1811887d61b2STaylor Simpson	fVFOREACH(8, i) {
1812887d61b2STaylor Simpson		VddV.v[0].ub[i] = VvV.ub[i];
1813887d61b2STaylor Simpson		VddV.v[1].ub[i] = VuV.ub[i];
1814887d61b2STaylor Simpson	}
1815887d61b2STaylor Simpson	} else {CANCEL;}
1816887d61b2STaylor Simpson})
1817887d61b2STaylor Simpson
1818887d61b2STaylor Simpson
1819887d61b2STaylor Simpson
1820887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(8,vmux,"Vd32=vmux(Qt4,Vu32,Vv32)",
1821887d61b2STaylor Simpson"Vector Select Element 8-bit",
1822887d61b2STaylor Simpson    VdV.ub[i] = fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i])
1823887d61b2STaylor Simpson
1824887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vswap,"Vdd32=vswap(Qt4,Vu32,Vv32)",
1825887d61b2STaylor Simpson"Vector Swap Element 8-bit",
1826887d61b2STaylor Simpson    VddV.v[0].ub[i] =  fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i];
1827887d61b2STaylor Simpson	VddV.v[1].ub[i] = !fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i])
1828887d61b2STaylor Simpson
1829887d61b2STaylor Simpson
1830887d61b2STaylor Simpson/***************************************************************************
1831887d61b2STaylor Simpson*
1832887d61b2STaylor Simpson*   MMVECTOR SORTING
1833887d61b2STaylor Simpson*
1834887d61b2STaylor Simpson****************************************************************************/
1835887d61b2STaylor Simpson
1836887d61b2STaylor Simpson#define MMVEC_SORT(TYPE,TYPE2,DESCR,ELEMENTSIZE,SRC)\
1837887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmax##TYPE, "Vd32=vmax" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmax(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " max", VdV.SRC[i] = (VuV.SRC[i] > VvV.SRC[i]) ? VuV.SRC[i] :  VvV.SRC[i])  \
1838887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmin##TYPE, "Vd32=vmin" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmin(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " min", VdV.SRC[i] = (VuV.SRC[i] < VvV.SRC[i]) ? VuV.SRC[i] :  VvV.SRC[i])
1839887d61b2STaylor Simpson
1840887d61b2STaylor SimpsonMMVEC_SORT(b,"b", "signed byte",    8,  b)
1841887d61b2STaylor SimpsonMMVEC_SORT(ub,"ub", "unsigned byte",    8,  ub)
1842887d61b2STaylor SimpsonMMVEC_SORT(uh,"uh", "unsigned halfword",16, uh)
1843887d61b2STaylor SimpsonMMVEC_SORT(h,   "h",    "halfword",         16, h)
1844887d61b2STaylor SimpsonMMVEC_SORT(w,   "w",    "word",             32, w)
1845887d61b2STaylor Simpson
1846887d61b2STaylor Simpson
1847887d61b2STaylor Simpson
1848887d61b2STaylor Simpson
1849887d61b2STaylor Simpson
1850887d61b2STaylor Simpson
1851887d61b2STaylor Simpson
1852887d61b2STaylor Simpson
1853887d61b2STaylor Simpson
1854887d61b2STaylor Simpson/*************************************************************
1855887d61b2STaylor Simpson* SHUFFLES
1856887d61b2STaylor Simpson****************************************************************/
1857887d61b2STaylor Simpson
1858887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(16,vsathub,"Vd32=vsathub(Vu32,Vv32)","Vd32.ub=vsat(Vu32.h,Vv32.h)",
1859887d61b2STaylor Simpson"Saturate and pack 32 halfwords to 32 unsigned bytes, and interleave them",
1860887d61b2STaylor Simpson    fSETBYTE(0, VdV.uh[i], fVSATUB(VvV.h[i]));
1861887d61b2STaylor Simpson    fSETBYTE(1, VdV.uh[i], fVSATUB(VuV.h[i])))
1862887d61b2STaylor Simpson
1863887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vsatwh,"Vd32=vsatwh(Vu32,Vv32)","Vd32.h=vsat(Vu32.w,Vv32.w)",
1864887d61b2STaylor Simpson"Saturate and pack 16 words to 16 halfwords, and interleave them",
1865887d61b2STaylor Simpson    fSETHALF(0, VdV.w[i], fVSATH(VvV.w[i]));
1866887d61b2STaylor Simpson    fSETHALF(1, VdV.w[i], fVSATH(VuV.w[i])))
1867887d61b2STaylor Simpson
1868887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vsatuwuh,"Vd32=vsatuwuh(Vu32,Vv32)","Vd32.uh=vsat(Vu32.uw,Vv32.uw)",
1869887d61b2STaylor Simpson"Saturate and pack 16 words to 16 halfwords, and interleave them",
1870887d61b2STaylor Simpson    fSETHALF(0, VdV.w[i], fVSATUH(VvV.uw[i]));
1871887d61b2STaylor Simpson    fSETHALF(1, VdV.w[i], fVSATUH(VuV.uw[i])))
1872887d61b2STaylor Simpson
1873887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(16,vshuffeb,"Vd32=vshuffeb(Vu32,Vv32)","Vd32.b=vshuffe(Vu32.b,Vv32.b)",
1874887d61b2STaylor Simpson"Shuffle half words with in a lane",
1875887d61b2STaylor Simpson    fSETBYTE(0, VdV.uh[i], fGETUBYTE(0, VvV.uh[i]));
1876887d61b2STaylor Simpson    fSETBYTE(1, VdV.uh[i], fGETUBYTE(0, VuV.uh[i])))
1877887d61b2STaylor Simpson
1878887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(16,vshuffob,"Vd32=vshuffob(Vu32,Vv32)","Vd32.b=vshuffo(Vu32.b,Vv32.b)",
1879887d61b2STaylor Simpson"Shuffle half words with in a lane",
1880887d61b2STaylor Simpson    fSETBYTE(0, VdV.uh[i], fGETUBYTE(1, VvV.uh[i]));
1881887d61b2STaylor Simpson    fSETBYTE(1, VdV.uh[i], fGETUBYTE(1, VuV.uh[i])))
1882887d61b2STaylor Simpson
1883887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vshufeh,"Vd32=vshuffeh(Vu32,Vv32)","Vd32.h=vshuffe(Vu32.h,Vv32.h)",
1884887d61b2STaylor Simpson"Shuffle half words with in a lane",
1885887d61b2STaylor Simpson    fSETHALF(0, VdV.uw[i], fGETUHALF(0, VvV.uw[i]));
1886887d61b2STaylor Simpson    fSETHALF(1, VdV.uw[i], fGETUHALF(0, VuV.uw[i])))
1887887d61b2STaylor Simpson
1888887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT(32,vshufoh,"Vd32=vshuffoh(Vu32,Vv32)","Vd32.h=vshuffo(Vu32.h,Vv32.h)",
1889887d61b2STaylor Simpson"Shuffle half words with in a lane",
1890887d61b2STaylor Simpson    fSETHALF(0, VdV.uw[i], fGETUHALF(1, VvV.uw[i]));
1891887d61b2STaylor Simpson    fSETHALF(1, VdV.uw[i], fGETUHALF(1, VuV.uw[i])))
1892887d61b2STaylor Simpson
1893887d61b2STaylor Simpson
1894887d61b2STaylor Simpson
1895887d61b2STaylor Simpson
1896887d61b2STaylor Simpson/**************************************************************************
1897887d61b2STaylor Simpson* Double Vector Shuffles
1898887d61b2STaylor Simpson**************************************************************************/
1899887d61b2STaylor Simpson
1900887d61b2STaylor SimpsonEXTINSN(V6_vshuff, "vshuff(Vy32,Vx32,Rt32)",
1901887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1902887d61b2STaylor Simpson"2x2->2x2 transpose, for multiple data sizes, inplace",
1903887d61b2STaylor Simpson{
1904887d61b2STaylor Simpson	fHIDE(int offset;)
1905887d61b2STaylor Simpson	for (offset=1; offset<fVBYTES(); offset<<=1) {
1906887d61b2STaylor Simpson		if ( RtV & offset) {
1907887d61b2STaylor Simpson			    fHIDE(int k;) \
1908887d61b2STaylor Simpson				fVFOREACH(8, k) {\
1909887d61b2STaylor Simpson				if (!( k & offset)) {
1910887d61b2STaylor Simpson					fSWAPB(VyV.ub[k], VxV.ub[k+offset]);
1911887d61b2STaylor Simpson				}
1912887d61b2STaylor Simpson			}
1913887d61b2STaylor Simpson		}
1914887d61b2STaylor Simpson	}
1915887d61b2STaylor Simpson	})
1916887d61b2STaylor Simpson
1917887d61b2STaylor SimpsonEXTINSN(V6_vshuffvdd, "Vdd32=vshuff(Vu32,Vv32,Rt8)",
1918887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1919887d61b2STaylor Simpson"2x2->2x2 transpose for multiple data sizes",
1920887d61b2STaylor Simpson{
1921887d61b2STaylor Simpson	fHIDE(int offset;)
1922887d61b2STaylor Simpson	VddV.v[0] = VvV;
1923887d61b2STaylor Simpson	VddV.v[1] = VuV;
1924887d61b2STaylor Simpson	for (offset=1; offset<fVBYTES(); offset<<=1) {
1925887d61b2STaylor Simpson		if ( RtV & offset) {
1926887d61b2STaylor Simpson			    fHIDE(int k;) \
1927887d61b2STaylor Simpson				fVFOREACH(8, k) {\
1928887d61b2STaylor Simpson				if (!( k & offset)) {
1929887d61b2STaylor Simpson					fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]);
1930887d61b2STaylor Simpson				}
1931887d61b2STaylor Simpson			}
1932887d61b2STaylor Simpson		}
1933887d61b2STaylor Simpson	}
1934887d61b2STaylor Simpson	})
1935887d61b2STaylor Simpson
1936887d61b2STaylor SimpsonEXTINSN(V6_vdeal, "vdeal(Vy32,Vx32,Rt32)",
1937887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1938887d61b2STaylor Simpson" vector - vector deal - or deinterleave, for multiple data sizes, inplace",
1939887d61b2STaylor Simpson{
1940887d61b2STaylor Simpson	fHIDE(int offset;)
1941887d61b2STaylor Simpson	for (offset=fVBYTES()>>1; offset>0; offset>>=1) {
1942887d61b2STaylor Simpson		if ( RtV & offset) {
1943887d61b2STaylor Simpson			    fHIDE(int k;) \
1944887d61b2STaylor Simpson				fVFOREACH(8, k) {\
1945887d61b2STaylor Simpson				if (!( k & offset)) {
1946887d61b2STaylor Simpson					fSWAPB(VyV.ub[k], VxV.ub[k+offset]);
1947887d61b2STaylor Simpson				}
1948887d61b2STaylor Simpson			}
1949887d61b2STaylor Simpson		}
1950887d61b2STaylor Simpson	}
1951887d61b2STaylor Simpson	})
1952887d61b2STaylor Simpson
1953887d61b2STaylor SimpsonEXTINSN(V6_vdealvdd, "Vdd32=vdeal(Vu32,Vv32,Rt8)",
1954887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1955887d61b2STaylor Simpson" vector - vector deal - or deinterleave, for multiple data sizes",
1956887d61b2STaylor Simpson{
1957887d61b2STaylor Simpson	fHIDE(int offset;)
1958887d61b2STaylor Simpson	VddV.v[0] = VvV;
1959887d61b2STaylor Simpson	VddV.v[1] = VuV;
1960887d61b2STaylor Simpson	for (offset=fVBYTES()>>1; offset>0; offset>>=1) {
1961887d61b2STaylor Simpson		if ( RtV & offset) {
1962887d61b2STaylor Simpson			    fHIDE(int k;) \
1963887d61b2STaylor Simpson				fVFOREACH(8, k) {\
1964887d61b2STaylor Simpson				if (!( k & offset)) {
1965887d61b2STaylor Simpson					fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]);
1966887d61b2STaylor Simpson				}
1967887d61b2STaylor Simpson			}
1968887d61b2STaylor Simpson		}
1969887d61b2STaylor Simpson	}
1970887d61b2STaylor Simpson	})
1971887d61b2STaylor Simpson
1972887d61b2STaylor Simpson/**************************************************************************/
1973887d61b2STaylor Simpson
1974887d61b2STaylor Simpson
1975887d61b2STaylor Simpson
1976887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vshufoeh,"Vdd32=vshuffoeh(Vu32,Vv32)","Vdd32.h=vshuffoe(Vu32.h,Vv32.h)",
1977887d61b2STaylor Simpson"Vector Shuffle half words",
1978887d61b2STaylor Simpson    fSETHALF(0, VddV.v[0].uw[i], fGETUHALF(0, VvV.uw[i]));
1979887d61b2STaylor Simpson    fSETHALF(1, VddV.v[0].uw[i], fGETUHALF(0, VuV.uw[i]));
1980887d61b2STaylor Simpson    fSETHALF(0, VddV.v[1].uw[i], fGETUHALF(1, VvV.uw[i]));
1981887d61b2STaylor Simpson    fSETHALF(1, VddV.v[1].uw[i], fGETUHALF(1, VuV.uw[i])))
1982887d61b2STaylor Simpson
1983887d61b2STaylor SimpsonITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vshufoeb,"Vdd32=vshuffoeb(Vu32,Vv32)","Vdd32.b=vshuffoe(Vu32.b,Vv32.b)",
1984887d61b2STaylor Simpson"Vector Shuffle bytes",
1985887d61b2STaylor Simpson    fSETBYTE(0, VddV.v[0].uh[i], fGETUBYTE(0, VvV.uh[i]));
1986887d61b2STaylor Simpson    fSETBYTE(1, VddV.v[0].uh[i], fGETUBYTE(0, VuV.uh[i]));
1987887d61b2STaylor Simpson    fSETBYTE(0, VddV.v[1].uh[i], fGETUBYTE(1, VvV.uh[i]));
1988887d61b2STaylor Simpson    fSETBYTE(1, VddV.v[1].uh[i], fGETUBYTE(1, VuV.uh[i])))
1989887d61b2STaylor Simpson
1990887d61b2STaylor Simpson
1991887d61b2STaylor Simpson/***************************************************************
1992887d61b2STaylor Simpson* Deal
1993887d61b2STaylor Simpson***************************************************************/
1994887d61b2STaylor Simpson
1995887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(32, vdealh, "Vd32=vdealh(Vu32)", "Vd32.h=vdeal(Vu32.h)",
1996887d61b2STaylor Simpson"Deal Halfwords",
1997887d61b2STaylor Simpson    VdV.uh[i  ] = fGETUHALF(0, VuV.uw[i]);
1998887d61b2STaylor Simpson    VdV.uh[i+fVELEM(32)] = fGETUHALF(1, VuV.uw[i]))
1999887d61b2STaylor Simpson
2000887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(16, vdealb, "Vd32=vdealb(Vu32)", "Vd32.b=vdeal(Vu32.b)",
2001887d61b2STaylor Simpson"Deal Halfwords",
2002887d61b2STaylor Simpson    VdV.ub[i   ] = fGETUBYTE(0, VuV.uh[i]);
2003887d61b2STaylor Simpson    VdV.ub[i+fVELEM(16)] = fGETUBYTE(1, VuV.uh[i]))
2004887d61b2STaylor Simpson
2005887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(32, vdealb4w,  "Vd32=vdealb4w(Vu32,Vv32)", "Vd32.b=vdeale(Vu32.b,Vv32.b)",
2006887d61b2STaylor Simpson"Deal Two Vectors Bytes",
2007887d61b2STaylor Simpson    VdV.ub[0+i ] = fGETUBYTE(0, VvV.uw[i]);
2008887d61b2STaylor Simpson    VdV.ub[fVELEM(32)+i ] = fGETUBYTE(2, VvV.uw[i]);
2009887d61b2STaylor Simpson    VdV.ub[2*fVELEM(32)+i] = fGETUBYTE(0, VuV.uw[i]);
2010887d61b2STaylor Simpson    VdV.ub[3*fVELEM(32)+i] = fGETUBYTE(2, VuV.uw[i]))
2011887d61b2STaylor Simpson
2012887d61b2STaylor Simpson/***************************************************************
2013887d61b2STaylor Simpson* shuffle
2014887d61b2STaylor Simpson***************************************************************/
2015887d61b2STaylor Simpson
2016887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(32, vshuffh, "Vd32=vshuffh(Vu32)", "Vd32.h=vshuff(Vu32.h)",
2017887d61b2STaylor Simpson"Deal Halfwords",
2018887d61b2STaylor Simpson    fSETHALF(0, VdV.uw[i], VuV.uh[i]);
2019887d61b2STaylor Simpson    fSETHALF(1, VdV.uw[i], VuV.uh[i+fVELEM(32)]))
2020887d61b2STaylor Simpson
2021887d61b2STaylor SimpsonITERATOR_INSN2_PERMUTE_SLOT(16, vshuffb, "Vd32=vshuffb(Vu32)", "Vd32.b=vshuff(Vu32.b)",
2022887d61b2STaylor Simpson"Deal Halfwords",
2023887d61b2STaylor Simpson    fSETBYTE(0, VdV.uh[i], VuV.ub[i]);
2024887d61b2STaylor Simpson    fSETBYTE(1, VdV.uh[i], VuV.ub[i+fVELEM(16)]))
2025887d61b2STaylor Simpson
2026887d61b2STaylor Simpson
2027887d61b2STaylor Simpson
2028887d61b2STaylor Simpson
2029887d61b2STaylor Simpson
2030887d61b2STaylor Simpson/***********************************************************
2031887d61b2STaylor Simpson* INSERT AND EXTRACT
2032887d61b2STaylor Simpson*********************************************************/
2033887d61b2STaylor SimpsonEXTINSN(V6_extractw, "Rd32=vextract(Vu32,Rs32)",
2034887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA,A_MEMLIKE,A_RESTRICT_SLOT0ONLY),
2035887d61b2STaylor Simpson"Extract an element from a vector to scalar",
2036887d61b2STaylor SimpsonfHIDE(warn("RdN=%d VuN=%d RsN=%d RsV=0x%08x widx=%d",RdN,VuN,RsN,RsV,((RsV & (fVBYTES()-1)) >> 2));)
2037887d61b2STaylor SimpsonRdV = VuV.uw[ (RsV & (fVBYTES()-1)) >> 2];
2038887d61b2STaylor SimpsonfHIDE(warn("RdV=0x%08x",RdV);))
2039887d61b2STaylor Simpson
2040887d61b2STaylor SimpsonEXTINSN(V6_vinsertwr, "Vx32.w=vinsert(Rt32)",
2041887d61b2STaylor SimpsonATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),
2042887d61b2STaylor Simpson"Insert Word Scalar into Vector",
2043887d61b2STaylor SimpsonVxV.uw[0] = RtV;)
2044887d61b2STaylor Simpson
2045887d61b2STaylor Simpson
2046887d61b2STaylor Simpson
2047887d61b2STaylor Simpson
20486c67d98cSMichael TokarevITERATOR_INSN_MPY_SLOT_LATE(32,lvsplatw, "Vd32=vsplat(Rt32)", "Replicates scalar across words in vector", VdV.uw[i] = RtV)
2049887d61b2STaylor Simpson
20506c67d98cSMichael TokarevITERATOR_INSN_MPY_SLOT_LATE(16,lvsplath, "Vd32.h=vsplat(Rt32)", "Replicates scalar across halves in vector", VdV.uh[i] = RtV)
2051887d61b2STaylor Simpson
20526c67d98cSMichael TokarevITERATOR_INSN_MPY_SLOT_LATE(8,lvsplatb, "Vd32.b=vsplat(Rt32)", "Replicates scalar across bytes in vector", VdV.ub[i] = RtV)
2053887d61b2STaylor Simpson
2054887d61b2STaylor Simpson
2055887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT(32,vassign,"Vd32=Vu32","Copy a vector",VdV.w[i]=VuV.w[i])
2056887d61b2STaylor Simpson
2057887d61b2STaylor Simpson
2058887d61b2STaylor SimpsonITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vcombine,"Vdd32=vcombine(Vu32,Vv32)",
2059887d61b2STaylor Simpson"Vector assign, Any two to Vector Pair",
2060887d61b2STaylor Simpson    VddV.v[0].ub[i] = VvV.ub[i];
2061887d61b2STaylor Simpson    VddV.v[1].ub[i] = VuV.ub[i])
2062887d61b2STaylor Simpson
2063887d61b2STaylor Simpson
2064887d61b2STaylor Simpson
2065887d61b2STaylor Simpson///////////////////////////////////////////////////////////////////////////
2066887d61b2STaylor Simpson
2067b2f20c2cSTaylor SimpsonEXTINSN(V6_vcombine_tmp, "Vdd32.tmp=vcombine(Vu32,Vv32)",    ATTRIBS(A_EXTENSION,A_CVI,A_CVI_REMAP,A_CVI_TMP,A_NO_INTRINSIC),
2068b2f20c2cSTaylor Simpson"Vector assign tmp, Any two to Vector Pair ",
2069b2f20c2cSTaylor Simpson{
2070b2f20c2cSTaylor Simpson   fHIDE(int i;)
2071b2f20c2cSTaylor Simpson    fVFOREACH(8, i) {
2072b2f20c2cSTaylor Simpson           VddV.v[0].ub[i] = VvV.ub[i];
2073b2f20c2cSTaylor Simpson           VddV.v[1].ub[i] = VuV.ub[i];
2074b2f20c2cSTaylor Simpson    }
2075b2f20c2cSTaylor Simpson})
2076b2f20c2cSTaylor Simpson
2077b2f20c2cSTaylor SimpsonEXTINSN(V6_vassign_tmp, "Vd32.tmp=Vu32",    ATTRIBS(A_EXTENSION,A_CVI,A_CVI_REMAP,A_CVI_TMP,A_NO_INTRINSIC),
2078b2f20c2cSTaylor Simpson"Vector assign tmp, Any two to Vector Pair ",
2079b2f20c2cSTaylor Simpson{
2080b2f20c2cSTaylor Simpson   fHIDE(int i;)
2081b2f20c2cSTaylor Simpson    fVFOREACH(32, i) {
2082b2f20c2cSTaylor Simpson           VdV.w[i]=VuV.w[i];
2083b2f20c2cSTaylor Simpson    }
2084b2f20c2cSTaylor Simpson})
2085887d61b2STaylor Simpson
2086887d61b2STaylor Simpson/*********************************************************
2087887d61b2STaylor Simpson* GENERAL PERMUTE NETWORKS
2088887d61b2STaylor Simpson*********************************************************/
2089887d61b2STaylor Simpson
2090887d61b2STaylor Simpson
2091887d61b2STaylor SimpsonEXTINSN(V6_vdelta, "Vd32=vdelta(Vu32,Vv32)",    ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
2092887d61b2STaylor Simpson"Reverse Benes Butterfly network ",
2093887d61b2STaylor Simpson{
2094887d61b2STaylor Simpson    fHIDE(int offset;)
2095887d61b2STaylor Simpson    fHIDE(int k;)
2096887d61b2STaylor Simpson    fHIDE(mmvector_t tmp;)
2097887d61b2STaylor Simpson    tmp = VuV;
2098887d61b2STaylor Simpson    for (offset=fVBYTES(); (offset>>=1)>0; ) {
2099887d61b2STaylor Simpson        for (k = 0; k<fVBYTES(); k++) {
2100887d61b2STaylor Simpson            VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k];
2101887d61b2STaylor Simpson        }
2102887d61b2STaylor Simpson        for (k = 0; k<fVBYTES(); k++) {
2103887d61b2STaylor Simpson            tmp.ub[k] = VdV.ub[k];
2104887d61b2STaylor Simpson        }
2105887d61b2STaylor Simpson    }
2106887d61b2STaylor Simpson})
2107887d61b2STaylor Simpson
2108887d61b2STaylor Simpson
2109887d61b2STaylor SimpsonEXTINSN(V6_vrdelta, "Vd32=vrdelta(Vu32,Vv32)",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
2110887d61b2STaylor Simpson"Forward Benes Butterfly network ",
2111887d61b2STaylor Simpson{
2112887d61b2STaylor Simpson	fHIDE(int offset;)
2113887d61b2STaylor Simpson    fHIDE(int k;)
2114887d61b2STaylor Simpson    fHIDE(mmvector_t tmp;)
2115887d61b2STaylor Simpson    tmp = VuV;
2116887d61b2STaylor Simpson    for (offset=1; offset<fVBYTES(); offset<<=1){
2117887d61b2STaylor Simpson        for (k = 0; k<fVBYTES(); k++) {
2118887d61b2STaylor Simpson            VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k];
2119887d61b2STaylor Simpson        }
2120887d61b2STaylor Simpson        for (k = 0; k<fVBYTES(); k++) {
2121887d61b2STaylor Simpson            tmp.ub[k] = VdV.ub[k];
2122887d61b2STaylor Simpson        }
2123887d61b2STaylor Simpson    }
2124887d61b2STaylor Simpson})
2125887d61b2STaylor Simpson
2126887d61b2STaylor Simpson
2127887d61b2STaylor Simpson
2128887d61b2STaylor Simpson
2129887d61b2STaylor Simpson
2130887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vcl0w,"Vd32=vcl0w(Vu32)","Vd32.uw=vcl0(Vu32.uw)",         "Count Leading Zeros in Word",     VdV.uw[i]=fCL1_4(~VuV.uw[i]))
2131887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vcl0h,"Vd32=vcl0h(Vu32)","Vd32.uh=vcl0(Vu32.uh)",         "Count Leading Zeros in Word",    VdV.uh[i]=fCL1_2(~VuV.uh[i]))
2132887d61b2STaylor Simpson
2133887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(32,vnormamtw,"Vd32=vnormamtw(Vu32)","Vd32.w=vnormamt(Vu32.w)","Norm Amount Word",
2134887d61b2STaylor SimpsonVdV.w[i]=fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i]))-1; fHIDE(IV1DEAD();))
2135887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vnormamth,"Vd32=vnormamth(Vu32)","Vd32.h=vnormamt(Vu32.h)","Norm Amount Halfword",
2136887d61b2STaylor SimpsonVdV.h[i]=fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i]))-1; fHIDE(IV1DEAD();))
2137887d61b2STaylor Simpson
2138887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT_VV_LATE(32,vaddclbw,"Vd32.w=vadd(vclb(Vu32.w),Vv32.w)",
2139887d61b2STaylor Simpson"Count leading bits and add",
2140887d61b2STaylor SimpsonVdV.w[i] = fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i])) + VvV.w[i])
2141887d61b2STaylor Simpson
2142887d61b2STaylor SimpsonITERATOR_INSN_SHIFT_SLOT_VV_LATE(16,vaddclbh,"Vd32.h=vadd(vclb(Vu32.h),Vv32.h)",
2143887d61b2STaylor Simpson"Count leading bits and add",
2144887d61b2STaylor SimpsonVdV.h[i] = fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i])) + VvV.h[i])
2145887d61b2STaylor Simpson
2146887d61b2STaylor Simpson
2147887d61b2STaylor SimpsonITERATOR_INSN2_SHIFT_SLOT(16,vpopcounth,"Vd32=vpopcounth(Vu32)","Vd32.h=vpopcount(Vu32.h)",   "Count Leading Zeros in Word",  VdV.uh[i]=fCOUNTONES_2(VuV.uh[i]))
2148887d61b2STaylor Simpson
2149887d61b2STaylor Simpson
2150887d61b2STaylor Simpson#define fHIST(INPUTVEC) \
2151887d61b2STaylor Simpson	fUARCH_NOTE_PUMP_4X(); \
2152887d61b2STaylor Simpson	fHIDE(int lane;) \
2153887d61b2STaylor Simpson	fHIDE(mmvector_t tmp;) \
2154887d61b2STaylor Simpson	fVFOREACH(128, lane) { \
2155887d61b2STaylor Simpson		for (fHIDE(int )i=0; i<128/8; ++i) { \
2156887d61b2STaylor Simpson			unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \
2157887d61b2STaylor Simpson			unsigned char regno = value>>3; \
2158887d61b2STaylor Simpson			unsigned char element = value & 7; \
2159887d61b2STaylor Simpson			READ_EXT_VREG(regno,tmp,0); \
2160887d61b2STaylor Simpson			tmp.uh[(128/16)*lane+(element)]++; \
2161887d61b2STaylor Simpson			WRITE_EXT_VREG(regno,tmp,EXT_NEW); \
2162887d61b2STaylor Simpson		} \
2163887d61b2STaylor Simpson	}
2164887d61b2STaylor Simpson
2165887d61b2STaylor Simpson#define fHISTQ(INPUTVEC,QVAL) \
2166887d61b2STaylor Simpson	fUARCH_NOTE_PUMP_4X(); \
2167887d61b2STaylor Simpson	fHIDE(int lane;) \
2168887d61b2STaylor Simpson	fHIDE(mmvector_t tmp;) \
2169887d61b2STaylor Simpson	fVFOREACH(128, lane) { \
2170887d61b2STaylor Simpson		for (fHIDE(int )i=0; i<128/8; ++i) { \
2171887d61b2STaylor Simpson			unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \
2172887d61b2STaylor Simpson			unsigned char regno = value>>3; \
2173887d61b2STaylor Simpson			unsigned char element = value & 7; \
2174887d61b2STaylor Simpson			READ_EXT_VREG(regno,tmp,0); \
2175887d61b2STaylor Simpson			if (fGETQBIT(QVAL,128/8*lane+i)) tmp.uh[(128/16)*lane+(element)]++; \
2176887d61b2STaylor Simpson			WRITE_EXT_VREG(regno,tmp,EXT_NEW); \
2177887d61b2STaylor Simpson		} \
2178887d61b2STaylor Simpson	}
2179887d61b2STaylor Simpson
2180887d61b2STaylor Simpson
2181887d61b2STaylor Simpson
2182887d61b2STaylor SimpsonEXTINSN(V6_vhist, "vhist",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHIST(inputVec); })
2183887d61b2STaylor SimpsonEXTINSN(V6_vhistq, "vhist(Qv4)",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHISTQ(inputVec,QvV); })
2184887d61b2STaylor Simpson
2185887d61b2STaylor Simpson#undef fHIST
2186887d61b2STaylor Simpson#undef fHISTQ
2187887d61b2STaylor Simpson
2188887d61b2STaylor Simpson
2189887d61b2STaylor Simpson/* **** WEIGHTED HISTOGRAM **** */
2190887d61b2STaylor Simpson
2191887d61b2STaylor Simpson
2192887d61b2STaylor Simpson#if 1
2193887d61b2STaylor Simpson#define WHIST(EL,MASK,BSHIFT,COND,SATF) \
2194887d61b2STaylor Simpson	fHIDE(unsigned int) bucket = fGETUBYTE(0,input.h[i]); \
2195887d61b2STaylor Simpson	fHIDE(unsigned int) weight = fGETUBYTE(1,input.h[i]); \
2196887d61b2STaylor Simpson	fHIDE(unsigned int) vindex = (bucket >> 3) & 0x1F; \
2197887d61b2STaylor Simpson	fHIDE(unsigned int) elindex = ((i>>BSHIFT) & (~MASK)) | ((bucket>>BSHIFT) & MASK); \
2198887d61b2STaylor Simpson	fHIDE(mmvector_t tmp;) \
2199887d61b2STaylor Simpson	READ_EXT_VREG(vindex,tmp,0); \
2200887d61b2STaylor Simpson	COND tmp.EL[elindex] = SATF(tmp.EL[elindex] + weight); \
2201887d61b2STaylor Simpson	WRITE_EXT_VREG(vindex,tmp,EXT_NEW); \
2202887d61b2STaylor Simpson	fUARCH_NOTE_PUMP_2X();
2203887d61b2STaylor Simpson
2204887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist256,"vwhist256","vector weighted histogram halfword counters", WHIST(uh,7,0,,))
2205887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist256q,"vwhist256(Qv4)","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),))
2206887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist256_sat,"vwhist256:sat","vector weighted histogram halfword counters", WHIST(uh,7,0,,fVSATUH))
2207887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist256q_sat,"vwhist256(Qv4):sat","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),fVSATUH))
2208887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist128,"vwhist128","vector weighted histogram word counters", WHIST(uw,3,1,,))
2209887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist128q,"vwhist128(Qv4)","vector weighted histogram word counters", WHIST(uw,3,1,if (fGETQBIT(QvV,2*i)),))
2210887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist128m,"vwhist128(#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if ((bucket & 1) == uiV),))
2211887d61b2STaylor SimpsonITERATOR_INSN_VHISTLIKE(16,vwhist128qm,"vwhist128(Qv4,#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if (((bucket & 1) == uiV) && fGETQBIT(QvV,2*i)),))
2212887d61b2STaylor Simpson
2213887d61b2STaylor Simpson
2214887d61b2STaylor Simpson#endif
2215887d61b2STaylor Simpson
2216887d61b2STaylor Simpson
2217887d61b2STaylor Simpson
2218887d61b2STaylor Simpson/* ******   lookup table instructions                          ***********  */
2219887d61b2STaylor Simpson
2220887d61b2STaylor Simpson/* Use low bits from idx to choose next-bigger elements from vector, then use LSB from idx to choose odd or even element */
2221887d61b2STaylor Simpson
2222887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup",
2223887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2224887d61b2STaylor Simpsonmatchval = RtV & 0x7;
2225887d61b2STaylor Simpsonoddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2226887d61b2STaylor Simpsonidx = VuV.ub[i];
2227887d61b2STaylor SimpsonVdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2228887d61b2STaylor Simpson
2229887d61b2STaylor Simpson
2230887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracc,"Vx32.b|=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup",
2231887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2232887d61b2STaylor Simpsonmatchval = RtV & 0x7;
2233887d61b2STaylor Simpsonoddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2234887d61b2STaylor Simpsonidx = VuV.ub[i];
2235887d61b2STaylor SimpsonVxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2236887d61b2STaylor Simpson
2237887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup",
2238887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2239887d61b2STaylor Simpsonmatchval = RtV & 0xF;
2240887d61b2STaylor Simpsonoddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2241887d61b2STaylor Simpsonidx = fGETUBYTE(0,VuV.uh[i]);
2242887d61b2STaylor SimpsonVddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2243887d61b2STaylor Simpsonidx = fGETUBYTE(1,VuV.uh[i]);
2244887d61b2STaylor SimpsonVddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2245887d61b2STaylor Simpson
2246887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracc,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup",
2247887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2248887d61b2STaylor Simpsonmatchval = fGETUBYTE(0,RtV) & 0xF;
2249887d61b2STaylor Simpsonoddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2250887d61b2STaylor Simpsonidx = fGETUBYTE(0,VuV.uh[i]);
2251887d61b2STaylor SimpsonVxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2252887d61b2STaylor Simpsonidx = fGETUBYTE(1,VuV.uh[i]);
2253887d61b2STaylor SimpsonVxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2254887d61b2STaylor Simpson
2255887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT(8,vlutvvbi,"Vd32.b=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup",
2256887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2257887d61b2STaylor Simpsonmatchval = uiV & 0x7;
2258887d61b2STaylor Simpsonoddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2259887d61b2STaylor Simpsonidx = VuV.ub[i];
2260887d61b2STaylor SimpsonVdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2261887d61b2STaylor Simpson
2262887d61b2STaylor Simpson
2263887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracci,"Vx32.b|=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup",
2264887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2265887d61b2STaylor Simpsonmatchval = uiV & 0x7;
2266887d61b2STaylor Simpsonoddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2267887d61b2STaylor Simpsonidx = VuV.ub[i];
2268887d61b2STaylor SimpsonVxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2269887d61b2STaylor Simpson
2270887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwhi,"Vdd32.h=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup",
2271887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2272887d61b2STaylor Simpsonmatchval = uiV & 0xF;
2273887d61b2STaylor Simpsonoddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2274887d61b2STaylor Simpsonidx = fGETUBYTE(0,VuV.uh[i]);
2275887d61b2STaylor SimpsonVddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2276887d61b2STaylor Simpsonidx = fGETUBYTE(1,VuV.uh[i]);
2277887d61b2STaylor SimpsonVddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2278887d61b2STaylor Simpson
2279887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracci,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup",
2280887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2281887d61b2STaylor Simpsonmatchval = uiV & 0xF;
2282887d61b2STaylor Simpsonoddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2283887d61b2STaylor Simpsonidx = fGETUBYTE(0,VuV.uh[i]);
2284887d61b2STaylor SimpsonVxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2285887d61b2STaylor Simpsonidx = fGETUBYTE(1,VuV.uh[i]);
2286887d61b2STaylor SimpsonVxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2287887d61b2STaylor Simpson
2288887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb_nm,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8):nomatch","vector-vector table lookup",
2289887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;)
2290887d61b2STaylor Simpson    matchval = RtV & 0x7;
2291887d61b2STaylor Simpson    oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2292887d61b2STaylor Simpson    idx = VuV.ub[i];
2293887d61b2STaylor Simpson    idx = (idx&0x1F) | (matchval<<5);
2294887d61b2STaylor Simpson    VdV.b[i] = fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]))
2295887d61b2STaylor Simpson
2296887d61b2STaylor SimpsonITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_nm,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8):nomatch","vector-vector table lookup",
2297887d61b2STaylor SimpsonfHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;)
2298887d61b2STaylor Simpson    matchval = RtV & 0xF;
2299887d61b2STaylor Simpson    oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2300887d61b2STaylor Simpson    idx = fGETUBYTE(0,VuV.uh[i]);
2301887d61b2STaylor Simpson    idx = (idx&0x0F) | (matchval<<4);
2302887d61b2STaylor Simpson    VddV.v[0].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]);
2303887d61b2STaylor Simpson    idx = fGETUBYTE(1,VuV.uh[i]);
2304887d61b2STaylor Simpson    idx = (idx&0x0F) | (matchval<<4);
2305887d61b2STaylor Simpson    VddV.v[1].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]))
2306887d61b2STaylor Simpson
2307887d61b2STaylor Simpson
2308887d61b2STaylor Simpson
2309887d61b2STaylor Simpson
2310887d61b2STaylor Simpson/******************************************************************************
2311887d61b2STaylor SimpsonNON LINEAR - V65
2312887d61b2STaylor Simpson ******************************************************************************/
2313887d61b2STaylor Simpson
2314887d61b2STaylor SimpsonITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpahhsat,"Vx32.h=vmpa(Vx32.h,Vu32.h,Rtt32.h):sat","piecewise linear approximation",
2315887d61b2STaylor Simpson    VxV.h[i]= fVSATH( ( ( fMPY16SS(VxV.h[i],VuV.h[i])<<1) + (fGETHALF(( (VuV.h[i]>>14)&0x3), RttV )<<15))>>16))
2316887d61b2STaylor Simpson
2317887d61b2STaylor Simpson
2318887d61b2STaylor SimpsonITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpauhuhsat,"Vx32.h=vmpa(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation",
2319887d61b2STaylor Simpson    VxV.h[i]= fVSATH( (  fMPY16SU(VxV.h[i],VuV.uh[i]) + (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16))
2320887d61b2STaylor Simpson
2321887d61b2STaylor SimpsonITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpsuhuhsat,"Vx32.h=vmps(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation",
2322887d61b2STaylor Simpson    VxV.h[i]= fVSATH( (  fMPY16SU(VxV.h[i],VuV.uh[i]) - (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16))
2323887d61b2STaylor Simpson
2324887d61b2STaylor Simpson
2325887d61b2STaylor SimpsonITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vlut4,"Vd32.h=vlut4(Vu32.uh,Rtt32.h)","4 entry lookup table",
2326887d61b2STaylor Simpson    VdV.h[i]= fGETHALF(  ((VuV.h[i]>>14)&0x3), RttV ))
2327887d61b2STaylor Simpson
2328887d61b2STaylor Simpson
2329887d61b2STaylor Simpson
2330887d61b2STaylor Simpson/******************************************************************************
2331887d61b2STaylor SimpsonV65
2332887d61b2STaylor Simpson ******************************************************************************/
2333887d61b2STaylor Simpson
2334887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe,"Vd32.uw=vmpye(Vu32.uh,Rt32.uh)",
2335887d61b2STaylor Simpson"Vector even halfword unsigned multiply by scalar",
2336887d61b2STaylor Simpson    VdV.uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)))
2337887d61b2STaylor Simpson
2338887d61b2STaylor Simpson
2339887d61b2STaylor SimpsonITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe_acc,"Vx32.uw+=vmpye(Vu32.uh,Rt32.uh)",
2340887d61b2STaylor Simpson"Vector even halfword unsigned multiply by scalar",
2341887d61b2STaylor Simpson    VxV.uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)))
2342887d61b2STaylor Simpson
2343887d61b2STaylor Simpson
2344887d61b2STaylor Simpson
2345887d61b2STaylor Simpson
2346887d61b2STaylor SimpsonEXTINSN(V6_vgathermw,  "vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words",
2347887d61b2STaylor Simpson{
2348887d61b2STaylor Simpson    fHIDE(int i;)
2349887d61b2STaylor Simpson	fHIDE(int element_size = 4;)
2350887d61b2STaylor Simpson    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2351887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2352887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2353887d61b2STaylor Simpson    fVFOREACH(32, i) {
2354887d61b2STaylor Simpson        EA = RtV+VvV.uw[i];
2355887d61b2STaylor Simpson        fVLOG_VTCM_GATHER_WORD(EA, VvV.uw[i], i,MuV);
2356887d61b2STaylor Simpson    }
2357887d61b2STaylor Simpson    fGATHER_FINISH()
2358887d61b2STaylor Simpson})
2359887d61b2STaylor SimpsonEXTINSN(V6_vgathermh,  "vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2360887d61b2STaylor Simpson{
2361887d61b2STaylor Simpson    fHIDE(int i;)
2362887d61b2STaylor Simpson	fHIDE(int element_size = 2;)
2363887d61b2STaylor Simpson    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2364887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2365887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2366887d61b2STaylor Simpson    fVFOREACH(16, i) {
2367887d61b2STaylor Simpson        EA = RtV+VvV.uh[i];
2368887d61b2STaylor Simpson        fVLOG_VTCM_GATHER_HALFWORD(EA, VvV.uh[i], i,MuV);
2369887d61b2STaylor Simpson    }
2370887d61b2STaylor Simpson    fGATHER_FINISH()
2371887d61b2STaylor Simpson})
2372887d61b2STaylor Simpson
2373887d61b2STaylor Simpson
2374887d61b2STaylor Simpson
2375887d61b2STaylor SimpsonEXTINSN(V6_vgathermhw,  "vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2376887d61b2STaylor Simpson{
2377887d61b2STaylor Simpson    fHIDE(int i;)
2378887d61b2STaylor Simpson    fHIDE(int j;)
2379887d61b2STaylor Simpson	fHIDE(int element_size = 2;)
2380887d61b2STaylor Simpson    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2381887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2382887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2383887d61b2STaylor Simpson    fVFOREACH(32, i) {
2384887d61b2STaylor Simpson       for(j = 0; j < 2; j++) {
2385887d61b2STaylor Simpson            EA = RtV+VvvV.v[j].uw[i];
2386887d61b2STaylor Simpson            fVLOG_VTCM_GATHER_HALFWORD_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,MuV);
2387887d61b2STaylor Simpson        }
2388887d61b2STaylor Simpson    }
2389887d61b2STaylor Simpson     fGATHER_FINISH()
2390887d61b2STaylor Simpson})
2391887d61b2STaylor Simpson
2392887d61b2STaylor Simpson
2393887d61b2STaylor SimpsonEXTINSN(V6_vgathermwq,  "if (Qs4) vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words",
2394887d61b2STaylor Simpson{
2395887d61b2STaylor Simpson    fHIDE(int i;)
2396887d61b2STaylor Simpson	fHIDE(int element_size = 4;)
2397887d61b2STaylor Simpson    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2398887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2399887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2400887d61b2STaylor Simpson    fVFOREACH(32, i) {
2401887d61b2STaylor Simpson        EA = RtV+VvV.uw[i];
2402887d61b2STaylor Simpson        fVLOG_VTCM_GATHER_WORDQ(EA, VvV.uw[i], i,QsV,MuV);
2403887d61b2STaylor Simpson    }
2404887d61b2STaylor Simpson    fGATHER_FINISH()
2405887d61b2STaylor Simpson})
2406887d61b2STaylor SimpsonEXTINSN(V6_vgathermhq,  "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2407887d61b2STaylor Simpson{
2408887d61b2STaylor Simpson    fHIDE(int i;)
2409887d61b2STaylor Simpson	fHIDE(int element_size = 2;)
2410887d61b2STaylor Simpson    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2411887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2412887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2413887d61b2STaylor Simpson    fVFOREACH(16, i) {
2414887d61b2STaylor Simpson        EA = RtV+VvV.uh[i];
2415887d61b2STaylor Simpson        fVLOG_VTCM_GATHER_HALFWORDQ(EA, VvV.uh[i], i,QsV,MuV);
2416887d61b2STaylor Simpson    }
2417887d61b2STaylor Simpson    fGATHER_FINISH()
2418887d61b2STaylor Simpson})
2419887d61b2STaylor Simpson
2420887d61b2STaylor Simpson
2421887d61b2STaylor Simpson
2422887d61b2STaylor SimpsonEXTINSN(V6_vgathermhwq,  "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2423887d61b2STaylor Simpson{
2424887d61b2STaylor Simpson    fHIDE(int i;)
2425887d61b2STaylor Simpson    fHIDE(int j;)
2426887d61b2STaylor Simpson	fHIDE(int element_size = 2;)
2427887d61b2STaylor Simpson    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2428887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2429887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2430887d61b2STaylor Simpson    fVFOREACH(32, i) {
2431887d61b2STaylor Simpson       for(j = 0; j < 2; j++) {
2432887d61b2STaylor Simpson            EA = RtV+VvvV.v[j].uw[i];
2433887d61b2STaylor Simpson            fVLOG_VTCM_GATHER_HALFWORDQ_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,QsV,MuV);
2434887d61b2STaylor Simpson       }
2435887d61b2STaylor Simpson    }
2436887d61b2STaylor Simpson    fGATHER_FINISH()
2437887d61b2STaylor Simpson})
2438887d61b2STaylor Simpson
2439887d61b2STaylor Simpson
2440887d61b2STaylor Simpson
2441887d61b2STaylor SimpsonEXTINSN(V6_vscattermw , "vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words",
2442887d61b2STaylor Simpson{
2443887d61b2STaylor Simpson    fHIDE(int i;)
2444887d61b2STaylor Simpson	fHIDE(int element_size = 4;)
2445887d61b2STaylor Simpson    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2446887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2447887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2448887d61b2STaylor Simpson    fVFOREACH(32, i) {
2449887d61b2STaylor Simpson        EA = RtV+VvV.uw[i];
2450887d61b2STaylor Simpson        fVLOG_VTCM_WORD(EA, VvV.uw[i], VwV,i,MuV);
2451887d61b2STaylor Simpson    }
2452887d61b2STaylor Simpson    fSCATTER_FINISH(0)
2453887d61b2STaylor Simpson})
2454887d61b2STaylor Simpson
2455887d61b2STaylor Simpson
2456887d61b2STaylor Simpson
2457887d61b2STaylor SimpsonEXTINSN(V6_vscattermh , "vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfWords",
2458887d61b2STaylor Simpson{
2459887d61b2STaylor Simpson    fHIDE(int i;)
2460887d61b2STaylor Simpson	fHIDE(int element_size = 2;)
2461887d61b2STaylor Simpson    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2462887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2463887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2464887d61b2STaylor Simpson    fVFOREACH(16, i) {
2465887d61b2STaylor Simpson        EA = RtV+VvV.uh[i];
2466887d61b2STaylor Simpson        fVLOG_VTCM_HALFWORD(EA,VvV.uh[i],VwV,i,MuV);
2467887d61b2STaylor Simpson    }
2468887d61b2STaylor Simpson    fSCATTER_FINISH(0)
2469887d61b2STaylor Simpson})
2470887d61b2STaylor Simpson
2471887d61b2STaylor Simpson
2472887d61b2STaylor SimpsonEXTINSN(V6_vscattermw_add,  "vscatter(Rt32,Mu2,Vv32.w).w+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words-Add",
2473887d61b2STaylor Simpson{
2474887d61b2STaylor Simpson    fHIDE(int i;)
2475887d61b2STaylor Simpson    fHIDE(int ALIGNMENT=4;)
2476887d61b2STaylor Simpson	fHIDE(int element_size = 4;)
2477887d61b2STaylor Simpson    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2478887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2479887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2480887d61b2STaylor Simpson    fVFOREACH(32, i) {
2481887d61b2STaylor Simpson        EA = (RtV+fVALIGN(VvV.uw[i],ALIGNMENT));
2482887d61b2STaylor Simpson        fVLOG_VTCM_WORD_INCREMENT(EA,VvV.uw[i],VwV,i,ALIGNMENT,MuV);
2483887d61b2STaylor Simpson    }
2484887d61b2STaylor Simpson    fHIDE(fLOG_SCATTER_OP(4);)
2485887d61b2STaylor Simpson    fSCATTER_FINISH(1)
2486887d61b2STaylor Simpson})
2487887d61b2STaylor Simpson
2488887d61b2STaylor SimpsonEXTINSN(V6_vscattermh_add,  "vscatter(Rt32,Mu2,Vv32.h).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfword-Add",
2489887d61b2STaylor Simpson{
2490887d61b2STaylor Simpson    fHIDE(int i;)
2491887d61b2STaylor Simpson    fHIDE(int ALIGNMENT=2;)
2492887d61b2STaylor Simpson	fHIDE(int element_size = 2;)
2493887d61b2STaylor Simpson    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2494887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2495887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2496887d61b2STaylor Simpson    fVFOREACH(16, i) {
2497887d61b2STaylor Simpson        EA = (RtV+fVALIGN(VvV.uh[i],ALIGNMENT));
2498887d61b2STaylor Simpson        fVLOG_VTCM_HALFWORD_INCREMENT(EA,VvV.uh[i],VwV,i,ALIGNMENT,MuV);
2499887d61b2STaylor Simpson    }
2500887d61b2STaylor Simpson    fHIDE(fLOG_SCATTER_OP(2);)
2501887d61b2STaylor Simpson    fSCATTER_FINISH(1)
2502887d61b2STaylor Simpson})
2503887d61b2STaylor Simpson
2504887d61b2STaylor Simpson
2505887d61b2STaylor SimpsonEXTINSN(V6_vscattermwq,  "if (Qs4) vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words conditional",
2506887d61b2STaylor Simpson{
2507887d61b2STaylor Simpson    fHIDE(int i;)
2508887d61b2STaylor Simpson	fHIDE(int element_size = 4;)
2509887d61b2STaylor Simpson    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2510887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2511887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2512887d61b2STaylor Simpson    fVFOREACH(32, i) {
2513887d61b2STaylor Simpson        EA = RtV+VvV.uw[i];
2514887d61b2STaylor Simpson        fVLOG_VTCM_WORDQ(EA,VvV.uw[i], VwV,i,QsV,MuV);
2515887d61b2STaylor Simpson    }
2516887d61b2STaylor Simpson    fSCATTER_FINISH(0)
2517887d61b2STaylor Simpson})
2518887d61b2STaylor Simpson
2519887d61b2STaylor SimpsonEXTINSN(V6_vscattermhq,  "if (Qs4) vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter HalfWords conditional",
2520887d61b2STaylor Simpson{
2521887d61b2STaylor Simpson    fHIDE(int i;)
2522887d61b2STaylor Simpson	fHIDE(int element_size = 2;)
2523887d61b2STaylor Simpson    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2524887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2525887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2526887d61b2STaylor Simpson    fVFOREACH(16, i) {
2527887d61b2STaylor Simpson        EA = RtV+VvV.uh[i];
2528887d61b2STaylor Simpson        fVLOG_VTCM_HALFWORDQ(EA,VvV.uh[i],VwV,i,QsV,MuV);
2529887d61b2STaylor Simpson    }
2530887d61b2STaylor Simpson    fSCATTER_FINISH(0)
2531887d61b2STaylor Simpson})
2532887d61b2STaylor Simpson
2533887d61b2STaylor Simpson
2534887d61b2STaylor Simpson
2535887d61b2STaylor Simpson
2536887d61b2STaylor SimpsonEXTINSN(V6_vscattermhw , "vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter Words",
2537887d61b2STaylor Simpson{
2538887d61b2STaylor Simpson    fHIDE(int i;)
2539887d61b2STaylor Simpson    fHIDE(int j;)
2540887d61b2STaylor Simpson	fHIDE(int element_size = 2;)
2541887d61b2STaylor Simpson    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2542887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2543887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2544887d61b2STaylor Simpson    fVFOREACH(32, i) {
2545887d61b2STaylor Simpson        for(j = 0; j < 2; j++) {
2546887d61b2STaylor Simpson            EA = RtV+VvvV.v[j].uw[i];
2547887d61b2STaylor Simpson            fVLOG_VTCM_HALFWORD_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,MuV);
2548887d61b2STaylor Simpson        }
2549887d61b2STaylor Simpson    }
2550887d61b2STaylor Simpson    fSCATTER_FINISH(0)
2551887d61b2STaylor Simpson})
2552887d61b2STaylor Simpson
2553887d61b2STaylor Simpson
2554f128c0feSTaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(32, v6mpyvubs10_vxx, "Vxx32.w+=v6mpy(Vuu32.ub,Vvv32.b,#u2):v", "",
2555f128c0feSTaylor Simpson    fHIDE(size2s_t c00;)
2556f128c0feSTaylor Simpson    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
2557f128c0feSTaylor Simpson    fHIDE(size2s_t c01;)
2558f128c0feSTaylor Simpson    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
2559f128c0feSTaylor Simpson    fHIDE(size2s_t c02;)
2560f128c0feSTaylor Simpson    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
2561f128c0feSTaylor Simpson
2562f128c0feSTaylor Simpson	fHIDE(size2s_t c10;)
2563f128c0feSTaylor Simpson    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
2564f128c0feSTaylor Simpson    fHIDE(size2s_t c11;)
2565f128c0feSTaylor Simpson    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
2566f128c0feSTaylor Simpson    fHIDE(size2s_t c12;)
2567f128c0feSTaylor Simpson    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
2568f128c0feSTaylor Simpson
2569f128c0feSTaylor Simpson    if (uiV == 0) {
2570f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
2571f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
2572f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
2573f128c0feSTaylor Simpson
2574f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
2575f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2576f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
2577f128c0feSTaylor Simpson
2578f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c10);
2579f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
2580f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c12);
2581f128c0feSTaylor Simpson
2582f128c0feSTaylor Simpson    } else if (uiV == 1) {
2583f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c00);
2584f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c01);
2585f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c02);
2586f128c0feSTaylor Simpson
2587f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
2588f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
2589f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
2590f128c0feSTaylor Simpson
2591f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
2592f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2593f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
2594f128c0feSTaylor Simpson
2595f128c0feSTaylor Simpson    } else if (uiV == 2) {
2596f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
2597f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2598f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
2599f128c0feSTaylor Simpson
2600f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
2601f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
2602f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
2603f128c0feSTaylor Simpson
2604f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c10);
2605f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c11);
2606f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c12);
2607f128c0feSTaylor Simpson
2608f128c0feSTaylor Simpson    } else if (uiV == 3) {
2609f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c00);
2610f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
2611f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c02);
2612f128c0feSTaylor Simpson
2613f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
2614f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2615f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
2616f128c0feSTaylor Simpson
2617f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
2618f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
2619f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
2620f128c0feSTaylor Simpson    }
2621f128c0feSTaylor Simpson)
2622f128c0feSTaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_VX_FWD(32, v6mpyhubs10_vxx, "Vxx32.w+=v6mpy(Vuu32.ub,Vvv32.b,#u2):h", "",
2623f128c0feSTaylor Simpson    fHIDE(size2s_t c00;)
2624f128c0feSTaylor Simpson    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
2625f128c0feSTaylor Simpson    fHIDE(size2s_t c01;)
2626f128c0feSTaylor Simpson    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
2627f128c0feSTaylor Simpson    fHIDE(size2s_t c02;)
2628f128c0feSTaylor Simpson    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
2629f128c0feSTaylor Simpson    fHIDE(size2s_t c10;)
2630f128c0feSTaylor Simpson    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
2631f128c0feSTaylor Simpson    fHIDE(size2s_t c11;)
2632f128c0feSTaylor Simpson    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
2633f128c0feSTaylor Simpson    fHIDE(size2s_t c12;)
2634f128c0feSTaylor Simpson    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
2635f128c0feSTaylor Simpson
2636f128c0feSTaylor Simpson    if (uiV == 0) {
2637f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
2638f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
2639f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
2640f128c0feSTaylor Simpson
2641f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
2642f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2643f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
2644f128c0feSTaylor Simpson
2645f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c10);
2646f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
2647f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c12);
2648f128c0feSTaylor Simpson
2649f128c0feSTaylor Simpson    } else if (uiV == 1) {
2650f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c00);
2651f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c01);
2652f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c02);
2653f128c0feSTaylor Simpson
2654f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
2655f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
2656f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
2657f128c0feSTaylor Simpson
2658f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
2659f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2660f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
2661f128c0feSTaylor Simpson
2662f128c0feSTaylor Simpson    }  else if (uiV == 2) {
2663f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
2664f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2665f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
2666f128c0feSTaylor Simpson
2667f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
2668f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
2669f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
2670f128c0feSTaylor Simpson
2671f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c10);
2672f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c11);
2673f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c12);
2674f128c0feSTaylor Simpson
2675f128c0feSTaylor Simpson    } else if (uiV == 3) {
2676f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c00);
2677f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
2678f128c0feSTaylor Simpson        VxxV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c02);
2679f128c0feSTaylor Simpson
2680f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
2681f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2682f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
2683f128c0feSTaylor Simpson
2684f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
2685f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
2686f128c0feSTaylor Simpson        VxxV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
2687f128c0feSTaylor Simpson    }
2688f128c0feSTaylor Simpson)
2689f128c0feSTaylor Simpson
2690f128c0feSTaylor Simpson
2691f128c0feSTaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32, v6mpyvubs10, "Vdd32.w=v6mpy(Vuu32.ub,Vvv32.b,#u2):v", "",
2692f128c0feSTaylor Simpson    fHIDE(short c00;)
2693f128c0feSTaylor Simpson    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
2694f128c0feSTaylor Simpson    fHIDE(short c01;)
2695f128c0feSTaylor Simpson    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
2696f128c0feSTaylor Simpson    fHIDE(short c02;)
2697f128c0feSTaylor Simpson    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
2698f128c0feSTaylor Simpson    fHIDE(short c10;)
2699f128c0feSTaylor Simpson    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
2700f128c0feSTaylor Simpson    fHIDE(short c11;)
2701f128c0feSTaylor Simpson    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
2702f128c0feSTaylor Simpson    fHIDE(short c12;)
2703f128c0feSTaylor Simpson    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
2704f128c0feSTaylor Simpson
2705f128c0feSTaylor Simpson
2706f128c0feSTaylor Simpson
2707f128c0feSTaylor Simpson    if (uiV == 0) {
2708f128c0feSTaylor Simpson        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
2709f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
2710f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
2711f128c0feSTaylor Simpson
2712f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
2713f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2714f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
2715f128c0feSTaylor Simpson
2716f128c0feSTaylor Simpson        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c10);
2717f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
2718f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c12);
2719f128c0feSTaylor Simpson
2720f128c0feSTaylor Simpson    }  else if (uiV == 1) {
2721f128c0feSTaylor Simpson        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c00);
2722f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c01);
2723f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c02);
2724f128c0feSTaylor Simpson
2725f128c0feSTaylor Simpson        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c10);
2726f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c11);
2727f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c12);
2728f128c0feSTaylor Simpson
2729f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c00);
2730f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2731f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c02);
2732f128c0feSTaylor Simpson
2733f128c0feSTaylor Simpson    }  else if (uiV == 2) {
2734f128c0feSTaylor Simpson        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
2735f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2736f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
2737f128c0feSTaylor Simpson
2738f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
2739f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
2740f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
2741f128c0feSTaylor Simpson
2742f128c0feSTaylor Simpson        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c10);
2743f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c11);
2744f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c12);
2745f128c0feSTaylor Simpson
2746f128c0feSTaylor Simpson    } else if (uiV == 3) {
2747f128c0feSTaylor Simpson        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c00);
2748f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
2749f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c02);
2750f128c0feSTaylor Simpson
2751f128c0feSTaylor Simpson        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c10);
2752f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2753f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c12);
2754f128c0feSTaylor Simpson
2755f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c00);
2756f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c01);
2757f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c02);
2758f128c0feSTaylor Simpson    }
2759f128c0feSTaylor Simpson)
2760f128c0feSTaylor Simpson
2761f128c0feSTaylor SimpsonITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32, v6mpyhubs10, "Vdd32.w=v6mpy(Vuu32.ub,Vvv32.b,#u2):h", "",
2762f128c0feSTaylor Simpson    fHIDE(short c00;)
2763f128c0feSTaylor Simpson    fGET10BIT(c00, VvvV.v[0].uw[i], 0)
2764f128c0feSTaylor Simpson    fHIDE(short c01;)
2765f128c0feSTaylor Simpson    fGET10BIT(c01, VvvV.v[0].uw[i], 1)
2766f128c0feSTaylor Simpson    fHIDE(short c02;)
2767f128c0feSTaylor Simpson    fGET10BIT(c02, VvvV.v[0].uw[i], 2)
2768f128c0feSTaylor Simpson    fHIDE(short c10;)
2769f128c0feSTaylor Simpson    fGET10BIT(c10, VvvV.v[1].uw[i], 0)
2770f128c0feSTaylor Simpson    fHIDE(short c11;)
2771f128c0feSTaylor Simpson    fGET10BIT(c11, VvvV.v[1].uw[i], 1)
2772f128c0feSTaylor Simpson    fHIDE(short c12;)
2773f128c0feSTaylor Simpson    fGET10BIT(c12, VvvV.v[1].uw[i], 2)
2774f128c0feSTaylor Simpson
2775f128c0feSTaylor Simpson    if (uiV == 0) {
2776f128c0feSTaylor Simpson        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
2777f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
2778f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
2779f128c0feSTaylor Simpson
2780f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
2781f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2782f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
2783f128c0feSTaylor Simpson
2784f128c0feSTaylor Simpson        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c10);
2785f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c11);
2786f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c12);
2787f128c0feSTaylor Simpson
2788f128c0feSTaylor Simpson    }  else if (uiV == 1) {
2789f128c0feSTaylor Simpson        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c00);
2790f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c01);
2791f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c02);
2792f128c0feSTaylor Simpson
2793f128c0feSTaylor Simpson        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(3,VuuV.v[1].uw[i]), c10);
2794f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c11);
2795f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c12);
2796f128c0feSTaylor Simpson
2797f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[1].uw[i]), c00);
2798f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c01);
2799f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c02);
2800f128c0feSTaylor Simpson
2801f128c0feSTaylor Simpson    }  else if (uiV == 2) {
2802f128c0feSTaylor Simpson        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
2803f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2804f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
2805f128c0feSTaylor Simpson
2806f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
2807f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
2808f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
2809f128c0feSTaylor Simpson
2810f128c0feSTaylor Simpson        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c10);
2811f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c11);
2812f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c12);
2813f128c0feSTaylor Simpson
2814f128c0feSTaylor Simpson    } else if (uiV == 3) {
2815f128c0feSTaylor Simpson        VddV.v[1].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c00);
2816f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c01);
2817f128c0feSTaylor Simpson        VddV.v[1].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c02);
2818f128c0feSTaylor Simpson
2819f128c0feSTaylor Simpson        VddV.v[0].w[i]  = fMPY16US(fGETUBYTE(1,VuuV.v[1].uw[i]), c10);
2820f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(3,VuuV.v[0].uw[i]), c11);
2821f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(1,VuuV.v[0].uw[i]), c12);
2822f128c0feSTaylor Simpson
2823f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[1].uw[i]), c00);
2824f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(2,VuuV.v[0].uw[i]), c01);
2825f128c0feSTaylor Simpson        VddV.v[0].w[i] += fMPY16US(fGETUBYTE(0,VuuV.v[0].uw[i]), c02);
2826f128c0feSTaylor Simpson    }
2827f128c0feSTaylor Simpson)
2828f128c0feSTaylor Simpson
2829887d61b2STaylor Simpson
2830887d61b2STaylor SimpsonEXTINSN(V6_vscattermhwq,  "if (Qs4) vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords conditional",
2831887d61b2STaylor Simpson{
2832887d61b2STaylor Simpson    fHIDE(int i;)
2833887d61b2STaylor Simpson    fHIDE(int j;)
2834887d61b2STaylor Simpson	fHIDE(int element_size = 2;)
2835887d61b2STaylor Simpson    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2836887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2837887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2838887d61b2STaylor Simpson    fVFOREACH(32, i) {
2839887d61b2STaylor Simpson        for(j = 0; j < 2; j++) {
2840887d61b2STaylor Simpson            EA = RtV+VvvV.v[j].uw[i];
2841887d61b2STaylor Simpson            fVLOG_VTCM_HALFWORDQ_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),QsV,i,j,MuV);
2842887d61b2STaylor Simpson        }
2843887d61b2STaylor Simpson    }
2844887d61b2STaylor Simpson    fSCATTER_FINISH(0)
2845887d61b2STaylor Simpson})
2846887d61b2STaylor Simpson
2847887d61b2STaylor SimpsonEXTINSN(V6_vscattermhw_add,  "vscatter(Rt32,Mu2,Vvv32.w).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords-add",
2848887d61b2STaylor Simpson{
2849887d61b2STaylor Simpson    fHIDE(int i;)
2850887d61b2STaylor Simpson    fHIDE(int j;)
2851887d61b2STaylor Simpson    fHIDE(int ALIGNMENT=2;)
2852887d61b2STaylor Simpson	fHIDE(int element_size = 2;)
2853887d61b2STaylor Simpson    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2854887d61b2STaylor Simpson    fVLASTBYTE(MuV, element_size);
2855887d61b2STaylor Simpson    fVALIGN(RtV, element_size);
2856887d61b2STaylor Simpson    fVFOREACH(32, i) {
2857887d61b2STaylor Simpson        for(j = 0; j < 2; j++) {
2858*29ea1946SZhao Liu             EA =  RtV + fVALIGN(VvvV.v[j].uw[i],ALIGNMENT);
2859887d61b2STaylor Simpson             fVLOG_VTCM_HALFWORD_INCREMENT_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,ALIGNMENT,MuV);
2860887d61b2STaylor Simpson        }
2861887d61b2STaylor Simpson    }
2862887d61b2STaylor Simpson    fHIDE(fLOG_SCATTER_OP(2);)
2863887d61b2STaylor Simpson    fSCATTER_FINISH(1)
2864887d61b2STaylor Simpson})
2865887d61b2STaylor Simpson
2866887d61b2STaylor SimpsonEXTINSN(V6_vprefixqb,"Vd32.b=prefixsum(Qv4)",   ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  "parallel prefix sum of Q into byte",
2867887d61b2STaylor Simpson{
2868887d61b2STaylor Simpson    fHIDE(int i;)
2869887d61b2STaylor Simpson    fHIDE(size1u_t acc = 0;)
2870887d61b2STaylor Simpson    fVFOREACH(8, i) {
2871887d61b2STaylor Simpson        acc += fGETQBIT(QvV,i);
2872887d61b2STaylor Simpson        VdV.ub[i] = acc;
2873887d61b2STaylor Simpson    }
2874887d61b2STaylor Simpson    } )
2875887d61b2STaylor SimpsonEXTINSN(V6_vprefixqh,"Vd32.h=prefixsum(Qv4)",   ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  "parallel prefix sum of Q into halfwords",
2876887d61b2STaylor Simpson{
2877887d61b2STaylor Simpson    fHIDE(int i;)
2878887d61b2STaylor Simpson    fHIDE(size2u_t acc = 0;)
2879887d61b2STaylor Simpson    fVFOREACH(16, i) {
2880887d61b2STaylor Simpson        acc += fGETQBIT(QvV,i*2+0);
2881887d61b2STaylor Simpson        acc += fGETQBIT(QvV,i*2+1);
2882887d61b2STaylor Simpson        VdV.uh[i] = acc;
2883887d61b2STaylor Simpson    }
2884887d61b2STaylor Simpson    } )
2885887d61b2STaylor SimpsonEXTINSN(V6_vprefixqw,"Vd32.w=prefixsum(Qv4)",   ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  "parallel prefix sum of Q into words",
2886887d61b2STaylor Simpson{
2887887d61b2STaylor Simpson    fHIDE(int i;)
2888887d61b2STaylor Simpson    fHIDE(size4u_t acc = 0;)
2889887d61b2STaylor Simpson    fVFOREACH(32, i) {
2890887d61b2STaylor Simpson        acc += fGETQBIT(QvV,i*4+0);
2891887d61b2STaylor Simpson        acc += fGETQBIT(QvV,i*4+1);
2892887d61b2STaylor Simpson        acc += fGETQBIT(QvV,i*4+2);
2893887d61b2STaylor Simpson        acc += fGETQBIT(QvV,i*4+3);
2894887d61b2STaylor Simpson        VdV.uw[i] = acc;
2895887d61b2STaylor Simpson    }
2896887d61b2STaylor Simpson    } )
2897887d61b2STaylor Simpson
2898887d61b2STaylor Simpson
2899887d61b2STaylor Simpson
2900887d61b2STaylor Simpson
2901887d61b2STaylor Simpson
2902887d61b2STaylor Simpson/******************************************************************************
2903887d61b2STaylor Simpson DEBUG Vector/Register Printing
2904887d61b2STaylor Simpson ******************************************************************************/
2905887d61b2STaylor Simpson
2906887d61b2STaylor Simpson#define PRINT_VU(TYPE, TYPE2, COUNT)\
2907887d61b2STaylor Simpson    int i;  \
2908887d61b2STaylor Simpson    size4u_t vec_len = fVBYTES();\
2909887d61b2STaylor Simpson    fprintf(stdout,"V%2d: ",VuN);  \
2910887d61b2STaylor Simpson    for (i=0;i<vec_len>>COUNT;i++) {         \
2911887d61b2STaylor Simpson        fprintf(stdout,TYPE2 " ", VuV.TYPE[i]); \
2912887d61b2STaylor Simpson    };  \
2913887d61b2STaylor Simpson    fprintf(stdout,"\\n");  \
2914887d61b2STaylor Simpson	fflush(stdout);\
2915887d61b2STaylor Simpson
2916887d61b2STaylor Simpson#undef ATTR_VMEM
2917887d61b2STaylor Simpson#undef ATTR_VMEMU
2918887d61b2STaylor Simpson#undef ATTR_VMEM_NT
2919887d61b2STaylor Simpson
2920887d61b2STaylor Simpson#endif /* NO_MMVEC */
2921887d61b2STaylor Simpson
2922887d61b2STaylor Simpson#ifdef __SELF_DEF_EXTINSN
2923887d61b2STaylor Simpson#undef EXTINSN
2924887d61b2STaylor Simpson#undef __SELF_DEF_EXTINSN
2925887d61b2STaylor Simpson#endif
2926