1/*
2 *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
3 *
4 *  This program is free software; you can redistribute it and/or modify
5 *  it under the terms of the GNU General Public License as published by
6 *  the Free Software Foundation; either version 2 of the License, or
7 *  (at your option) any later version.
8 *
9 *  This program is distributed in the hope that it will be useful,
10 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 *  GNU General Public License for more details.
13 *
14 *  You should have received a copy of the GNU General Public License
15 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18/******************************************************************************
19 *
20 *     HOYA: MULTI MEDIA INSTRUCITONS
21 *
22 ******************************************************************************/
23
24#ifndef EXTINSN
25#define EXTINSN Q6INSN
26#define __SELF_DEF_EXTINSN 1
27#endif
28
29#ifndef NO_MMVEC
30
31#define DO_FOR_EACH_CODE(WIDTH, CODE) \
32{ \
33    fHIDE(int i;) \
34    fVFOREACH(WIDTH, i) {\
35        CODE ;\
36    } \
37}
38
39
40
41
42#define ITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
43EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),  \
44DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
45
46
47
48#define ITERATOR_INSN2_ANY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
49ITERATOR_INSN_ANY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
50
51#define ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
52EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV),  \
53DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
54
55
56#define ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
57ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE)
58
59
60#define ITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
61EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  \
62DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
63
64
65
66#define ITERATOR_INSN_SHIFT_SLOT_VV_LATE(WIDTH,TAG,SYNTAX,DESCR,CODE) \
67EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  \
68DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
69
70#define ITERATOR_INSN2_SHIFT_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
71ITERATOR_INSN_SHIFT_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
72
73#define ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,DESCR,CODE) \
74EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),  \
75DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
76
77#define ITERATOR_INSN2_PERMUTE_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
78ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
79
80#define ITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \
81EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
82
83
84#define ITERATOR_INSN2_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
85ITERATOR_INSN_PERMUTE_SLOT_DEP(WIDTH,TAG,SYNTAX2,DESCR,CODE)
86
87#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
88EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
89DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
90
91#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEP(WIDTH,TAG,SYNTAX,DESCR,CODE) \
92EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
93DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
94
95#define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
96ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE)
97
98#define ITERATOR_INSN_MPY_SLOT(WIDTH,TAG, SYNTAX,DESCR,CODE) \
99EXTINSN(V6_##TAG, SYNTAX, \
100ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),  \
101DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
102
103#define ITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,DESCR,CODE) \
104EXTINSN(V6_##TAG, SYNTAX, \
105ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),  \
106DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
107
108#define ITERATOR_INSN2_MPY_SLOT(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
109ITERATOR_INSN_MPY_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
110
111#define ITERATOR_INSN2_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX,SYNTAX2,DESCR,CODE) \
112ITERATOR_INSN_MPY_SLOT_LATE(WIDTH,TAG, SYNTAX2,DESCR,CODE)
113
114
115#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
116EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV),  \
117DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
118
119#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
120ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(WIDTH,TAG,SYNTAX2,DESCR,CODE)
121
122
123
124
125#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC2(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
126EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_CVI_VX_VSRC0_IS_DST), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
127
128#define ITERATOR_INSN_SLOT2_DOUBLE_VEC(WIDTH,TAG,SYNTAX,DESCR,CODE) \
129EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV,A_RESTRICT_SLOT2ONLY), DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
130
131#define ITERATOR_INSN_VHISTLIKE(WIDTH,TAG,SYNTAX,DESCR,CODE) \
132EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT),  \
133DESCR, fHIDE(mmvector_t input;) input = fTMPVDATA(); DO_FOR_EACH_CODE(WIDTH, CODE))
134
135
136
137
138
139/******************************************************************************************
140*
141* MMVECTOR MEMORY OPERATIONS - NO NAPALI V1
142*
143*******************************************************************************************/
144
145
146
147#define ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
148EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX_DV),  \
149DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
150
151#define ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
152ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
153
154
155
156#define ITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
157EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  \
158DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
159
160#define ITERATOR_INSN2_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
161ITERATOR_INSN_SHIFT_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
162
163
164#define ITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
165EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),  \
166DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
167
168#define ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
169ITERATOR_INSN_ANY_SLOT_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
170
171
172#define ITERATOR_INSN_MPY_SLOT_NOV1(WIDTH,TAG, SYNTAX,DESCR,CODE) \
173EXTINSN(V6_##TAG, SYNTAX, \
174ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),  \
175DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
176
177#define ITERATOR_INSN_PERMUTE_SLOT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
178EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),  \
179DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
180
181#define ITERATOR_INSN2_PERMUTE_SLOTT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
182ITERATOR_INSN_PERMUTE_SLOT(WIDTH,TAG,SYNTAX2,DESCR,CODE)
183
184#define ITERATOR_INSN_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
185EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
186
187
188#define ITERATOR_INSN2_PERMUTE_SLOT_DEPT_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
189ITERATOR_INSN_PERMUTE_SLOT_DEP_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
190
191#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
192EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
193DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
194
195#define ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_DEPT_NOV1(WIDTH,TAG,SYNTAX,DESCR,CODE) \
196EXTINSN(V6_##TAG, SYNTAX, ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),  \
197DESCR, DO_FOR_EACH_CODE(WIDTH, CODE))
198
199#define ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX,SYNTAX2,DESCR,CODE) \
200ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC_NOV1(WIDTH,TAG,SYNTAX2,DESCR,CODE)
201
202#define NARROWING_SHIFT_NOV1(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \
203ITERATOR_INSN_SHIFT_SLOT_NOV1(ITERSIZE,TAG, \
204"Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \
205"Vector shift right and shuffle", \
206    fHIDE(int )shamt = RtV & SHAMTMASK; \
207    DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \
208    DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt)))
209
210#define MMVEC_AVGS_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
211ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",          "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",          "Vector Average "DESCR,                                      VdV.DEST[i]  = fVAVGS(       WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
212ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",      "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",      "Vector Average % Round"DESCR,                               VdV.DEST[i]  = fVAVGSRND(    WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
213ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vnavg##TYPE,                       "Vd32=vnavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")",         "Vector Negative Average "DESCR,                             VdV.DEST[i]  = fVNAVGS(      WIDTH,  VuV.SRC[i], VvV.SRC[i]))
214
215  #define MMVEC_AVGU_NOV1(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
216ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",        "Vector Average "DESCR,                                      VdV.DEST[i] = fVAVGU(   WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
217ITERATOR_INSN2_ANY_SLOT_NOV1(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",     "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",    "Vector Average % Round"DESCR,                               VdV.DEST[i] = fVAVGURND(WIDTH,  VuV.SRC[i], VvV.SRC[i]))
218
219
220
221/******************************************************************************************
222*
223* MMVECTOR MEMORY OPERATIONS
224*
225*******************************************************************************************/
226
227#define MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,BEH) \
228EXTINSN(V6_##TAG##_pi,      SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_I(RxV,VEC_SCALE(siV)); }) \
229EXTINSN(V6_##TAG##_ai,      SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_RI(RtV,VEC_SCALE(siV)); BEH;}) \
230EXTINSN(V6_##TAG##_ppu,      SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ fEA_REG(RxV); BEH; fPM_M(RxV,MuV); }) \
231
232
233#define MMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
234EXTINSN(V6_##TAG##_pred_pi,      "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB, ATTRIB,DESCR, { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \
235EXTINSN(V6_##TAG##_pred_ai,      "if (" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB, ATTRIB,DESCR,  { if (fLSBOLD(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \
236EXTINSN(V6_##TAG##_pred_ppu,     "if (" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,  { if (fLSBOLD(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}}) \
237
238#define MMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
239EXTINSN(V6_##TAG##_npred_pi,     "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++#s3)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_I(RxV,siV*fVECSIZE()); } else {CANCEL;}}) \
240EXTINSN(V6_##TAG##_npred_ai,     "if (!" #SYNTAXP "4) " SYNTAXA "(Rt32+#s4)" NT SYNTAXB,ATTRIB,DESCR, { if (fLSBOLDNOT(SYNTAXP##V)) { fEA_RI(RtV,siV*fVECSIZE()); BEH;} else {CANCEL;}}) \
241EXTINSN(V6_##TAG##_npred_ppu,    "if (!" #SYNTAXP "4) " SYNTAXA "(Rx32++Mu2)" NT SYNTAXB,ATTRIB,DESCR,{ if (fLSBOLDNOT(SYNTAXP##V)) { fEA_REG(RxV); BEH; fPM_M(RxV,MuV); } else {CANCEL;}})
242
243#define MMVEC_COND_EACH_EA(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
244MMVEC_COND_EACH_EA_TRUE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH) \
245MMVEC_COND_EACH_EA_FALSE(TAG,DESCR,ATTRIB,NT,SYNTAXA,SYNTAXB,SYNTAXP,BEH)
246
247
248#define VEC_SCALE(X) X*fVECSIZE()
249
250
251#define MMVEC_LD(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmem","",fLOADMMV(EA,VdV))
252#define MMVEC_LDC(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_cur,DESCR,ATTRIB,NT,"Vd32.cur=vmem","",fLOADMMV(EA,VdV))
253#define MMVEC_LDT(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG##_tmp,DESCR,ATTRIB,NT,"Vd32.tmp=vmem","",fLOADMMV(EA,VdV))
254#define MMVEC_LDU(TAG,DESCR,ATTRIB,NT) MMVEC_EACH_EA(TAG,DESCR,ATTRIB,NT,"Vd32=vmemu","",fLOADMMVU(EA,VdV))
255
256
257#define MMVEC_STQ(TAG,DESCR,ATTRIB,NT) \
258MMVEC_EACH_EA(TAG##_qpred,DESCR,ATTRIB,NT,"if (Qv4) vmem","=Vs32",fSTOREMMVQ(EA,VsV,QvV)) \
259MMVEC_EACH_EA(TAG##_nqpred,DESCR,ATTRIB,NT,"if (!Qv4) vmem","=Vs32",fSTOREMMVNQ(EA,VsV,QvV))
260
261/****************************************************************
262* MAPPING FOR VMEMs
263****************************************************************/
264
265#define ATTR_VMEM A_EXTENSION,A_CVI,A_CVI_VM
266#define ATTR_VMEMU A_EXTENSION,A_CVI,A_CVI_VM,A_CVI_VP
267
268
269MMVEC_LD(vL32b,  "Aligned Vector Load",        ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),)
270MMVEC_LDC(vL32b,  "Aligned Vector Load Cur",	ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_NEW,A_CVI_VA),)
271MMVEC_LDT(vL32b,  "Aligned Vector Load Tmp",	ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),)
272
273MMVEC_COND_EACH_EA(vL32b,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA),,"Vd32=vmem",,Pv,fLOADMMV(EA,VdV);)
274MMVEC_COND_EACH_EA(vL32b_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",,Pv,fLOADMMV(EA,VdV);)
275MMVEC_COND_EACH_EA(vL32b_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM,A_LOAD,A_CVI_TMP),,"Vd32.tmp=vmem",,Pv,fLOADMMV(EA,VdV);)
276
277MMVEC_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",fSTOREMMV(EA,VsV))
278MMVEC_COND_EACH_EA(vS32b,"Aligned Vector Store",ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),,"vmem","=Vs32",Pv,fSTOREMMV(EA,VsV))
279
280
281MMVEC_STQ(vS32b,  "Aligned Vector Store",      ATTRIBS(ATTR_VMEM,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),)
282
283MMVEC_LDU(vL32Ub, "Unaligned Vector Load",     ATTRIBS(ATTR_VMEMU,A_LOAD,A_RESTRICT_NOSLOT1),)
284
285MMVEC_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",fSTOREMMVU(EA,VsV))
286
287MMVEC_COND_EACH_EA(vS32Ub,"Unaligned Vector Store",ATTRIBS(ATTR_VMEMU,A_STORE,A_RESTRICT_NOSLOT1),,"vmemu","=Vs32",Pv,fSTOREMMVU(EA,VsV))
288
289MMVEC_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN)))
290
291// V65 store relase, zero byte store
292MMVEC_EACH_EA(vS32b_srls,"Aligned Vector Scatter Release",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_SCATTER_RELEASE,A_CVI_NEW,A_RESTRICT_SLOT0ONLY),,"vmem",":scatter_release",fSTORERELEASE(EA,0))
293
294
295
296MMVEC_COND_EACH_EA(vS32b_new,"Aligned Vector Store New",ATTRIBS(ATTR_VMEM,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),,"vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN)))
297
298
299/******************************************************************************************
300*
301* MMVECTOR MEMORY OPERATIONS - NON TEMPORAL
302*
303*******************************************************************************************/
304
305#define ATTR_VMEM_NT A_EXTENSION,A_CVI,A_CVI_VM
306
307MMVEC_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",fSTOREMMV(EA,VsV))
308MMVEC_COND_EACH_EA(vS32b_nt,"Aligned Vector Store - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt","vmem","=Vs32",Pv,fSTOREMMV(EA,VsV))
309
310MMVEC_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",fSTOREMMV(EA,fNEWVREG(OsN)))
311MMVEC_COND_EACH_EA(vS32b_nt_new,"Aligned Vector Store New - Non temporal",ATTRIBS(ATTR_VMEM_NT,A_STORE,A_CVI_NEW,A_DOTNEWVALUE,A_RESTRICT_SLOT0ONLY),":nt","vmem","=Os8.new",Pv,fSTOREMMV(EA,fNEWVREG(OsN)))
312
313
314MMVEC_STQ(vS32b_nt,  "Aligned Vector Store - Non temporal",      ATTRIBS(ATTR_VMEM_NT,A_STORE,A_RESTRICT_SLOT0ONLY,A_CVI_VA),":nt")
315
316MMVEC_LD(vL32b_nt,  "Aligned Vector Load - Non temporal",       ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_VA),":nt")
317MMVEC_LDC(vL32b_nt,  "Aligned Vector Load Cur - Non temporal",	ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_NEW,A_CVI_VA),":nt")
318MMVEC_LDT(vL32b_nt,  "Aligned Vector Load Tmp - Non temporal",	ATTRIBS(ATTR_VMEM_NT,A_LOAD,A_CVI_TMP),":nt")
319
320MMVEC_COND_EACH_EA(vL32b_nt,"Conditional Aligned Vector Load",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA),,"Vd32=vmem",":nt",Pv,fLOADMMV(EA,VdV);)
321MMVEC_COND_EACH_EA(vL32b_nt_cur,"Conditional Aligned Vector Load Cur",ATTRIBS(ATTR_VMEM_NT,A_CVI_VA,A_CVI_NEW),,"Vd32.cur=vmem",":nt",Pv,fLOADMMV(EA,VdV);)
322MMVEC_COND_EACH_EA(vL32b_nt_tmp,"Conditional Aligned Vector Load Tmp",ATTRIBS(ATTR_VMEM_NT,A_CVI_TMP),,"Vd32.tmp=vmem",":nt",Pv,fLOADMMV(EA,VdV);)
323
324
325#undef VEC_SCALE
326
327
328/***************************************************
329 * Vector Alignment
330 ************************************************/
331
332#define VALIGNB(SHIFT)  \
333    fHIDE(int i;) \
334    for(i = 0; i < fVBYTES(); i++) {\
335        VdV.ub[i] = (i+SHIFT>=fVBYTES()) ? VuV.ub[i+SHIFT-fVBYTES()] : VvV.ub[i+SHIFT];\
336	}
337
338EXTINSN(V6_valignb,  "Vd32=valign(Vu32,Vv32,Rt8)",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control",
339{
340	unsigned shift = RtV & (fVBYTES()-1);
341	VALIGNB(shift)
342})
343EXTINSN(V6_vlalignb, "Vd32=vlalign(Vu32,Vv32,Rt8)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by Rt8 as control",
344{
345	unsigned shift = fVBYTES() - (RtV & (fVBYTES()-1));
346	VALIGNB(shift)
347})
348EXTINSN(V6_valignbi, "Vd32=valign(Vu32,Vv32,#u3)", 	ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control",
349{
350	VALIGNB(uiV)
351})
352EXTINSN(V6_vlalignbi,"Vd32=vlalign(Vu32,Vv32,#u3)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),"Align Two vectors by #u3 as control",
353{
354	unsigned shift = fVBYTES() - uiV;
355	VALIGNB(shift)
356})
357
358EXTINSN(V6_vror, "Vd32=vror(Vu32,Rt32)", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
359"Align Two vectors by Rt32 as control",
360{
361	fHIDE(int k;)
362	for (k=0;k<fVBYTES();k++) {
363		VdV.ub[k] = VuV.ub[(k+RtV)&(fVBYTES()-1)];
364	}
365	})
366
367
368
369
370
371
372
373/**************************************************************
374* Unpack elements with zero/sign extend and cross lane permute
375***************************************************************/
376
377ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackub,  "Vdd32=vunpackub(Vu32)", "Vdd32.uh=vunpack(Vu32.ub)", "Unpack byte with zero-extend",     fVARRAY_ELEMENT_ACCESS(VddV, uh, i)  = fZE8_16( VuV.ub[i]))
378ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8,vunpackb,   "Vdd32=vunpackb(Vu32)",  "Vdd32.h=vunpack(Vu32.b)",   "Unpack bytes with sign-extend",    fVARRAY_ELEMENT_ACCESS(VddV, h,  i)  = fSE8_16( VuV.b[i] ))
379ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackuh, "Vdd32=vunpackuh(Vu32)", "Vdd32.uw=vunpack(Vu32.uh)", "Unpack halves with zero-extend",   fVARRAY_ELEMENT_ACCESS(VddV, uw, i)  = fZE16_32(VuV.uh[i]))
380ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackh,  "Vdd32=vunpackh(Vu32)",  "Vdd32.w=vunpack(Vu32.h)",   "Unpack halves with sign-extend",   fVARRAY_ELEMENT_ACCESS(VddV, w,  i)  = fSE16_32(VuV.h[i] ))
381
382ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(8, vunpackob, "Vxx32|=vunpackob(Vu32)", "Vxx32.h|=vunpacko(Vu32.b)", "Unpack byte to odd bytes ",       fVARRAY_ELEMENT_ACCESS(VxxV, uh, i) |= fZE8_16( VuV.ub[i])<<8)
383ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(16,vunpackoh, "Vxx32|=vunpackoh(Vu32)", "Vxx32.w|=vunpacko(Vu32.h)", "Unpack halves to odd halves",     fVARRAY_ELEMENT_ACCESS(VxxV, uw, i) |= fZE16_32(VuV.uh[i])<<16)
384
385
386/**************************************************************
387* Pack elements and cross lane permute
388***************************************************************/
389
390 ITERATOR_INSN2_PERMUTE_SLOT(16, vpackeb,  "Vd32=vpackeb(Vu32,Vv32)", "Vd32.b=vpacke(Vu32.h,Vv32.h)",
391 "Pack  bytes",
392    VdV.ub[i]               = fGETUBYTE(0, VvV.uh[i]);
393    VdV.ub[i+fVELEM(16)]    = fGETUBYTE(0, VuV.uh[i]))
394
395 ITERATOR_INSN2_PERMUTE_SLOT(32, vpackeh,  "Vd32=vpackeh(Vu32,Vv32)", "Vd32.h=vpacke(Vu32.w,Vv32.w)",
396 "Pack  halfwords",
397    VdV.uh[i]               = fGETUHALF(0, VvV.uw[i]);
398    VdV.uh[i+fVELEM(32)]    = fGETUHALF(0, VuV.uw[i]))
399
400  ITERATOR_INSN2_PERMUTE_SLOT(16, vpackob,  "Vd32=vpackob(Vu32,Vv32)", "Vd32.b=vpacko(Vu32.h,Vv32.h)",
401 "Pack  bytes",
402    VdV.ub[i]               = fGETUBYTE(1, VvV.uh[i]);
403    VdV.ub[i+fVELEM(16)]    = fGETUBYTE(1, VuV.uh[i]))
404
405 ITERATOR_INSN2_PERMUTE_SLOT(32, vpackoh,  "Vd32=vpackoh(Vu32,Vv32)", "Vd32.h=vpacko(Vu32.w,Vv32.w)",
406 "Pack  halfwords",
407    VdV.uh[i]               = fGETUHALF(1, VvV.uw[i]);
408    VdV.uh[i+fVELEM(32)]    = fGETUHALF(1, VuV.uw[i]))
409
410
411
412ITERATOR_INSN2_PERMUTE_SLOT(16, vpackhub_sat,  "Vd32=vpackhub(Vu32,Vv32):sat", "Vd32.ub=vpack(Vu32.h,Vv32.h):sat",
413 "Pack ubytes with saturation",
414    VdV.ub[i]               = fVSATUB(VvV.h[i]);
415    VdV.ub[i+fVELEM(16)]    = fVSATUB(VuV.h[i]))
416
417
418ITERATOR_INSN2_PERMUTE_SLOT(16, vpackhb_sat,  "Vd32=vpackhb(Vu32,Vv32):sat", "Vd32.b=vpack(Vu32.h,Vv32.h):sat",
419 "Pack bytes with saturation",
420    VdV.b[i]               = fVSATB(VvV.h[i]);
421    VdV.b[i+fVELEM(16)]    = fVSATB(VuV.h[i]))
422
423
424ITERATOR_INSN2_PERMUTE_SLOT(32, vpackwuh_sat,  "Vd32=vpackwuh(Vu32,Vv32):sat", "Vd32.uh=vpack(Vu32.w,Vv32.w):sat",
425 "Pack ubytes with saturation",
426    VdV.uh[i]               = fVSATUH(VvV.w[i]);
427    VdV.uh[i+fVELEM(32)]    = fVSATUH(VuV.w[i]))
428
429ITERATOR_INSN2_PERMUTE_SLOT(32, vpackwh_sat,  "Vd32=vpackwh(Vu32,Vv32):sat", "Vd32.h=vpack(Vu32.w,Vv32.w):sat",
430 "Pack bytes with saturation",
431    VdV.h[i]               = fVSATH(VvV.w[i]);
432    VdV.h[i+fVELEM(32)]    = fVSATH(VuV.w[i]))
433
434
435
436
437
438/**************************************************************
439* Zero/Sign Extend with in-lane permute
440***************************************************************/
441
442ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vzb,"Vdd32=vzxtb(Vu32)","Vdd32.uh=vzxt(Vu32.ub)",
443"Vector Zero Extend Bytes",
444    VddV.v[0].uh[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i]));
445    VddV.v[1].uh[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])))
446
447ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vsb,"Vdd32=vsxtb(Vu32)","Vdd32.h=vsxt(Vu32.b)",
448"Vector Sign Extend Bytes",
449    VddV.v[0].h[i] = fSE8_16(fGETBYTE(0, VuV.h[i]));
450    VddV.v[1].h[i] = fSE8_16(fGETBYTE(1, VuV.h[i])))
451
452ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vzh,"Vdd32=vzxth(Vu32)","Vdd32.uw=vzxt(Vu32.uh)",
453"Vector Zero Extend halfwords",
454    VddV.v[0].uw[i] = fZE16_32(fGETUHALF(0, VuV.uw[i]));
455    VddV.v[1].uw[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])))
456
457ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vsh,"Vdd32=vsxth(Vu32)","Vdd32.w=vsxt(Vu32.h)",
458"Vector Sign Extend halfwords",
459    VddV.v[0].w[i] = fSE16_32(fGETHALF(0, VuV.w[i]));
460    VddV.v[1].w[i] = fSE16_32(fGETHALF(1, VuV.w[i])))
461
462
463/**********************************************************************
464*
465*
466*
467*               MMVECTOR REDUCTION
468*
469*
470*
471**********************************************************************/
472
473/********************************************
474*  2-WAY REDUCTION - UNSIGNED BYTE BY BYTE
475********************************************/
476
477
478ITERATOR_INSN2_MPY_SLOT(16,vdmpybus,"Vd32=vdmpybus(Vu32,Rt32)","Vd32.h=vdmpy(Vu32.ub,Rt32.b)",
479"Vector Dual Multiply-Accumulates unsigned bytes by bytes",
480    VdV.h[i]   = fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV));
481    VdV.h[i]  += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
482
483ITERATOR_INSN2_MPY_SLOT(16,vdmpybus_acc,"Vx32+=vdmpybus(Vu32,Rt32)","Vx32.h+=vdmpy(Vu32.ub,Rt32.b)",
484"Vector Dual Multiply-Accumulates unsigned bytes by  bytes, and accumulate",
485    VxV.h[i] += fMPY8US( fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i) % 4, RtV));
486    VxV.h[i] += fMPY8US( fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
487
488
489
490ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv,"Vdd32=vdmpybus(Vuu32,Rt32)","Vdd32.h=vdmpy(Vuu32.ub,Rt32.b)",
491"Vector Dual Multiply-Accumulates unsigned bytes by  bytes, and accumulate Sliding Window Reduction",
492    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
493    VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV));
494
495    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
496    VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV)))
497
498ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vdmpybus_dv_acc,"Vxx32+=vdmpybus(Vuu32,Rt32)","Vxx32.h+=vdmpy(Vuu32.ub,Rt32.b)",
499"Vector Dual Multiply-Accumulates unsigned bytes by  bytes, and accumulate Sliding Window Reduction",
500    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
501    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i+1)%4, RtV));
502
503    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]),fGETBYTE((2*i) % 4, RtV));
504    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]),fGETBYTE((2*i+1)%4, RtV)))
505
506
507
508/********************************************
509*  2-WAY REDUCTION - HALF BY BYTE
510********************************************/
511ITERATOR_INSN2_MPY_SLOT(32,vdmpyhb,"Vd32=vdmpyhb(Vu32,Rt32)","Vd32.w=vdmpy(Vu32.h,Rt32.b)",
512"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
513    VdV.w[i]  = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV));
514    VdV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV)))
515
516ITERATOR_INSN2_MPY_SLOT(32,vdmpyhb_acc,"Vx32+=vdmpyhb(Vu32,Rt32)","Vx32.w+=vdmpy(Vu32.h,Rt32.b)",
517"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
518    VxV.w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETBYTE((2*i+0)%4, RtV));
519    VxV.w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETBYTE((2*i+1)%4, RtV)))
520
521
522
523ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv,"Vdd32=vdmpyhb(Vuu32,Rt32)","Vdd32.w=vdmpy(Vuu32.h,Rt32.b)",
524"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
525    VddV.v[0].w[i]  = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
526    VddV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV));
527
528    VddV.v[1].w[i]  = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
529    VddV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV)))
530
531
532ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhb_dv_acc,"Vxx32+=vdmpyhb(Vuu32,Rt32)","Vxx32.w+=vdmpy(Vuu32.h,Rt32.b)",
533"Dual-Vector 2-Element Half x Byte Reduction with Sliding Window Overlap",
534    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
535    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+1)%4, RtV));
536
537    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]),fGETBYTE((2*i+0)%4, RtV));
538    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]),fGETBYTE((2*i+1)%4, RtV)))
539
540
541
542
543
544/********************************************
545*  2-WAY REDUCTION - HALF BY HALF
546********************************************/
547
548ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat,"Vd32=vdmpyh(Vu32,Vv32):sat","Vd32.w=vdmpy(Vu32.h,Vv32.h):sat",
549"Vector halfword multiply, accumulate pairs, sat to word",
550    fHIDE(size8s_t accum;)
551    accum    = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i]));
552    accum   += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i]));
553    VdV.w[i] = fVSATW(accum))
554
555ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhvsat_acc,"Vx32+=vdmpyh(Vu32,Vv32):sat","Vx32.w+=vdmpy(Vu32.h,Vv32.h):sat",
556"Vector halfword multiply, accumulate pairs, sat to word",
557    fHIDE(size8s_t accum;)
558    accum    = fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0, VvV.w[i]));
559    accum   += fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1, VvV.w[i]));
560    VxV.w[i] = fVSATW(VxV.w[i]+accum))
561
562
563/* VDMPYH */
564
565ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat,"Vd32=vdmpyh(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.h):sat",
566"Vector halfword multiply, accumulate pairs, saturate to word",
567    fHIDE(size8s_t accum;)
568    accum    = fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV));
569    accum   += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV));
570    VdV.w[i] = fVSATW(accum))
571
572ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsat_acc,"Vx32+=vdmpyh(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.h):sat",
573"Vector halfword multiply, accumulate pairs, saturate to word",
574    fHIDE(size8s_t) accum = VxV.w[i];
575    accum   += fMPY16SS(fGETHALF(0, VuV.w[i]),fGETHALF(0, RtV));
576    accum   += fMPY16SS(fGETHALF(1, VuV.w[i]),fGETHALF(1, RtV));
577    VxV.w[i] = fVSATW(accum))
578
579
580
581
582ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat,"Vd32=vdmpyh(Vuu32,Rt32):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.h):sat",
583"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation",
584    fHIDE(size8s_t accum;)
585    accum    = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV));
586    accum   += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV));
587    VdV.w[i] = fVSATW(accum))
588
589ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhisat_acc,"Vx32+=vdmpyh(Vuu32,Rt32):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.h):sat",
590"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation",
591    fHIDE(size8s_t) accum = VxV.w[i];
592    accum   += fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]),fGETHALF(0,RtV));
593    accum   += fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]),fGETHALF(1,RtV));
594    VxV.w[i] = fVSATW(accum))
595
596
597
598
599
600
601
602/* VDMPYHSU */
603ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat,"Vd32=vdmpyhsu(Vu32,Rt32):sat","Vd32.w=vdmpy(Vu32.h,Rt32.uh):sat",
604"Vector halfword multiply, accumulate pairs, saturate to word",
605    fHIDE(size8s_t accum;)
606    accum    = fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV));
607    accum   += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV));
608    VdV.w[i] = fVSATW(accum))
609
610ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsusat_acc,"Vx32+=vdmpyhsu(Vu32,Rt32):sat","Vx32.w+=vdmpy(Vu32.h,Rt32.uh):sat",
611"Vector halfword multiply, accumulate pairs, saturate to word",
612    fHIDE(size8s_t) accum=VxV.w[i];
613    accum   += fMPY16SU(fGETHALF(0, VuV.w[i]),fGETUHALF(0, RtV));
614    accum   += fMPY16SU(fGETHALF(1, VuV.w[i]),fGETUHALF(1, RtV));
615    VxV.w[i] = fVSATW(accum))
616
617
618
619ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat,"Vd32=vdmpyhsu(Vuu32,Rt32,#1):sat","Vd32.w=vdmpy(Vuu32.h,Rt32.uh,#1):sat",
620"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with saturation",
621    fHIDE(size8s_t accum;)
622    accum    = fMPY16SU(fGETHALF(1,VuuV.v[0].w[i]),fGETUHALF(0,RtV));
623    accum   += fMPY16SU(fGETHALF(0,VuuV.v[1].w[i]),fGETUHALF(1,RtV));
624    VdV.w[i] = fVSATW(accum))
625
626ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdmpyhsuisat_acc,"Vx32+=vdmpyhsu(Vuu32,Rt32,#1):sat","Vx32.w+=vdmpy(Vuu32.h,Rt32.uh,#1):sat",
627"Dual Vector Signed Halfword by Signed Halfword 2-Way Reduction to Halfword with accumulation and saturation",
628    fHIDE(size8s_t) accum=VxV.w[i];
629    accum   += fMPY16SU(fGETHALF(1, VuuV.v[0].w[i]),fGETUHALF(0,RtV));
630    accum   += fMPY16SU(fGETHALF(0, VuuV.v[1].w[i]),fGETUHALF(1,RtV));
631    VxV.w[i] = fVSATW(accum))
632
633
634
635/********************************************
636*  3-WAY REDUCTION - UNSIGNED BYTE BY  BYTE
637********************************************/
638
639 ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb, "Vdd32=vtmpyb(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.b,Rt32.b)",
640"Dual Vector 3x1 Reduction",
641    VddV.v[0].h[i]  = fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
642    VddV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV));
643    VddV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]);
644
645    VddV.v[1].h[i]  = fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
646    VddV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV));
647    VddV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i]))
648
649
650ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpyb_acc, "Vxx32+=vtmpyb(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.b,Rt32.b)",
651"Dual Vector 3x1 Reduction",
652    VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
653    VxxV.v[0].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i+1)%4, RtV));
654    VxxV.v[0].h[i] += fGETBYTE(0,VuuV.v[1].h[i]);
655
656    VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(1,VuuV.v[0].h[i]), fGETBYTE((2*i  )%4, RtV));
657    VxxV.v[1].h[i] += fMPY8SS(fGETBYTE(0,VuuV.v[1].h[i]), fGETBYTE((2*i+1)%4, RtV));
658    VxxV.v[1].h[i] += fGETBYTE(1,VuuV.v[1].h[i]))
659
660
661
662ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus, "Vdd32=vtmpybus(Vuu32,Rt32)", "Vdd32.h=vtmpy(Vuu32.ub,Rt32.b)",
663"Dual Vector 3x1 Reduction",
664    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
665    VddV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV));
666    VddV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]);
667
668    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
669    VddV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV));
670    VddV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i]))
671
672ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vtmpybus_acc, "Vxx32+=vtmpybus(Vuu32,Rt32)", "Vxx32.h+=vtmpy(Vuu32.ub,Rt32.b)",
673"Dual Vector 3x1 Reduction",
674    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
675    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i+1)%4, RtV));
676    VxxV.v[0].h[i] += fGETUBYTE(0,VuuV.v[1].uh[i]);
677
678    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1,VuuV.v[0].uh[i]), fGETBYTE((2*i  )%4, RtV));
679    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(0,VuuV.v[1].uh[i]), fGETBYTE((2*i+1)%4, RtV));
680    VxxV.v[1].h[i] += fGETUBYTE(1,VuuV.v[1].uh[i]))
681
682
683ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb, "Vdd32=vtmpyhb(Vuu32,Rt32)", "Vdd32.w=vtmpy(Vuu32.h,Rt32.b)",
684"Dual Vector 3x1 Reduction",
685    VddV.v[0].w[i] = fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
686    VddV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
687    VddV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]);
688
689    VddV.v[1].w[i] = fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
690    VddV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
691    VddV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i]))
692
693ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vtmpyhb_acc, "Vxx32+=vtmpyhb(Vuu32,Rt32)", "Vxx32.w+=vtmpy(Vuu32.h,Rt32.b)",
694"Dual Vector 3x1 Reduction",
695    VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
696    VxxV.v[0].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
697    VxxV.v[0].w[i]+= fGETHALF(0,VuuV.v[1].w[i]);
698
699    VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(1,VuuV.v[0].w[i]), fSE8_16(fGETBYTE((2*i+0)%4, RtV)));
700    VxxV.v[1].w[i]+= fMPY16SS(fGETHALF(0,VuuV.v[1].w[i]), fSE8_16(fGETBYTE((2*i+1)%4, RtV)));
701    VxxV.v[1].w[i]+= fGETHALF(1,VuuV.v[1].w[i]))
702
703
704/********************************************
705*  4-WAY REDUCTION - UNSIGNED BYTE BY UNSIGNED BYTE
706********************************************/
707
708
709
710ITERATOR_INSN2_MPY_SLOT(32,vrmpyub,"Vd32=vrmpyub(Vu32,Rt32)","Vd32.uw=vrmpy(Vu32.ub,Rt32.ub)",
711"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
712    VdV.uw[i]  = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV));
713    VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV));
714    VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV));
715    VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV)))
716
717ITERATOR_INSN2_MPY_SLOT(32,vrmpyub_acc,"Vx32+=vrmpyub(Vu32,Rt32)","Vx32.uw+=vrmpy(Vu32.ub,Rt32.ub)",
718"Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate",
719    VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,RtV));
720    VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,RtV));
721    VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,RtV));
722    VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,RtV)))
723
724
725ITERATOR_INSN2_MPY_SLOT(32,vrmpyubv,"Vd32=vrmpyub(Vu32,Vv32)","Vd32.uw=vrmpy(Vu32.ub,Vv32.ub)",
726"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
727    VdV.uw[i]  = fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i]));
728    VdV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i]));
729    VdV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i]));
730    VdV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i])))
731
732ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubv_acc,"Vx32+=vrmpyub(Vu32,Vv32)","Vx32.uw+=vrmpy(Vu32.ub,Vv32.ub)",
733"Vector Multiply-Accumulate Reduce with 4 byte coefficients Accumulate",
734    VxV.uw[i] += fMPY8UU(fGETUBYTE(0,VuV.uw[i]), fGETUBYTE(0,VvV.uw[i]));
735    VxV.uw[i] += fMPY8UU(fGETUBYTE(1,VuV.uw[i]), fGETUBYTE(1,VvV.uw[i]));
736    VxV.uw[i] += fMPY8UU(fGETUBYTE(2,VuV.uw[i]), fGETUBYTE(2,VvV.uw[i]));
737    VxV.uw[i] += fMPY8UU(fGETUBYTE(3,VuV.uw[i]), fGETUBYTE(3,VvV.uw[i])))
738
739ITERATOR_INSN2_MPY_SLOT(32,vrmpybv,"Vd32=vrmpyb(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.b,Vv32.b)",
740"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
741    VdV.w[i]  = fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i]));
742    VdV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i]));
743    VdV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i]));
744    VdV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i])))
745
746
747ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybv_acc,"Vx32+=vrmpyb(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.b,Vv32.b)",
748"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
749    VxV.w[i] += fMPY8SS(fGETBYTE(0, VuV.w[i]), fGETBYTE(0, VvV.w[i]));
750    VxV.w[i] += fMPY8SS(fGETBYTE(1, VuV.w[i]), fGETBYTE(1, VvV.w[i]));
751    VxV.w[i] += fMPY8SS(fGETBYTE(2, VuV.w[i]), fGETBYTE(2, VvV.w[i]));
752    VxV.w[i] += fMPY8SS(fGETBYTE(3, VuV.w[i]), fGETBYTE(3, VvV.w[i])))
753
754
755ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi,"Vdd32=vrmpyub(Vuu32,Rt32,#u1)","Vdd32.uw=vrmpy(Vuu32.ub,Rt32.ub,#u1)",
756"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word",
757    VddV.v[0].uw[i]  = fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
758    VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV));
759    VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
760    VddV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
761
762    VddV.v[1].uw[i]  = fMPY8UU(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
763    VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
764    VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
765    VddV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)))
766
767
768ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpyubi_acc,"Vxx32+=vrmpyub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrmpy(Vuu32.ub,Rt32.ub,#u1)",
769"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word",
770    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
771    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV));
772    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
773    VxxV.v[0].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
774
775    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETUBYTE((2-uiV) & 0x3,RtV));
776    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETUBYTE((3-uiV) & 0x3,RtV));
777    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETUBYTE((0-uiV) & 0x3,RtV));
778    VxxV.v[1].uw[i] += fMPY8UU(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETUBYTE((1-uiV) & 0x3,RtV)))
779
780
781
782
783/********************************************
784*  4-WAY REDUCTION - UNSIGNED BYTE BY  BYTE
785********************************************/
786
787ITERATOR_INSN2_MPY_SLOT(32,vrmpybus,"Vd32=vrmpybus(Vu32,Rt32)","Vd32.w=vrmpy(Vu32.ub,Rt32.b)",
788"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
789    VdV.w[i]  = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV));
790    VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV));
791    VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV));
792    VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV)))
793
794
795ITERATOR_INSN2_MPY_SLOT(32,vrmpybus_acc,"Vx32+=vrmpybus(Vu32,Rt32)","Vx32.w+=vrmpy(Vu32.ub,Rt32.b)",
796"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
797    VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,RtV));
798    VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,RtV));
799    VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,RtV));
800    VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,RtV)))
801
802
803ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi,"Vdd32=vrmpybus(Vuu32,Rt32,#u1)","Vdd32.w=vrmpy(Vuu32.ub,Rt32.b,#u1)",
804"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction to Word",
805    VddV.v[0].w[i]  = fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
806    VddV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV));
807    VddV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
808    VddV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
809
810    VddV.v[1].w[i]  = fMPY8US(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
811    VddV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
812    VddV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
813    VddV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)))
814
815
816ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusi_acc,"Vxx32+=vrmpybus(Vuu32,Rt32,#u1)","Vxx32.w+=vrmpy(Vuu32.ub,Rt32.b,#u1)",
817"Dual Vector Unsigned Byte By Signed Byte 4-way Reduction with accumulate and saturation to Word",
818    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
819    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV));
820    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[0        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
821    VxxV.v[0].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
822
823    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(0, VuuV.v[1        ].uw[i]),fGETBYTE((2-uiV) & 0x3,RtV));
824    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(1, VuuV.v[1        ].uw[i]),fGETBYTE((3-uiV) & 0x3,RtV));
825    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(2, VuuV.v[uiV ? 1:0].uw[i]),fGETBYTE((0-uiV) & 0x3,RtV));
826    VxxV.v[1].w[i] += fMPY8US(fGETUBYTE(3, VuuV.v[0        ].uw[i]),fGETBYTE((1-uiV) & 0x3,RtV)))
827
828
829
830
831ITERATOR_INSN2_MPY_SLOT(32,vrmpybusv,"Vd32=vrmpybus(Vu32,Vv32)","Vd32.w=vrmpy(Vu32.ub,Vv32.b)",
832"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
833    VdV.w[i]  = fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i]));
834    VdV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i]));
835    VdV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i]));
836    VdV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i])))
837
838
839ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrmpybusv_acc,"Vx32+=vrmpybus(Vu32,Vv32)","Vx32.w+=vrmpy(Vu32.ub,Vv32.b)",
840"Vector Multiply-Accumulate Reduce with 4 byte coefficients",
841    VxV.w[i] += fMPY8US(fGETUBYTE(0,VuV.uw[i]), fGETBYTE(0,VvV.w[i]));
842    VxV.w[i] += fMPY8US(fGETUBYTE(1,VuV.uw[i]), fGETBYTE(1,VvV.w[i]));
843    VxV.w[i] += fMPY8US(fGETUBYTE(2,VuV.uw[i]), fGETBYTE(2,VvV.w[i]));
844    VxV.w[i] += fMPY8US(fGETUBYTE(3,VuV.uw[i]), fGETBYTE(3,VvV.w[i])))
845
846
847
848
849
850
851
852
853
854
855
856/********************************************
857*  2-WAY REDUCTION - SAD
858********************************************/
859
860ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh,"Vdd32=vdsaduh(Vuu32,Rt32)","Vdd32.uw=vdsad(Vuu32.uh,Rt32.uh)",
861"Dual Vector Halfword by Byte 4-Way Reduction to Word",
862    VddV.v[0].uw[i]  = fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
863    VddV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV));
864    VddV.v[1].uw[i]  = fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
865    VddV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV)))
866
867ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vdsaduh_acc,"Vxx32+=vdsaduh(Vuu32,Rt32)","Vxx32.uw+=vdsad(Vuu32.uh,Rt32.uh)",
868"Dual Vector Halfword by Byte 4-Way Reduction to Word",
869    VxxV.v[0].uw[i] += fABS(fGETUHALF(0, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
870    VxxV.v[0].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(1,RtV));
871    VxxV.v[1].uw[i] += fABS(fGETUHALF(1, VuuV.v[0].uw[i]) - fGETUHALF(0,RtV));
872    VxxV.v[1].uw[i] += fABS(fGETUHALF(0, VuuV.v[1].uw[i]) - fGETUHALF(1,RtV)))
873
874
875
876
877/********************************************
878*  4-WAY REDUCTION - SAD
879********************************************/
880
881
882
883ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi,"Vdd32=vrsadub(Vuu32,Rt32,#u1)","Vdd32.uw=vrsad(Vuu32.ub,Rt32.ub,#u1)",
884"Dual Vector Halfword by Byte 4-Way Reduction to Word",
885    VddV.v[0].uw[i]  = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
886    VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)));
887    VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
888    VddV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
889
890    VddV.v[1].uw[i]  = fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
891    VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
892    VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
893    VddV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))))
894
895ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vrsadubi_acc,"Vxx32+=vrsadub(Vuu32,Rt32,#u1)","Vxx32.uw+=vrsad(Vuu32.ub,Rt32.ub,#u1)",
896"Dual Vector Halfword by Byte 4-Way Reduction to Word",
897    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
898    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV)));
899    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
900    VxxV.v[0].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
901
902    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(0, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((2-uiV)&3,RtV)));
903    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(1, VuuV.v[1      ].uw[i])) - fZE8_16(fGETUBYTE((3-uiV)&3,RtV)));
904    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(2, VuuV.v[uiV?1:0].uw[i])) - fZE8_16(fGETUBYTE((0-uiV)&3,RtV)));
905    VxxV.v[1].uw[i] += fABS(fZE8_16(fGETUBYTE(3, VuuV.v[0      ].uw[i])) - fZE8_16(fGETUBYTE((1-uiV)&3,RtV))))
906
907
908
909
910
911
912
913
914
915
916/*********************************************************************
917 * MMVECTOR SHIFTING
918 * ******************************************************************/
919// Macro to shift arithmetically left/right and by either RT or Vv
920
921#define V_SHIFT(TYPE, DESC, SIZE, LOGSIZE, CASTTYPE)   \
922ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE,   "Vd32=vasr" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Rt32)",         "Vector arithmetic shift right " DESC,    VdV.TYPE[i]     = (VuV.TYPE[i]    >> (RtV & (SIZE-1)))) \
923ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE,   "Vd32=vasl" #TYPE "(Vu32,Rt32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Rt32)",         "Vector arithmetic shift left  " DESC,    VdV.TYPE[i]     = (VuV.TYPE[i]    << (RtV & (SIZE-1)))) \
924ITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE,   "Vd32=vlsr" #TYPE "(Vu32,Rt32)","Vd32.u"#TYPE"=vlsr(Vu32.u"#TYPE",Rt32)",       "Vector logical shift right "    DESC,    VdV.u##TYPE[i]  = (VuV.u##TYPE[i] >> (RtV & (SIZE-1)))) \
925ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasr##TYPE##v,"Vd32=vasr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasr(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift right " DESC,    VdV.TYPE[i]     = fBIDIR_ASHIFTR(VuV.TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \
926ITERATOR_INSN2_SHIFT_SLOT(SIZE,vasl##TYPE##v,"Vd32=vasl" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vasl(Vu32."#TYPE",Vv32."#TYPE")", "Vector arithmetic shift left  " DESC,    VdV.TYPE[i]     = fBIDIR_ASHIFTL(VuV.TYPE[i],  fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \
927ITERATOR_INSN2_SHIFT_SLOT(SIZE,vlsr##TYPE##v,"Vd32=vlsr" #TYPE "(Vu32,Vv32)","Vd32."#TYPE"=vlsr(Vu32."#TYPE",Vv32."#TYPE")", "Vector logical shift right "    DESC,    VdV.u##TYPE[i]  = fBIDIR_LSHIFTR(VuV.u##TYPE[i], fSXTN((LOGSIZE+1),SIZE,VvV.TYPE[i]),CASTTYPE)) \
928
929V_SHIFT(w, "word",   32,5,4_4)
930V_SHIFT(h, "halfword", 16,4,2_2)
931
932ITERATOR_INSN_SHIFT_SLOT(8,vlsrb,"Vd32.ub=vlsr(Vu32.ub,Rt32)","vec log shift right bytes", VdV.b[i] = VuV.ub[i] >> (RtV & 0x7))
933
934ITERATOR_INSN2_SHIFT_SLOT(32,vrotr,"Vd32=vrotr(Vu32,Vv32)","Vd32.uw=vrotr(Vu32.uw,Vv32.uw)","Vector word rotate right", VdV.uw[i] = ((VuV.uw[i] >> (VvV.uw[i] & 0x1f)) | (VuV.uw[i] << (32 - (VvV.uw[i] & 0x1f)))))
935
936/*********************************************************************
937 * MMVECTOR SHIFT AND PERMUTE
938 * ******************************************************************/
939
940ITERATOR_INSN2_PERMUTE_SLOT_DOUBLE_VEC(32,vasr_into,"Vxx32=vasrinto(Vu32,Vv32)","Vxx32.w=vasrinto(Vu32.w,Vv32.w)","ASR vector 1 elements and overlay dropping bits to MSB of vector 2 elements",
941    fHIDE(int64_t ) shift = (fSE32_64(VuV.w[i]) << 32);
942    fHIDE(int64_t ) mask  = (((fSE32_64(VxxV.v[0].w[i])) << 32) | fZE32_64(VxxV.v[0].w[i]));
943    fHIDE(int64_t) lomask = (((fSE32_64(1)) << 32) - 1);
944    fHIDE(int ) count = -(0x40 & VvV.w[i]) + (VvV.w[i] & 0x3f);
945    fHIDE(int64_t ) result = (count == -0x40) ? 0 : (((count < 0) ? ((shift << -(count)) | (mask & (lomask << -(count)))) : ((shift >> count) | (mask & (lomask >> count)))));
946    VxxV.v[1].w[i] = ((result >> 32) & 0xffffffff);
947    VxxV.v[0].w[i] = (result & 0xffffffff))
948
949#define NEW_NARROWING_SHIFT 1
950
951#if NEW_NARROWING_SHIFT
952#define NARROWING_SHIFT(ITERSIZE,TAG,DSTM,DSTTYPE,SRCTYPE,SYNOPTS,SATFUNC,RNDFUNC,SHAMTMASK) \
953ITERATOR_INSN_SHIFT_SLOT(ITERSIZE,TAG, \
954"Vd32." #DSTTYPE "=vasr(Vu32." #SRCTYPE ",Vv32." #SRCTYPE ",Rt8)" #SYNOPTS, \
955"Vector shift right and shuffle", \
956    fHIDE(int )shamt = RtV & SHAMTMASK; \
957    DSTM(0,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VvV.SRCTYPE[i],shamt) >> shamt)); \
958    DSTM(1,VdV.SRCTYPE[i],SATFUNC(RNDFUNC(VuV.SRCTYPE[i],shamt) >> shamt)))
959
960
961
962
963
964/* WORD TO HALF*/
965
966NARROWING_SHIFT(32,vasrwh,fSETHALF,h,w,,fECHO,fVNOROUND,0xF)
967NARROWING_SHIFT(32,vasrwhsat,fSETHALF,h,w,:sat,fVSATH,fVNOROUND,0xF)
968NARROWING_SHIFT(32,vasrwhrndsat,fSETHALF,h,w,:rnd:sat,fVSATH,fVROUND,0xF)
969NARROWING_SHIFT(32,vasrwuhrndsat,fSETHALF,uh,w,:rnd:sat,fVSATUH,fVROUND,0xF)
970NARROWING_SHIFT(32,vasrwuhsat,fSETHALF,uh,w,:sat,fVSATUH,fVNOROUND,0xF)
971NARROWING_SHIFT(32,vasruwuhrndsat,fSETHALF,uh,uw,:rnd:sat,fVSATUH,fVROUND,0xF)
972
973NARROWING_SHIFT_NOV1(32,vasruwuhsat,fSETHALF,uh,uw,:sat,fVSATUH,fVNOROUND,0xF)
974NARROWING_SHIFT(16,vasrhubsat,fSETBYTE,ub,h,:sat,fVSATUB,fVNOROUND,0x7)
975NARROWING_SHIFT(16,vasrhubrndsat,fSETBYTE,ub,h,:rnd:sat,fVSATUB,fVROUND,0x7)
976NARROWING_SHIFT(16,vasrhbsat,fSETBYTE,b,h,:sat,fVSATB,fVNOROUND,0x7)
977NARROWING_SHIFT(16,vasrhbrndsat,fSETBYTE,b,h,:rnd:sat,fVSATB,fVROUND,0x7)
978
979NARROWING_SHIFT_NOV1(16,vasruhubsat,fSETBYTE,ub,uh,:sat,fVSATUB,fVNOROUND,0x7)
980NARROWING_SHIFT_NOV1(16,vasruhubrndsat,fSETBYTE,ub,uh,:rnd:sat,fVSATUB,fVROUND,0x7)
981
982#else
983ITERATOR_INSN2_SHIFT_SLOT(32,vasrwh,"Vd32=vasrwh(Vu32,Vv32,Rt8)","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8)",
984"Vector arithmetic shift right words, shuffle even halfwords",
985    fSETHALF(0,VdV.w[i], (VvV.w[i] >> (RtV & 0xF)));
986    fSETHALF(1,VdV.w[i], (VuV.w[i] >> (RtV & 0xF))))
987
988
989ITERATOR_INSN2_SHIFT_SLOT(32,vasrwhsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):sat",
990"Vector arithmetic shift right words, shuffle even halfwords",
991    fSETHALF(0,VdV.w[i], fVSATH(VvV.w[i] >> (RtV & 0xF)));
992    fSETHALF(1,VdV.w[i], fVSATH(VuV.w[i] >> (RtV & 0xF))))
993
994ITERATOR_INSN2_SHIFT_SLOT(32,vasrwhrndsat,"Vd32=vasrwh(Vu32,Vv32,Rt8):rnd:sat","Vd32.h=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat",
995"Vector arithmetic shift right words, shuffle even halfwords",
996    fHIDE(int ) shamt = RtV & 0xF;
997    fSETHALF(0,VdV.w[i], fVSATH(  (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt));
998    fSETHALF(1,VdV.w[i], fVSATH(  (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)))
999
1000ITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhrndsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat",
1001"Vector arithmetic shift right words, shuffle even halfwords",
1002    fHIDE(int ) shamt = RtV & 0xF;
1003    fSETHALF(0,VdV.w[i], fVSATUH(  (VvV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt));
1004    fSETHALF(1,VdV.w[i], fVSATUH(  (VuV.w[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)))
1005
1006ITERATOR_INSN2_SHIFT_SLOT(32,vasrwuhsat,"Vd32=vasrwuh(Vu32,Vv32,Rt8):sat","Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):sat",
1007"Vector arithmetic shift right words, shuffle even halfwords",
1008    fSETHALF(0, VdV.uw[i], fVSATUH(VvV.w[i] >> (RtV & 0xF)));
1009    fSETHALF(1, VdV.uw[i], fVSATUH(VuV.w[i] >> (RtV & 0xF))))
1010
1011ITERATOR_INSN2_SHIFT_SLOT(32,vasruwuhrndsat,"Vd32=vasruwuh(Vu32,Vv32,Rt8):rnd:sat","Vd32.uh=vasr(Vu32.uw,Vv32.uw,Rt8):rnd:sat",
1012"Vector arithmetic shift right words, shuffle even halfwords",
1013    fHIDE(int ) shamt = RtV & 0xF;
1014    fSETHALF(0,VdV.w[i], fVSATUH(  (VvV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt));
1015    fSETHALF(1,VdV.w[i], fVSATUH(  (VuV.uw[i] + fBIDIR_ASHIFTL(1,(shamt-1),4_8) ) >> shamt)))
1016#endif
1017
1018
1019
1020ITERATOR_INSN2_SHIFT_SLOT(32,vroundwh,"Vd32=vroundwh(Vu32,Vv32):sat","Vd32.h=vround(Vu32.w,Vv32.w):sat",
1021"Vector round words to halves, shuffle resultant halfwords",
1022    fSETHALF(0, VdV.uw[i], fVSATH((VvV.w[i] + fCONSTLL(0x8000)) >> 16));
1023    fSETHALF(1, VdV.uw[i], fVSATH((VuV.w[i] + fCONSTLL(0x8000)) >> 16)))
1024
1025ITERATOR_INSN2_SHIFT_SLOT(32,vroundwuh,"Vd32=vroundwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.w,Vv32.w):sat",
1026"Vector round words to halves, shuffle resultant halfwords",
1027    fSETHALF(0, VdV.uw[i], fVSATUH((VvV.w[i] + fCONSTLL(0x8000)) >> 16));
1028    fSETHALF(1, VdV.uw[i], fVSATUH((VuV.w[i] + fCONSTLL(0x8000)) >> 16)))
1029
1030ITERATOR_INSN2_SHIFT_SLOT(32,vrounduwuh,"Vd32=vrounduwuh(Vu32,Vv32):sat","Vd32.uh=vround(Vu32.uw,Vv32.uw):sat",
1031"Vector round words to halves, shuffle resultant halfwords",
1032    fSETHALF(0, VdV.uw[i], fVSATUH((VvV.uw[i] + fCONSTLL(0x8000)) >> 16));
1033    fSETHALF(1, VdV.uw[i], fVSATUH((VuV.uw[i] + fCONSTLL(0x8000)) >> 16)))
1034
1035
1036
1037
1038
1039/* HALF TO BYTE*/
1040
1041ITERATOR_INSN2_SHIFT_SLOT(16,vroundhb,"Vd32=vroundhb(Vu32,Vv32):sat","Vd32.b=vround(Vu32.h,Vv32.h):sat",
1042"Vector round words to halves, shuffle resultant halfwords",
1043    fSETBYTE(0, VdV.uh[i], fVSATB((VvV.h[i] + 0x80) >> 8));
1044    fSETBYTE(1, VdV.uh[i], fVSATB((VuV.h[i] + 0x80) >> 8)))
1045
1046ITERATOR_INSN2_SHIFT_SLOT(16,vroundhub,"Vd32=vroundhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.h,Vv32.h):sat",
1047"Vector round words to halves, shuffle resultant halfwords",
1048    fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.h[i] + 0x80) >> 8));
1049    fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.h[i] + 0x80) >> 8)))
1050
1051ITERATOR_INSN2_SHIFT_SLOT(16,vrounduhub,"Vd32=vrounduhub(Vu32,Vv32):sat","Vd32.ub=vround(Vu32.uh,Vv32.uh):sat",
1052"Vector round words to halves, shuffle resultant halfwords",
1053    fSETBYTE(0, VdV.uh[i], fVSATUB((VvV.uh[i] + 0x80) >> 8));
1054    fSETBYTE(1, VdV.uh[i], fVSATUB((VuV.uh[i] + 0x80) >> 8)))
1055
1056
1057ITERATOR_INSN2_SHIFT_SLOT(32,vaslw_acc,"Vx32+=vaslw(Vu32,Rt32)","Vx32.w+=vasl(Vu32.w,Rt32)",
1058"Vector shift add word",
1059    VxV.w[i]  +=  (VuV.w[i] << (RtV & (32-1))))
1060
1061ITERATOR_INSN2_SHIFT_SLOT(32,vasrw_acc,"Vx32+=vasrw(Vu32,Rt32)","Vx32.w+=vasr(Vu32.w,Rt32)",
1062"Vector shift add word",
1063    VxV.w[i]  +=  (VuV.w[i] >> (RtV & (32-1))))
1064
1065ITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vaslh_acc,"Vx32+=vaslh(Vu32,Rt32)","Vx32.h+=vasl(Vu32.h,Rt32)",
1066"Vector shift add halfword",
1067    VxV.h[i]  +=  (VuV.h[i] << (RtV & (16-1))))
1068
1069ITERATOR_INSN2_SHIFT_SLOT_NOV1(16,vasrh_acc,"Vx32+=vasrh(Vu32,Rt32)","Vx32.h+=vasr(Vu32.h,Rt32)",
1070"Vector shift add halfword",
1071    VxV.h[i]  +=  (VuV.h[i] >> (RtV & (16-1))))
1072
1073/**************************************************************************
1074*
1075* MMVECTOR ELEMENT-WISE ARITHMETIC
1076*
1077**************************************************************************/
1078
1079/**************************************************************************
1080* MACROS GO IN MACROS.DEF NOT HERE!!!
1081**************************************************************************/
1082
1083
1084#define MMVEC_ABSDIFF(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1085ITERATOR_INSN2_MPY_SLOT(WIDTH, vabsdiff##TYPE,                   "Vd32=vabsdiff"TYPE2"(Vu32,Vv32)" ,"Vd32."#DEST"=vabsdiff(Vu32."#SRC",Vv32."#SRC")" ,     "Vector Absolute of Difference "DESCR,   VdV.DEST[i] = (VuV.SRC[i] > VvV.SRC[i]) ? (VuV.SRC[i] - VvV.SRC[i]) : (VvV.SRC[i] - VuV.SRC[i]))
1086
1087#define MMVEC_ADDU_SAT(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1088ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat,                  "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" ,    "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVUADDSAT(WIDTH,  VuV.SRC[i], VvV.SRC[i]))\
1089ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv,    "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUADDSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\
1090ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat,                  "Vd32=vsub"TYPE2"(Vu32,Vv32):sat",     "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVUSUBSAT(WIDTH,  VuV.SRC[i], VvV.SRC[i]))\
1091ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv,    "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[0].SRC[i],VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVUSUBSAT(WIDTH, VuuV.v[1].SRC[i],VvvV.v[1].SRC[i]))\
1092
1093#define MMVEC_ADDS_SAT(TYPE,TYPE2,DESCR, WIDTH,DEST,SRC)\
1094ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE##sat,                  "Vd32=vadd"TYPE2"(Vu32,Vv32):sat" ,    "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVSADDSAT(WIDTH,  VuV.SRC[i],  VvV.SRC[i]))\
1095ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##sat_dv,    "Vdd32=vadd"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSADDSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\
1096ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE##sat,                  "Vd32=vsub"TYPE2"(Vu32,Vv32):sat",     "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC"):sat",    "Vector Add & Saturate "DESCR,            VdV.DEST[i] = fVSSUBSAT(WIDTH,  VuV.SRC[i],  VvV.SRC[i]))\
1097ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##sat_dv,    "Vdd32=vsub"TYPE2"(Vuu32,Vvv32):sat",  "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC"):sat", "Double Vector Add & Saturate "DESCR,    VddV.v[0].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[0].SRC[i], VvvV.v[0].SRC[i]); VddV.v[1].DEST[i] = fVSSUBSAT(WIDTH, VuuV.v[1].SRC[i], VvvV.v[1].SRC[i]))\
1098
1099#define MMVEC_AVGU(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1100ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",        "Vector Average "DESCR,                                      VdV.DEST[i] = fVAVGU(   WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
1101ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",     "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",    "Vector Average % Round"DESCR,                               VdV.DEST[i] = fVAVGURND(WIDTH,  VuV.SRC[i], VvV.SRC[i]))
1102
1103
1104
1105#define MMVEC_AVGS(TYPE,TYPE2,DESCR, WIDTH, DEST,SRC)\
1106ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE,                        "Vd32=vavg"TYPE2"(Vu32,Vv32)",          "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC")",          "Vector Average "DESCR,                                      VdV.DEST[i]  = fVAVGS(       WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
1107ITERATOR_INSN2_ANY_SLOT(WIDTH,vavg##TYPE##rnd,                   "Vd32=vavg"TYPE2"(Vu32,Vv32):rnd",      "Vd32."#DEST"=vavg(Vu32."#SRC",Vv32."#SRC"):rnd",      "Vector Average % Round"DESCR,                               VdV.DEST[i]  = fVAVGSRND(    WIDTH,  VuV.SRC[i], VvV.SRC[i])) \
1108ITERATOR_INSN2_ANY_SLOT(WIDTH,vnavg##TYPE,                       "Vd32=vnavg"TYPE2"(Vu32,Vv32)",         "Vd32."#DEST"=vnavg(Vu32."#SRC",Vv32."#SRC")",         "Vector Negative Average "DESCR,                             VdV.DEST[i]  = fVNAVGS(      WIDTH,  VuV.SRC[i], VvV.SRC[i]))
1109
1110
1111
1112
1113
1114
1115
1116#define MMVEC_ADDWRAP(TYPE,TYPE2, DESCR, WIDTH , DEST,SRC)\
1117ITERATOR_INSN2_ANY_SLOT(WIDTH, vadd##TYPE,                  "Vd32=vadd"TYPE2"(Vu32,Vv32)" ,     "Vd32."#DEST"=vadd(Vu32."#SRC",Vv32."#SRC")",    "Vector Add "DESCR,          VdV.DEST[i] =  VuV.SRC[i] +  VvV.SRC[i])\
1118ITERATOR_INSN2_ANY_SLOT(WIDTH, vsub##TYPE,                  "Vd32=vsub"TYPE2"(Vu32,Vv32)" ,     "Vd32."#DEST"=vsub(Vu32."#SRC",Vv32."#SRC")",    "Vector Sub "DESCR,          VdV.DEST[i] =  VuV.SRC[i] -  VvV.SRC[i])\
1119ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vadd##TYPE##_dv,  "Vdd32=vadd"TYPE2"(Vuu32,Vvv32)" ,  "Vdd32."#DEST"=vadd(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Add "DESCR,   VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] + VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] + VvvV.v[1].SRC[i])\
1120ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(WIDTH, vsub##TYPE##_dv,  "Vdd32=vsub"TYPE2"(Vuu32,Vvv32)" ,  "Vdd32."#DEST"=vsub(Vuu32."#SRC",Vvv32."#SRC")", "Double Vector Sub "DESCR,   VddV.v[0].DEST[i] = VuuV.v[0].SRC[i] - VvvV.v[0].SRC[i]; VddV.v[1].DEST[i] = VuuV.v[1].SRC[i] - VvvV.v[1].SRC[i]) \
1121
1122
1123
1124
1125
1126/* Wrapping Adds */
1127MMVEC_ADDWRAP(b,    "b",    "Byte",         8,   b, b)
1128MMVEC_ADDWRAP(h,    "h",    "Halfword",     16,  h, h)
1129MMVEC_ADDWRAP(w,    "w",    "Word",         32,   w,    w)
1130
1131/* Saturating Adds */
1132MMVEC_ADDU_SAT(ub, "ub",    "Unsigned Byte",        8,   ub,    ub)
1133MMVEC_ADDU_SAT(uh, "uh",    "Unsigned Halfword",    16,  uh,    uh)
1134MMVEC_ADDU_SAT(uw, "uw",    "Unsigned word",    32,  uw,    uw)
1135MMVEC_ADDS_SAT(b,  "b",     "byte",             8,  b,     b)
1136MMVEC_ADDS_SAT(h,  "h",     "Halfword",             16,  h,     h)
1137MMVEC_ADDS_SAT(w,  "w",     "Word",                 32,  w,     w)
1138
1139
1140/* Averaging Instructions */
1141MMVEC_AVGU(ub,"ub",     "Unsigned Byte",     8,   ub,   ub)
1142MMVEC_AVGU(uh,"uh",     "Unsigned Halfword", 16,  uh,   uh)
1143MMVEC_AVGU_NOV1(uw,"uw",     "Unsigned Word",     32,  uw,   uw)
1144MMVEC_AVGS_NOV1(b,   "b",    "Byte",               8,   b,   b)
1145MMVEC_AVGS(h,   "h",    "Halfword",          16,   h,   h)
1146MMVEC_AVGS(w,   "w",    "Word",              32,   w,   w)
1147
1148
1149/* Absolute Difference */
1150MMVEC_ABSDIFF(ub,"ub",  "Unsigned Byte",        8,   ub,    ub)
1151MMVEC_ABSDIFF(uh,"uh",  "Unsigned Halfword",    16,  uh,    uh)
1152MMVEC_ABSDIFF(h,"h",        "Halfword",             16,  uh,    h)
1153MMVEC_ABSDIFF(w,"w",        "Word",                 32,  uw,    w)
1154
1155ITERATOR_INSN2_ANY_SLOT(8,vnavgub, "Vd32=vnavgub(Vu32,Vv32)", "Vd32.b=vnavg(Vu32.ub,Vv32.ub)",
1156"Vector Negative Average Unsigned Byte", VdV.b[i]   = fVNAVGU(8, VuV.ub[i], VvV.ub[i]))
1157
1158ITERATOR_INSN_ANY_SLOT(32,vaddcarrysat,"Vd32.w=vadd(Vu32.w,Vv32.w,Qs4):carry:sat","add w/carry and saturate",
1159VdV.w[i] = fVSATW(VuV.w[i]+VvV.w[i]+fGETQBIT(QsV,i*4)))
1160
1161ITERATOR_INSN_ANY_SLOT(32,vaddcarry,"Vd32.w=vadd(Vu32.w,Vv32.w,Qx4):carry","add w/carry",
1162VdV.w[i] = VuV.w[i]+VvV.w[i]+fGETQBIT(QxV,i*4);
1163fSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],fGETQBIT(QxV,i*4))))
1164
1165ITERATOR_INSN_ANY_SLOT(32,vsubcarry,"Vd32.w=vsub(Vu32.w,Vv32.w,Qx4):carry","add w/carry",
1166VdV.w[i] = VuV.w[i]+~VvV.w[i]+fGETQBIT(QxV,i*4);
1167fSETQBITS(QxV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],fGETQBIT(QxV,i*4))))
1168
1169ITERATOR_INSN_ANY_SLOT(32,vaddcarryo,"Vd32.w,Qe4=vadd(Vu32.w,Vv32.w):carry","add w/carry out-only",
1170VdV.w[i] = VuV.w[i]+VvV.w[i];
1171fSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],VvV.w[i],0)))
1172
1173ITERATOR_INSN_ANY_SLOT(32,vsubcarryo,"Vd32.w,Qe4=vsub(Vu32.w,Vv32.w):carry","subtract w/carry out-only",
1174VdV.w[i] = VuV.w[i]+~VvV.w[i]+1;
1175fSETQBITS(QeV,4,0xF,4*i,-fCARRY_FROM_ADD32(VuV.w[i],~VvV.w[i],1)))
1176
1177
1178ITERATOR_INSN_ANY_SLOT(32,vsatdw,"Vd32.w=vsatdw(Vu32.w,Vv32.w)","Saturate from 64-bits (higher 32-bits come from first vector) to 32-bits",VdV.w[i] = fVSATDW(VuV.w[i],VvV.w[i]))
1179
1180
1181#define MMVEC_ADDSAT_MIX(TAGEND,SATF,WIDTH,DEST,SRC1,SRC2)\
1182ITERATOR_INSN_ANY_SLOT(WIDTH, vadd##TAGEND,"Vd32."#DEST"=vadd(Vu32."#SRC1",Vv32."#SRC2"):sat",    "Vector Add mixed", VdV.DEST[i] =  SATF(VuV.SRC1[i] +  VvV.SRC2[i]))\
1183ITERATOR_INSN_ANY_SLOT(WIDTH, vsub##TAGEND,"Vd32."#DEST"=vsub(Vu32."#SRC1",Vv32."#SRC2"):sat",    "Vector Sub mixed", VdV.DEST[i] =  SATF(VuV.SRC1[i] -  VvV.SRC2[i]))\
1184
1185MMVEC_ADDSAT_MIX(ububb_sat,fVSATUB,8,ub,ub,b)
1186
1187/****************************
1188*   WIDENING
1189****************************/
1190
1191
1192
1193
1194ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh,"Vdd32=vaddub(Vu32,Vv32)","Vdd32.h=vadd(Vu32.ub,Vv32.ub)",
1195"Vector addition with widen into two vectors",
1196    VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) + fZE8_16(fGETUBYTE(0, VvV.uh[i]));
1197    VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) + fZE8_16(fGETUBYTE(1, VvV.uh[i])))
1198
1199ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vsububh,"Vdd32=vsubub(Vu32,Vv32)","Vdd32.h=vsub(Vu32.ub,Vv32.ub)",
1200"Vector subtraction with widen into two vectors",
1201    VddV.v[0].h[i] = fZE8_16(fGETUBYTE(0, VuV.uh[i])) - fZE8_16(fGETUBYTE(0, VvV.uh[i]));
1202    VddV.v[1].h[i] = fZE8_16(fGETUBYTE(1, VuV.uh[i])) - fZE8_16(fGETUBYTE(1, VvV.uh[i])))
1203
1204
1205
1206ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw,"Vdd32=vaddh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.h,Vv32.h)",
1207"Vector addition with widen into two vectors",
1208    VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]);
1209    VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i]))
1210
1211ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubhw,"Vdd32=vsubh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.h,Vv32.h)",
1212"Vector subtraction with widen into two vectors",
1213    VddV.v[0].w[i] = fGETHALF(0, VuV.w[i]) - fGETHALF(0, VvV.w[i]);
1214    VddV.v[1].w[i] = fGETHALF(1, VuV.w[i]) - fGETHALF(1, VvV.w[i]))
1215
1216
1217ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw,"Vdd32=vadduh(Vu32,Vv32)","Vdd32.w=vadd(Vu32.uh,Vv32.uh)",
1218"Vector addition with widen into two vectors",
1219    VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) + fZE16_32(fGETUHALF(0, VvV.uw[i]));
1220    VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) + fZE16_32(fGETUHALF(1, VvV.uw[i])))
1221
1222ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vsubuhw,"Vdd32=vsubuh(Vu32,Vv32)","Vdd32.w=vsub(Vu32.uh,Vv32.uh)",
1223"Vector subtraction with widen into two vectors",
1224    VddV.v[0].w[i] = fZE16_32(fGETUHALF(0, VuV.uw[i])) - fZE16_32(fGETUHALF(0, VvV.uw[i]));
1225    VddV.v[1].w[i] = fZE16_32(fGETUHALF(1, VuV.uw[i])) - fZE16_32(fGETUHALF(1, VvV.uw[i])))
1226
1227
1228
1229ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vaddhw_acc,"Vxx32+=vaddh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.h,Vv32.h)",
1230"Vector addition with widen into two vectors",
1231    VxxV.v[0].w[i] += fGETHALF(0, VuV.w[i]) + fGETHALF(0, VvV.w[i]);
1232    VxxV.v[1].w[i] += fGETHALF(1, VuV.w[i]) + fGETHALF(1, VvV.w[i]))
1233
1234ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vadduhw_acc,"Vxx32+=vadduh(Vu32,Vv32)","Vxx32.w+=vadd(Vu32.uh,Vv32.uh)",
1235"Vector addition with widen into two vectors",
1236    VxxV.v[0].w[i] += fGETUHALF(0, VuV.w[i]) + fGETUHALF(0, VvV.w[i]);
1237    VxxV.v[1].w[i] += fGETUHALF(1, VuV.w[i]) + fGETUHALF(1, VvV.w[i]))
1238
1239ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vaddubh_acc,"Vxx32+=vaddub(Vu32,Vv32)","Vxx32.h+=vadd(Vu32.ub,Vv32.ub)",
1240"Vector addition with widen into two vectors",
1241    VxxV.v[0].h[i] += fGETUBYTE(0, VuV.h[i]) + fGETUBYTE(0, VvV.h[i]);
1242    VxxV.v[1].h[i] += fGETUBYTE(1, VuV.h[i]) + fGETUBYTE(1, VvV.h[i]))
1243
1244
1245/****************************
1246*   Conditional
1247****************************/
1248
1249#define CONDADDSUB(WIDTH,TAGEND,LHSYN,RHSYN,DESCR,LHBEH,RHBEH) \
1250ITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH+RHBEH,LHBEH)) \
1251ITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##q,"if (Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH-RHBEH,LHBEH)) \
1252ITERATOR_INSN2_ANY_SLOT(WIDTH,vadd##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"+="RHSYN,"if (!Qv4) "LHSYN"+="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH+RHBEH)) \
1253ITERATOR_INSN2_ANY_SLOT(WIDTH,vsub##TAGEND##nq,"if (!Qv4."#TAGEND") "LHSYN"-="RHSYN,"if (!Qv4) "LHSYN"-="RHSYN,DESCR,LHBEH=fCONDMASK##WIDTH(QvV,i,LHBEH,LHBEH-RHBEH)) \
1254
1255CONDADDSUB(8,b,"Vx32.b","Vu32.b","Conditional add/sub Byte",VxV.ub[i],VuV.ub[i])
1256CONDADDSUB(16,h,"Vx32.h","Vu32.h","Conditional add/sub Half",VxV.h[i],VuV.h[i])
1257CONDADDSUB(32,w,"Vx32.w","Vu32.w","Conditional add/sub Word",VxV.w[i],VuV.w[i])
1258
1259/*****************************************************
1260 ABSOLUTE VALUES
1261*****************************************************/
1262// V65
1263ITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb,        "Vd32=vabsb(Vu32)",     "Vd32.b=vabs(Vu32.b)",     "Vector absolute value of bytes",    VdV.b[i]  =  fABS(VuV.b[i]))
1264ITERATOR_INSN2_ANY_SLOT_NOV1(8,vabsb_sat,    "Vd32=vabsb(Vu32):sat", "Vd32.b=vabs(Vu32.b):sat", "Vector absolute value of bytes",    VdV.b[i]  =  fVSATB(fABS(fSE8_16(VuV.b[i]))))
1265
1266
1267ITERATOR_INSN2_ANY_SLOT(16,vabsh,        "Vd32=vabsh(Vu32)",     "Vd32.h=vabs(Vu32.h)",     "Vector absolute value of halfwords",    VdV.h[i]  =  fABS(VuV.h[i]))
1268ITERATOR_INSN2_ANY_SLOT(16,vabsh_sat,    "Vd32=vabsh(Vu32):sat", "Vd32.h=vabs(Vu32.h):sat", "Vector absolute value of halfwords",    VdV.h[i]  =  fVSATH(fABS(fSE16_32(VuV.h[i]))))
1269ITERATOR_INSN2_ANY_SLOT(32,vabsw,        "Vd32=vabsw(Vu32)",     "Vd32.w=vabs(Vu32.w)",     "Vector absolute value of words",        VdV.w[i]  =  fABS(VuV.w[i]))
1270ITERATOR_INSN2_ANY_SLOT(32,vabsw_sat,    "Vd32=vabsw(Vu32):sat", "Vd32.w=vabs(Vu32.w):sat", "Vector absolute value of words",        VdV.w[i]  =  fVSATW(fABS(fSE32_64(VuV.w[i]))))
1271
1272
1273/**************************************************************************
1274 * MMVECTOR MULTIPLICATIONS
1275 * ************************************************************************/
1276
1277
1278/* Byte by Byte */
1279ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv,"Vdd32=vmpyb(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.b,Vv32.b)",
1280"Vector absolute value of words",
1281    VddV.v[0].h[i] =  fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i]));
1282    VddV.v[1].h[i] =  fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i])))
1283
1284ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybv_acc,"Vxx32+=vmpyb(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.b,Vv32.b)",
1285"Vector absolute value of words",
1286    VxxV.v[0].h[i] +=  fMPY8SS(fGETBYTE(0, VuV.h[i]), fGETBYTE(0, VvV.h[i]));
1287    VxxV.v[1].h[i] +=  fMPY8SS(fGETBYTE(1, VuV.h[i]), fGETBYTE(1, VvV.h[i])))
1288
1289
1290ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv,"Vdd32=vmpyub(Vu32,Vv32)","Vdd32.uh=vmpy(Vu32.ub,Vv32.ub)",
1291"Vector absolute value of words",
1292    VddV.v[0].uh[i] =  fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) );
1293    VddV.v[1].uh[i] =  fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) ))
1294
1295ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyubv_acc,"Vxx32+=vmpyub(Vu32,Vv32)","Vxx32.uh+=vmpy(Vu32.ub,Vv32.ub)",
1296"Vector absolute value of words",
1297    VxxV.v[0].uh[i] +=  fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE(0, VvV.uh[i]) );
1298    VxxV.v[1].uh[i] +=  fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE(1, VvV.uh[i]) ))
1299
1300
1301
1302
1303
1304
1305ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv,"Vdd32=vmpybus(Vu32,Vv32)","Vdd32.h=vmpy(Vu32.ub,Vv32.b)",
1306"Vector absolute value of words",
1307    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i]));
1308    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i])))
1309
1310ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybusv_acc,"Vxx32+=vmpybus(Vu32,Vv32)","Vxx32.h+=vmpy(Vu32.ub,Vv32.b)",
1311"Vector absolute value of words",
1312    VxxV.v[0].h[i]  += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE(0, VvV.h[i]));
1313    VxxV.v[1].h[i]  += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE(1, VvV.h[i])))
1314
1315
1316
1317
1318ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabusv,"Vdd32=vmpabus(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.b)",
1319"Vertical Byte Multiply",
1320    VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(0, VvvV.v[1].uh[i]));
1321    VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(1, VvvV.v[0].uh[i])) + fMPY8US(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(1, VvvV.v[1].uh[i])))
1322
1323ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabuuv,"Vdd32=vmpabuu(Vuu32,Vvv32)","Vdd32.h=vmpa(Vuu32.ub,Vvv32.ub)",
1324"Vertical Byte Multiply",
1325    VddV.v[0].h[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(0, VvvV.v[1].uh[i]));
1326    VddV.v[1].h[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(1, VvvV.v[0].uh[i])) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(1, VvvV.v[1].uh[i])))
1327
1328
1329
1330
1331
1332
1333
1334ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv,"Vdd32=vmpyh(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.h)",
1335"Vector by Vector Halfword Multiply",
1336    VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i]));
1337    VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i])))
1338
1339ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhv_acc,"Vxx32+=vmpyh(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.h)",
1340"Vector by Vector Halfword Multiply",
1341    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, VvV.w[i]));
1342    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, VvV.w[i])))
1343
1344ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv,"Vdd32=vmpyuh(Vu32,Vv32)","Vdd32.uw=vmpy(Vu32.uh,Vv32.uh)",
1345"Vector by Vector Unsigned Halfword Multiply",
1346    VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i]));
1347    VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i])))
1348
1349ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuhv_acc,"Vxx32+=vmpyuh(Vu32,Vv32)","Vxx32.uw+=vmpy(Vu32.uh,Vv32.uh)",
1350"Vector by Vector Unsigned Halfword Multiply",
1351    VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]), fGETUHALF(0, VvV.uw[i]));
1352    VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]), fGETUHALF(1, VvV.uw[i])))
1353
1354
1355
1356/* Vector by Vector */
1357ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyhvsrs,"Vd32=vmpyh(Vu32,Vv32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Vv32.h):<<1:rnd:sat",
1358"Vector halfword multiply with round, shift, and sat16",
1359    VdV.h[i] = fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(VuV.h[i],VvV.h[i]    )<<1))))))
1360
1361
1362
1363
1364
1365ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus, "Vdd32=vmpyhus(Vu32,Vv32)","Vdd32.w=vmpy(Vu32.h,Vv32.uh)",
1366"Vector by Vector Halfword Multiply",
1367    VddV.v[0].w[i] = fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i]));
1368    VddV.v[1].w[i] = fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i])))
1369
1370
1371ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhus_acc, "Vxx32+=vmpyhus(Vu32,Vv32)","Vxx32.w+=vmpy(Vu32.h,Vv32.uh)",
1372"Vector by Vector Halfword Multiply",
1373    VxxV.v[0].w[i] += fMPY16SU(fGETHALF(0, VuV.w[i]), fGETUHALF(0, VvV.uw[i]));
1374    VxxV.v[1].w[i] += fMPY16SU(fGETHALF(1, VuV.w[i]), fGETUHALF(1, VvV.uw[i])))
1375
1376
1377
1378
1379ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih,"Vd32=vmpyih(Vu32,Vv32)","Vd32.h=vmpyi(Vu32.h,Vv32.h)",
1380"Vector by Vector Halfword Multiply",
1381    VdV.h[i] = fMPY16SS(VuV.h[i], VvV.h[i]))
1382
1383ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyih_acc,"Vx32+=vmpyih(Vu32,Vv32)","Vx32.h+=vmpyi(Vu32.h,Vv32.h)",
1384"Vector by Vector Halfword Multiply",
1385    VxV.h[i] += fMPY16SS(VuV.h[i], VvV.h[i]))
1386
1387
1388
1389/* 32x32 high half / frac */
1390
1391
1392ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh,"Vd32=vmpyewuh(Vu32,Vv32)","Vd32.w=vmpye(Vu32.w,Vv32.uh)",
1393"Vector by Vector Halfword Multiply",
1394VdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) >> 16)
1395
1396ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh,"Vd32=vmpyowh(Vu32,Vv32):<<1:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:sat",
1397"Vector by Vector Halfword Multiply",
1398VdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 0) >> 1)))
1399
1400ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd,"Vd32=vmpyowh(Vu32,Vv32):<<1:rnd:sat","Vd32.w=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat",
1401"Vector by Vector Halfword Multiply",
1402VdV.w[i] = fVSATW((((fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) >> 14) + 1) >> 1)))
1403
1404ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyewuh_64,"Vdd32=vmpye(Vu32.w,Vv32.uh)",
1405"Word times Halfword Multiply, 64-bit result",
1406	fHIDE(size8s_t prod;)
1407	prod = fMPY32SU(VuV.w[i],fGETUHALF(0,VvV.w[i]));
1408	VddV.v[1].w[i] = prod >> 16;
1409	VddV.v[0].w[i] = prod << 16)
1410
1411ITERATOR_INSN_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_64_acc,"Vxx32+=vmpyo(Vu32.w,Vv32.h)",
1412"Word times Halfword Multiply, 64-bit result",
1413	fHIDE(size8s_t prod;)
1414	prod = fMPY32SS(VuV.w[i],fGETHALF(1,VvV.w[i]))  + fSE32_64(VxxV.v[1].w[i]);
1415	VxxV.v[1].w[i] = prod >> 16;
1416	fSETHALF(0, VxxV.v[0].w[i], VxxV.v[0].w[i] >> 16);
1417	fSETHALF(1, VxxV.v[0].w[i], prod & 0x0000ffff))
1418
1419ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:sat:shift",
1420"Vector by Vector Halfword Multiply",
1421IV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 0) >> 1)))
1422
1423ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyowh_rnd_sacc,"Vx32+=vmpyowh(Vu32,Vv32):<<1:rnd:sat:shift","Vx32.w+=vmpyo(Vu32.w,Vv32.h):<<1:rnd:sat:shift",
1424"Vector by Vector Halfword Multiply",
1425IV1DEAD() VxV.w[i] = fVSATW(((((VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i]))) >> 14) + 1) >> 1)))
1426
1427/* For 32x32 integer / low half */
1428
1429ITERATOR_INSN_MPY_SLOT(32,vmpyieoh,"Vd32.w=vmpyieo(Vu32.h,Vv32.h)","Odd/Even multiply for 32x32 low half",
1430	VdV.w[i] = (fGETHALF(0,VuV.w[i])*fGETHALF(1,VvV.w[i])) << 16)
1431
1432ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh,"Vd32=vmpyiewuh(Vu32,Vv32)","Vd32.w=vmpyie(Vu32.w,Vv32.uh)",
1433"Vector by Vector Word by Halfword Multiply",
1434IV1DEAD()    VdV.w[i] = fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) )
1435
1436ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiowh,"Vd32=vmpyiowh(Vu32,Vv32)","Vd32.w=vmpyio(Vu32.w,Vv32.h)",
1437"Vector by Vector Word by Halfword Multiply",
1438IV1DEAD()    VdV.w[i] = fMPY3216SS(VuV.w[i], fGETHALF(1, VvV.w[i])) )
1439
1440/* Add back these... */
1441
1442ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewh_acc,"Vx32+=vmpyiewh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.h)",
1443"Vector by Vector Word by Halfword Multiply",
1444VxV.w[i] = VxV.w[i] + fMPY3216SS(VuV.w[i], fGETHALF(0, VvV.w[i])) )
1445
1446ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiewuh_acc,"Vx32+=vmpyiewuh(Vu32,Vv32)","Vx32.w+=vmpyie(Vu32.w,Vv32.uh)",
1447"Vector by Vector Word by Halfword Multiply",
1448VxV.w[i] = VxV.w[i] + fMPY3216SU(VuV.w[i], fGETUHALF(0, VvV.w[i])) )
1449
1450
1451
1452
1453
1454
1455
1456/* Vector by Scalar */
1457ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub,"Vdd32=vmpyub(Vu32,Rt32)","Vdd32.uh=vmpy(Vu32.ub,Rt32.ub)",
1458"Vector absolute value of words",
1459    VddV.v[0].uh[i]  = fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV));
1460    VddV.v[1].uh[i]  = fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV)))
1461
1462ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpyub_acc,"Vxx32+=vmpyub(Vu32,Rt32)","Vxx32.uh+=vmpy(Vu32.ub,Rt32.ub)",
1463"Vector absolute value of words",
1464    VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuV.uh[i]), fGETUBYTE((2*i+0)%4, RtV));
1465    VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuV.uh[i]), fGETUBYTE((2*i+1)%4, RtV)))
1466
1467
1468ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus,"Vdd32=vmpybus(Vu32,Rt32)","Vdd32.h=vmpy(Vu32.ub,Rt32.b)",
1469"Vector absolute value of words",
1470    VddV.v[0].h[i]  = fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV));
1471    VddV.v[1].h[i]  = fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
1472
1473ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpybus_acc,"Vxx32+=vmpybus(Vu32,Rt32)","Vxx32.h+=vmpy(Vu32.ub,Rt32.b)",
1474"Vector absolute value of words",
1475    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuV.uh[i]), fGETBYTE((2*i+0)%4, RtV));
1476    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuV.uh[i]), fGETBYTE((2*i+1)%4, RtV)))
1477
1478
1479ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus,"Vdd32=vmpabus(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.b)",
1480"Vertical Byte Multiply",
1481    VddV.v[0].h[i] = fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV));
1482    VddV.v[1].h[i] = fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV)))
1483
1484ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(16,vmpabus_acc,"Vxx32+=vmpabus(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.b)",
1485"Vertical Byte Multiply",
1486    VxxV.v[0].h[i] += fMPY8US(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETBYTE(0, RtV)) + fMPY16SS(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETBYTE(1, RtV));
1487    VxxV.v[1].h[i] += fMPY8US(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETBYTE(2, RtV)) + fMPY16SS(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETBYTE(3, RtV)))
1488
1489// V65
1490
1491ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu,"Vdd32=vmpabuu(Vuu32,Rt32)","Vdd32.h=vmpa(Vuu32.ub,Rt32.ub)",
1492"Vertical Byte Multiply",
1493    VddV.v[0].uh[i] = fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV));
1494    VddV.v[1].uh[i] = fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV)))
1495
1496ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(16,vmpabuu_acc,"Vxx32+=vmpabuu(Vuu32,Rt32)","Vxx32.h+=vmpa(Vuu32.ub,Rt32.ub)",
1497"Vertical Byte Multiply",
1498    VxxV.v[0].uh[i] += fMPY8UU(fGETUBYTE(0, VuuV.v[0].uh[i]), fGETUBYTE(0, RtV)) + fMPY8UU(fGETUBYTE(0, VuuV.v[1].uh[i]), fGETUBYTE(1, RtV));
1499    VxxV.v[1].uh[i] += fMPY8UU(fGETUBYTE(1, VuuV.v[0].uh[i]), fGETUBYTE(2, RtV)) + fMPY8UU(fGETUBYTE(1, VuuV.v[1].uh[i]), fGETUBYTE(3, RtV)))
1500
1501
1502
1503
1504/* Half by Byte */
1505ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb,"Vdd32=vmpahb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.h,Rt32.b)",
1506"Vertical Byte Multiply",
1507    VddV.v[0].w[i] = fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1508    VddV.v[1].w[i] = fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1509
1510ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpahb_acc,"Vxx32+=vmpahb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.h,Rt32.b)",
1511"Vertical Byte Multiply",
1512    VxxV.v[0].w[i] += fMPY16SS(fGETHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16SS(fGETHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1513    VxxV.v[1].w[i] += fMPY16SS(fGETHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16SS(fGETHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1514
1515/* Half by Byte */
1516ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb,"Vdd32=vmpauhb(Vuu32,Rt32)","Vdd32.w=vmpa(Vuu32.uh,Rt32.b)",
1517"Vertical Byte Multiply",
1518    VddV.v[0].w[i] = fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1519    VddV.v[1].w[i] = fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1520
1521ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpauhb_acc,"Vxx32+=vmpauhb(Vuu32,Rt32)","Vxx32.w+=vmpa(Vuu32.uh,Rt32.b)",
1522"Vertical Byte Multiply",
1523    VxxV.v[0].w[i] += fMPY16US(fGETUHALF(0, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(0, RtV))) + fMPY16US(fGETUHALF(0, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(1, RtV)));
1524    VxxV.v[1].w[i] += fMPY16US(fGETUHALF(1, VuuV.v[0].w[i]), fSE8_16(fGETBYTE(2, RtV))) + fMPY16US(fGETUHALF(1, VuuV.v[1].w[i]), fSE8_16(fGETBYTE(3, RtV))))
1525
1526
1527
1528
1529
1530
1531
1532/* Half by Half */
1533ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyh,"Vdd32=vmpyh(Vu32,Rt32)","Vdd32.w=vmpy(Vu32.h,Rt32.h)",
1534"Vector absolute value of words",
1535    VddV.v[0].w[i] =  fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV));
1536    VddV.v[1].w[i] =  fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV)))
1537
1538ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC_NOV1(32,vmpyh_acc,"Vxx32+=vmpyh(Vu32,Rt32)","Vxx32.w+=vmpy(Vu32.h,Rt32.h)",
1539"Vector even halfwords with scalar lower halfword multiply with shift and sat32",
1540    VxxV.v[0].w[i] =  fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV));
1541    VxxV.v[1].w[i] =  fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV)))
1542
1543
1544ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsat_acc,"Vxx32+=vmpyh(Vu32,Rt32):sat","Vxx32.w+=vmpy(Vu32.h,Rt32.h):sat",
1545"Vector even halfwords with scalar lower halfword multiply with shift and sat32",
1546    VxxV.v[0].w[i] =  fVSATW(fCAST8s(VxxV.v[0].w[i]) + fMPY16SS(fGETHALF(0, VuV.w[i]), fGETHALF(0, RtV)));
1547    VxxV.v[1].w[i] =  fVSATW(fCAST8s(VxxV.v[1].w[i]) + fMPY16SS(fGETHALF(1, VuV.w[i]), fGETHALF(1, RtV))))
1548
1549
1550
1551ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhss,"Vd32=vmpyh(Vu32,Rt32):<<1:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:sat",
1552"Vector halfword by halfword multiply, shift by 1, and take upper 16 msb",
1553          fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1)))));
1554          fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1)))));
1555)
1556
1557ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyhsrs,"Vd32=vmpyh(Vu32,Rt32):<<1:rnd:sat","Vd32.h=vmpy(Vu32.h,Rt32.h):<<1:rnd:sat",
1558"Vector halfword with scalar halfword multiply with round, shift, and sat16",
1559       fSETHALF(0,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(0,VuV.w[i]),fGETHALF(0,RtV))<<1))))));
1560       fSETHALF(1,VdV.w[i],fVSATH(fGETHALF(1,fVSAT(fROUND((fMPY16SS(fGETHALF(1,VuV.w[i]),fGETHALF(1,RtV))<<1))))));
1561)
1562
1563
1564ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh,"Vdd32=vmpyuh(Vu32,Rt32)","Vdd32.uw=vmpy(Vu32.uh,Rt32.uh)",
1565"Vector even halfword unsigned multiply by scalar",
1566    VddV.v[0].uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV));
1567    VddV.v[1].uw[i] = fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV)))
1568
1569
1570ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyuh_acc,"Vxx32+=vmpyuh(Vu32,Rt32)","Vxx32.uw+=vmpy(Vu32.uh,Rt32.uh)",
1571"Vector even halfword unsigned multiply by scalar",
1572    VxxV.v[0].uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV));
1573    VxxV.v[1].uw[i] += fMPY16UU(fGETUHALF(1, VuV.uw[i]),fGETUHALF(1,RtV)))
1574
1575
1576
1577
1578/********************************************
1579*  HALF BY BYTE
1580********************************************/
1581ITERATOR_INSN2_MPY_SLOT(16,vmpyihb,"Vd32=vmpyihb(Vu32,Rt32)","Vd32.h=vmpyi(Vu32.h,Rt32.b)",
1582"Vector word by byte multiply, keep lower result",
1583VdV.h[i]  = fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) ))
1584
1585ITERATOR_INSN2_MPY_SLOT(16,vmpyihb_acc,"Vx32+=vmpyihb(Vu32,Rt32)","Vx32.h+=vmpyi(Vu32.h,Rt32.b)",
1586"Vector word by byte multiply, keep lower result",
1587VxV.h[i] += fMPY16SS(VuV.h[i], fGETBYTE(i % 4, RtV) ))
1588
1589
1590/********************************************
1591*  WORD BY BYTE
1592********************************************/
1593ITERATOR_INSN2_MPY_SLOT(32,vmpyiwb,"Vd32=vmpyiwb(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.b)",
1594"Vector word by byte multiply, keep lower result",
1595VdV.w[i]  = fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) ))
1596
1597ITERATOR_INSN2_MPY_SLOT(32,vmpyiwb_acc,"Vx32+=vmpyiwb(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.b)",
1598"Vector word by byte multiply, keep lower result",
1599VxV.w[i] += fMPY32SS(VuV.w[i], fGETBYTE(i % 4, RtV) ))
1600
1601ITERATOR_INSN2_MPY_SLOT(32,vmpyiwub,"Vd32=vmpyiwub(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.ub)",
1602"Vector word by byte multiply, keep lower result",
1603VdV.w[i]  = fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) ))
1604
1605ITERATOR_INSN2_MPY_SLOT(32,vmpyiwub_acc,"Vx32+=vmpyiwub(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.ub)",
1606"Vector word by byte multiply, keep lower result",
1607VxV.w[i] += fMPY32SS(VuV.w[i], fGETUBYTE(i % 4, RtV) ))
1608
1609
1610/********************************************
1611*  WORD BY HALF
1612********************************************/
1613ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh,"Vd32=vmpyiwh(Vu32,Rt32)","Vd32.w=vmpyi(Vu32.w,Rt32.h)",
1614"Vector word by byte multiply, keep lower result",
1615VdV.w[i]  = fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV)))
1616
1617ITERATOR_INSN2_MPY_SLOT_DOUBLE_VEC(32,vmpyiwh_acc,"Vx32+=vmpyiwh(Vu32,Rt32)","Vx32.w+=vmpyi(Vu32.w,Rt32.h)",
1618"Vector word by byte multiply, keep lower result",
1619VxV.w[i] += fMPY32SS(VuV.w[i], fGETHALF(i % 2, RtV)))
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639/**************************************************************************
1640 * MMVECTOR LOGICAL OPERATIONS
1641 * ************************************************************************/
1642ITERATOR_INSN_ANY_SLOT(16,vand,"Vd32=vand(Vu32,Vv32)", "Vector Logical And", VdV.uh[i] = VuV.uh[i] & VvV.h[i])
1643ITERATOR_INSN_ANY_SLOT(16,vor, "Vd32=vor(Vu32,Vv32)",  "Vector Logical Or", VdV.uh[i] = VuV.uh[i] | VvV.h[i])
1644ITERATOR_INSN_ANY_SLOT(16,vxor,"Vd32=vxor(Vu32,Vv32)", "Vector Logical XOR",    VdV.uh[i] = VuV.uh[i] ^ VvV.h[i])
1645ITERATOR_INSN_ANY_SLOT(16,vnot,"Vd32=vnot(Vu32)",     "Vector Logical NOT", VdV.uh[i] = ~VuV.uh[i])
1646
1647
1648
1649
1650
1651ITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt,
1652"Vd32.ub=vand(Qu4.ub,Rt32.ub)", "Vd32=vand(Qu4,Rt32)", "Insert Predicate into Vector",
1653    VdV.ub[i] = fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0)
1654
1655ITERATOR_INSN2_MPY_SLOT_LATE(8, vandqrt_acc,
1656"Vx32.ub|=vand(Qu4.ub,Rt32.ub)", "Vx32|=vand(Qu4,Rt32)",  "Insert Predicate into Vector",
1657    VxV.ub[i] |= (fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0)
1658
1659ITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt,
1660"Vd32.ub=vand(!Qu4.ub,Rt32.ub)", "Vd32=vand(!Qu4,Rt32)", "Insert Predicate into Vector",
1661    VdV.ub[i] = !fGETQBIT(QuV,i) ? fGETUBYTE(i % 4, RtV) : 0)
1662
1663ITERATOR_INSN2_MPY_SLOT_LATE(8, vandnqrt_acc,
1664"Vx32.ub|=vand(!Qu4.ub,Rt32.ub)", "Vx32|=vand(!Qu4,Rt32)",  "Insert Predicate into Vector",
1665    VxV.ub[i] |= !(fGETQBIT(QuV,i)) ? fGETUBYTE(i % 4, RtV) : 0)
1666
1667
1668ITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt,
1669"Qd4.ub=vand(Vu32.ub,Rt32.ub)", "Qd4=vand(Vu32,Rt32)", "Insert into Predicate",
1670    fSETQBIT(QdV,i,((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0))
1671
1672ITERATOR_INSN2_MPY_SLOT_LATE(8, vandvrt_acc,
1673"Qx4.ub|=vand(Vu32.ub,Rt32.ub)", "Qx4|=vand(Vu32,Rt32)", "Insert into Predicate ",
1674    fSETQBIT(QxV,i,fGETQBIT(QxV,i)|(((VuV.ub[i] & fGETUBYTE(i % 4, RtV)) != 0) ? 1 : 0)))
1675
1676ITERATOR_INSN_ANY_SLOT(8,vandvqv,"Vd32=vand(Qv4,Vu32)","Mask off bytes",
1677VdV.b[i] = fGETQBIT(QvV,i) ? VuV.b[i] : 0)
1678ITERATOR_INSN_ANY_SLOT(8,vandvnqv,"Vd32=vand(!Qv4,Vu32)","Mask off bytes",
1679VdV.b[i] = !fGETQBIT(QvV,i) ? VuV.b[i] : 0)
1680
1681
1682 /***************************************************
1683 * Compare Vector with Vector
1684 ***************************************************/
1685#define VCMP(DEST, ASRC, ASRCOP, CMP, N, SRC, MASK, WIDTH)        \
1686{ \
1687       for(fHIDE(int) i = 0; i < fVBYTES(); i += WIDTH) { \
1688		fSETQBITS(DEST,WIDTH,MASK,i,ASRC ASRCOP ((VuV.SRC[i/WIDTH] CMP VvV.SRC[i/WIDTH]) ? MASK : 0)); \
1689    } \
1690       }
1691
1692
1693#define MMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \
1694EXTINSN(V6_vgt##TYPE,       "Qd4=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than", \
1695	VCMP(QdV, , , >, N, SRC, MASK, WIDTH)) \
1696EXTINSN(V6_vgt##TYPE##_and, "Qx4&=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-and", \
1697	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, >, N, SRC, MASK, WIDTH)) \
1698EXTINSN(V6_vgt##TYPE##_or,  "Qx4|=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-or", \
1699	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, >, N, SRC, MASK, WIDTH)) \
1700EXTINSN(V6_vgt##TYPE##_xor, "Qx4^=vcmp.gt(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" greater than with predicate-xor", \
1701	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, >, N, SRC, MASK, WIDTH))
1702
1703#define MMVEC_CMP(TYPE,TYPE2,TYPE3,DESCR,N,MASK, WIDTH, SRC)\
1704MMVEC_CMPGT(TYPE,TYPE2,TYPE3,DESCR,N,MASK,WIDTH,SRC) \
1705EXTINSN(V6_veq##TYPE,       "Qd4=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equal to", \
1706	VCMP(QdV, , , ==, N, SRC, MASK, WIDTH)) \
1707EXTINSN(V6_veq##TYPE##_and, "Qx4&=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-and", \
1708	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), &, ==, N, SRC, MASK, WIDTH)) \
1709EXTINSN(V6_veq##TYPE##_or,  "Qx4|=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-or", \
1710	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), |, ==, N, SRC, MASK, WIDTH)) \
1711EXTINSN(V6_veq##TYPE##_xor, "Qx4^=vcmp.eq(Vu32." TYPE2 ",Vv32." TYPE2 ")", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA), DESCR" equalto with predicate-xor", \
1712	VCMP(QxV, fGETQBITS(QxV,WIDTH,MASK,i), ^, ==, N, SRC, MASK, WIDTH))
1713
1714
1715MMVEC_CMP(w,"w","","Vector Word Compare ", fVELEM(32), 0xF, 4, w)
1716MMVEC_CMP(h,"h","","Vector Half Compare ", fVELEM(16), 0x3, 2, h)
1717MMVEC_CMP(b,"b","","Vector Half Compare ", fVELEM(8),  0x1, 1, b)
1718MMVEC_CMPGT(uw,"uw","","Vector Unsigned Half Compare ", fVELEM(32), 0xF, 4,uw)
1719MMVEC_CMPGT(uh,"uh","","Vector Unsigned Half Compare ", fVELEM(16), 0x3, 2,uh)
1720MMVEC_CMPGT(ub,"ub","","Vector Unsigned Byte Compare ", fVELEM(8),  0x1, 1,ub)
1721
1722/***************************************************
1723* Predicate Operations
1724***************************************************/
1725
1726EXTINSN(V6_pred_scalar2, "Qd4=vsetq(Rt32)",         ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),   "Set Vector Predicate ",
1727{
1728    fHIDE(int i;)
1729    for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i < (RtV & (fVBYTES()-1))) ? 1 : 0);
1730})
1731
1732EXTINSN(V6_pred_scalar2v2, "Qd4=vsetq2(Rt32)",         ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),   "Set Vector Predicate ",
1733{
1734    fHIDE(int i;)
1735    for(i = 0; i < fVBYTES(); i++) fSETQBIT(QdV,i,(i <= ((RtV-1) & (fVBYTES()-1))) ? 1 : 0);
1736})
1737
1738
1739ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqw, "Qd4.h=vshuffe(Qs4.w,Qt4.w)","Shrink Predicate", fSETQBIT(QdV,i, (i & 2) ? fGETQBIT(QsV,i-2) : fGETQBIT(QtV,i) ) )
1740ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, shuffeqh, "Qd4.b=vshuffe(Qs4.h,Qt4.h)","Shrink Predicate", fSETQBIT(QdV,i, (i & 1) ? fGETQBIT(QsV,i-1) : fGETQBIT(QtV,i) ) )
1741ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or, "Qd4=or(Qs4,Qt4)","Vector Predicate Or", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || fGETQBIT(QtV,i) ) )
1742ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and, "Qd4=and(Qs4,Qt4)","Vector Predicate And", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && fGETQBIT(QtV,i) ) )
1743ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_xor, "Qd4=xor(Qs4,Qt4)","Vector Predicate Xor", fSETQBIT(QdV,i,fGETQBIT(QsV,i) ^ fGETQBIT(QtV,i) ) )
1744ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_or_n, "Qd4=or(Qs4,!Qt4)","Vector Predicate Or with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) || !fGETQBIT(QtV,i) ) )
1745ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8, pred_and_n, "Qd4=and(Qs4,!Qt4)","Vector Predicate And  with not", fSETQBIT(QdV,i,fGETQBIT(QsV,i) && !fGETQBIT(QtV,i) ) )
1746ITERATOR_INSN_ANY_SLOT(8, pred_not, "Qd4=not(Qs4)","Vector Predicate Not", fSETQBIT(QdV,i,!fGETQBIT(QsV,i) ) )
1747
1748
1749
1750EXTINSN(V6_vcmov,  "if (Ps4) Vd32=Vu32",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),   "Conditional Mov",
1751{
1752if (fLSBOLD(PsV))	{
1753	fHIDE(int i;)
1754	fVFOREACH(8, i) {
1755		VdV.ub[i] = VuV.ub[i];
1756	}
1757	} else {CANCEL;}
1758})
1759
1760EXTINSN(V6_vncmov,  "if (!Ps4) Vd32=Vu32",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA),   "Conditional Mov",
1761{
1762if (fLSBOLDNOT(PsV))	{
1763	fHIDE(int i;)
1764	fVFOREACH(8, i) {
1765		VdV.ub[i] = VuV.ub[i];
1766	}
1767	} else {CANCEL;}
1768})
1769
1770EXTINSN(V6_vccombine,  "if (Ps4) Vdd32=vcombine(Vu32,Vv32)",	ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV),   "Conditional Combine",
1771{
1772if (fLSBOLD(PsV))	{
1773	fHIDE(int i;)
1774	fVFOREACH(8, i) {
1775		VddV.v[0].ub[i] = VvV.ub[i];
1776		VddV.v[1].ub[i] = VuV.ub[i];
1777	}
1778	} else {CANCEL;}
1779})
1780
1781EXTINSN(V6_vnccombine,  "if (!Ps4) Vdd32=vcombine(Vu32,Vv32)",	ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA_DV),   "Conditional Combine",
1782{
1783if (fLSBOLDNOT(PsV))	{
1784	fHIDE(int i;)
1785	fVFOREACH(8, i) {
1786		VddV.v[0].ub[i] = VvV.ub[i];
1787		VddV.v[1].ub[i] = VuV.ub[i];
1788	}
1789	} else {CANCEL;}
1790})
1791
1792
1793
1794ITERATOR_INSN_ANY_SLOT(8,vmux,"Vd32=vmux(Qt4,Vu32,Vv32)",
1795"Vector Select Element 8-bit",
1796    VdV.ub[i] = fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i])
1797
1798ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vswap,"Vdd32=vswap(Qt4,Vu32,Vv32)",
1799"Vector Swap Element 8-bit",
1800    VddV.v[0].ub[i] =  fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i];
1801	VddV.v[1].ub[i] = !fGETQBIT(QtV,i) ? VuV.ub[i] : VvV.ub[i])
1802
1803
1804/***************************************************************************
1805*
1806*   MMVECTOR SORTING
1807*
1808****************************************************************************/
1809
1810#define MMVEC_SORT(TYPE,TYPE2,DESCR,ELEMENTSIZE,SRC)\
1811ITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmax##TYPE, "Vd32=vmax" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmax(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " max", VdV.SRC[i] = (VuV.SRC[i] > VvV.SRC[i]) ? VuV.SRC[i] :  VvV.SRC[i])  \
1812ITERATOR_INSN2_ANY_SLOT(ELEMENTSIZE,vmin##TYPE, "Vd32=vmin" TYPE2 "(Vu32,Vv32)", "Vd32."#SRC"=vmin(Vu32."#SRC",Vv32."#SRC")", "Vector " DESCR " min", VdV.SRC[i] = (VuV.SRC[i] < VvV.SRC[i]) ? VuV.SRC[i] :  VvV.SRC[i])
1813
1814MMVEC_SORT(b,"b", "signed byte",    8,  b)
1815MMVEC_SORT(ub,"ub", "unsigned byte",    8,  ub)
1816MMVEC_SORT(uh,"uh", "unsigned halfword",16, uh)
1817MMVEC_SORT(h,   "h",    "halfword",         16, h)
1818MMVEC_SORT(w,   "w",    "word",             32, w)
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828/*************************************************************
1829* SHUFFLES
1830****************************************************************/
1831
1832ITERATOR_INSN2_ANY_SLOT(16,vsathub,"Vd32=vsathub(Vu32,Vv32)","Vd32.ub=vsat(Vu32.h,Vv32.h)",
1833"Saturate and pack 32 halfwords to 32 unsigned bytes, and interleave them",
1834    fSETBYTE(0, VdV.uh[i], fVSATUB(VvV.h[i]));
1835    fSETBYTE(1, VdV.uh[i], fVSATUB(VuV.h[i])))
1836
1837ITERATOR_INSN2_ANY_SLOT(32,vsatwh,"Vd32=vsatwh(Vu32,Vv32)","Vd32.h=vsat(Vu32.w,Vv32.w)",
1838"Saturate and pack 16 words to 16 halfwords, and interleave them",
1839    fSETHALF(0, VdV.w[i], fVSATH(VvV.w[i]));
1840    fSETHALF(1, VdV.w[i], fVSATH(VuV.w[i])))
1841
1842ITERATOR_INSN2_ANY_SLOT(32,vsatuwuh,"Vd32=vsatuwuh(Vu32,Vv32)","Vd32.uh=vsat(Vu32.uw,Vv32.uw)",
1843"Saturate and pack 16 words to 16 halfwords, and interleave them",
1844    fSETHALF(0, VdV.w[i], fVSATUH(VvV.uw[i]));
1845    fSETHALF(1, VdV.w[i], fVSATUH(VuV.uw[i])))
1846
1847ITERATOR_INSN2_ANY_SLOT(16,vshuffeb,"Vd32=vshuffeb(Vu32,Vv32)","Vd32.b=vshuffe(Vu32.b,Vv32.b)",
1848"Shuffle half words with in a lane",
1849    fSETBYTE(0, VdV.uh[i], fGETUBYTE(0, VvV.uh[i]));
1850    fSETBYTE(1, VdV.uh[i], fGETUBYTE(0, VuV.uh[i])))
1851
1852ITERATOR_INSN2_ANY_SLOT(16,vshuffob,"Vd32=vshuffob(Vu32,Vv32)","Vd32.b=vshuffo(Vu32.b,Vv32.b)",
1853"Shuffle half words with in a lane",
1854    fSETBYTE(0, VdV.uh[i], fGETUBYTE(1, VvV.uh[i]));
1855    fSETBYTE(1, VdV.uh[i], fGETUBYTE(1, VuV.uh[i])))
1856
1857ITERATOR_INSN2_ANY_SLOT(32,vshufeh,"Vd32=vshuffeh(Vu32,Vv32)","Vd32.h=vshuffe(Vu32.h,Vv32.h)",
1858"Shuffle half words with in a lane",
1859    fSETHALF(0, VdV.uw[i], fGETUHALF(0, VvV.uw[i]));
1860    fSETHALF(1, VdV.uw[i], fGETUHALF(0, VuV.uw[i])))
1861
1862ITERATOR_INSN2_ANY_SLOT(32,vshufoh,"Vd32=vshuffoh(Vu32,Vv32)","Vd32.h=vshuffo(Vu32.h,Vv32.h)",
1863"Shuffle half words with in a lane",
1864    fSETHALF(0, VdV.uw[i], fGETUHALF(1, VvV.uw[i]));
1865    fSETHALF(1, VdV.uw[i], fGETUHALF(1, VuV.uw[i])))
1866
1867
1868
1869
1870/**************************************************************************
1871* Double Vector Shuffles
1872**************************************************************************/
1873
1874EXTINSN(V6_vshuff, "vshuff(Vy32,Vx32,Rt32)",
1875ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1876"2x2->2x2 transpose, for multiple data sizes, inplace",
1877{
1878	fHIDE(int offset;)
1879	for (offset=1; offset<fVBYTES(); offset<<=1) {
1880		if ( RtV & offset) {
1881			    fHIDE(int k;) \
1882				fVFOREACH(8, k) {\
1883				if (!( k & offset)) {
1884					fSWAPB(VyV.ub[k], VxV.ub[k+offset]);
1885				}
1886			}
1887		}
1888	}
1889	})
1890
1891EXTINSN(V6_vshuffvdd, "Vdd32=vshuff(Vu32,Vv32,Rt8)",
1892ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1893"2x2->2x2 transpose for multiple data sizes",
1894{
1895	fHIDE(int offset;)
1896	VddV.v[0] = VvV;
1897	VddV.v[1] = VuV;
1898	for (offset=1; offset<fVBYTES(); offset<<=1) {
1899		if ( RtV & offset) {
1900			    fHIDE(int k;) \
1901				fVFOREACH(8, k) {\
1902				if (!( k & offset)) {
1903					fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]);
1904				}
1905			}
1906		}
1907	}
1908	})
1909
1910EXTINSN(V6_vdeal, "vdeal(Vy32,Vx32,Rt32)",
1911ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1912" vector - vector deal - or deinterleave, for multiple data sizes, inplace",
1913{
1914	fHIDE(int offset;)
1915	for (offset=fVBYTES()>>1; offset>0; offset>>=1) {
1916		if ( RtV & offset) {
1917			    fHIDE(int k;) \
1918				fVFOREACH(8, k) {\
1919				if (!( k & offset)) {
1920					fSWAPB(VyV.ub[k], VxV.ub[k+offset]);
1921				}
1922			}
1923		}
1924	}
1925	})
1926
1927EXTINSN(V6_vdealvdd, "Vdd32=vdeal(Vu32,Vv32,Rt8)",
1928ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP_VS),
1929" vector - vector deal - or deinterleave, for multiple data sizes",
1930{
1931	fHIDE(int offset;)
1932	VddV.v[0] = VvV;
1933	VddV.v[1] = VuV;
1934	for (offset=fVBYTES()>>1; offset>0; offset>>=1) {
1935		if ( RtV & offset) {
1936			    fHIDE(int k;) \
1937				fVFOREACH(8, k) {\
1938				if (!( k & offset)) {
1939					fSWAPB(VddV.v[1].ub[k], VddV.v[0].ub[k+offset]);
1940				}
1941			}
1942		}
1943	}
1944	})
1945
1946/**************************************************************************/
1947
1948
1949
1950ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(32,vshufoeh,"Vdd32=vshuffoeh(Vu32,Vv32)","Vdd32.h=vshuffoe(Vu32.h,Vv32.h)",
1951"Vector Shuffle half words",
1952    fSETHALF(0, VddV.v[0].uw[i], fGETUHALF(0, VvV.uw[i]));
1953    fSETHALF(1, VddV.v[0].uw[i], fGETUHALF(0, VuV.uw[i]));
1954    fSETHALF(0, VddV.v[1].uw[i], fGETUHALF(1, VvV.uw[i]));
1955    fSETHALF(1, VddV.v[1].uw[i], fGETUHALF(1, VuV.uw[i])))
1956
1957ITERATOR_INSN2_ANY_SLOT_DOUBLE_VEC(16,vshufoeb,"Vdd32=vshuffoeb(Vu32,Vv32)","Vdd32.b=vshuffoe(Vu32.b,Vv32.b)",
1958"Vector Shuffle bytes",
1959    fSETBYTE(0, VddV.v[0].uh[i], fGETUBYTE(0, VvV.uh[i]));
1960    fSETBYTE(1, VddV.v[0].uh[i], fGETUBYTE(0, VuV.uh[i]));
1961    fSETBYTE(0, VddV.v[1].uh[i], fGETUBYTE(1, VvV.uh[i]));
1962    fSETBYTE(1, VddV.v[1].uh[i], fGETUBYTE(1, VuV.uh[i])))
1963
1964
1965/***************************************************************
1966* Deal
1967***************************************************************/
1968
1969ITERATOR_INSN2_PERMUTE_SLOT(32, vdealh, "Vd32=vdealh(Vu32)", "Vd32.h=vdeal(Vu32.h)",
1970"Deal Halfwords",
1971    VdV.uh[i  ] = fGETUHALF(0, VuV.uw[i]);
1972    VdV.uh[i+fVELEM(32)] = fGETUHALF(1, VuV.uw[i]))
1973
1974ITERATOR_INSN2_PERMUTE_SLOT(16, vdealb, "Vd32=vdealb(Vu32)", "Vd32.b=vdeal(Vu32.b)",
1975"Deal Halfwords",
1976    VdV.ub[i   ] = fGETUBYTE(0, VuV.uh[i]);
1977    VdV.ub[i+fVELEM(16)] = fGETUBYTE(1, VuV.uh[i]))
1978
1979ITERATOR_INSN2_PERMUTE_SLOT(32, vdealb4w,  "Vd32=vdealb4w(Vu32,Vv32)", "Vd32.b=vdeale(Vu32.b,Vv32.b)",
1980"Deal Two Vectors Bytes",
1981    VdV.ub[0+i ] = fGETUBYTE(0, VvV.uw[i]);
1982    VdV.ub[fVELEM(32)+i ] = fGETUBYTE(2, VvV.uw[i]);
1983    VdV.ub[2*fVELEM(32)+i] = fGETUBYTE(0, VuV.uw[i]);
1984    VdV.ub[3*fVELEM(32)+i] = fGETUBYTE(2, VuV.uw[i]))
1985
1986/***************************************************************
1987* shuffle
1988***************************************************************/
1989
1990ITERATOR_INSN2_PERMUTE_SLOT(32, vshuffh, "Vd32=vshuffh(Vu32)", "Vd32.h=vshuff(Vu32.h)",
1991"Deal Halfwords",
1992    fSETHALF(0, VdV.uw[i], VuV.uh[i]);
1993    fSETHALF(1, VdV.uw[i], VuV.uh[i+fVELEM(32)]))
1994
1995ITERATOR_INSN2_PERMUTE_SLOT(16, vshuffb, "Vd32=vshuffb(Vu32)", "Vd32.b=vshuff(Vu32.b)",
1996"Deal Halfwords",
1997    fSETBYTE(0, VdV.uh[i], VuV.ub[i]);
1998    fSETBYTE(1, VdV.uh[i], VuV.ub[i+fVELEM(16)]))
1999
2000
2001
2002
2003
2004/***********************************************************
2005* INSERT AND EXTRACT
2006*********************************************************/
2007EXTINSN(V6_extractw, "Rd32=vextract(Vu32,Rs32)",
2008ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VA,A_MEMLIKE,A_RESTRICT_SLOT0ONLY),
2009"Extract an element from a vector to scalar",
2010fHIDE(warn("RdN=%d VuN=%d RsN=%d RsV=0x%08x widx=%d",RdN,VuN,RsN,RsV,((RsV & (fVBYTES()-1)) >> 2));)
2011RdV = VuV.uw[ (RsV & (fVBYTES()-1)) >> 2];
2012fHIDE(warn("RdV=0x%08x",RdV);))
2013
2014EXTINSN(V6_vinsertwr, "Vx32.w=vinsert(Rt32)",
2015ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VX),
2016"Insert Word Scalar into Vector",
2017VxV.uw[0] = RtV;)
2018
2019
2020
2021
2022ITERATOR_INSN_MPY_SLOT_LATE(32,lvsplatw, "Vd32=vsplat(Rt32)", "Replicates scalar accross words in vector", VdV.uw[i] = RtV)
2023
2024ITERATOR_INSN_MPY_SLOT_LATE(16,lvsplath, "Vd32.h=vsplat(Rt32)", "Replicates scalar accross halves in vector", VdV.uh[i] = RtV)
2025
2026ITERATOR_INSN_MPY_SLOT_LATE(8,lvsplatb, "Vd32.b=vsplat(Rt32)", "Replicates scalar accross bytes in vector", VdV.ub[i] = RtV)
2027
2028
2029ITERATOR_INSN_ANY_SLOT(32,vassign,"Vd32=Vu32","Copy a vector",VdV.w[i]=VuV.w[i])
2030
2031
2032ITERATOR_INSN_ANY_SLOT_DOUBLE_VEC(8,vcombine,"Vdd32=vcombine(Vu32,Vv32)",
2033"Vector assign, Any two to Vector Pair",
2034    VddV.v[0].ub[i] = VvV.ub[i];
2035    VddV.v[1].ub[i] = VuV.ub[i])
2036
2037
2038
2039///////////////////////////////////////////////////////////////////////////
2040
2041
2042/*********************************************************
2043* GENERAL PERMUTE NETWORKS
2044*********************************************************/
2045
2046
2047EXTINSN(V6_vdelta, "Vd32=vdelta(Vu32,Vv32)",    ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
2048"Reverse Benes Butterfly network ",
2049{
2050    fHIDE(int offset;)
2051    fHIDE(int k;)
2052    fHIDE(mmvector_t tmp;)
2053    tmp = VuV;
2054    for (offset=fVBYTES(); (offset>>=1)>0; ) {
2055        for (k = 0; k<fVBYTES(); k++) {
2056            VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k];
2057        }
2058        for (k = 0; k<fVBYTES(); k++) {
2059            tmp.ub[k] = VdV.ub[k];
2060        }
2061    }
2062})
2063
2064
2065EXTINSN(V6_vrdelta, "Vd32=vrdelta(Vu32,Vv32)",  ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VP),
2066"Forward Benes Butterfly network ",
2067{
2068	fHIDE(int offset;)
2069    fHIDE(int k;)
2070    fHIDE(mmvector_t tmp;)
2071    tmp = VuV;
2072    for (offset=1; offset<fVBYTES(); offset<<=1){
2073        for (k = 0; k<fVBYTES(); k++) {
2074            VdV.ub[k] = (VvV.ub[k]&offset) ? tmp.ub[k^offset] : tmp.ub[k];
2075        }
2076        for (k = 0; k<fVBYTES(); k++) {
2077            tmp.ub[k] = VdV.ub[k];
2078        }
2079    }
2080})
2081
2082
2083
2084
2085
2086ITERATOR_INSN2_SHIFT_SLOT(32,vcl0w,"Vd32=vcl0w(Vu32)","Vd32.uw=vcl0(Vu32.uw)",         "Count Leading Zeros in Word",     VdV.uw[i]=fCL1_4(~VuV.uw[i]))
2087ITERATOR_INSN2_SHIFT_SLOT(16,vcl0h,"Vd32=vcl0h(Vu32)","Vd32.uh=vcl0(Vu32.uh)",         "Count Leading Zeros in Word",    VdV.uh[i]=fCL1_2(~VuV.uh[i]))
2088
2089ITERATOR_INSN2_SHIFT_SLOT(32,vnormamtw,"Vd32=vnormamtw(Vu32)","Vd32.w=vnormamt(Vu32.w)","Norm Amount Word",
2090VdV.w[i]=fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i]))-1; fHIDE(IV1DEAD();))
2091ITERATOR_INSN2_SHIFT_SLOT(16,vnormamth,"Vd32=vnormamth(Vu32)","Vd32.h=vnormamt(Vu32.h)","Norm Amount Halfword",
2092VdV.h[i]=fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i]))-1; fHIDE(IV1DEAD();))
2093
2094ITERATOR_INSN_SHIFT_SLOT_VV_LATE(32,vaddclbw,"Vd32.w=vadd(vclb(Vu32.w),Vv32.w)",
2095"Count leading bits and add",
2096VdV.w[i] = fMAX(fCL1_4(~VuV.w[i]),fCL1_4(VuV.w[i])) + VvV.w[i])
2097
2098ITERATOR_INSN_SHIFT_SLOT_VV_LATE(16,vaddclbh,"Vd32.h=vadd(vclb(Vu32.h),Vv32.h)",
2099"Count leading bits and add",
2100VdV.h[i] = fMAX(fCL1_2(~VuV.h[i]),fCL1_2(VuV.h[i])) + VvV.h[i])
2101
2102
2103ITERATOR_INSN2_SHIFT_SLOT(16,vpopcounth,"Vd32=vpopcounth(Vu32)","Vd32.h=vpopcount(Vu32.h)",   "Count Leading Zeros in Word",  VdV.uh[i]=fCOUNTONES_2(VuV.uh[i]))
2104
2105
2106#define fHIST(INPUTVEC) \
2107	fUARCH_NOTE_PUMP_4X(); \
2108	fHIDE(int lane;) \
2109	fHIDE(mmvector_t tmp;) \
2110	fVFOREACH(128, lane) { \
2111		for (fHIDE(int )i=0; i<128/8; ++i) { \
2112			unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \
2113			unsigned char regno = value>>3; \
2114			unsigned char element = value & 7; \
2115			READ_EXT_VREG(regno,tmp,0); \
2116			tmp.uh[(128/16)*lane+(element)]++; \
2117			WRITE_EXT_VREG(regno,tmp,EXT_NEW); \
2118		} \
2119	}
2120
2121#define fHISTQ(INPUTVEC,QVAL) \
2122	fUARCH_NOTE_PUMP_4X(); \
2123	fHIDE(int lane;) \
2124	fHIDE(mmvector_t tmp;) \
2125	fVFOREACH(128, lane) { \
2126		for (fHIDE(int )i=0; i<128/8; ++i) { \
2127			unsigned char value = INPUTVEC.ub[(128/8)*lane+i]; \
2128			unsigned char regno = value>>3; \
2129			unsigned char element = value & 7; \
2130			READ_EXT_VREG(regno,tmp,0); \
2131			if (fGETQBIT(QVAL,128/8*lane+i)) tmp.uh[(128/16)*lane+(element)]++; \
2132			WRITE_EXT_VREG(regno,tmp,EXT_NEW); \
2133		} \
2134	}
2135
2136
2137
2138EXTINSN(V6_vhist, "vhist",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHIST(inputVec); })
2139EXTINSN(V6_vhistq, "vhist(Qv4)",ATTRIBS(A_EXTENSION,A_CVI,A_CVI_4SLOT), "vhist instruction",{ fHIDE(mmvector_t inputVec;) inputVec=fTMPVDATA(); fHISTQ(inputVec,QvV); })
2140
2141#undef fHIST
2142#undef fHISTQ
2143
2144
2145/* **** WEIGHTED HISTOGRAM **** */
2146
2147
2148#if 1
2149#define WHIST(EL,MASK,BSHIFT,COND,SATF) \
2150	fHIDE(unsigned int) bucket = fGETUBYTE(0,input.h[i]); \
2151	fHIDE(unsigned int) weight = fGETUBYTE(1,input.h[i]); \
2152	fHIDE(unsigned int) vindex = (bucket >> 3) & 0x1F; \
2153	fHIDE(unsigned int) elindex = ((i>>BSHIFT) & (~MASK)) | ((bucket>>BSHIFT) & MASK); \
2154	fHIDE(mmvector_t tmp;) \
2155	READ_EXT_VREG(vindex,tmp,0); \
2156	COND tmp.EL[elindex] = SATF(tmp.EL[elindex] + weight); \
2157	WRITE_EXT_VREG(vindex,tmp,EXT_NEW); \
2158	fUARCH_NOTE_PUMP_2X();
2159
2160ITERATOR_INSN_VHISTLIKE(16,vwhist256,"vwhist256","vector weighted histogram halfword counters", WHIST(uh,7,0,,))
2161ITERATOR_INSN_VHISTLIKE(16,vwhist256q,"vwhist256(Qv4)","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),))
2162ITERATOR_INSN_VHISTLIKE(16,vwhist256_sat,"vwhist256:sat","vector weighted histogram halfword counters", WHIST(uh,7,0,,fVSATUH))
2163ITERATOR_INSN_VHISTLIKE(16,vwhist256q_sat,"vwhist256(Qv4):sat","vector weighted histogram halfword counters", WHIST(uh,7,0,if (fGETQBIT(QvV,2*i)),fVSATUH))
2164ITERATOR_INSN_VHISTLIKE(16,vwhist128,"vwhist128","vector weighted histogram word counters", WHIST(uw,3,1,,))
2165ITERATOR_INSN_VHISTLIKE(16,vwhist128q,"vwhist128(Qv4)","vector weighted histogram word counters", WHIST(uw,3,1,if (fGETQBIT(QvV,2*i)),))
2166ITERATOR_INSN_VHISTLIKE(16,vwhist128m,"vwhist128(#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if ((bucket & 1) == uiV),))
2167ITERATOR_INSN_VHISTLIKE(16,vwhist128qm,"vwhist128(Qv4,#u1)","vector weighted histogram word counters", WHIST(uw,3,1,if (((bucket & 1) == uiV) && fGETQBIT(QvV,2*i)),))
2168
2169
2170#endif
2171
2172
2173
2174/* ******   lookup table instructions                          ***********  */
2175
2176/* Use low bits from idx to choose next-bigger elements from vector, then use LSB from idx to choose odd or even element */
2177
2178ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup",
2179fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2180matchval = RtV & 0x7;
2181oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2182idx = VuV.ub[i];
2183VdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2184
2185
2186ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracc,"Vx32.b|=vlut32(Vu32.b,Vv32.b,Rt8)","vector-vector table lookup",
2187fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2188matchval = RtV & 0x7;
2189oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2190idx = VuV.ub[i];
2191VxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2192
2193ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup",
2194fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2195matchval = RtV & 0xF;
2196oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2197idx = fGETUBYTE(0,VuV.uh[i]);
2198VddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2199idx = fGETUBYTE(1,VuV.uh[i]);
2200VddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2201
2202ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracc,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,Rt8)","vector-vector table lookup",
2203fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2204matchval = fGETUBYTE(0,RtV) & 0xF;
2205oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2206idx = fGETUBYTE(0,VuV.uh[i]);
2207VxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2208idx = fGETUBYTE(1,VuV.uh[i]);
2209VxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2210
2211ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvbi,"Vd32.b=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup",
2212fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2213matchval = uiV & 0x7;
2214oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2215idx = VuV.ub[i];
2216VdV.b[i] = ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2217
2218
2219ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(8,vlutvvb_oracci,"Vx32.b|=vlut32(Vu32.b,Vv32.b,#u3)","vector-vector table lookup",
2220fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2221matchval = uiV & 0x7;
2222oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2223idx = VuV.ub[i];
2224VxV.b[i] |= ((idx & 0xE0) == (matchval << 5)) ? fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]) : 0)
2225
2226ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwhi,"Vdd32.h=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup",
2227fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2228matchval = uiV & 0xF;
2229oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2230idx = fGETUBYTE(0,VuV.uh[i]);
2231VddV.v[0].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2232idx = fGETUBYTE(1,VuV.uh[i]);
2233VddV.v[1].h[i] = ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2234
2235ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_oracci,"Vxx32.h|=vlut16(Vu32.b,Vv32.h,#u3)","vector-vector table lookup",
2236fHIDE(unsigned int idx;) fHIDE(int matchval;) fHIDE(int oddhalf;)
2237matchval = uiV & 0xF;
2238oddhalf = (uiV >> (fVECLOGSIZE()-6)) & 0x1;
2239idx = fGETUBYTE(0,VuV.uh[i]);
2240VxxV.v[0].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0;
2241idx = fGETUBYTE(1,VuV.uh[i]);
2242VxxV.v[1].h[i] |= ((idx & 0xF0) == (matchval << 4)) ? fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]) : 0)
2243
2244ITERATOR_INSN_PERMUTE_SLOT(8,vlutvvb_nm,"Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8):nomatch","vector-vector table lookup",
2245fHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;)
2246    matchval = RtV & 0x7;
2247    oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2248    idx = VuV.ub[i];
2249    idx = (idx&0x1F) | (matchval<<5);
2250    VdV.b[i] = fGETBYTE(oddhalf,VvV.h[idx % fVELEM(16)]))
2251
2252ITERATOR_INSN_PERMUTE_SLOT_DOUBLE_VEC(16,vlutvwh_nm,"Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8):nomatch","vector-vector table lookup",
2253fHIDE(unsigned int idx;) fHIDE(int oddhalf;) fHIDE(int matchval;)
2254    matchval = RtV & 0xF;
2255    oddhalf = (RtV >> (fVECLOGSIZE()-6)) & 0x1;
2256    idx = fGETUBYTE(0,VuV.uh[i]);
2257    idx = (idx&0x0F) | (matchval<<4);
2258    VddV.v[0].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]);
2259    idx = fGETUBYTE(1,VuV.uh[i]);
2260    idx = (idx&0x0F) | (matchval<<4);
2261    VddV.v[1].h[i] = fGETHALF(oddhalf,VvV.w[idx % fVELEM(32)]))
2262
2263
2264
2265
2266/******************************************************************************
2267NON LINEAR - V65
2268 ******************************************************************************/
2269
2270ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpahhsat,"Vx32.h=vmpa(Vx32.h,Vu32.h,Rtt32.h):sat","piecewise linear approximation",
2271    VxV.h[i]= fVSATH( ( ( fMPY16SS(VxV.h[i],VuV.h[i])<<1) + (fGETHALF(( (VuV.h[i]>>14)&0x3), RttV )<<15))>>16))
2272
2273
2274ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpauhuhsat,"Vx32.h=vmpa(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation",
2275    VxV.h[i]= fVSATH( (  fMPY16SU(VxV.h[i],VuV.uh[i]) + (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16))
2276
2277ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vmpsuhuhsat,"Vx32.h=vmps(Vx32.h,Vu32.uh,Rtt32.uh):sat","piecewise linear approximation",
2278    VxV.h[i]= fVSATH( (  fMPY16SU(VxV.h[i],VuV.uh[i]) - (fGETUHALF(((VuV.uh[i]>>14)&0x3), RttV )<<15))>>16))
2279
2280
2281ITERATOR_INSN_SLOT2_DOUBLE_VEC(16,vlut4,"Vd32.h=vlut4(Vu32.uh,Rtt32.h)","4 entry lookup table",
2282    VdV.h[i]= fGETHALF(  ((VuV.h[i]>>14)&0x3), RttV ))
2283
2284
2285
2286/******************************************************************************
2287V65
2288 ******************************************************************************/
2289
2290ITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe,"Vd32.uw=vmpye(Vu32.uh,Rt32.uh)",
2291"Vector even halfword unsigned multiply by scalar",
2292    VdV.uw[i] = fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)))
2293
2294
2295ITERATOR_INSN_MPY_SLOT_NOV1(32,vmpyuhe_acc,"Vx32.uw+=vmpye(Vu32.uh,Rt32.uh)",
2296"Vector even halfword unsigned multiply by scalar",
2297    VxV.uw[i] += fMPY16UU(fGETUHALF(0, VuV.uw[i]),fGETUHALF(0,RtV)))
2298
2299
2300
2301
2302EXTINSN(V6_vgathermw,  "vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words",
2303{
2304    fHIDE(int i;)
2305	fHIDE(int element_size = 4;)
2306    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2307    fVLASTBYTE(MuV, element_size);
2308    fVALIGN(RtV, element_size);
2309    fVFOREACH(32, i) {
2310        EA = RtV+VvV.uw[i];
2311        fVLOG_VTCM_GATHER_WORD(EA, VvV.uw[i], i,MuV);
2312    }
2313    fGATHER_FINISH()
2314})
2315EXTINSN(V6_vgathermh,  "vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2316{
2317    fHIDE(int i;)
2318	fHIDE(int element_size = 2;)
2319    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2320    fVLASTBYTE(MuV, element_size);
2321    fVALIGN(RtV, element_size);
2322    fVFOREACH(16, i) {
2323        EA = RtV+VvV.uh[i];
2324        fVLOG_VTCM_GATHER_HALFWORD(EA, VvV.uh[i], i,MuV);
2325    }
2326    fGATHER_FINISH()
2327})
2328
2329
2330
2331EXTINSN(V6_vgathermhw,  "vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2332{
2333    fHIDE(int i;)
2334    fHIDE(int j;)
2335	fHIDE(int element_size = 2;)
2336    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2337    fVLASTBYTE(MuV, element_size);
2338    fVALIGN(RtV, element_size);
2339    fVFOREACH(32, i) {
2340       for(j = 0; j < 2; j++) {
2341            EA = RtV+VvvV.v[j].uw[i];
2342            fVLOG_VTCM_GATHER_HALFWORD_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,MuV);
2343        }
2344    }
2345     fGATHER_FINISH()
2346})
2347
2348
2349EXTINSN(V6_vgathermwq,  "if (Qs4) vtmp.w=vgather(Rt32,Mu2,Vv32.w).w", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather Words",
2350{
2351    fHIDE(int i;)
2352	fHIDE(int element_size = 4;)
2353    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2354    fVLASTBYTE(MuV, element_size);
2355    fVALIGN(RtV, element_size);
2356    fVFOREACH(32, i) {
2357        EA = RtV+VvV.uw[i];
2358        fVLOG_VTCM_GATHER_WORDQ(EA, VvV.uw[i], i,QsV,MuV);
2359    }
2360    fGATHER_FINISH()
2361})
2362EXTINSN(V6_vgathermhq,  "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vv32.h).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2363{
2364    fHIDE(int i;)
2365	fHIDE(int element_size = 2;)
2366    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2367    fVLASTBYTE(MuV, element_size);
2368    fVALIGN(RtV, element_size);
2369    fVFOREACH(16, i) {
2370        EA = RtV+VvV.uh[i];
2371        fVLOG_VTCM_GATHER_HALFWORDQ(EA, VvV.uh[i], i,QsV,MuV);
2372    }
2373    fGATHER_FINISH()
2374})
2375
2376
2377
2378EXTINSN(V6_vgathermhwq,  "if (Qs4) vtmp.h=vgather(Rt32,Mu2,Vvv32.w).h", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_GATHER,A_CVI_VA_DV,A_CVI_VM,A_CVI_TMP_DST,A_MEMLIKE), "Gather halfwords",
2379{
2380    fHIDE(int i;)
2381    fHIDE(int j;)
2382	fHIDE(int element_size = 2;)
2383    fHIDE(fGATHER_INIT( RtV, MuV, element_size);)
2384    fVLASTBYTE(MuV, element_size);
2385    fVALIGN(RtV, element_size);
2386    fVFOREACH(32, i) {
2387       for(j = 0; j < 2; j++) {
2388            EA = RtV+VvvV.v[j].uw[i];
2389            fVLOG_VTCM_GATHER_HALFWORDQ_DV(EA, VvvV.v[j].uw[i], (2*i+j),i,j,QsV,MuV);
2390       }
2391    }
2392    fGATHER_FINISH()
2393})
2394
2395
2396
2397EXTINSN(V6_vscattermw , "vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words",
2398{
2399    fHIDE(int i;)
2400	fHIDE(int element_size = 4;)
2401    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2402    fVLASTBYTE(MuV, element_size);
2403    fVALIGN(RtV, element_size);
2404    fVFOREACH(32, i) {
2405        EA = RtV+VvV.uw[i];
2406        fVLOG_VTCM_WORD(EA, VvV.uw[i], VwV,i,MuV);
2407    }
2408    fSCATTER_FINISH(0)
2409})
2410
2411
2412
2413EXTINSN(V6_vscattermh , "vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfWords",
2414{
2415    fHIDE(int i;)
2416	fHIDE(int element_size = 2;)
2417    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2418    fVLASTBYTE(MuV, element_size);
2419    fVALIGN(RtV, element_size);
2420    fVFOREACH(16, i) {
2421        EA = RtV+VvV.uh[i];
2422        fVLOG_VTCM_HALFWORD(EA,VvV.uh[i],VwV,i,MuV);
2423    }
2424    fSCATTER_FINISH(0)
2425})
2426
2427
2428EXTINSN(V6_vscattermw_add,  "vscatter(Rt32,Mu2,Vv32.w).w+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words-Add",
2429{
2430    fHIDE(int i;)
2431    fHIDE(int ALIGNMENT=4;)
2432	fHIDE(int element_size = 4;)
2433    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2434    fVLASTBYTE(MuV, element_size);
2435    fVALIGN(RtV, element_size);
2436    fVFOREACH(32, i) {
2437        EA = (RtV+fVALIGN(VvV.uw[i],ALIGNMENT));
2438        fVLOG_VTCM_WORD_INCREMENT(EA,VvV.uw[i],VwV,i,ALIGNMENT,MuV);
2439    }
2440    fHIDE(fLOG_SCATTER_OP(4);)
2441    fSCATTER_FINISH(1)
2442})
2443
2444EXTINSN(V6_vscattermh_add,  "vscatter(Rt32,Mu2,Vv32.h).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter halfword-Add",
2445{
2446    fHIDE(int i;)
2447    fHIDE(int ALIGNMENT=2;)
2448	fHIDE(int element_size = 2;)
2449    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2450    fVLASTBYTE(MuV, element_size);
2451    fVALIGN(RtV, element_size);
2452    fVFOREACH(16, i) {
2453        EA = (RtV+fVALIGN(VvV.uh[i],ALIGNMENT));
2454        fVLOG_VTCM_HALFWORD_INCREMENT(EA,VvV.uh[i],VwV,i,ALIGNMENT,MuV);
2455    }
2456    fHIDE(fLOG_SCATTER_OP(2);)
2457    fSCATTER_FINISH(1)
2458})
2459
2460
2461EXTINSN(V6_vscattermwq,  "if (Qs4) vscatter(Rt32,Mu2,Vv32.w).w=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter Words conditional",
2462{
2463    fHIDE(int i;)
2464	fHIDE(int element_size = 4;)
2465    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2466    fVLASTBYTE(MuV, element_size);
2467    fVALIGN(RtV, element_size);
2468    fVFOREACH(32, i) {
2469        EA = RtV+VvV.uw[i];
2470        fVLOG_VTCM_WORDQ(EA,VvV.uw[i], VwV,i,QsV,MuV);
2471    }
2472    fSCATTER_FINISH(0)
2473})
2474
2475EXTINSN(V6_vscattermhq,  "if (Qs4) vscatter(Rt32,Mu2,Vv32.h).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA,A_CVI_VM,A_MEMLIKE), "Scatter HalfWords conditional",
2476{
2477    fHIDE(int i;)
2478	fHIDE(int element_size = 2;)
2479    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2480    fVLASTBYTE(MuV, element_size);
2481    fVALIGN(RtV, element_size);
2482    fVFOREACH(16, i) {
2483        EA = RtV+VvV.uh[i];
2484        fVLOG_VTCM_HALFWORDQ(EA,VvV.uh[i],VwV,i,QsV,MuV);
2485    }
2486    fSCATTER_FINISH(0)
2487})
2488
2489
2490
2491
2492EXTINSN(V6_vscattermhw , "vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter Words",
2493{
2494    fHIDE(int i;)
2495    fHIDE(int j;)
2496	fHIDE(int element_size = 2;)
2497    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2498    fVLASTBYTE(MuV, element_size);
2499    fVALIGN(RtV, element_size);
2500    fVFOREACH(32, i) {
2501        for(j = 0; j < 2; j++) {
2502            EA = RtV+VvvV.v[j].uw[i];
2503            fVLOG_VTCM_HALFWORD_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,MuV);
2504        }
2505    }
2506    fSCATTER_FINISH(0)
2507})
2508
2509
2510
2511EXTINSN(V6_vscattermhwq,  "if (Qs4) vscatter(Rt32,Mu2,Vvv32.w).h=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords conditional",
2512{
2513    fHIDE(int i;)
2514    fHIDE(int j;)
2515	fHIDE(int element_size = 2;)
2516    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2517    fVLASTBYTE(MuV, element_size);
2518    fVALIGN(RtV, element_size);
2519    fVFOREACH(32, i) {
2520        for(j = 0; j < 2; j++) {
2521            EA = RtV+VvvV.v[j].uw[i];
2522            fVLOG_VTCM_HALFWORDQ_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),QsV,i,j,MuV);
2523        }
2524    }
2525    fSCATTER_FINISH(0)
2526})
2527
2528EXTINSN(V6_vscattermhw_add,  "vscatter(Rt32,Mu2,Vvv32.w).h+=Vw32", ATTRIBS(A_EXTENSION,A_CVI,A_CVI_SCATTER,A_CVI_VA_DV,A_CVI_VM,A_MEMLIKE), "Scatter halfwords-add",
2529{
2530    fHIDE(int i;)
2531    fHIDE(int j;)
2532    fHIDE(int ALIGNMENT=2;)
2533	fHIDE(int element_size = 2;)
2534    fHIDE(fSCATTER_INIT( RtV, MuV, element_size);)
2535    fVLASTBYTE(MuV, element_size);
2536    fVALIGN(RtV, element_size);
2537    fVFOREACH(32, i) {
2538        for(j = 0; j < 2; j++) {
2539             EA =  RtV + fVALIGN(VvvV.v[j].uw[i],ALIGNMENT);;
2540             fVLOG_VTCM_HALFWORD_INCREMENT_DV(EA,VvvV.v[j].uw[i],VwV,(2*i+j),i,j,ALIGNMENT,MuV);
2541        }
2542    }
2543    fHIDE(fLOG_SCATTER_OP(2);)
2544    fSCATTER_FINISH(1)
2545})
2546
2547EXTINSN(V6_vprefixqb,"Vd32.b=prefixsum(Qv4)",   ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  "parallel prefix sum of Q into byte",
2548{
2549    fHIDE(int i;)
2550    fHIDE(size1u_t acc = 0;)
2551    fVFOREACH(8, i) {
2552        acc += fGETQBIT(QvV,i);
2553        VdV.ub[i] = acc;
2554    }
2555    } )
2556EXTINSN(V6_vprefixqh,"Vd32.h=prefixsum(Qv4)",   ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  "parallel prefix sum of Q into halfwords",
2557{
2558    fHIDE(int i;)
2559    fHIDE(size2u_t acc = 0;)
2560    fVFOREACH(16, i) {
2561        acc += fGETQBIT(QvV,i*2+0);
2562        acc += fGETQBIT(QvV,i*2+1);
2563        VdV.uh[i] = acc;
2564    }
2565    } )
2566EXTINSN(V6_vprefixqw,"Vd32.w=prefixsum(Qv4)",   ATTRIBS(A_EXTENSION,A_CVI,A_CVI_VS),  "parallel prefix sum of Q into words",
2567{
2568    fHIDE(int i;)
2569    fHIDE(size4u_t acc = 0;)
2570    fVFOREACH(32, i) {
2571        acc += fGETQBIT(QvV,i*4+0);
2572        acc += fGETQBIT(QvV,i*4+1);
2573        acc += fGETQBIT(QvV,i*4+2);
2574        acc += fGETQBIT(QvV,i*4+3);
2575        VdV.uw[i] = acc;
2576    }
2577    } )
2578
2579
2580
2581
2582
2583/******************************************************************************
2584 DEBUG Vector/Register Printing
2585 ******************************************************************************/
2586
2587#define PRINT_VU(TYPE, TYPE2, COUNT)\
2588    int i;  \
2589    size4u_t vec_len = fVBYTES();\
2590    fprintf(stdout,"V%2d: ",VuN);  \
2591    for (i=0;i<vec_len>>COUNT;i++) {         \
2592        fprintf(stdout,TYPE2 " ", VuV.TYPE[i]); \
2593    };  \
2594    fprintf(stdout,"\\n");  \
2595	fflush(stdout);\
2596
2597#undef ATTR_VMEM
2598#undef ATTR_VMEMU
2599#undef ATTR_VMEM_NT
2600
2601#endif /* NO_MMVEC */
2602
2603#ifdef __SELF_DEF_EXTINSN
2604#undef EXTINSN
2605#undef __SELF_DEF_EXTINSN
2606#endif
2607