xref: /openbmc/qemu/target/hexagon/imported/mpy.idef (revision aa09b3d5)
1/*
2 *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
3 *
4 *  This program is free software; you can redistribute it and/or modify
5 *  it under the terms of the GNU General Public License as published by
6 *  the Free Software Foundation; either version 2 of the License, or
7 *  (at your option) any later version.
8 *
9 *  This program is distributed in the hope that it will be useful,
10 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 *  GNU General Public License for more details.
13 *
14 *  You should have received a copy of the GNU General Public License
15 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18/*
19 * Multiply Instructions
20 */
21
22
23#define STD_SP_MODES(TAG,OPER,ATR,DST,ACCSEM,SEM,OSEM,SATSEM,RNDSEM)\
24Q6INSN(M2_##TAG##_hh_s0, OPER"(Rs.H32,Rt.H32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETHALF(1,RsV),fGETHALF(1,RtV))));})\
25Q6INSN(M2_##TAG##_hh_s1, OPER"(Rs.H32,Rt.H32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETHALF(1,RsV),fGETHALF(1,RtV)))));})\
26Q6INSN(M2_##TAG##_hl_s0, OPER"(Rs.H32,Rt.L32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETHALF(1,RsV),fGETHALF(0,RtV))));})\
27Q6INSN(M2_##TAG##_hl_s1, OPER"(Rs.H32,Rt.L32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETHALF(1,RsV),fGETHALF(0,RtV)))));})\
28Q6INSN(M2_##TAG##_lh_s0, OPER"(Rs.L32,Rt.H32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETHALF(0,RsV),fGETHALF(1,RtV))));})\
29Q6INSN(M2_##TAG##_lh_s1, OPER"(Rs.L32,Rt.H32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETHALF(0,RsV),fGETHALF(1,RtV)))));})\
30Q6INSN(M2_##TAG##_ll_s0, OPER"(Rs.L32,Rt.L32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETHALF(0,RsV),fGETHALF(0,RtV))));})\
31Q6INSN(M2_##TAG##_ll_s1, OPER"(Rs.L32,Rt.L32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETHALF(0,RsV),fGETHALF(0,RtV)))));})
32
33/*****************************************************/
34/* multiply 16x16->32 signed instructions            */
35/*****************************************************/
36STD_SP_MODES(mpy_acc,    "Rx32+=mpy", ,RxV,RxV+    ,fMPY16SS,          ,fPASS,fPASS)
37STD_SP_MODES(mpy_nac,    "Rx32-=mpy", ,RxV,RxV-    ,fMPY16SS,          ,fPASS,fPASS)
38STD_SP_MODES(mpy_acc_sat,"Rx32+=mpy", ,RxV,RxV+    ,fMPY16SS,":sat"    ,fSAT, fPASS)
39STD_SP_MODES(mpy_nac_sat,"Rx32-=mpy", ,RxV,RxV-    ,fMPY16SS,":sat"    ,fSAT, fPASS)
40STD_SP_MODES(mpy,        "Rd32=mpy",  ,RdV,        ,fMPY16SS,          ,fPASS,fPASS)
41STD_SP_MODES(mpy_sat,    "Rd32=mpy",  ,RdV,        ,fMPY16SS,":sat"    ,fSAT, fPASS)
42STD_SP_MODES(mpy_rnd,    "Rd32=mpy",  ,RdV,        ,fMPY16SS,":rnd"    ,fPASS,fROUND)
43STD_SP_MODES(mpy_sat_rnd,"Rd32=mpy",  ,RdV,        ,fMPY16SS,":rnd:sat",fSAT, fROUND)
44STD_SP_MODES(mpyd_acc,   "Rxx32+=mpy",,RxxV,RxxV+  ,fMPY16SS,          ,fPASS,fPASS)
45STD_SP_MODES(mpyd_nac,   "Rxx32-=mpy",,RxxV,RxxV-  ,fMPY16SS,          ,fPASS,fPASS)
46STD_SP_MODES(mpyd,       "Rdd32=mpy", ,RddV,       ,fMPY16SS,          ,fPASS,fPASS)
47STD_SP_MODES(mpyd_rnd,   "Rdd32=mpy", ,RddV,       ,fMPY16SS,":rnd"    ,fPASS,fROUND)
48
49
50/*****************************************************/
51/* multiply 16x16->32 unsigned instructions          */
52/*****************************************************/
53#define STD_USP_MODES(TAG,OPER,ATR,DST,ACCSEM,SEM,OSEM,SATSEM,RNDSEM)\
54Q6INSN(M2_##TAG##_hh_s0, OPER"(Rs.H32,Rt.H32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETUHALF(1,RsV),fGETUHALF(1,RtV))));})\
55Q6INSN(M2_##TAG##_hh_s1, OPER"(Rs.H32,Rt.H32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETUHALF(1,RsV),fGETUHALF(1,RtV)))));})\
56Q6INSN(M2_##TAG##_hl_s0, OPER"(Rs.H32,Rt.L32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETUHALF(1,RsV),fGETUHALF(0,RtV))));})\
57Q6INSN(M2_##TAG##_hl_s1, OPER"(Rs.H32,Rt.L32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETUHALF(1,RsV),fGETUHALF(0,RtV)))));})\
58Q6INSN(M2_##TAG##_lh_s0, OPER"(Rs.L32,Rt.H32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETUHALF(0,RsV),fGETUHALF(1,RtV))));})\
59Q6INSN(M2_##TAG##_lh_s1, OPER"(Rs.L32,Rt.H32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETUHALF(0,RsV),fGETUHALF(1,RtV)))));})\
60Q6INSN(M2_##TAG##_ll_s0, OPER"(Rs.L32,Rt.L32)"OSEM,        ATR,"",{DST=SATSEM(RNDSEM(ACCSEM SEM(         fGETUHALF(0,RsV),fGETUHALF(0,RtV))));})\
61Q6INSN(M2_##TAG##_ll_s1, OPER"(Rs.L32,Rt.L32):<<1"OSEM,    ATR,"",{DST=SATSEM(RNDSEM(ACCSEM fSCALE(1,SEM(fGETUHALF(0,RsV),fGETUHALF(0,RtV)))));})
62
63STD_USP_MODES(mpyu_acc,    "Rx32+=mpyu", ,RxV,RxV+  ,fMPY16UU,          ,fPASS,fPASS)
64STD_USP_MODES(mpyu_nac,    "Rx32-=mpyu", ,RxV,RxV-  ,fMPY16UU,          ,fPASS,fPASS)
65STD_USP_MODES(mpyu,        "Rd32=mpyu",  ATTRIBS() ,RdV,  ,fMPY16UU, ,fPASS,fPASS)
66STD_USP_MODES(mpyud_acc,   "Rxx32+=mpyu",,RxxV,RxxV+,fMPY16UU,          ,fPASS,fPASS)
67STD_USP_MODES(mpyud_nac,   "Rxx32-=mpyu",,RxxV,RxxV-,fMPY16UU,          ,fPASS,fPASS)
68STD_USP_MODES(mpyud,       "Rdd32=mpyu", ATTRIBS() ,RddV, ,fMPY16UU, ,fPASS,fPASS)
69
70/**********************************************/
71/* mpy 16x#s8->32                             */
72/**********************************************/
73
74Q6INSN(M2_mpysip,"Rd32=+mpyi(Rs32,#u8)",ATTRIBS(A_ARCHV2),
75"32-bit Multiply by unsigned immediate",
76{ fIMMEXT(uiV); RdV=RsV*uiV; })
77
78Q6INSN(M2_mpysin,"Rd32=-mpyi(Rs32,#u8)",ATTRIBS(A_ARCHV2),
79"32-bit Multiply by unsigned immediate, negate result",
80{ RdV=RsV*-uiV; })
81
82Q6INSN(M2_macsip,"Rx32+=mpyi(Rs32,#u8)",ATTRIBS(A_ARCHV2),
83"32-bit Multiply-Add by unsigned immediate",
84{ fIMMEXT(uiV); RxV=RxV + (RsV*uiV);})
85
86Q6INSN(M2_macsin,"Rx32-=mpyi(Rs32,#u8)",ATTRIBS(A_ARCHV2),
87"32-bit Multiply-Subtract by unsigned immediate",
88{ fIMMEXT(uiV); RxV=RxV - (RsV*uiV);})
89
90
91/**********************************************/
92/* multiply/mac  32x32->64 instructions       */
93/**********************************************/
94Q6INSN(M2_dpmpyss_s0,    "Rdd32=mpy(Rs32,Rt32)", ATTRIBS(),"Multiply 32x32",{RddV=fMPY32SS(RsV,RtV);})
95Q6INSN(M2_dpmpyss_acc_s0,"Rxx32+=mpy(Rs32,Rt32)",ATTRIBS(),"Multiply 32x32",{RxxV= RxxV + fMPY32SS(RsV,RtV);})
96Q6INSN(M2_dpmpyss_nac_s0,"Rxx32-=mpy(Rs32,Rt32)",ATTRIBS(),"Multiply 32x32",{RxxV= RxxV - fMPY32SS(RsV,RtV);})
97
98Q6INSN(M2_dpmpyuu_s0,    "Rdd32=mpyu(Rs32,Rt32)", ATTRIBS(),"Multiply 32x32",{RddV=fMPY32UU(fCAST4u(RsV),fCAST4u(RtV));})
99Q6INSN(M2_dpmpyuu_acc_s0,"Rxx32+=mpyu(Rs32,Rt32)",ATTRIBS(),"Multiply 32x32",{RxxV= RxxV + fMPY32UU(fCAST4u(RsV),fCAST4u(RtV));})
100Q6INSN(M2_dpmpyuu_nac_s0,"Rxx32-=mpyu(Rs32,Rt32)",ATTRIBS(),"Multiply 32x32",{RxxV= RxxV - fMPY32UU(fCAST4u(RsV),fCAST4u(RtV));})
101
102
103/******************************************************/
104/* multiply/mac  32x32->32 (upper) instructions       */
105/******************************************************/
106Q6INSN(M2_mpy_up,        "Rd32=mpy(Rs32,Rt32)", ATTRIBS(),"Multiply 32x32",{RdV=fMPY32SS(RsV,RtV)>>32;})
107Q6INSN(M2_mpy_up_s1,     "Rd32=mpy(Rs32,Rt32):<<1", ATTRIBS(),"Multiply 32x32",{RdV=fMPY32SS(RsV,RtV)>>31;})
108Q6INSN(M2_mpy_up_s1_sat, "Rd32=mpy(Rs32,Rt32):<<1:sat", ATTRIBS(),"Multiply 32x32",{RdV=fSAT(fMPY32SS(RsV,RtV)>>31);})
109Q6INSN(M2_mpyu_up,       "Rd32=mpyu(Rs32,Rt32)", ATTRIBS(),"Multiply 32x32",{RdV=fMPY32UU(fCAST4u(RsV),fCAST4u(RtV))>>32;})
110Q6INSN(M2_mpysu_up,      "Rd32=mpysu(Rs32,Rt32)", ATTRIBS(),"Multiply 32x32",{RdV=fMPY32SU(RsV,fCAST4u(RtV))>>32;})
111Q6INSN(M2_dpmpyss_rnd_s0,"Rd32=mpy(Rs32,Rt32):rnd", ATTRIBS(),"Multiply 32x32",{RdV=(fMPY32SS(RsV,RtV)+fCONSTLL(0x80000000))>>32;})
112
113Q6INSN(M4_mac_up_s1_sat, "Rx32+=mpy(Rs32,Rt32):<<1:sat", ATTRIBS(),"Multiply 32x32",{RxV=fSAT(  (fSE32_64(RxV)) + (fMPY32SS(RsV,RtV)>>31));})
114Q6INSN(M4_nac_up_s1_sat, "Rx32-=mpy(Rs32,Rt32):<<1:sat", ATTRIBS(),"Multiply 32x32",{RxV=fSAT(  (fSE32_64(RxV)) - (fMPY32SS(RsV,RtV)>>31));})
115
116
117/**********************************************/
118/* 32x32->32 multiply (lower)                 */
119/**********************************************/
120
121Q6INSN(M2_mpyi,"Rd32=mpyi(Rs32,Rt32)",ATTRIBS(),
122"Multiply Integer",
123{ RdV=RsV*RtV;})
124
125Q6INSN(M2_maci,"Rx32+=mpyi(Rs32,Rt32)",ATTRIBS(A_ARCHV2),
126"Multiply-Accumulate Integer",
127{ RxV=RxV + RsV*RtV;})
128
129Q6INSN(M2_mnaci,"Rx32-=mpyi(Rs32,Rt32)",ATTRIBS(A_ARCHV2),
130"Multiply-Neg-Accumulate Integer",
131{ RxV=RxV - RsV*RtV;})
132
133/****** WHY ARE THESE IN MPY.IDEF? **********/
134
135Q6INSN(M2_acci,"Rx32+=add(Rs32,Rt32)",ATTRIBS(A_ARCHV2),
136"Add with accumulate",
137{ RxV=RxV + RsV + RtV;})
138
139Q6INSN(M2_accii,"Rx32+=add(Rs32,#s8)",ATTRIBS(A_ARCHV2),
140"Add with accumulate",
141{ fIMMEXT(siV); RxV=RxV + RsV + siV;})
142
143Q6INSN(M2_nacci,"Rx32-=add(Rs32,Rt32)",ATTRIBS(A_ARCHV2),
144"Add with neg accumulate",
145{ RxV=RxV - (RsV + RtV);})
146
147Q6INSN(M2_naccii,"Rx32-=add(Rs32,#s8)",ATTRIBS(A_ARCHV2),
148"Add with neg accumulate",
149{ fIMMEXT(siV); RxV=RxV - (RsV + siV);})
150
151Q6INSN(M2_subacc,"Rx32+=sub(Rt32,Rs32)",ATTRIBS(A_ARCHV2),
152"Sub with accumulate",
153{ RxV=RxV + RtV - RsV;})
154
155
156
157
158Q6INSN(M4_mpyrr_addr,"Ry32=add(Ru32,mpyi(Ry32,Rs32))",ATTRIBS(),
159"Mpy by immed and add immed",
160{ RyV = RuV + RsV*RyV;})
161
162Q6INSN(M4_mpyri_addr_u2,"Rd32=add(Ru32,mpyi(#u6:2,Rs32))",ATTRIBS(),
163"Mpy by immed and add immed",
164{ RdV = RuV + RsV*uiV;})
165
166Q6INSN(M4_mpyri_addr,"Rd32=add(Ru32,mpyi(Rs32,#u6))",ATTRIBS(),
167"Mpy by immed and add immed",
168{ fIMMEXT(uiV); RdV = RuV + RsV*uiV;})
169
170
171
172Q6INSN(M4_mpyri_addi,"Rd32=add(#u6,mpyi(Rs32,#U6))",ATTRIBS(),
173"Mpy by immed and add immed",
174{ fIMMEXT(uiV); RdV = uiV + RsV*UiV;})
175
176
177
178Q6INSN(M4_mpyrr_addi,"Rd32=add(#u6,mpyi(Rs32,Rt32))",ATTRIBS(),
179"Mpy by immed and add immed",
180{ fIMMEXT(uiV); RdV = uiV + RsV*RtV;})
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198/**********************************************/
199/* vector mac  2x[16x16 -> 32]                */
200/**********************************************/
201
202#undef vmac_sema
203#define vmac_sema(N)\
204{ fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV)))));\
205  fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV)))));\
206}
207Q6INSN(M2_vmpy2s_s0,"Rdd32=vmpyh(Rs32,Rt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
208Q6INSN(M2_vmpy2s_s1,"Rdd32=vmpyh(Rs32,Rt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
209
210
211#undef vmac_sema
212#define vmac_sema(N)\
213{ fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV)))));\
214  fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV)))));\
215}
216Q6INSN(M2_vmac2s_s0,"Rxx32+=vmpyh(Rs32,Rt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
217Q6INSN(M2_vmac2s_s1,"Rxx32+=vmpyh(Rs32,Rt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
218
219#undef vmac_sema
220#define vmac_sema(N)\
221{ fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SU(fGETHALF(0,RsV),fGETUHALF(0,RtV)))));\
222  fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SU(fGETHALF(1,RsV),fGETUHALF(1,RtV)))));\
223}
224Q6INSN(M2_vmpy2su_s0,"Rdd32=vmpyhsu(Rs32,Rt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
225Q6INSN(M2_vmpy2su_s1,"Rdd32=vmpyhsu(Rs32,Rt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
226
227
228#undef vmac_sema
229#define vmac_sema(N)\
230{ fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SU(fGETHALF(0,RsV),fGETUHALF(0,RtV)))));\
231  fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SU(fGETHALF(1,RsV),fGETUHALF(1,RtV)))));\
232}
233Q6INSN(M2_vmac2su_s0,"Rxx32+=vmpyhsu(Rs32,Rt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
234Q6INSN(M2_vmac2su_s1,"Rxx32+=vmpyhsu(Rs32,Rt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
235
236
237
238#undef vmac_sema
239#define vmac_sema(N)\
240{ fSETHALF(1,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV))) + 0x8000))));\
241  fSETHALF(0,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV))) + 0x8000))));\
242}
243Q6INSN(M2_vmpy2s_s0pack,"Rd32=vmpyh(Rs32,Rt32):rnd:sat",ATTRIBS(A_ARCHV2),"Vector Multiply",vmac_sema(0))
244Q6INSN(M2_vmpy2s_s1pack,"Rd32=vmpyh(Rs32,Rt32):<<1:rnd:sat",ATTRIBS(A_ARCHV2),"Vector Multiply",vmac_sema(1))
245
246
247#undef vmac_sema
248#define vmac_sema(N)\
249{ fSETWORD(0,RxxV,fGETWORD(0,RxxV) + fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV)));\
250  fSETWORD(1,RxxV,fGETWORD(1,RxxV) + fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV)));\
251}
252Q6INSN(M2_vmac2,"Rxx32+=vmpyh(Rs32,Rt32)",ATTRIBS(A_ARCHV2),"Vector Multiply",vmac_sema(0))
253
254#undef vmac_sema
255#define vmac_sema(N)\
256{ fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)))));\
257  fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)))));\
258}
259Q6INSN(M2_vmpy2es_s0,"Rdd32=vmpyeh(Rss32,Rtt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
260Q6INSN(M2_vmpy2es_s1,"Rdd32=vmpyeh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
261
262#undef vmac_sema
263#define vmac_sema(N)\
264{ fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)))));\
265  fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)))));\
266}
267Q6INSN(M2_vmac2es_s0,"Rxx32+=vmpyeh(Rss32,Rtt32):sat",ATTRIBS(),"Vector Multiply",vmac_sema(0))
268Q6INSN(M2_vmac2es_s1,"Rxx32+=vmpyeh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Vector Multiply",vmac_sema(1))
269
270#undef vmac_sema
271#define vmac_sema(N)\
272{ fSETWORD(0,RxxV,fGETWORD(0,RxxV) + fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)));\
273  fSETWORD(1,RxxV,fGETWORD(1,RxxV) + fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)));\
274}
275Q6INSN(M2_vmac2es,"Rxx32+=vmpyeh(Rss32,Rtt32)",ATTRIBS(A_ARCHV2),"Vector Multiply",vmac_sema(0))
276
277
278
279
280/********************************************************/
281/* vrmpyh, aka Big Mac, aka Mac Daddy, aka Mac-ac-ac-ac */
282/* vector mac  4x[16x16] + 64 ->64                      */
283/********************************************************/
284
285
286#undef vmac_sema
287#define vmac_sema(N)\
288{ RxxV = RxxV + fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV))\
289              + fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV))\
290              + fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV))\
291              + fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
292}
293Q6INSN(M2_vrmac_s0,"Rxx32+=vrmpyh(Rss32,Rtt32)",ATTRIBS(),"Vector Multiply",vmac_sema(0))
294
295#undef vmac_sema
296#define vmac_sema(N)\
297{ RddV = fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV))\
298       + fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV))\
299       + fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV))\
300       + fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
301}
302Q6INSN(M2_vrmpy_s0,"Rdd32=vrmpyh(Rss32,Rtt32)",ATTRIBS(),"Vector Multiply",vmac_sema(0))
303
304
305
306/******************************************************/
307/* vector dual macs. just like complex                */
308/******************************************************/
309
310
311/* With round&pack */
312#undef dmpy_sema
313#define dmpy_sema(N)\
314{ fSETHALF(0,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV))) + \
315                                  fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV))) + 0x8000))));\
316  fSETHALF(1,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV))) + \
317                                  fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV))) + 0x8000))));\
318}
319Q6INSN(M2_vdmpyrs_s0,"Rd32=vdmpy(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "vector dual mac w/ round&pack",dmpy_sema(0))
320Q6INSN(M2_vdmpyrs_s1,"Rd32=vdmpy(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"vector dual mac w/ round&pack",dmpy_sema(1))
321
322
323
324
325
326/******************************************************/
327/* vector byte multiplies                             */
328/******************************************************/
329
330
331Q6INSN(M5_vrmpybuu,"Rdd32=vrmpybu(Rss32,Rtt32)",ATTRIBS(),
332 "vector dual mpy bytes",
333{
334  fSETWORD(0,RddV,(fMPY16SS(fGETUBYTE(0,RssV),fGETUBYTE(0,RttV)) +
335                   fMPY16SS(fGETUBYTE(1,RssV),fGETUBYTE(1,RttV)) +
336                   fMPY16SS(fGETUBYTE(2,RssV),fGETUBYTE(2,RttV)) +
337                   fMPY16SS(fGETUBYTE(3,RssV),fGETUBYTE(3,RttV))));
338  fSETWORD(1,RddV,(fMPY16SS(fGETUBYTE(4,RssV),fGETUBYTE(4,RttV)) +
339                   fMPY16SS(fGETUBYTE(5,RssV),fGETUBYTE(5,RttV)) +
340                   fMPY16SS(fGETUBYTE(6,RssV),fGETUBYTE(6,RttV)) +
341                   fMPY16SS(fGETUBYTE(7,RssV),fGETUBYTE(7,RttV))));
342 })
343
344Q6INSN(M5_vrmacbuu,"Rxx32+=vrmpybu(Rss32,Rtt32)",ATTRIBS(),
345 "vector dual mac bytes",
346{
347  fSETWORD(0,RxxV,(fGETWORD(0,RxxV) +
348                   fMPY16SS(fGETUBYTE(0,RssV),fGETUBYTE(0,RttV)) +
349                   fMPY16SS(fGETUBYTE(1,RssV),fGETUBYTE(1,RttV)) +
350                   fMPY16SS(fGETUBYTE(2,RssV),fGETUBYTE(2,RttV)) +
351                   fMPY16SS(fGETUBYTE(3,RssV),fGETUBYTE(3,RttV))));
352  fSETWORD(1,RxxV,(fGETWORD(1,RxxV) +
353                   fMPY16SS(fGETUBYTE(4,RssV),fGETUBYTE(4,RttV)) +
354                   fMPY16SS(fGETUBYTE(5,RssV),fGETUBYTE(5,RttV)) +
355                   fMPY16SS(fGETUBYTE(6,RssV),fGETUBYTE(6,RttV)) +
356                   fMPY16SS(fGETUBYTE(7,RssV),fGETUBYTE(7,RttV))));
357 })
358
359
360Q6INSN(M5_vrmpybsu,"Rdd32=vrmpybsu(Rss32,Rtt32)",ATTRIBS(),
361 "vector dual mpy bytes",
362{
363  fSETWORD(0,RddV,(fMPY16SS(fGETBYTE(0,RssV),fGETUBYTE(0,RttV)) +
364                   fMPY16SS(fGETBYTE(1,RssV),fGETUBYTE(1,RttV)) +
365                   fMPY16SS(fGETBYTE(2,RssV),fGETUBYTE(2,RttV)) +
366                   fMPY16SS(fGETBYTE(3,RssV),fGETUBYTE(3,RttV))));
367  fSETWORD(1,RddV,(fMPY16SS(fGETBYTE(4,RssV),fGETUBYTE(4,RttV)) +
368                   fMPY16SS(fGETBYTE(5,RssV),fGETUBYTE(5,RttV)) +
369                   fMPY16SS(fGETBYTE(6,RssV),fGETUBYTE(6,RttV)) +
370                   fMPY16SS(fGETBYTE(7,RssV),fGETUBYTE(7,RttV))));
371 })
372
373Q6INSN(M5_vrmacbsu,"Rxx32+=vrmpybsu(Rss32,Rtt32)",ATTRIBS(),
374 "vector dual mac bytes",
375{
376  fSETWORD(0,RxxV,(fGETWORD(0,RxxV) +
377                   fMPY16SS(fGETBYTE(0,RssV),fGETUBYTE(0,RttV)) +
378                   fMPY16SS(fGETBYTE(1,RssV),fGETUBYTE(1,RttV)) +
379                   fMPY16SS(fGETBYTE(2,RssV),fGETUBYTE(2,RttV)) +
380                   fMPY16SS(fGETBYTE(3,RssV),fGETUBYTE(3,RttV))));
381  fSETWORD(1,RxxV,(fGETWORD(1,RxxV) +
382                   fMPY16SS(fGETBYTE(4,RssV),fGETUBYTE(4,RttV)) +
383                   fMPY16SS(fGETBYTE(5,RssV),fGETUBYTE(5,RttV)) +
384                   fMPY16SS(fGETBYTE(6,RssV),fGETUBYTE(6,RttV)) +
385                   fMPY16SS(fGETBYTE(7,RssV),fGETUBYTE(7,RttV))));
386 })
387
388
389Q6INSN(M5_vmpybuu,"Rdd32=vmpybu(Rs32,Rt32)",ATTRIBS(),
390 "vector mpy bytes",
391{
392  fSETHALF(0,RddV,(fMPY16SS(fGETUBYTE(0,RsV),fGETUBYTE(0,RtV))));
393  fSETHALF(1,RddV,(fMPY16SS(fGETUBYTE(1,RsV),fGETUBYTE(1,RtV))));
394  fSETHALF(2,RddV,(fMPY16SS(fGETUBYTE(2,RsV),fGETUBYTE(2,RtV))));
395  fSETHALF(3,RddV,(fMPY16SS(fGETUBYTE(3,RsV),fGETUBYTE(3,RtV))));
396 })
397
398Q6INSN(M5_vmpybsu,"Rdd32=vmpybsu(Rs32,Rt32)",ATTRIBS(),
399 "vector mpy bytes",
400{
401  fSETHALF(0,RddV,(fMPY16SS(fGETBYTE(0,RsV),fGETUBYTE(0,RtV))));
402  fSETHALF(1,RddV,(fMPY16SS(fGETBYTE(1,RsV),fGETUBYTE(1,RtV))));
403  fSETHALF(2,RddV,(fMPY16SS(fGETBYTE(2,RsV),fGETUBYTE(2,RtV))));
404  fSETHALF(3,RddV,(fMPY16SS(fGETBYTE(3,RsV),fGETUBYTE(3,RtV))));
405 })
406
407
408Q6INSN(M5_vmacbuu,"Rxx32+=vmpybu(Rs32,Rt32)",ATTRIBS(),
409 "vector mac bytes",
410{
411  fSETHALF(0,RxxV,(fGETHALF(0,RxxV)+fMPY16SS(fGETUBYTE(0,RsV),fGETUBYTE(0,RtV))));
412  fSETHALF(1,RxxV,(fGETHALF(1,RxxV)+fMPY16SS(fGETUBYTE(1,RsV),fGETUBYTE(1,RtV))));
413  fSETHALF(2,RxxV,(fGETHALF(2,RxxV)+fMPY16SS(fGETUBYTE(2,RsV),fGETUBYTE(2,RtV))));
414  fSETHALF(3,RxxV,(fGETHALF(3,RxxV)+fMPY16SS(fGETUBYTE(3,RsV),fGETUBYTE(3,RtV))));
415 })
416
417Q6INSN(M5_vmacbsu,"Rxx32+=vmpybsu(Rs32,Rt32)",ATTRIBS(),
418 "vector mac bytes",
419{
420  fSETHALF(0,RxxV,(fGETHALF(0,RxxV)+fMPY16SS(fGETBYTE(0,RsV),fGETUBYTE(0,RtV))));
421  fSETHALF(1,RxxV,(fGETHALF(1,RxxV)+fMPY16SS(fGETBYTE(1,RsV),fGETUBYTE(1,RtV))));
422  fSETHALF(2,RxxV,(fGETHALF(2,RxxV)+fMPY16SS(fGETBYTE(2,RsV),fGETUBYTE(2,RtV))));
423  fSETHALF(3,RxxV,(fGETHALF(3,RxxV)+fMPY16SS(fGETBYTE(3,RsV),fGETUBYTE(3,RtV))));
424 })
425
426
427
428Q6INSN(M5_vdmpybsu,"Rdd32=vdmpybsu(Rss32,Rtt32):sat",ATTRIBS(),
429 "vector quad mpy bytes",
430{
431  fSETHALF(0,RddV,fSATN(16,(fMPY16SS(fGETBYTE(0,RssV),fGETUBYTE(0,RttV)) +
432                            fMPY16SS(fGETBYTE(1,RssV),fGETUBYTE(1,RttV)))));
433  fSETHALF(1,RddV,fSATN(16,(fMPY16SS(fGETBYTE(2,RssV),fGETUBYTE(2,RttV)) +
434                            fMPY16SS(fGETBYTE(3,RssV),fGETUBYTE(3,RttV)))));
435  fSETHALF(2,RddV,fSATN(16,(fMPY16SS(fGETBYTE(4,RssV),fGETUBYTE(4,RttV)) +
436                            fMPY16SS(fGETBYTE(5,RssV),fGETUBYTE(5,RttV)))));
437  fSETHALF(3,RddV,fSATN(16,(fMPY16SS(fGETBYTE(6,RssV),fGETUBYTE(6,RttV)) +
438                            fMPY16SS(fGETBYTE(7,RssV),fGETUBYTE(7,RttV)))));
439 })
440
441
442Q6INSN(M5_vdmacbsu,"Rxx32+=vdmpybsu(Rss32,Rtt32):sat",ATTRIBS(),
443 "vector quad mac bytes",
444{
445  fSETHALF(0,RxxV,fSATN(16,(fGETHALF(0,RxxV) +
446                   fMPY16SS(fGETBYTE(0,RssV),fGETUBYTE(0,RttV)) +
447                   fMPY16SS(fGETBYTE(1,RssV),fGETUBYTE(1,RttV)))));
448  fSETHALF(1,RxxV,fSATN(16,(fGETHALF(1,RxxV) +
449                   fMPY16SS(fGETBYTE(2,RssV),fGETUBYTE(2,RttV)) +
450                   fMPY16SS(fGETBYTE(3,RssV),fGETUBYTE(3,RttV)))));
451  fSETHALF(2,RxxV,fSATN(16,(fGETHALF(2,RxxV) +
452                   fMPY16SS(fGETBYTE(4,RssV),fGETUBYTE(4,RttV)) +
453                   fMPY16SS(fGETBYTE(5,RssV),fGETUBYTE(5,RttV)))));
454  fSETHALF(3,RxxV,fSATN(16,(fGETHALF(3,RxxV) +
455                   fMPY16SS(fGETBYTE(6,RssV),fGETUBYTE(6,RttV)) +
456                   fMPY16SS(fGETBYTE(7,RssV),fGETUBYTE(7,RttV)))));
457 })
458
459
460
461/* Full version */
462#undef dmpy_sema
463#define dmpy_sema(N)\
464{ fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV))) + \
465                     fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)))));\
466  fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV))) + \
467                     fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV)))));\
468}
469Q6INSN(M2_vdmacs_s0,"Rxx32+=vdmpy(Rss32,Rtt32):sat",ATTRIBS(),    "",dmpy_sema(0))
470Q6INSN(M2_vdmacs_s1,"Rxx32+=vdmpy(Rss32,Rtt32):<<1:sat",ATTRIBS(),"",dmpy_sema(1))
471
472#undef dmpy_sema
473#define dmpy_sema(N)\
474{ fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV))) + \
475              fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)))));\
476  fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV))) + \
477              fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV)))));\
478}
479
480Q6INSN(M2_vdmpys_s0,"Rdd32=vdmpy(Rss32,Rtt32):sat",ATTRIBS(),    "",dmpy_sema(0))
481Q6INSN(M2_vdmpys_s1,"Rdd32=vdmpy(Rss32,Rtt32):<<1:sat",ATTRIBS(),"",dmpy_sema(1))
482
483
484
485/******************************************************/
486/* complex multiply/mac with                          */
487/* real&imag are packed together and always saturated */
488/* to protect against overflow.                       */
489/******************************************************/
490
491#undef cmpy_sema
492#define cmpy_sema(N,CONJMINUS,CONJPLUS)\
493{ fSETHALF(1,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV))) CONJMINUS \
494                                  fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV))) + 0x8000))));\
495  fSETHALF(0,RdV,fGETHALF(1,(fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV))) CONJPLUS \
496                                  fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV))) + 0x8000))));\
497}
498Q6INSN(M2_cmpyrs_s0,"Rd32=cmpy(Rs32,Rt32):rnd:sat",ATTRIBS(),    "Complex Multiply",cmpy_sema(0,+,-))
499Q6INSN(M2_cmpyrs_s1,"Rd32=cmpy(Rs32,Rt32):<<1:rnd:sat",ATTRIBS(),"Complex Multiply",cmpy_sema(1,+,-))
500
501
502Q6INSN(M2_cmpyrsc_s0,"Rd32=cmpy(Rs32,Rt32*):rnd:sat",ATTRIBS(A_ARCHV2),    "Complex Multiply",cmpy_sema(0,-,+))
503Q6INSN(M2_cmpyrsc_s1,"Rd32=cmpy(Rs32,Rt32*):<<1:rnd:sat",ATTRIBS(A_ARCHV2),"Complex Multiply",cmpy_sema(1,-,+))
504
505
506#undef cmpy_sema
507#define cmpy_sema(N,CONJMINUS,CONJPLUS)\
508{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV))) CONJMINUS \
509                                          fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV)))));\
510  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV))) CONJPLUS \
511                                          fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV)))));\
512}
513Q6INSN(M2_cmacs_s0,"Rxx32+=cmpy(Rs32,Rt32):sat",ATTRIBS(),    "Complex Multiply",cmpy_sema(0,+,-))
514Q6INSN(M2_cmacs_s1,"Rxx32+=cmpy(Rs32,Rt32):<<1:sat",ATTRIBS(),"Complex Multiply",cmpy_sema(1,+,-))
515
516/* EJP: Need mac versions w/ CONJ T? */
517Q6INSN(M2_cmacsc_s0,"Rxx32+=cmpy(Rs32,Rt32*):sat",ATTRIBS(A_ARCHV2),    "Complex Multiply",cmpy_sema(0,-,+))
518Q6INSN(M2_cmacsc_s1,"Rxx32+=cmpy(Rs32,Rt32*):<<1:sat",ATTRIBS(A_ARCHV2),"Complex Multiply",cmpy_sema(1,-,+))
519
520
521#undef cmpy_sema
522#define cmpy_sema(N,CONJMINUS,CONJPLUS)\
523{ fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV))) CONJMINUS \
524                       fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV)))));\
525  fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV))) CONJPLUS \
526                       fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV)))));\
527}
528
529Q6INSN(M2_cmpys_s0,"Rdd32=cmpy(Rs32,Rt32):sat",ATTRIBS(),    "Complex Multiply",cmpy_sema(0,+,-))
530Q6INSN(M2_cmpys_s1,"Rdd32=cmpy(Rs32,Rt32):<<1:sat",ATTRIBS(),"Complex Multiply",cmpy_sema(1,+,-))
531
532Q6INSN(M2_cmpysc_s0,"Rdd32=cmpy(Rs32,Rt32*):sat",ATTRIBS(A_ARCHV2),    "Complex Multiply",cmpy_sema(0,-,+))
533Q6INSN(M2_cmpysc_s1,"Rdd32=cmpy(Rs32,Rt32*):<<1:sat",ATTRIBS(A_ARCHV2),"Complex Multiply",cmpy_sema(1,-,+))
534
535
536
537#undef cmpy_sema
538#define cmpy_sema(N,CONJMINUS,CONJPLUS)\
539{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) - (fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV))) CONJMINUS \
540                                           fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV))))));\
541  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) - (fSCALE(N,fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV))) CONJPLUS \
542                                           fSCALE(N,fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV))))));\
543}
544Q6INSN(M2_cnacs_s0,"Rxx32-=cmpy(Rs32,Rt32):sat",ATTRIBS(A_ARCHV2),    "Complex Multiply",cmpy_sema(0,+,-))
545Q6INSN(M2_cnacs_s1,"Rxx32-=cmpy(Rs32,Rt32):<<1:sat",ATTRIBS(A_ARCHV2),"Complex Multiply",cmpy_sema(1,+,-))
546
547/* EJP: need CONJ versions? */
548Q6INSN(M2_cnacsc_s0,"Rxx32-=cmpy(Rs32,Rt32*):sat",ATTRIBS(A_ARCHV2),    "Complex Multiply",cmpy_sema(0,-,+))
549Q6INSN(M2_cnacsc_s1,"Rxx32-=cmpy(Rs32,Rt32*):<<1:sat",ATTRIBS(A_ARCHV2),"Complex Multiply",cmpy_sema(1,-,+))
550
551
552/******************************************************/
553/* complex interpolation                              */
554/* Given a pair of complex values, scale by a,b, sum  */
555/* Saturate/shift1 and round/pack                     */
556/******************************************************/
557
558#undef vrcmpys_sema
559#define vrcmpys_sema(N,INWORD) \
560{ fSETWORD(1,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,INWORD))) + \
561                       fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(1,INWORD)))));\
562  fSETWORD(0,RddV,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,INWORD))) + \
563                       fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(1,INWORD)))));\
564}
565
566
567
568Q6INSN(M2_vrcmpys_s1_h,"Rdd32=vrcmpys(Rss32,Rtt32):<<1:sat:raw:hi",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(1,RttV)))
569Q6INSN(M2_vrcmpys_s1_l,"Rdd32=vrcmpys(Rss32,Rtt32):<<1:sat:raw:lo",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(0,RttV)))
570
571#undef vrcmpys_sema
572#define vrcmpys_sema(N,INWORD) \
573{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,INWORD))) + \
574                       fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(1,INWORD)))));\
575  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,INWORD))) + \
576                       fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(1,INWORD)))));\
577}
578
579
580
581Q6INSN(M2_vrcmpys_acc_s1_h,"Rxx32+=vrcmpys(Rss32,Rtt32):<<1:sat:raw:hi",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(1,RttV)))
582Q6INSN(M2_vrcmpys_acc_s1_l,"Rxx32+=vrcmpys(Rss32,Rtt32):<<1:sat:raw:lo",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(0,RttV)))
583
584#undef vrcmpys_sema
585#define vrcmpys_sema(N,INWORD) \
586{ fSETHALF(1,RdV,fGETHALF(1,fSAT(fSCALE(N,fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,INWORD))) + \
587                       fSCALE(N,fMPY16SS(fGETHALF(3,RssV),fGETHALF(1,INWORD))) + 0x8000)));\
588  fSETHALF(0,RdV,fGETHALF(1,fSAT(fSCALE(N,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,INWORD))) + \
589                       fSCALE(N,fMPY16SS(fGETHALF(2,RssV),fGETHALF(1,INWORD))) + 0x8000)));\
590}
591
592Q6INSN(M2_vrcmpys_s1rp_h,"Rd32=vrcmpys(Rss32,Rtt32):<<1:rnd:sat:raw:hi",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(1,RttV)))
593Q6INSN(M2_vrcmpys_s1rp_l,"Rd32=vrcmpys(Rss32,Rtt32):<<1:rnd:sat:raw:lo",ATTRIBS(A_ARCHV3), "Vector Reduce Complex Multiply by Scalar",vrcmpys_sema(1,fGETWORD(0,RttV)))
594
595/**************************************************************/
596/* mixed mode 32x16 vector dual multiplies                    */
597/*                                                            */
598/**************************************************************/
599
600/* SIGNED 32 x SIGNED 16 */
601
602
603#undef mixmpy_sema
604#define mixmpy_sema(N)\
605{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(2,RttV))))>>16)) ); \
606  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RttV))))>>16)) ); \
607}
608Q6INSN(M2_mmacls_s0,"Rxx32+=vmpyweh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
609Q6INSN(M2_mmacls_s1,"Rxx32+=vmpyweh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
610
611#undef mixmpy_sema
612#define mixmpy_sema(N)\
613{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(3,RttV))))>>16) )); \
614  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RttV))))>>16 ))); \
615}
616Q6INSN(M2_mmachs_s0,"Rxx32+=vmpywoh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
617Q6INSN(M2_mmachs_s1,"Rxx32+=vmpywoh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
618
619#undef mixmpy_sema
620#define mixmpy_sema(N)\
621{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(2,RttV))))>>16)); \
622  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RttV))))>>16)); \
623}
624Q6INSN(M2_mmpyl_s0,"Rdd32=vmpyweh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
625Q6INSN(M2_mmpyl_s1,"Rdd32=vmpyweh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
626
627#undef mixmpy_sema
628#define mixmpy_sema(N)\
629{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(3,RttV))))>>16)); \
630  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RttV))))>>16)); \
631}
632Q6INSN(M2_mmpyh_s0,"Rdd32=vmpywoh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
633Q6INSN(M2_mmpyh_s1,"Rdd32=vmpywoh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
634
635
636/* With rounding */
637
638#undef mixmpy_sema
639#define mixmpy_sema(N)\
640{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(2,RttV)))+0x8000)>>16)) ); \
641  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RttV)))+0x8000)>>16)) ); \
642}
643Q6INSN(M2_mmacls_rs0,"Rxx32+=vmpyweh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
644Q6INSN(M2_mmacls_rs1,"Rxx32+=vmpyweh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
645
646#undef mixmpy_sema
647#define mixmpy_sema(N)\
648{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(3,RttV)))+0x8000)>>16) )); \
649  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RttV)))+0x8000)>>16 ))); \
650}
651Q6INSN(M2_mmachs_rs0,"Rxx32+=vmpywoh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
652Q6INSN(M2_mmachs_rs1,"Rxx32+=vmpywoh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
653
654#undef mixmpy_sema
655#define mixmpy_sema(N)\
656{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(2,RttV)))+0x8000)>>16)); \
657  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RttV)))+0x8000)>>16)); \
658}
659Q6INSN(M2_mmpyl_rs0,"Rdd32=vmpyweh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
660Q6INSN(M2_mmpyl_rs1,"Rdd32=vmpyweh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
661
662#undef mixmpy_sema
663#define mixmpy_sema(N)\
664{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(3,RttV)))+0x8000)>>16)); \
665  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RttV)))+0x8000)>>16)); \
666}
667Q6INSN(M2_mmpyh_rs0,"Rdd32=vmpywoh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
668Q6INSN(M2_mmpyh_rs1,"Rdd32=vmpywoh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
669
670
671#undef mixmpy_sema
672#define mixmpy_sema(DEST,EQUALS,N)\
673{ DEST EQUALS fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(2,RttV))) + fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RttV)));}
674
675Q6INSN(M4_vrmpyeh_s0,"Rdd32=vrmpyweh(Rss32,Rtt32)",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(RddV,=,0))
676Q6INSN(M4_vrmpyeh_s1,"Rdd32=vrmpyweh(Rss32,Rtt32):<<1",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(RddV,=,1))
677Q6INSN(M4_vrmpyeh_acc_s0,"Rxx32+=vrmpyweh(Rss32,Rtt32)",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(RxxV,+=,0))
678Q6INSN(M4_vrmpyeh_acc_s1,"Rxx32+=vrmpyweh(Rss32,Rtt32):<<1",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(RxxV,+=,1))
679
680#undef mixmpy_sema
681#define mixmpy_sema(DEST,EQUALS,N)\
682{ DEST EQUALS fSCALE(N,fMPY3216SS(fGETWORD(1,RssV),fGETHALF(3,RttV))) + fSCALE(N,fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RttV)));}
683
684Q6INSN(M4_vrmpyoh_s0,"Rdd32=vrmpywoh(Rss32,Rtt32)",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(RddV,=,0))
685Q6INSN(M4_vrmpyoh_s1,"Rdd32=vrmpywoh(Rss32,Rtt32):<<1",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(RddV,=,1))
686Q6INSN(M4_vrmpyoh_acc_s0,"Rxx32+=vrmpywoh(Rss32,Rtt32)",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(RxxV,+=,0))
687Q6INSN(M4_vrmpyoh_acc_s1,"Rxx32+=vrmpywoh(Rss32,Rtt32):<<1",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(RxxV,+=,1))
688
689
690
691
692
693
694#undef mixmpy_sema
695#define mixmpy_sema(N,H,RND)\
696{  RdV = fSAT((fSCALE(N,fMPY3216SS(RsV,fGETHALF(H,RtV)))RND)>>16); \
697}
698Q6INSN(M2_hmmpyl_rs1,"Rd32=mpy(Rs32,Rt.L32):<<1:rnd:sat",ATTRIBS(A_ARCHV2),"Mixed Precision Multiply",mixmpy_sema(1,0,+0x8000))
699Q6INSN(M2_hmmpyh_rs1,"Rd32=mpy(Rs32,Rt.H32):<<1:rnd:sat",ATTRIBS(A_ARCHV2),"Mixed Precision Multiply",mixmpy_sema(1,1,+0x8000))
700Q6INSN(M2_hmmpyl_s1,"Rd32=mpy(Rs32,Rt.L32):<<1:sat",ATTRIBS(A_ARCHV2),"Mixed Precision Multiply",mixmpy_sema(1,0,))
701Q6INSN(M2_hmmpyh_s1,"Rd32=mpy(Rs32,Rt.H32):<<1:sat",ATTRIBS(A_ARCHV2),"Mixed Precision Multiply",mixmpy_sema(1,1,))
702
703
704
705
706
707
708
709
710
711/* SIGNED 32 x UNSIGNED 16 */
712
713#undef mixmpy_sema
714#define mixmpy_sema(N)\
715{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(2,RttV))))>>16)) ); \
716  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(0,RttV))))>>16)) ); \
717}
718Q6INSN(M2_mmaculs_s0,"Rxx32+=vmpyweuh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
719Q6INSN(M2_mmaculs_s1,"Rxx32+=vmpyweuh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
720
721#undef mixmpy_sema
722#define mixmpy_sema(N)\
723{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(3,RttV))))>>16) )); \
724  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(1,RttV))))>>16 ))); \
725}
726Q6INSN(M2_mmacuhs_s0,"Rxx32+=vmpywouh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
727Q6INSN(M2_mmacuhs_s1,"Rxx32+=vmpywouh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
728
729#undef mixmpy_sema
730#define mixmpy_sema(N)\
731{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(2,RttV))))>>16)); \
732  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(0,RttV))))>>16)); \
733}
734Q6INSN(M2_mmpyul_s0,"Rdd32=vmpyweuh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
735Q6INSN(M2_mmpyul_s1,"Rdd32=vmpyweuh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
736
737#undef mixmpy_sema
738#define mixmpy_sema(N)\
739{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(3,RttV))))>>16)); \
740  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(1,RttV))))>>16)); \
741}
742Q6INSN(M2_mmpyuh_s0,"Rdd32=vmpywouh(Rss32,Rtt32):sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
743Q6INSN(M2_mmpyuh_s1,"Rdd32=vmpywouh(Rss32,Rtt32):<<1:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
744
745
746/* With rounding */
747
748#undef mixmpy_sema
749#define mixmpy_sema(N)\
750{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(2,RttV)))+0x8000)>>16)) ); \
751  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(0,RttV)))+0x8000)>>16)) ); \
752}
753Q6INSN(M2_mmaculs_rs0,"Rxx32+=vmpyweuh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
754Q6INSN(M2_mmaculs_rs1,"Rxx32+=vmpyweuh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
755
756#undef mixmpy_sema
757#define mixmpy_sema(N)\
758{ fSETWORD(1,RxxV,fSAT(fGETWORD(1,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(3,RttV)))+0x8000)>>16) )); \
759  fSETWORD(0,RxxV,fSAT(fGETWORD(0,RxxV) + ((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(1,RttV)))+0x8000)>>16 ))); \
760}
761Q6INSN(M2_mmacuhs_rs0,"Rxx32+=vmpywouh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
762Q6INSN(M2_mmacuhs_rs1,"Rxx32+=vmpywouh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
763
764#undef mixmpy_sema
765#define mixmpy_sema(N)\
766{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(2,RttV)))+0x8000)>>16)); \
767  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(0,RttV)))+0x8000)>>16)); \
768}
769Q6INSN(M2_mmpyul_rs0,"Rdd32=vmpyweuh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
770Q6INSN(M2_mmpyul_rs1,"Rdd32=vmpyweuh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
771
772#undef mixmpy_sema
773#define mixmpy_sema(N)\
774{ fSETWORD(1,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(1,RssV),fGETUHALF(3,RttV)))+0x8000)>>16)); \
775  fSETWORD(0,RddV,fSAT((fSCALE(N,fMPY3216SU(fGETWORD(0,RssV),fGETUHALF(1,RttV)))+0x8000)>>16)); \
776}
777Q6INSN(M2_mmpyuh_rs0,"Rdd32=vmpywouh(Rss32,Rtt32):rnd:sat",ATTRIBS(),    "Mixed Precision Multiply",mixmpy_sema(0))
778Q6INSN(M2_mmpyuh_rs1,"Rdd32=vmpywouh(Rss32,Rtt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Multiply",mixmpy_sema(1))
779
780
781/**************************************************************/
782/* complex mac with full 64-bit accum - no sat, no shift      */
783/* either do real or accum, never both                        */
784/**************************************************************/
785
786Q6INSN(M2_vrcmaci_s0,"Rxx32+=vrcmpyi(Rss32,Rtt32)",ATTRIBS(),"Vector Complex Mac Imaginary",
787{
788RxxV = RxxV + fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,RttV)) + \
789              fMPY16SS(fGETHALF(0,RssV),fGETHALF(1,RttV)) + \
790              fMPY16SS(fGETHALF(3,RssV),fGETHALF(2,RttV)) + \
791              fMPY16SS(fGETHALF(2,RssV),fGETHALF(3,RttV));\
792})
793
794Q6INSN(M2_vrcmacr_s0,"Rxx32+=vrcmpyr(Rss32,Rtt32)",ATTRIBS(),"Vector Complex Mac Real",
795{ RxxV = RxxV + fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)) - \
796                fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)) + \
797                fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)) - \
798                fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
799})
800
801Q6INSN(M2_vrcmaci_s0c,"Rxx32+=vrcmpyi(Rss32,Rtt32*)",ATTRIBS(A_ARCHV2),"Vector Complex Mac Imaginary",
802{
803RxxV = RxxV + fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,RttV)) - \
804              fMPY16SS(fGETHALF(0,RssV),fGETHALF(1,RttV)) + \
805              fMPY16SS(fGETHALF(3,RssV),fGETHALF(2,RttV)) - \
806              fMPY16SS(fGETHALF(2,RssV),fGETHALF(3,RttV));\
807})
808
809Q6INSN(M2_vrcmacr_s0c,"Rxx32+=vrcmpyr(Rss32,Rtt32*)",ATTRIBS(A_ARCHV2),"Vector Complex Mac Real",
810{ RxxV = RxxV + fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)) + \
811                fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)) + \
812                fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)) + \
813                fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
814})
815
816Q6INSN(M2_cmaci_s0,"Rxx32+=cmpyi(Rs32,Rt32)",ATTRIBS(),"Vector Complex Mac Imaginary",
817{
818RxxV = RxxV + fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV)) + \
819              fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV));
820})
821
822Q6INSN(M2_cmacr_s0,"Rxx32+=cmpyr(Rs32,Rt32)",ATTRIBS(),"Vector Complex Mac Real",
823{ RxxV = RxxV + fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV)) - \
824                fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV));
825})
826
827
828Q6INSN(M2_vrcmpyi_s0,"Rdd32=vrcmpyi(Rss32,Rtt32)",ATTRIBS(),"Vector Complex Mpy Imaginary",
829{
830RddV = fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,RttV)) + \
831       fMPY16SS(fGETHALF(0,RssV),fGETHALF(1,RttV)) + \
832       fMPY16SS(fGETHALF(3,RssV),fGETHALF(2,RttV)) + \
833       fMPY16SS(fGETHALF(2,RssV),fGETHALF(3,RttV));\
834})
835
836Q6INSN(M2_vrcmpyr_s0,"Rdd32=vrcmpyr(Rss32,Rtt32)",ATTRIBS(),"Vector Complex Mpy Real",
837{ RddV = fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)) - \
838         fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)) + \
839         fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)) - \
840         fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
841})
842
843Q6INSN(M2_vrcmpyi_s0c,"Rdd32=vrcmpyi(Rss32,Rtt32*)",ATTRIBS(A_ARCHV2),"Vector Complex Mpy Imaginary",
844{
845RddV = fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,RttV)) - \
846       fMPY16SS(fGETHALF(0,RssV),fGETHALF(1,RttV)) + \
847       fMPY16SS(fGETHALF(3,RssV),fGETHALF(2,RttV)) - \
848       fMPY16SS(fGETHALF(2,RssV),fGETHALF(3,RttV));\
849})
850
851Q6INSN(M2_vrcmpyr_s0c,"Rdd32=vrcmpyr(Rss32,Rtt32*)",ATTRIBS(A_ARCHV2),"Vector Complex Mpy Real",
852{ RddV = fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)) + \
853         fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV)) + \
854         fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)) + \
855         fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV));\
856})
857
858Q6INSN(M2_cmpyi_s0,"Rdd32=cmpyi(Rs32,Rt32)",ATTRIBS(),"Vector Complex Mpy Imaginary",
859{
860RddV = fMPY16SS(fGETHALF(1,RsV),fGETHALF(0,RtV)) + \
861       fMPY16SS(fGETHALF(0,RsV),fGETHALF(1,RtV));
862})
863
864Q6INSN(M2_cmpyr_s0,"Rdd32=cmpyr(Rs32,Rt32)",ATTRIBS(),"Vector Complex Mpy Real",
865{ RddV = fMPY16SS(fGETHALF(0,RsV),fGETHALF(0,RtV)) - \
866         fMPY16SS(fGETHALF(1,RsV),fGETHALF(1,RtV));
867})
868
869
870/**************************************************************/
871/* Complex mpy/mac with 2x32 bit accum, sat, shift            */
872/* 32x16 real or imag                                         */
873/**************************************************************/
874
875#if 1
876
877Q6INSN(M4_cmpyi_wh,"Rd32=cmpyiwh(Rss32,Rt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Complex Multiply",
878{
879 RdV = fSAT(  (  fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RtV))
880               + fMPY3216SS(fGETWORD(1,RssV),fGETHALF(0,RtV))
881               + 0x4000)>>15);
882})
883
884
885Q6INSN(M4_cmpyr_wh,"Rd32=cmpyrwh(Rss32,Rt32):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Complex Multiply",
886{
887 RdV = fSAT(  (  fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RtV))
888               - fMPY3216SS(fGETWORD(1,RssV),fGETHALF(1,RtV))
889               + 0x4000)>>15);
890})
891
892Q6INSN(M4_cmpyi_whc,"Rd32=cmpyiwh(Rss32,Rt32*):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Complex Multiply",
893{
894 RdV = fSAT(  (  fMPY3216SS(fGETWORD(1,RssV),fGETHALF(0,RtV))
895               - fMPY3216SS(fGETWORD(0,RssV),fGETHALF(1,RtV))
896               + 0x4000)>>15);
897})
898
899
900Q6INSN(M4_cmpyr_whc,"Rd32=cmpyrwh(Rss32,Rt32*):<<1:rnd:sat",ATTRIBS(),"Mixed Precision Complex Multiply",
901{
902 RdV = fSAT(  (  fMPY3216SS(fGETWORD(0,RssV),fGETHALF(0,RtV))
903               + fMPY3216SS(fGETWORD(1,RssV),fGETHALF(1,RtV))
904               + 0x4000)>>15);
905})
906
907
908#endif
909
910/**************************************************************/
911/* Vector mpy/mac with 2x32 bit accum, sat, shift             */
912/* either do real or imag,  never both                        */
913/**************************************************************/
914
915#undef VCMPYSEMI
916#define VCMPYSEMI(DST,ACC0,ACC1,SHIFT,SAT) \
917    fSETWORD(0,DST,SAT(ACC0 fSCALE(SHIFT,fMPY16SS(fGETHALF(1,RssV),fGETHALF(0,RttV)) + \
918        fMPY16SS(fGETHALF(0,RssV),fGETHALF(1,RttV))))); \
919    fSETWORD(1,DST,SAT(ACC1 fSCALE(SHIFT,fMPY16SS(fGETHALF(3,RssV),fGETHALF(2,RttV)) + \
920        fMPY16SS(fGETHALF(2,RssV),fGETHALF(3,RttV))))); \
921
922#undef VCMPYSEMR
923#define VCMPYSEMR(DST,ACC0,ACC1,SHIFT,SAT) \
924    fSETWORD(0,DST,SAT(ACC0 fSCALE(SHIFT,fMPY16SS(fGETHALF(0,RssV),fGETHALF(0,RttV)) - \
925        fMPY16SS(fGETHALF(1,RssV),fGETHALF(1,RttV))))); \
926    fSETWORD(1,DST,SAT(ACC1 fSCALE(SHIFT,fMPY16SS(fGETHALF(2,RssV),fGETHALF(2,RttV)) - \
927        fMPY16SS(fGETHALF(3,RssV),fGETHALF(3,RttV))))); \
928
929
930#undef VCMPYIR
931#define VCMPYIR(TAGBASE,DSTSYN,DSTVAL,ACCSEM,ACCVAL0,ACCVAL1,SHIFTSYN,SHIFTVAL,SATSYN,SATVAL) \
932Q6INSN(M2_##TAGBASE##i,DSTSYN ACCSEM "vcmpyi(Rss32,Rtt32)" SHIFTSYN SATSYN,ATTRIBS(A_ARCHV2), \
933    "Vector Complex Multiply Imaginary", { VCMPYSEMI(DSTVAL,ACCVAL0,ACCVAL1,SHIFTVAL,SATVAL); }) \
934Q6INSN(M2_##TAGBASE##r,DSTSYN ACCSEM "vcmpyr(Rss32,Rtt32)" SHIFTSYN SATSYN,ATTRIBS(A_ARCHV2), \
935    "Vector Complex Multiply Imaginary", { VCMPYSEMR(DSTVAL,ACCVAL0,ACCVAL1,SHIFTVAL,SATVAL); })
936
937
938VCMPYIR(vcmpy_s0_sat_,"Rdd32",RddV,"=",,,"",0,":sat",fSAT)
939VCMPYIR(vcmpy_s1_sat_,"Rdd32",RddV,"=",,,":<<1",1,":sat",fSAT)
940VCMPYIR(vcmac_s0_sat_,"Rxx32",RxxV,"+=",fGETWORD(0,RxxV) + ,fGETWORD(1,RxxV) + ,"",0,":sat",fSAT)
941
942
943/**********************************************************************
944 *  Rotation  -- by 0, 90, 180, or 270 means mult by 1, J, -1, -J     *
945 *********************************************************************/
946
947Q6INSN(S2_vcrotate,"Rdd32=vcrotate(Rss32,Rt32)",ATTRIBS(A_ARCHV2),"Rotate complex value by multiple of PI/2",
948{
949    fHIDE(size1u_t tmp;)
950    tmp = fEXTRACTU_RANGE(RtV,1,0);
951    if (tmp == 0) { /* No rotation */
952        fSETHALF(0,RddV,fGETHALF(0,RssV));
953        fSETHALF(1,RddV,fGETHALF(1,RssV));
954    } else if (tmp == 1) { /* Multiply by -J */
955        fSETHALF(0,RddV,fGETHALF(1,RssV));
956        fSETHALF(1,RddV,fSATH(-fGETHALF(0,RssV)));
957    } else if (tmp == 2) { /* Multiply by J */
958        fSETHALF(0,RddV,fSATH(-fGETHALF(1,RssV)));
959        fSETHALF(1,RddV,fGETHALF(0,RssV));
960    } else { /* Multiply by -1 */
961        fHIDE(if (tmp != 3) fatal("C is broken");)
962        fSETHALF(0,RddV,fSATH(-fGETHALF(0,RssV)));
963        fSETHALF(1,RddV,fSATH(-fGETHALF(1,RssV)));
964    }
965    tmp = fEXTRACTU_RANGE(RtV,3,2);
966    if (tmp == 0) { /* No rotation */
967        fSETHALF(2,RddV,fGETHALF(2,RssV));
968        fSETHALF(3,RddV,fGETHALF(3,RssV));
969    } else if (tmp == 1) { /* Multiply by -J */
970        fSETHALF(2,RddV,fGETHALF(3,RssV));
971        fSETHALF(3,RddV,fSATH(-fGETHALF(2,RssV)));
972    } else if (tmp == 2) { /* Multiply by J */
973        fSETHALF(2,RddV,fSATH(-fGETHALF(3,RssV)));
974        fSETHALF(3,RddV,fGETHALF(2,RssV));
975    } else { /* Multiply by -1 */
976        fHIDE(if (tmp != 3) fatal("C is broken");)
977        fSETHALF(2,RddV,fSATH(-fGETHALF(2,RssV)));
978        fSETHALF(3,RddV,fSATH(-fGETHALF(3,RssV)));
979    }
980})
981
982
983Q6INSN(S4_vrcrotate_acc,"Rxx32+=vrcrotate(Rss32,Rt32,#u2)",ATTRIBS(),"Rotate and Reduce Bytes",
984{
985    fHIDE(int i; int tmpr; int tmpi; unsigned int control;)
986    fHIDE(int sumr; int sumi;)
987    sumr = 0;
988    sumi = 0;
989    control = fGETUBYTE(uiV,RtV);
990    for (i = 0; i < 8; i += 2) {
991        tmpr = fGETBYTE(i  ,RssV);
992        tmpi = fGETBYTE(i+1,RssV);
993        switch (control & 3) {
994        case 0: /* No Rotation */
995            sumr += tmpr;
996            sumi += tmpi;
997            break;
998        case 1: /* Multiply by -J */
999            sumr += tmpi;
1000            sumi -= tmpr;
1001            break;
1002        case 2: /* Multiply by J */
1003            sumr -= tmpi;
1004            sumi += tmpr;
1005            break;
1006        case 3: /* Multiply by -1 */
1007            sumr -= tmpr;
1008            sumi -= tmpi;
1009            break;
1010        fHIDE(default: fatal("C is broken!");)
1011        }
1012        control = control >> 2;
1013    }
1014    fSETWORD(0,RxxV,fGETWORD(0,RxxV) + sumr);
1015    fSETWORD(1,RxxV,fGETWORD(1,RxxV) + sumi);
1016})
1017
1018Q6INSN(S4_vrcrotate,"Rdd32=vrcrotate(Rss32,Rt32,#u2)",ATTRIBS(),"Rotate and Reduce Bytes",
1019{
1020    fHIDE(int i; int tmpr; int tmpi; unsigned int control;)
1021    fHIDE(int sumr; int sumi;)
1022    sumr = 0;
1023    sumi = 0;
1024    control = fGETUBYTE(uiV,RtV);
1025    for (i = 0; i < 8; i += 2) {
1026        tmpr = fGETBYTE(i  ,RssV);
1027        tmpi = fGETBYTE(i+1,RssV);
1028        switch (control & 3) {
1029        case 0: /* No Rotation */
1030            sumr += tmpr;
1031            sumi += tmpi;
1032            break;
1033        case 1: /* Multiply by -J */
1034            sumr += tmpi;
1035            sumi -= tmpr;
1036            break;
1037        case 2: /* Multiply by J */
1038            sumr -= tmpi;
1039            sumi += tmpr;
1040            break;
1041        case 3: /* Multiply by -1 */
1042            sumr -= tmpr;
1043            sumi -= tmpi;
1044            break;
1045        fHIDE(default: fatal("C is broken!");)
1046        }
1047        control = control >> 2;
1048    }
1049    fSETWORD(0,RddV,sumr);
1050    fSETWORD(1,RddV,sumi);
1051})
1052
1053
1054Q6INSN(S2_vcnegh,"Rdd32=vcnegh(Rss32,Rt32)",ATTRIBS(),"Conditional Negate halfwords",
1055{
1056    fHIDE(int i;)
1057    for (i = 0; i < 4; i++) {
1058        if (fGETBIT(i,RtV)) {
1059            fSETHALF(i,RddV,fSATH(-fGETHALF(i,RssV)));
1060        } else {
1061            fSETHALF(i,RddV,fGETHALF(i,RssV));
1062        }
1063    }
1064})
1065
1066Q6INSN(S2_vrcnegh,"Rxx32+=vrcnegh(Rss32,Rt32)",ATTRIBS(),"Vector Reduce Conditional Negate halfwords",
1067{
1068    fHIDE(int i;)
1069    for (i = 0; i < 4; i++) {
1070        if (fGETBIT(i,RtV)) {
1071            RxxV += -fGETHALF(i,RssV);
1072        } else {
1073            RxxV += fGETHALF(i,RssV);
1074        }
1075    }
1076})
1077
1078
1079/**********************************************************************
1080 *  Finite-field multiplies.  Written by David Hoyle                  *
1081 *********************************************************************/
1082
1083Q6INSN(M4_pmpyw,"Rdd32=pmpyw(Rs32,Rt32)",ATTRIBS(),"Polynomial 32bit Multiplication with Addition in GF(2)",
1084{
1085        fHIDE(int i; unsigned int y;)
1086        fHIDE(unsigned long long x; unsigned long long prod;)
1087        x = fGETUWORD(0, RsV);
1088        y = fGETUWORD(0, RtV);
1089
1090        prod = 0;
1091        for(i=0; i < 32; i++) {
1092            if((y >> i) & 1) prod ^= (x << i);
1093        }
1094        RddV = prod;
1095})
1096
1097Q6INSN(M4_vpmpyh,"Rdd32=vpmpyh(Rs32,Rt32)",ATTRIBS(),"Dual Polynomial 16bit Multiplication with Addition in GF(2)",
1098{
1099        fHIDE(int i; unsigned int x0; unsigned int x1;)
1100        fHIDE(unsigned int y0; unsigned int y1;)
1101        fHIDE(unsigned int prod0; unsigned int prod1;)
1102
1103        x0 = fGETUHALF(0, RsV);
1104        x1 = fGETUHALF(1, RsV);
1105        y0 = fGETUHALF(0, RtV);
1106        y1 = fGETUHALF(1, RtV);
1107
1108        prod0 = prod1 = 0;
1109        for(i=0; i < 16; i++) {
1110            if((y0 >> i) & 1) prod0 ^= (x0 << i);
1111            if((y1 >> i) & 1) prod1 ^= (x1 << i);
1112        }
1113        fSETHALF(0,RddV,fGETUHALF(0,prod0));
1114        fSETHALF(1,RddV,fGETUHALF(0,prod1));
1115        fSETHALF(2,RddV,fGETUHALF(1,prod0));
1116        fSETHALF(3,RddV,fGETUHALF(1,prod1));
1117})
1118
1119Q6INSN(M4_pmpyw_acc,"Rxx32^=pmpyw(Rs32,Rt32)",ATTRIBS(),"Polynomial 32bit Multiplication with Addition in GF(2)",
1120{
1121        fHIDE(int i; unsigned int y;)
1122        fHIDE(unsigned long long x; unsigned long long prod;)
1123        x = fGETUWORD(0, RsV);
1124        y = fGETUWORD(0, RtV);
1125
1126        prod = 0;
1127        for(i=0; i < 32; i++) {
1128            if((y >> i) & 1) prod ^= (x << i);
1129        }
1130        RxxV ^= prod;
1131})
1132
1133Q6INSN(M4_vpmpyh_acc,"Rxx32^=vpmpyh(Rs32,Rt32)",ATTRIBS(),"Dual Polynomial 16bit Multiplication with Addition in GF(2)",
1134{
1135        fHIDE(int i; unsigned int x0; unsigned int x1;)
1136        fHIDE(unsigned int y0; unsigned int y1;)
1137        fHIDE(unsigned int prod0; unsigned int prod1;)
1138
1139        x0 = fGETUHALF(0, RsV);
1140        x1 = fGETUHALF(1, RsV);
1141        y0 = fGETUHALF(0, RtV);
1142        y1 = fGETUHALF(1, RtV);
1143
1144        prod0 = prod1 = 0;
1145        for(i=0; i < 16; i++) {
1146            if((y0 >> i) & 1) prod0 ^= (x0 << i);
1147            if((y1 >> i) & 1) prod1 ^= (x1 << i);
1148        }
1149        fSETHALF(0,RxxV,fGETUHALF(0,RxxV) ^ fGETUHALF(0,prod0));
1150        fSETHALF(1,RxxV,fGETUHALF(1,RxxV) ^ fGETUHALF(0,prod1));
1151        fSETHALF(2,RxxV,fGETUHALF(2,RxxV) ^ fGETUHALF(1,prod0));
1152        fSETHALF(3,RxxV,fGETUHALF(3,RxxV) ^ fGETUHALF(1,prod1));
1153})
1154
1155
1156/* V70: TINY CORE */
1157
1158#define CMPY64(TAG,NAME,DESC,OPERAND1,OP,W0,W1,W2,W3) \
1159Q6INSN(M7_##TAG,"Rdd32=" NAME "(Rss32," OPERAND1 ")",ATTRIBS(A_RESTRICT_SLOT3ONLY),"Complex Multiply 64-bit " DESC,    { RddV  = (fMPY32SS(fGETWORD(W0, RssV), fGETWORD(W1, RttV)) OP fMPY32SS(fGETWORD(W2, RssV), fGETWORD(W3, RttV)));})\
1160Q6INSN(M7_##TAG##_acc,"Rxx32+=" NAME "(Rss32,"OPERAND1")",ATTRIBS(A_RESTRICT_SLOT3ONLY),"Complex Multiply-Accumulate 64-bit " DESC, { RxxV += (fMPY32SS(fGETWORD(W0, RssV), fGETWORD(W1, RttV)) OP fMPY32SS(fGETWORD(W2, RssV), fGETWORD(W3, RttV)));})
1161
1162CMPY64(dcmpyrw, "cmpyrw","Real","Rtt32" ,-,0,0,1,1)
1163CMPY64(dcmpyrwc,"cmpyrw","Real","Rtt32*",+,0,0,1,1)
1164CMPY64(dcmpyiw, "cmpyiw","Imag","Rtt32" ,+,0,1,1,0)
1165CMPY64(dcmpyiwc,"cmpyiw","Imag","Rtt32*",-,1,0,0,1)
1166
1167#define CMPY128(TAG, NAME, OPERAND1, WORD0, WORD1, WORD2, WORD3, OP) \
1168Q6INSN(M7_##TAG,"Rd32=" NAME "(Rss32,"OPERAND1"):<<1:sat",ATTRIBS(A_RESTRICT_SLOT3ONLY),"Complex Multiply 32-bit result real",  \
1169{ \
1170fHIDE(size16s_t acc128;)\
1171fHIDE(size16s_t tmp128;)\
1172fHIDE(size8s_t acc64;)\
1173tmp128 = fCAST8S_16S(fMPY32SS(fGETWORD(WORD0, RssV), fGETWORD(WORD1, RttV)));\
1174acc128 = fCAST8S_16S(fMPY32SS(fGETWORD(WORD2, RssV), fGETWORD(WORD3, RttV)));\
1175acc128 = OP(tmp128,acc128);\
1176acc128 = fSHIFTR128(acc128, 31);\
1177acc64 =  fCAST16S_8S(acc128);\
1178RdV = fSATW(acc64);\
1179})
1180
1181
1182CMPY128(wcmpyrw, "cmpyrw", "Rtt32", 0, 0, 1, 1, fSUB128)
1183CMPY128(wcmpyrwc, "cmpyrw", "Rtt32*", 0, 0, 1, 1, fADD128)
1184CMPY128(wcmpyiw, "cmpyiw", "Rtt32", 0, 1, 1, 0, fADD128)
1185CMPY128(wcmpyiwc, "cmpyiw", "Rtt32*", 1, 0, 0, 1, fSUB128)
1186
1187
1188#define CMPY128RND(TAG, NAME, OPERAND1, WORD0, WORD1, WORD2, WORD3, OP) \
1189Q6INSN(M7_##TAG##_rnd,"Rd32=" NAME "(Rss32,"OPERAND1"):<<1:rnd:sat",ATTRIBS(A_RESTRICT_SLOT3ONLY),"Complex Multiply 32-bit result real",  \
1190{ \
1191fHIDE(size16s_t acc128;)\
1192fHIDE(size16s_t tmp128;)\
1193fHIDE(size16s_t const128;)\
1194fHIDE(size8s_t acc64;)\
1195tmp128 = fCAST8S_16S(fMPY32SS(fGETWORD(WORD0, RssV), fGETWORD(WORD1, RttV)));\
1196acc128 = fCAST8S_16S(fMPY32SS(fGETWORD(WORD2, RssV), fGETWORD(WORD3, RttV)));\
1197const128 = fCAST8S_16S(fCONSTLL(0x40000000));\
1198acc128 = OP(tmp128,acc128);\
1199acc128 = fADD128(acc128,const128);\
1200acc128 = fSHIFTR128(acc128, 31);\
1201acc64 =  fCAST16S_8S(acc128);\
1202RdV = fSATW(acc64);\
1203})
1204
1205CMPY128RND(wcmpyrw, "cmpyrw", "Rtt32", 0, 0, 1, 1, fSUB128)
1206CMPY128RND(wcmpyrwc, "cmpyrw", "Rtt32*", 0, 0, 1, 1, fADD128)
1207CMPY128RND(wcmpyiw, "cmpyiw", "Rtt32", 0, 1, 1, 0, fADD128)
1208CMPY128RND(wcmpyiwc, "cmpyiw", "Rtt32*", 1, 0, 0, 1, fSUB128)
1209