1*a2b0a27dSPhilippe Mathieu-Daudé /* 2*a2b0a27dSPhilippe Mathieu-Daudé * Loongson Multimedia Instruction emulation helpers for QEMU. 3*a2b0a27dSPhilippe Mathieu-Daudé * 4*a2b0a27dSPhilippe Mathieu-Daudé * Copyright (c) 2011 Richard Henderson <rth@twiddle.net> 5*a2b0a27dSPhilippe Mathieu-Daudé * 6*a2b0a27dSPhilippe Mathieu-Daudé * This library is free software; you can redistribute it and/or 7*a2b0a27dSPhilippe Mathieu-Daudé * modify it under the terms of the GNU Lesser General Public 8*a2b0a27dSPhilippe Mathieu-Daudé * License as published by the Free Software Foundation; either 9*a2b0a27dSPhilippe Mathieu-Daudé * version 2.1 of the License, or (at your option) any later version. 10*a2b0a27dSPhilippe Mathieu-Daudé * 11*a2b0a27dSPhilippe Mathieu-Daudé * This library is distributed in the hope that it will be useful, 12*a2b0a27dSPhilippe Mathieu-Daudé * but WITHOUT ANY WARRANTY; without even the implied warranty of 13*a2b0a27dSPhilippe Mathieu-Daudé * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14*a2b0a27dSPhilippe Mathieu-Daudé * Lesser General Public License for more details. 15*a2b0a27dSPhilippe Mathieu-Daudé * 16*a2b0a27dSPhilippe Mathieu-Daudé * You should have received a copy of the GNU Lesser General Public 17*a2b0a27dSPhilippe Mathieu-Daudé * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18*a2b0a27dSPhilippe Mathieu-Daudé */ 19*a2b0a27dSPhilippe Mathieu-Daudé 20*a2b0a27dSPhilippe Mathieu-Daudé #include "qemu/osdep.h" 21*a2b0a27dSPhilippe Mathieu-Daudé #include "cpu.h" 22*a2b0a27dSPhilippe Mathieu-Daudé #include "exec/helper-proto.h" 23*a2b0a27dSPhilippe Mathieu-Daudé 24*a2b0a27dSPhilippe Mathieu-Daudé /* 25*a2b0a27dSPhilippe Mathieu-Daudé * If the byte ordering doesn't matter, i.e. all columns are treated 26*a2b0a27dSPhilippe Mathieu-Daudé * identically, then this union can be used directly. If byte ordering 27*a2b0a27dSPhilippe Mathieu-Daudé * does matter, we generally ignore dumping to memory. 28*a2b0a27dSPhilippe Mathieu-Daudé */ 29*a2b0a27dSPhilippe Mathieu-Daudé typedef union { 30*a2b0a27dSPhilippe Mathieu-Daudé uint8_t ub[8]; 31*a2b0a27dSPhilippe Mathieu-Daudé int8_t sb[8]; 32*a2b0a27dSPhilippe Mathieu-Daudé uint16_t uh[4]; 33*a2b0a27dSPhilippe Mathieu-Daudé int16_t sh[4]; 34*a2b0a27dSPhilippe Mathieu-Daudé uint32_t uw[2]; 35*a2b0a27dSPhilippe Mathieu-Daudé int32_t sw[2]; 36*a2b0a27dSPhilippe Mathieu-Daudé uint64_t d; 37*a2b0a27dSPhilippe Mathieu-Daudé } LMIValue; 38*a2b0a27dSPhilippe Mathieu-Daudé 39*a2b0a27dSPhilippe Mathieu-Daudé /* Some byte ordering issues can be mitigated by XORing in the following. */ 40*a2b0a27dSPhilippe Mathieu-Daudé #ifdef HOST_WORDS_BIGENDIAN 41*a2b0a27dSPhilippe Mathieu-Daudé # define BYTE_ORDER_XOR(N) N 42*a2b0a27dSPhilippe Mathieu-Daudé #else 43*a2b0a27dSPhilippe Mathieu-Daudé # define BYTE_ORDER_XOR(N) 0 44*a2b0a27dSPhilippe Mathieu-Daudé #endif 45*a2b0a27dSPhilippe Mathieu-Daudé 46*a2b0a27dSPhilippe Mathieu-Daudé #define SATSB(x) (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x) 47*a2b0a27dSPhilippe Mathieu-Daudé #define SATUB(x) (x > 0xff ? 0xff : x) 48*a2b0a27dSPhilippe Mathieu-Daudé 49*a2b0a27dSPhilippe Mathieu-Daudé #define SATSH(x) (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x) 50*a2b0a27dSPhilippe Mathieu-Daudé #define SATUH(x) (x > 0xffff ? 0xffff : x) 51*a2b0a27dSPhilippe Mathieu-Daudé 52*a2b0a27dSPhilippe Mathieu-Daudé #define SATSW(x) \ 53*a2b0a27dSPhilippe Mathieu-Daudé (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x) 54*a2b0a27dSPhilippe Mathieu-Daudé #define SATUW(x) (x > 0xffffffffull ? 0xffffffffull : x) 55*a2b0a27dSPhilippe Mathieu-Daudé 56*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddsb(uint64_t fs, uint64_t ft) 57*a2b0a27dSPhilippe Mathieu-Daudé { 58*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 59*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 60*a2b0a27dSPhilippe Mathieu-Daudé 61*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 62*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 63*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 8; ++i) { 64*a2b0a27dSPhilippe Mathieu-Daudé int r = vs.sb[i] + vt.sb[i]; 65*a2b0a27dSPhilippe Mathieu-Daudé vs.sb[i] = SATSB(r); 66*a2b0a27dSPhilippe Mathieu-Daudé } 67*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 68*a2b0a27dSPhilippe Mathieu-Daudé } 69*a2b0a27dSPhilippe Mathieu-Daudé 70*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddusb(uint64_t fs, uint64_t ft) 71*a2b0a27dSPhilippe Mathieu-Daudé { 72*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 73*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 74*a2b0a27dSPhilippe Mathieu-Daudé 75*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 76*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 77*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 8; ++i) { 78*a2b0a27dSPhilippe Mathieu-Daudé int r = vs.ub[i] + vt.ub[i]; 79*a2b0a27dSPhilippe Mathieu-Daudé vs.ub[i] = SATUB(r); 80*a2b0a27dSPhilippe Mathieu-Daudé } 81*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 82*a2b0a27dSPhilippe Mathieu-Daudé } 83*a2b0a27dSPhilippe Mathieu-Daudé 84*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddsh(uint64_t fs, uint64_t ft) 85*a2b0a27dSPhilippe Mathieu-Daudé { 86*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 87*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 88*a2b0a27dSPhilippe Mathieu-Daudé 89*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 90*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 91*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 92*a2b0a27dSPhilippe Mathieu-Daudé int r = vs.sh[i] + vt.sh[i]; 93*a2b0a27dSPhilippe Mathieu-Daudé vs.sh[i] = SATSH(r); 94*a2b0a27dSPhilippe Mathieu-Daudé } 95*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 96*a2b0a27dSPhilippe Mathieu-Daudé } 97*a2b0a27dSPhilippe Mathieu-Daudé 98*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddush(uint64_t fs, uint64_t ft) 99*a2b0a27dSPhilippe Mathieu-Daudé { 100*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 101*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 102*a2b0a27dSPhilippe Mathieu-Daudé 103*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 104*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 105*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 106*a2b0a27dSPhilippe Mathieu-Daudé int r = vs.uh[i] + vt.uh[i]; 107*a2b0a27dSPhilippe Mathieu-Daudé vs.uh[i] = SATUH(r); 108*a2b0a27dSPhilippe Mathieu-Daudé } 109*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 110*a2b0a27dSPhilippe Mathieu-Daudé } 111*a2b0a27dSPhilippe Mathieu-Daudé 112*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddb(uint64_t fs, uint64_t ft) 113*a2b0a27dSPhilippe Mathieu-Daudé { 114*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 115*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 116*a2b0a27dSPhilippe Mathieu-Daudé 117*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 118*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 119*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 8; ++i) { 120*a2b0a27dSPhilippe Mathieu-Daudé vs.ub[i] += vt.ub[i]; 121*a2b0a27dSPhilippe Mathieu-Daudé } 122*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 123*a2b0a27dSPhilippe Mathieu-Daudé } 124*a2b0a27dSPhilippe Mathieu-Daudé 125*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddh(uint64_t fs, uint64_t ft) 126*a2b0a27dSPhilippe Mathieu-Daudé { 127*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 128*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 129*a2b0a27dSPhilippe Mathieu-Daudé 130*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 131*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 132*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 133*a2b0a27dSPhilippe Mathieu-Daudé vs.uh[i] += vt.uh[i]; 134*a2b0a27dSPhilippe Mathieu-Daudé } 135*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 136*a2b0a27dSPhilippe Mathieu-Daudé } 137*a2b0a27dSPhilippe Mathieu-Daudé 138*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_paddw(uint64_t fs, uint64_t ft) 139*a2b0a27dSPhilippe Mathieu-Daudé { 140*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 141*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 142*a2b0a27dSPhilippe Mathieu-Daudé 143*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 144*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 145*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 2; ++i) { 146*a2b0a27dSPhilippe Mathieu-Daudé vs.uw[i] += vt.uw[i]; 147*a2b0a27dSPhilippe Mathieu-Daudé } 148*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 149*a2b0a27dSPhilippe Mathieu-Daudé } 150*a2b0a27dSPhilippe Mathieu-Daudé 151*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubsb(uint64_t fs, uint64_t ft) 152*a2b0a27dSPhilippe Mathieu-Daudé { 153*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 154*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 155*a2b0a27dSPhilippe Mathieu-Daudé 156*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 157*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 158*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 8; ++i) { 159*a2b0a27dSPhilippe Mathieu-Daudé int r = vs.sb[i] - vt.sb[i]; 160*a2b0a27dSPhilippe Mathieu-Daudé vs.sb[i] = SATSB(r); 161*a2b0a27dSPhilippe Mathieu-Daudé } 162*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 163*a2b0a27dSPhilippe Mathieu-Daudé } 164*a2b0a27dSPhilippe Mathieu-Daudé 165*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubusb(uint64_t fs, uint64_t ft) 166*a2b0a27dSPhilippe Mathieu-Daudé { 167*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 168*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 169*a2b0a27dSPhilippe Mathieu-Daudé 170*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 171*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 172*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 8; ++i) { 173*a2b0a27dSPhilippe Mathieu-Daudé int r = vs.ub[i] - vt.ub[i]; 174*a2b0a27dSPhilippe Mathieu-Daudé vs.ub[i] = SATUB(r); 175*a2b0a27dSPhilippe Mathieu-Daudé } 176*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 177*a2b0a27dSPhilippe Mathieu-Daudé } 178*a2b0a27dSPhilippe Mathieu-Daudé 179*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubsh(uint64_t fs, uint64_t ft) 180*a2b0a27dSPhilippe Mathieu-Daudé { 181*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 182*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 183*a2b0a27dSPhilippe Mathieu-Daudé 184*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 185*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 186*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 187*a2b0a27dSPhilippe Mathieu-Daudé int r = vs.sh[i] - vt.sh[i]; 188*a2b0a27dSPhilippe Mathieu-Daudé vs.sh[i] = SATSH(r); 189*a2b0a27dSPhilippe Mathieu-Daudé } 190*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 191*a2b0a27dSPhilippe Mathieu-Daudé } 192*a2b0a27dSPhilippe Mathieu-Daudé 193*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubush(uint64_t fs, uint64_t ft) 194*a2b0a27dSPhilippe Mathieu-Daudé { 195*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 196*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 197*a2b0a27dSPhilippe Mathieu-Daudé 198*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 199*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 200*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 201*a2b0a27dSPhilippe Mathieu-Daudé int r = vs.uh[i] - vt.uh[i]; 202*a2b0a27dSPhilippe Mathieu-Daudé vs.uh[i] = SATUH(r); 203*a2b0a27dSPhilippe Mathieu-Daudé } 204*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 205*a2b0a27dSPhilippe Mathieu-Daudé } 206*a2b0a27dSPhilippe Mathieu-Daudé 207*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubb(uint64_t fs, uint64_t ft) 208*a2b0a27dSPhilippe Mathieu-Daudé { 209*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 210*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 211*a2b0a27dSPhilippe Mathieu-Daudé 212*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 213*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 214*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 8; ++i) { 215*a2b0a27dSPhilippe Mathieu-Daudé vs.ub[i] -= vt.ub[i]; 216*a2b0a27dSPhilippe Mathieu-Daudé } 217*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 218*a2b0a27dSPhilippe Mathieu-Daudé } 219*a2b0a27dSPhilippe Mathieu-Daudé 220*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubh(uint64_t fs, uint64_t ft) 221*a2b0a27dSPhilippe Mathieu-Daudé { 222*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 223*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 224*a2b0a27dSPhilippe Mathieu-Daudé 225*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 226*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 227*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 228*a2b0a27dSPhilippe Mathieu-Daudé vs.uh[i] -= vt.uh[i]; 229*a2b0a27dSPhilippe Mathieu-Daudé } 230*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 231*a2b0a27dSPhilippe Mathieu-Daudé } 232*a2b0a27dSPhilippe Mathieu-Daudé 233*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psubw(uint64_t fs, uint64_t ft) 234*a2b0a27dSPhilippe Mathieu-Daudé { 235*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 236*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 237*a2b0a27dSPhilippe Mathieu-Daudé 238*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 239*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 240*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 2; ++i) { 241*a2b0a27dSPhilippe Mathieu-Daudé vs.uw[i] -= vt.uw[i]; 242*a2b0a27dSPhilippe Mathieu-Daudé } 243*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 244*a2b0a27dSPhilippe Mathieu-Daudé } 245*a2b0a27dSPhilippe Mathieu-Daudé 246*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pshufh(uint64_t fs, uint64_t ft) 247*a2b0a27dSPhilippe Mathieu-Daudé { 248*a2b0a27dSPhilippe Mathieu-Daudé unsigned host = BYTE_ORDER_XOR(3); 249*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vd, vs; 250*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 251*a2b0a27dSPhilippe Mathieu-Daudé 252*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 253*a2b0a27dSPhilippe Mathieu-Daudé vd.d = 0; 254*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; i++, ft >>= 2) { 255*a2b0a27dSPhilippe Mathieu-Daudé vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host]; 256*a2b0a27dSPhilippe Mathieu-Daudé } 257*a2b0a27dSPhilippe Mathieu-Daudé return vd.d; 258*a2b0a27dSPhilippe Mathieu-Daudé } 259*a2b0a27dSPhilippe Mathieu-Daudé 260*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_packsswh(uint64_t fs, uint64_t ft) 261*a2b0a27dSPhilippe Mathieu-Daudé { 262*a2b0a27dSPhilippe Mathieu-Daudé uint64_t fd = 0; 263*a2b0a27dSPhilippe Mathieu-Daudé int64_t tmp; 264*a2b0a27dSPhilippe Mathieu-Daudé 265*a2b0a27dSPhilippe Mathieu-Daudé tmp = (int32_t)(fs >> 0); 266*a2b0a27dSPhilippe Mathieu-Daudé tmp = SATSH(tmp); 267*a2b0a27dSPhilippe Mathieu-Daudé fd |= (tmp & 0xffff) << 0; 268*a2b0a27dSPhilippe Mathieu-Daudé 269*a2b0a27dSPhilippe Mathieu-Daudé tmp = (int32_t)(fs >> 32); 270*a2b0a27dSPhilippe Mathieu-Daudé tmp = SATSH(tmp); 271*a2b0a27dSPhilippe Mathieu-Daudé fd |= (tmp & 0xffff) << 16; 272*a2b0a27dSPhilippe Mathieu-Daudé 273*a2b0a27dSPhilippe Mathieu-Daudé tmp = (int32_t)(ft >> 0); 274*a2b0a27dSPhilippe Mathieu-Daudé tmp = SATSH(tmp); 275*a2b0a27dSPhilippe Mathieu-Daudé fd |= (tmp & 0xffff) << 32; 276*a2b0a27dSPhilippe Mathieu-Daudé 277*a2b0a27dSPhilippe Mathieu-Daudé tmp = (int32_t)(ft >> 32); 278*a2b0a27dSPhilippe Mathieu-Daudé tmp = SATSH(tmp); 279*a2b0a27dSPhilippe Mathieu-Daudé fd |= (tmp & 0xffff) << 48; 280*a2b0a27dSPhilippe Mathieu-Daudé 281*a2b0a27dSPhilippe Mathieu-Daudé return fd; 282*a2b0a27dSPhilippe Mathieu-Daudé } 283*a2b0a27dSPhilippe Mathieu-Daudé 284*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_packsshb(uint64_t fs, uint64_t ft) 285*a2b0a27dSPhilippe Mathieu-Daudé { 286*a2b0a27dSPhilippe Mathieu-Daudé uint64_t fd = 0; 287*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 288*a2b0a27dSPhilippe Mathieu-Daudé 289*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 290*a2b0a27dSPhilippe Mathieu-Daudé int16_t tmp = fs >> (i * 16); 291*a2b0a27dSPhilippe Mathieu-Daudé tmp = SATSB(tmp); 292*a2b0a27dSPhilippe Mathieu-Daudé fd |= (uint64_t)(tmp & 0xff) << (i * 8); 293*a2b0a27dSPhilippe Mathieu-Daudé } 294*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 295*a2b0a27dSPhilippe Mathieu-Daudé int16_t tmp = ft >> (i * 16); 296*a2b0a27dSPhilippe Mathieu-Daudé tmp = SATSB(tmp); 297*a2b0a27dSPhilippe Mathieu-Daudé fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32); 298*a2b0a27dSPhilippe Mathieu-Daudé } 299*a2b0a27dSPhilippe Mathieu-Daudé 300*a2b0a27dSPhilippe Mathieu-Daudé return fd; 301*a2b0a27dSPhilippe Mathieu-Daudé } 302*a2b0a27dSPhilippe Mathieu-Daudé 303*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_packushb(uint64_t fs, uint64_t ft) 304*a2b0a27dSPhilippe Mathieu-Daudé { 305*a2b0a27dSPhilippe Mathieu-Daudé uint64_t fd = 0; 306*a2b0a27dSPhilippe Mathieu-Daudé unsigned int i; 307*a2b0a27dSPhilippe Mathieu-Daudé 308*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 309*a2b0a27dSPhilippe Mathieu-Daudé int16_t tmp = fs >> (i * 16); 310*a2b0a27dSPhilippe Mathieu-Daudé tmp = SATUB(tmp); 311*a2b0a27dSPhilippe Mathieu-Daudé fd |= (uint64_t)(tmp & 0xff) << (i * 8); 312*a2b0a27dSPhilippe Mathieu-Daudé } 313*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 314*a2b0a27dSPhilippe Mathieu-Daudé int16_t tmp = ft >> (i * 16); 315*a2b0a27dSPhilippe Mathieu-Daudé tmp = SATUB(tmp); 316*a2b0a27dSPhilippe Mathieu-Daudé fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32); 317*a2b0a27dSPhilippe Mathieu-Daudé } 318*a2b0a27dSPhilippe Mathieu-Daudé 319*a2b0a27dSPhilippe Mathieu-Daudé return fd; 320*a2b0a27dSPhilippe Mathieu-Daudé } 321*a2b0a27dSPhilippe Mathieu-Daudé 322*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft) 323*a2b0a27dSPhilippe Mathieu-Daudé { 324*a2b0a27dSPhilippe Mathieu-Daudé return (fs & 0xffffffff) | (ft << 32); 325*a2b0a27dSPhilippe Mathieu-Daudé } 326*a2b0a27dSPhilippe Mathieu-Daudé 327*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft) 328*a2b0a27dSPhilippe Mathieu-Daudé { 329*a2b0a27dSPhilippe Mathieu-Daudé return (fs >> 32) | (ft & ~0xffffffffull); 330*a2b0a27dSPhilippe Mathieu-Daudé } 331*a2b0a27dSPhilippe Mathieu-Daudé 332*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft) 333*a2b0a27dSPhilippe Mathieu-Daudé { 334*a2b0a27dSPhilippe Mathieu-Daudé unsigned host = BYTE_ORDER_XOR(3); 335*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vd, vs, vt; 336*a2b0a27dSPhilippe Mathieu-Daudé 337*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 338*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 339*a2b0a27dSPhilippe Mathieu-Daudé vd.uh[0 ^ host] = vs.uh[0 ^ host]; 340*a2b0a27dSPhilippe Mathieu-Daudé vd.uh[1 ^ host] = vt.uh[0 ^ host]; 341*a2b0a27dSPhilippe Mathieu-Daudé vd.uh[2 ^ host] = vs.uh[1 ^ host]; 342*a2b0a27dSPhilippe Mathieu-Daudé vd.uh[3 ^ host] = vt.uh[1 ^ host]; 343*a2b0a27dSPhilippe Mathieu-Daudé 344*a2b0a27dSPhilippe Mathieu-Daudé return vd.d; 345*a2b0a27dSPhilippe Mathieu-Daudé } 346*a2b0a27dSPhilippe Mathieu-Daudé 347*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft) 348*a2b0a27dSPhilippe Mathieu-Daudé { 349*a2b0a27dSPhilippe Mathieu-Daudé unsigned host = BYTE_ORDER_XOR(3); 350*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vd, vs, vt; 351*a2b0a27dSPhilippe Mathieu-Daudé 352*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 353*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 354*a2b0a27dSPhilippe Mathieu-Daudé vd.uh[0 ^ host] = vs.uh[2 ^ host]; 355*a2b0a27dSPhilippe Mathieu-Daudé vd.uh[1 ^ host] = vt.uh[2 ^ host]; 356*a2b0a27dSPhilippe Mathieu-Daudé vd.uh[2 ^ host] = vs.uh[3 ^ host]; 357*a2b0a27dSPhilippe Mathieu-Daudé vd.uh[3 ^ host] = vt.uh[3 ^ host]; 358*a2b0a27dSPhilippe Mathieu-Daudé 359*a2b0a27dSPhilippe Mathieu-Daudé return vd.d; 360*a2b0a27dSPhilippe Mathieu-Daudé } 361*a2b0a27dSPhilippe Mathieu-Daudé 362*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft) 363*a2b0a27dSPhilippe Mathieu-Daudé { 364*a2b0a27dSPhilippe Mathieu-Daudé unsigned host = BYTE_ORDER_XOR(7); 365*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vd, vs, vt; 366*a2b0a27dSPhilippe Mathieu-Daudé 367*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 368*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 369*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[0 ^ host] = vs.ub[0 ^ host]; 370*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[1 ^ host] = vt.ub[0 ^ host]; 371*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[2 ^ host] = vs.ub[1 ^ host]; 372*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[3 ^ host] = vt.ub[1 ^ host]; 373*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[4 ^ host] = vs.ub[2 ^ host]; 374*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[5 ^ host] = vt.ub[2 ^ host]; 375*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[6 ^ host] = vs.ub[3 ^ host]; 376*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[7 ^ host] = vt.ub[3 ^ host]; 377*a2b0a27dSPhilippe Mathieu-Daudé 378*a2b0a27dSPhilippe Mathieu-Daudé return vd.d; 379*a2b0a27dSPhilippe Mathieu-Daudé } 380*a2b0a27dSPhilippe Mathieu-Daudé 381*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft) 382*a2b0a27dSPhilippe Mathieu-Daudé { 383*a2b0a27dSPhilippe Mathieu-Daudé unsigned host = BYTE_ORDER_XOR(7); 384*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vd, vs, vt; 385*a2b0a27dSPhilippe Mathieu-Daudé 386*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 387*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 388*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[0 ^ host] = vs.ub[4 ^ host]; 389*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[1 ^ host] = vt.ub[4 ^ host]; 390*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[2 ^ host] = vs.ub[5 ^ host]; 391*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[3 ^ host] = vt.ub[5 ^ host]; 392*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[4 ^ host] = vs.ub[6 ^ host]; 393*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[5 ^ host] = vt.ub[6 ^ host]; 394*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[6 ^ host] = vs.ub[7 ^ host]; 395*a2b0a27dSPhilippe Mathieu-Daudé vd.ub[7 ^ host] = vt.ub[7 ^ host]; 396*a2b0a27dSPhilippe Mathieu-Daudé 397*a2b0a27dSPhilippe Mathieu-Daudé return vd.d; 398*a2b0a27dSPhilippe Mathieu-Daudé } 399*a2b0a27dSPhilippe Mathieu-Daudé 400*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pavgh(uint64_t fs, uint64_t ft) 401*a2b0a27dSPhilippe Mathieu-Daudé { 402*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 403*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 404*a2b0a27dSPhilippe Mathieu-Daudé 405*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 406*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 407*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; i++) { 408*a2b0a27dSPhilippe Mathieu-Daudé vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1; 409*a2b0a27dSPhilippe Mathieu-Daudé } 410*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 411*a2b0a27dSPhilippe Mathieu-Daudé } 412*a2b0a27dSPhilippe Mathieu-Daudé 413*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pavgb(uint64_t fs, uint64_t ft) 414*a2b0a27dSPhilippe Mathieu-Daudé { 415*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 416*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 417*a2b0a27dSPhilippe Mathieu-Daudé 418*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 419*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 420*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 8; i++) { 421*a2b0a27dSPhilippe Mathieu-Daudé vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1; 422*a2b0a27dSPhilippe Mathieu-Daudé } 423*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 424*a2b0a27dSPhilippe Mathieu-Daudé } 425*a2b0a27dSPhilippe Mathieu-Daudé 426*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft) 427*a2b0a27dSPhilippe Mathieu-Daudé { 428*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 429*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 430*a2b0a27dSPhilippe Mathieu-Daudé 431*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 432*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 433*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; i++) { 434*a2b0a27dSPhilippe Mathieu-Daudé vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]); 435*a2b0a27dSPhilippe Mathieu-Daudé } 436*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 437*a2b0a27dSPhilippe Mathieu-Daudé } 438*a2b0a27dSPhilippe Mathieu-Daudé 439*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pminsh(uint64_t fs, uint64_t ft) 440*a2b0a27dSPhilippe Mathieu-Daudé { 441*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 442*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 443*a2b0a27dSPhilippe Mathieu-Daudé 444*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 445*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 446*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; i++) { 447*a2b0a27dSPhilippe Mathieu-Daudé vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]); 448*a2b0a27dSPhilippe Mathieu-Daudé } 449*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 450*a2b0a27dSPhilippe Mathieu-Daudé } 451*a2b0a27dSPhilippe Mathieu-Daudé 452*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmaxub(uint64_t fs, uint64_t ft) 453*a2b0a27dSPhilippe Mathieu-Daudé { 454*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 455*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 456*a2b0a27dSPhilippe Mathieu-Daudé 457*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 458*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 459*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; i++) { 460*a2b0a27dSPhilippe Mathieu-Daudé vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]); 461*a2b0a27dSPhilippe Mathieu-Daudé } 462*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 463*a2b0a27dSPhilippe Mathieu-Daudé } 464*a2b0a27dSPhilippe Mathieu-Daudé 465*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pminub(uint64_t fs, uint64_t ft) 466*a2b0a27dSPhilippe Mathieu-Daudé { 467*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 468*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 469*a2b0a27dSPhilippe Mathieu-Daudé 470*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 471*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 472*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; i++) { 473*a2b0a27dSPhilippe Mathieu-Daudé vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]); 474*a2b0a27dSPhilippe Mathieu-Daudé } 475*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 476*a2b0a27dSPhilippe Mathieu-Daudé } 477*a2b0a27dSPhilippe Mathieu-Daudé 478*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft) 479*a2b0a27dSPhilippe Mathieu-Daudé { 480*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 481*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 482*a2b0a27dSPhilippe Mathieu-Daudé 483*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 484*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 485*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 2; i++) { 486*a2b0a27dSPhilippe Mathieu-Daudé vs.uw[i] = -(vs.uw[i] == vt.uw[i]); 487*a2b0a27dSPhilippe Mathieu-Daudé } 488*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 489*a2b0a27dSPhilippe Mathieu-Daudé } 490*a2b0a27dSPhilippe Mathieu-Daudé 491*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft) 492*a2b0a27dSPhilippe Mathieu-Daudé { 493*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 494*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 495*a2b0a27dSPhilippe Mathieu-Daudé 496*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 497*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 498*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 2; i++) { 499*a2b0a27dSPhilippe Mathieu-Daudé vs.uw[i] = -(vs.uw[i] > vt.uw[i]); 500*a2b0a27dSPhilippe Mathieu-Daudé } 501*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 502*a2b0a27dSPhilippe Mathieu-Daudé } 503*a2b0a27dSPhilippe Mathieu-Daudé 504*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft) 505*a2b0a27dSPhilippe Mathieu-Daudé { 506*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 507*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 508*a2b0a27dSPhilippe Mathieu-Daudé 509*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 510*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 511*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; i++) { 512*a2b0a27dSPhilippe Mathieu-Daudé vs.uh[i] = -(vs.uh[i] == vt.uh[i]); 513*a2b0a27dSPhilippe Mathieu-Daudé } 514*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 515*a2b0a27dSPhilippe Mathieu-Daudé } 516*a2b0a27dSPhilippe Mathieu-Daudé 517*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft) 518*a2b0a27dSPhilippe Mathieu-Daudé { 519*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 520*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 521*a2b0a27dSPhilippe Mathieu-Daudé 522*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 523*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 524*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; i++) { 525*a2b0a27dSPhilippe Mathieu-Daudé vs.uh[i] = -(vs.uh[i] > vt.uh[i]); 526*a2b0a27dSPhilippe Mathieu-Daudé } 527*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 528*a2b0a27dSPhilippe Mathieu-Daudé } 529*a2b0a27dSPhilippe Mathieu-Daudé 530*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft) 531*a2b0a27dSPhilippe Mathieu-Daudé { 532*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 533*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 534*a2b0a27dSPhilippe Mathieu-Daudé 535*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 536*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 537*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 8; i++) { 538*a2b0a27dSPhilippe Mathieu-Daudé vs.ub[i] = -(vs.ub[i] == vt.ub[i]); 539*a2b0a27dSPhilippe Mathieu-Daudé } 540*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 541*a2b0a27dSPhilippe Mathieu-Daudé } 542*a2b0a27dSPhilippe Mathieu-Daudé 543*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft) 544*a2b0a27dSPhilippe Mathieu-Daudé { 545*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 546*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 547*a2b0a27dSPhilippe Mathieu-Daudé 548*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 549*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 550*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 8; i++) { 551*a2b0a27dSPhilippe Mathieu-Daudé vs.ub[i] = -(vs.ub[i] > vt.ub[i]); 552*a2b0a27dSPhilippe Mathieu-Daudé } 553*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 554*a2b0a27dSPhilippe Mathieu-Daudé } 555*a2b0a27dSPhilippe Mathieu-Daudé 556*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psllw(uint64_t fs, uint64_t ft) 557*a2b0a27dSPhilippe Mathieu-Daudé { 558*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs; 559*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 560*a2b0a27dSPhilippe Mathieu-Daudé 561*a2b0a27dSPhilippe Mathieu-Daudé ft &= 0x7f; 562*a2b0a27dSPhilippe Mathieu-Daudé if (ft > 31) { 563*a2b0a27dSPhilippe Mathieu-Daudé return 0; 564*a2b0a27dSPhilippe Mathieu-Daudé } 565*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 566*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 2; ++i) { 567*a2b0a27dSPhilippe Mathieu-Daudé vs.uw[i] <<= ft; 568*a2b0a27dSPhilippe Mathieu-Daudé } 569*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 570*a2b0a27dSPhilippe Mathieu-Daudé } 571*a2b0a27dSPhilippe Mathieu-Daudé 572*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psrlw(uint64_t fs, uint64_t ft) 573*a2b0a27dSPhilippe Mathieu-Daudé { 574*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs; 575*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 576*a2b0a27dSPhilippe Mathieu-Daudé 577*a2b0a27dSPhilippe Mathieu-Daudé ft &= 0x7f; 578*a2b0a27dSPhilippe Mathieu-Daudé if (ft > 31) { 579*a2b0a27dSPhilippe Mathieu-Daudé return 0; 580*a2b0a27dSPhilippe Mathieu-Daudé } 581*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 582*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 2; ++i) { 583*a2b0a27dSPhilippe Mathieu-Daudé vs.uw[i] >>= ft; 584*a2b0a27dSPhilippe Mathieu-Daudé } 585*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 586*a2b0a27dSPhilippe Mathieu-Daudé } 587*a2b0a27dSPhilippe Mathieu-Daudé 588*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psraw(uint64_t fs, uint64_t ft) 589*a2b0a27dSPhilippe Mathieu-Daudé { 590*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs; 591*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 592*a2b0a27dSPhilippe Mathieu-Daudé 593*a2b0a27dSPhilippe Mathieu-Daudé ft &= 0x7f; 594*a2b0a27dSPhilippe Mathieu-Daudé if (ft > 31) { 595*a2b0a27dSPhilippe Mathieu-Daudé ft = 31; 596*a2b0a27dSPhilippe Mathieu-Daudé } 597*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 598*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 2; ++i) { 599*a2b0a27dSPhilippe Mathieu-Daudé vs.sw[i] >>= ft; 600*a2b0a27dSPhilippe Mathieu-Daudé } 601*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 602*a2b0a27dSPhilippe Mathieu-Daudé } 603*a2b0a27dSPhilippe Mathieu-Daudé 604*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psllh(uint64_t fs, uint64_t ft) 605*a2b0a27dSPhilippe Mathieu-Daudé { 606*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs; 607*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 608*a2b0a27dSPhilippe Mathieu-Daudé 609*a2b0a27dSPhilippe Mathieu-Daudé ft &= 0x7f; 610*a2b0a27dSPhilippe Mathieu-Daudé if (ft > 15) { 611*a2b0a27dSPhilippe Mathieu-Daudé return 0; 612*a2b0a27dSPhilippe Mathieu-Daudé } 613*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 614*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 615*a2b0a27dSPhilippe Mathieu-Daudé vs.uh[i] <<= ft; 616*a2b0a27dSPhilippe Mathieu-Daudé } 617*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 618*a2b0a27dSPhilippe Mathieu-Daudé } 619*a2b0a27dSPhilippe Mathieu-Daudé 620*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psrlh(uint64_t fs, uint64_t ft) 621*a2b0a27dSPhilippe Mathieu-Daudé { 622*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs; 623*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 624*a2b0a27dSPhilippe Mathieu-Daudé 625*a2b0a27dSPhilippe Mathieu-Daudé ft &= 0x7f; 626*a2b0a27dSPhilippe Mathieu-Daudé if (ft > 15) { 627*a2b0a27dSPhilippe Mathieu-Daudé return 0; 628*a2b0a27dSPhilippe Mathieu-Daudé } 629*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 630*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 631*a2b0a27dSPhilippe Mathieu-Daudé vs.uh[i] >>= ft; 632*a2b0a27dSPhilippe Mathieu-Daudé } 633*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 634*a2b0a27dSPhilippe Mathieu-Daudé } 635*a2b0a27dSPhilippe Mathieu-Daudé 636*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_psrah(uint64_t fs, uint64_t ft) 637*a2b0a27dSPhilippe Mathieu-Daudé { 638*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs; 639*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 640*a2b0a27dSPhilippe Mathieu-Daudé 641*a2b0a27dSPhilippe Mathieu-Daudé ft &= 0x7f; 642*a2b0a27dSPhilippe Mathieu-Daudé if (ft > 15) { 643*a2b0a27dSPhilippe Mathieu-Daudé ft = 15; 644*a2b0a27dSPhilippe Mathieu-Daudé } 645*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 646*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 647*a2b0a27dSPhilippe Mathieu-Daudé vs.sh[i] >>= ft; 648*a2b0a27dSPhilippe Mathieu-Daudé } 649*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 650*a2b0a27dSPhilippe Mathieu-Daudé } 651*a2b0a27dSPhilippe Mathieu-Daudé 652*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmullh(uint64_t fs, uint64_t ft) 653*a2b0a27dSPhilippe Mathieu-Daudé { 654*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 655*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 656*a2b0a27dSPhilippe Mathieu-Daudé 657*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 658*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 659*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 660*a2b0a27dSPhilippe Mathieu-Daudé vs.sh[i] *= vt.sh[i]; 661*a2b0a27dSPhilippe Mathieu-Daudé } 662*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 663*a2b0a27dSPhilippe Mathieu-Daudé } 664*a2b0a27dSPhilippe Mathieu-Daudé 665*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmulhh(uint64_t fs, uint64_t ft) 666*a2b0a27dSPhilippe Mathieu-Daudé { 667*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 668*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 669*a2b0a27dSPhilippe Mathieu-Daudé 670*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 671*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 672*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 673*a2b0a27dSPhilippe Mathieu-Daudé int32_t r = vs.sh[i] * vt.sh[i]; 674*a2b0a27dSPhilippe Mathieu-Daudé vs.sh[i] = r >> 16; 675*a2b0a27dSPhilippe Mathieu-Daudé } 676*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 677*a2b0a27dSPhilippe Mathieu-Daudé } 678*a2b0a27dSPhilippe Mathieu-Daudé 679*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft) 680*a2b0a27dSPhilippe Mathieu-Daudé { 681*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 682*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 683*a2b0a27dSPhilippe Mathieu-Daudé 684*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 685*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 686*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 4; ++i) { 687*a2b0a27dSPhilippe Mathieu-Daudé uint32_t r = vs.uh[i] * vt.uh[i]; 688*a2b0a27dSPhilippe Mathieu-Daudé vs.uh[i] = r >> 16; 689*a2b0a27dSPhilippe Mathieu-Daudé } 690*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 691*a2b0a27dSPhilippe Mathieu-Daudé } 692*a2b0a27dSPhilippe Mathieu-Daudé 693*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft) 694*a2b0a27dSPhilippe Mathieu-Daudé { 695*a2b0a27dSPhilippe Mathieu-Daudé unsigned host = BYTE_ORDER_XOR(3); 696*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 697*a2b0a27dSPhilippe Mathieu-Daudé uint32_t p0, p1; 698*a2b0a27dSPhilippe Mathieu-Daudé 699*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 700*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 701*a2b0a27dSPhilippe Mathieu-Daudé p0 = vs.sh[0 ^ host] * vt.sh[0 ^ host]; 702*a2b0a27dSPhilippe Mathieu-Daudé p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host]; 703*a2b0a27dSPhilippe Mathieu-Daudé p1 = vs.sh[2 ^ host] * vt.sh[2 ^ host]; 704*a2b0a27dSPhilippe Mathieu-Daudé p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host]; 705*a2b0a27dSPhilippe Mathieu-Daudé 706*a2b0a27dSPhilippe Mathieu-Daudé return ((uint64_t)p1 << 32) | p0; 707*a2b0a27dSPhilippe Mathieu-Daudé } 708*a2b0a27dSPhilippe Mathieu-Daudé 709*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pasubub(uint64_t fs, uint64_t ft) 710*a2b0a27dSPhilippe Mathieu-Daudé { 711*a2b0a27dSPhilippe Mathieu-Daudé LMIValue vs, vt; 712*a2b0a27dSPhilippe Mathieu-Daudé unsigned i; 713*a2b0a27dSPhilippe Mathieu-Daudé 714*a2b0a27dSPhilippe Mathieu-Daudé vs.d = fs; 715*a2b0a27dSPhilippe Mathieu-Daudé vt.d = ft; 716*a2b0a27dSPhilippe Mathieu-Daudé for (i = 0; i < 8; ++i) { 717*a2b0a27dSPhilippe Mathieu-Daudé int r = vs.ub[i] - vt.ub[i]; 718*a2b0a27dSPhilippe Mathieu-Daudé vs.ub[i] = (r < 0 ? -r : r); 719*a2b0a27dSPhilippe Mathieu-Daudé } 720*a2b0a27dSPhilippe Mathieu-Daudé return vs.d; 721*a2b0a27dSPhilippe Mathieu-Daudé } 722*a2b0a27dSPhilippe Mathieu-Daudé 723*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_biadd(uint64_t fs) 724*a2b0a27dSPhilippe Mathieu-Daudé { 725*a2b0a27dSPhilippe Mathieu-Daudé unsigned i, fd; 726*a2b0a27dSPhilippe Mathieu-Daudé 727*a2b0a27dSPhilippe Mathieu-Daudé for (i = fd = 0; i < 8; ++i) { 728*a2b0a27dSPhilippe Mathieu-Daudé fd += (fs >> (i * 8)) & 0xff; 729*a2b0a27dSPhilippe Mathieu-Daudé } 730*a2b0a27dSPhilippe Mathieu-Daudé return fd & 0xffff; 731*a2b0a27dSPhilippe Mathieu-Daudé } 732*a2b0a27dSPhilippe Mathieu-Daudé 733*a2b0a27dSPhilippe Mathieu-Daudé uint64_t helper_pmovmskb(uint64_t fs) 734*a2b0a27dSPhilippe Mathieu-Daudé { 735*a2b0a27dSPhilippe Mathieu-Daudé unsigned fd = 0; 736*a2b0a27dSPhilippe Mathieu-Daudé 737*a2b0a27dSPhilippe Mathieu-Daudé fd |= ((fs >> 7) & 1) << 0; 738*a2b0a27dSPhilippe Mathieu-Daudé fd |= ((fs >> 15) & 1) << 1; 739*a2b0a27dSPhilippe Mathieu-Daudé fd |= ((fs >> 23) & 1) << 2; 740*a2b0a27dSPhilippe Mathieu-Daudé fd |= ((fs >> 31) & 1) << 3; 741*a2b0a27dSPhilippe Mathieu-Daudé fd |= ((fs >> 39) & 1) << 4; 742*a2b0a27dSPhilippe Mathieu-Daudé fd |= ((fs >> 47) & 1) << 5; 743*a2b0a27dSPhilippe Mathieu-Daudé fd |= ((fs >> 55) & 1) << 6; 744*a2b0a27dSPhilippe Mathieu-Daudé fd |= ((fs >> 63) & 1) << 7; 745*a2b0a27dSPhilippe Mathieu-Daudé 746*a2b0a27dSPhilippe Mathieu-Daudé return fd & 0xff; 747*a2b0a27dSPhilippe Mathieu-Daudé } 748