1 /* 2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support 3 * 4 * Copyright (c) 2005 Fabrice Bellard 5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com> 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 #include "crypto/aes.h" 22 23 #if SHIFT == 0 24 #define Reg MMXReg 25 #define SIZE 8 26 #define XMM_ONLY(...) 27 #define B(n) MMX_B(n) 28 #define W(n) MMX_W(n) 29 #define L(n) MMX_L(n) 30 #define Q(n) MMX_Q(n) 31 #define SUFFIX _mmx 32 #else 33 #define Reg ZMMReg 34 #define SIZE 16 35 #define XMM_ONLY(...) __VA_ARGS__ 36 #define B(n) ZMM_B(n) 37 #define W(n) ZMM_W(n) 38 #define L(n) ZMM_L(n) 39 #define Q(n) ZMM_Q(n) 40 #define SUFFIX _xmm 41 #endif 42 43 /* 44 * Copy the relevant parts of a Reg value around. In the case where 45 * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of 46 * a 64 byte ZMMReg, so we must copy only those and keep the top bytes 47 * untouched in the guest-visible destination destination register. 48 * Note that the "lower bytes" are placed last in memory on big-endian 49 * hosts, which store the vector backwards in memory. In that case the 50 * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of 51 * the little-endian case. 52 */ 53 #if HOST_BIG_ENDIAN 54 #define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(r).B(SIZE - 1), SIZE) 55 #else 56 #define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE) 57 #endif 58 59 void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 60 { 61 int shift; 62 63 if (s->Q(0) > 15) { 64 d->Q(0) = 0; 65 #if SHIFT == 1 66 d->Q(1) = 0; 67 #endif 68 } else { 69 shift = s->B(0); 70 d->W(0) >>= shift; 71 d->W(1) >>= shift; 72 d->W(2) >>= shift; 73 d->W(3) >>= shift; 74 #if SHIFT == 1 75 d->W(4) >>= shift; 76 d->W(5) >>= shift; 77 d->W(6) >>= shift; 78 d->W(7) >>= shift; 79 #endif 80 } 81 } 82 83 void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 84 { 85 int shift; 86 87 if (s->Q(0) > 15) { 88 shift = 15; 89 } else { 90 shift = s->B(0); 91 } 92 d->W(0) = (int16_t)d->W(0) >> shift; 93 d->W(1) = (int16_t)d->W(1) >> shift; 94 d->W(2) = (int16_t)d->W(2) >> shift; 95 d->W(3) = (int16_t)d->W(3) >> shift; 96 #if SHIFT == 1 97 d->W(4) = (int16_t)d->W(4) >> shift; 98 d->W(5) = (int16_t)d->W(5) >> shift; 99 d->W(6) = (int16_t)d->W(6) >> shift; 100 d->W(7) = (int16_t)d->W(7) >> shift; 101 #endif 102 } 103 104 void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 105 { 106 int shift; 107 108 if (s->Q(0) > 15) { 109 d->Q(0) = 0; 110 #if SHIFT == 1 111 d->Q(1) = 0; 112 #endif 113 } else { 114 shift = s->B(0); 115 d->W(0) <<= shift; 116 d->W(1) <<= shift; 117 d->W(2) <<= shift; 118 d->W(3) <<= shift; 119 #if SHIFT == 1 120 d->W(4) <<= shift; 121 d->W(5) <<= shift; 122 d->W(6) <<= shift; 123 d->W(7) <<= shift; 124 #endif 125 } 126 } 127 128 void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 129 { 130 int shift; 131 132 if (s->Q(0) > 31) { 133 d->Q(0) = 0; 134 #if SHIFT == 1 135 d->Q(1) = 0; 136 #endif 137 } else { 138 shift = s->B(0); 139 d->L(0) >>= shift; 140 d->L(1) >>= shift; 141 #if SHIFT == 1 142 d->L(2) >>= shift; 143 d->L(3) >>= shift; 144 #endif 145 } 146 } 147 148 void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 149 { 150 int shift; 151 152 if (s->Q(0) > 31) { 153 shift = 31; 154 } else { 155 shift = s->B(0); 156 } 157 d->L(0) = (int32_t)d->L(0) >> shift; 158 d->L(1) = (int32_t)d->L(1) >> shift; 159 #if SHIFT == 1 160 d->L(2) = (int32_t)d->L(2) >> shift; 161 d->L(3) = (int32_t)d->L(3) >> shift; 162 #endif 163 } 164 165 void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 166 { 167 int shift; 168 169 if (s->Q(0) > 31) { 170 d->Q(0) = 0; 171 #if SHIFT == 1 172 d->Q(1) = 0; 173 #endif 174 } else { 175 shift = s->B(0); 176 d->L(0) <<= shift; 177 d->L(1) <<= shift; 178 #if SHIFT == 1 179 d->L(2) <<= shift; 180 d->L(3) <<= shift; 181 #endif 182 } 183 } 184 185 void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 186 { 187 int shift; 188 189 if (s->Q(0) > 63) { 190 d->Q(0) = 0; 191 #if SHIFT == 1 192 d->Q(1) = 0; 193 #endif 194 } else { 195 shift = s->B(0); 196 d->Q(0) >>= shift; 197 #if SHIFT == 1 198 d->Q(1) >>= shift; 199 #endif 200 } 201 } 202 203 void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 204 { 205 int shift; 206 207 if (s->Q(0) > 63) { 208 d->Q(0) = 0; 209 #if SHIFT == 1 210 d->Q(1) = 0; 211 #endif 212 } else { 213 shift = s->B(0); 214 d->Q(0) <<= shift; 215 #if SHIFT == 1 216 d->Q(1) <<= shift; 217 #endif 218 } 219 } 220 221 #if SHIFT == 1 222 void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 223 { 224 int shift, i; 225 226 shift = s->L(0); 227 if (shift > 16) { 228 shift = 16; 229 } 230 for (i = 0; i < 16 - shift; i++) { 231 d->B(i) = d->B(i + shift); 232 } 233 for (i = 16 - shift; i < 16; i++) { 234 d->B(i) = 0; 235 } 236 } 237 238 void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 239 { 240 int shift, i; 241 242 shift = s->L(0); 243 if (shift > 16) { 244 shift = 16; 245 } 246 for (i = 15; i >= shift; i--) { 247 d->B(i) = d->B(i - shift); 248 } 249 for (i = 0; i < shift; i++) { 250 d->B(i) = 0; 251 } 252 } 253 #endif 254 255 #define SSE_HELPER_B(name, F) \ 256 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 257 { \ 258 d->B(0) = F(d->B(0), s->B(0)); \ 259 d->B(1) = F(d->B(1), s->B(1)); \ 260 d->B(2) = F(d->B(2), s->B(2)); \ 261 d->B(3) = F(d->B(3), s->B(3)); \ 262 d->B(4) = F(d->B(4), s->B(4)); \ 263 d->B(5) = F(d->B(5), s->B(5)); \ 264 d->B(6) = F(d->B(6), s->B(6)); \ 265 d->B(7) = F(d->B(7), s->B(7)); \ 266 XMM_ONLY( \ 267 d->B(8) = F(d->B(8), s->B(8)); \ 268 d->B(9) = F(d->B(9), s->B(9)); \ 269 d->B(10) = F(d->B(10), s->B(10)); \ 270 d->B(11) = F(d->B(11), s->B(11)); \ 271 d->B(12) = F(d->B(12), s->B(12)); \ 272 d->B(13) = F(d->B(13), s->B(13)); \ 273 d->B(14) = F(d->B(14), s->B(14)); \ 274 d->B(15) = F(d->B(15), s->B(15)); \ 275 ) \ 276 } 277 278 #define SSE_HELPER_W(name, F) \ 279 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 280 { \ 281 d->W(0) = F(d->W(0), s->W(0)); \ 282 d->W(1) = F(d->W(1), s->W(1)); \ 283 d->W(2) = F(d->W(2), s->W(2)); \ 284 d->W(3) = F(d->W(3), s->W(3)); \ 285 XMM_ONLY( \ 286 d->W(4) = F(d->W(4), s->W(4)); \ 287 d->W(5) = F(d->W(5), s->W(5)); \ 288 d->W(6) = F(d->W(6), s->W(6)); \ 289 d->W(7) = F(d->W(7), s->W(7)); \ 290 ) \ 291 } 292 293 #define SSE_HELPER_L(name, F) \ 294 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 295 { \ 296 d->L(0) = F(d->L(0), s->L(0)); \ 297 d->L(1) = F(d->L(1), s->L(1)); \ 298 XMM_ONLY( \ 299 d->L(2) = F(d->L(2), s->L(2)); \ 300 d->L(3) = F(d->L(3), s->L(3)); \ 301 ) \ 302 } 303 304 #define SSE_HELPER_Q(name, F) \ 305 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 306 { \ 307 d->Q(0) = F(d->Q(0), s->Q(0)); \ 308 XMM_ONLY( \ 309 d->Q(1) = F(d->Q(1), s->Q(1)); \ 310 ) \ 311 } 312 313 #if SHIFT == 0 314 static inline int satub(int x) 315 { 316 if (x < 0) { 317 return 0; 318 } else if (x > 255) { 319 return 255; 320 } else { 321 return x; 322 } 323 } 324 325 static inline int satuw(int x) 326 { 327 if (x < 0) { 328 return 0; 329 } else if (x > 65535) { 330 return 65535; 331 } else { 332 return x; 333 } 334 } 335 336 static inline int satsb(int x) 337 { 338 if (x < -128) { 339 return -128; 340 } else if (x > 127) { 341 return 127; 342 } else { 343 return x; 344 } 345 } 346 347 static inline int satsw(int x) 348 { 349 if (x < -32768) { 350 return -32768; 351 } else if (x > 32767) { 352 return 32767; 353 } else { 354 return x; 355 } 356 } 357 358 #define FADD(a, b) ((a) + (b)) 359 #define FADDUB(a, b) satub((a) + (b)) 360 #define FADDUW(a, b) satuw((a) + (b)) 361 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b)) 362 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b)) 363 364 #define FSUB(a, b) ((a) - (b)) 365 #define FSUBUB(a, b) satub((a) - (b)) 366 #define FSUBUW(a, b) satuw((a) - (b)) 367 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b)) 368 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b)) 369 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b) 370 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b) 371 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b) 372 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b) 373 374 #define FAND(a, b) ((a) & (b)) 375 #define FANDN(a, b) ((~(a)) & (b)) 376 #define FOR(a, b) ((a) | (b)) 377 #define FXOR(a, b) ((a) ^ (b)) 378 379 #define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0) 380 #define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0) 381 #define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0) 382 #define FCMPEQ(a, b) ((a) == (b) ? -1 : 0) 383 384 #define FMULLW(a, b) ((a) * (b)) 385 #define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16) 386 #define FMULHUW(a, b) ((a) * (b) >> 16) 387 #define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16) 388 389 #define FAVG(a, b) (((a) + (b) + 1) >> 1) 390 #endif 391 392 SSE_HELPER_B(helper_paddb, FADD) 393 SSE_HELPER_W(helper_paddw, FADD) 394 SSE_HELPER_L(helper_paddl, FADD) 395 SSE_HELPER_Q(helper_paddq, FADD) 396 397 SSE_HELPER_B(helper_psubb, FSUB) 398 SSE_HELPER_W(helper_psubw, FSUB) 399 SSE_HELPER_L(helper_psubl, FSUB) 400 SSE_HELPER_Q(helper_psubq, FSUB) 401 402 SSE_HELPER_B(helper_paddusb, FADDUB) 403 SSE_HELPER_B(helper_paddsb, FADDSB) 404 SSE_HELPER_B(helper_psubusb, FSUBUB) 405 SSE_HELPER_B(helper_psubsb, FSUBSB) 406 407 SSE_HELPER_W(helper_paddusw, FADDUW) 408 SSE_HELPER_W(helper_paddsw, FADDSW) 409 SSE_HELPER_W(helper_psubusw, FSUBUW) 410 SSE_HELPER_W(helper_psubsw, FSUBSW) 411 412 SSE_HELPER_B(helper_pminub, FMINUB) 413 SSE_HELPER_B(helper_pmaxub, FMAXUB) 414 415 SSE_HELPER_W(helper_pminsw, FMINSW) 416 SSE_HELPER_W(helper_pmaxsw, FMAXSW) 417 418 SSE_HELPER_Q(helper_pand, FAND) 419 SSE_HELPER_Q(helper_pandn, FANDN) 420 SSE_HELPER_Q(helper_por, FOR) 421 SSE_HELPER_Q(helper_pxor, FXOR) 422 423 SSE_HELPER_B(helper_pcmpgtb, FCMPGTB) 424 SSE_HELPER_W(helper_pcmpgtw, FCMPGTW) 425 SSE_HELPER_L(helper_pcmpgtl, FCMPGTL) 426 427 SSE_HELPER_B(helper_pcmpeqb, FCMPEQ) 428 SSE_HELPER_W(helper_pcmpeqw, FCMPEQ) 429 SSE_HELPER_L(helper_pcmpeql, FCMPEQ) 430 431 SSE_HELPER_W(helper_pmullw, FMULLW) 432 #if SHIFT == 0 433 SSE_HELPER_W(helper_pmulhrw, FMULHRW) 434 #endif 435 SSE_HELPER_W(helper_pmulhuw, FMULHUW) 436 SSE_HELPER_W(helper_pmulhw, FMULHW) 437 438 SSE_HELPER_B(helper_pavgb, FAVG) 439 SSE_HELPER_W(helper_pavgw, FAVG) 440 441 void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 442 { 443 d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0); 444 #if SHIFT == 1 445 d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2); 446 #endif 447 } 448 449 void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 450 { 451 int i; 452 453 for (i = 0; i < (2 << SHIFT); i++) { 454 d->L(i) = (int16_t)s->W(2 * i) * (int16_t)d->W(2 * i) + 455 (int16_t)s->W(2 * i + 1) * (int16_t)d->W(2 * i + 1); 456 } 457 } 458 459 #if SHIFT == 0 460 static inline int abs1(int a) 461 { 462 if (a < 0) { 463 return -a; 464 } else { 465 return a; 466 } 467 } 468 #endif 469 void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 470 { 471 unsigned int val; 472 473 val = 0; 474 val += abs1(d->B(0) - s->B(0)); 475 val += abs1(d->B(1) - s->B(1)); 476 val += abs1(d->B(2) - s->B(2)); 477 val += abs1(d->B(3) - s->B(3)); 478 val += abs1(d->B(4) - s->B(4)); 479 val += abs1(d->B(5) - s->B(5)); 480 val += abs1(d->B(6) - s->B(6)); 481 val += abs1(d->B(7) - s->B(7)); 482 d->Q(0) = val; 483 #if SHIFT == 1 484 val = 0; 485 val += abs1(d->B(8) - s->B(8)); 486 val += abs1(d->B(9) - s->B(9)); 487 val += abs1(d->B(10) - s->B(10)); 488 val += abs1(d->B(11) - s->B(11)); 489 val += abs1(d->B(12) - s->B(12)); 490 val += abs1(d->B(13) - s->B(13)); 491 val += abs1(d->B(14) - s->B(14)); 492 val += abs1(d->B(15) - s->B(15)); 493 d->Q(1) = val; 494 #endif 495 } 496 497 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 498 target_ulong a0) 499 { 500 int i; 501 502 for (i = 0; i < (8 << SHIFT); i++) { 503 if (s->B(i) & 0x80) { 504 cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC()); 505 } 506 } 507 } 508 509 void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val) 510 { 511 d->L(0) = val; 512 d->L(1) = 0; 513 #if SHIFT == 1 514 d->Q(1) = 0; 515 #endif 516 } 517 518 #ifdef TARGET_X86_64 519 void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val) 520 { 521 d->Q(0) = val; 522 #if SHIFT == 1 523 d->Q(1) = 0; 524 #endif 525 } 526 #endif 527 528 #if SHIFT == 0 529 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) 530 { 531 Reg r; 532 533 r.W(0) = s->W(order & 3); 534 r.W(1) = s->W((order >> 2) & 3); 535 r.W(2) = s->W((order >> 4) & 3); 536 r.W(3) = s->W((order >> 6) & 3); 537 MOVE(*d, r); 538 } 539 #else 540 void helper_shufps(Reg *d, Reg *s, int order) 541 { 542 Reg r; 543 544 r.L(0) = d->L(order & 3); 545 r.L(1) = d->L((order >> 2) & 3); 546 r.L(2) = s->L((order >> 4) & 3); 547 r.L(3) = s->L((order >> 6) & 3); 548 MOVE(*d, r); 549 } 550 551 void helper_shufpd(Reg *d, Reg *s, int order) 552 { 553 Reg r; 554 555 r.Q(0) = d->Q(order & 1); 556 r.Q(1) = s->Q((order >> 1) & 1); 557 MOVE(*d, r); 558 } 559 560 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order) 561 { 562 Reg r; 563 564 r.L(0) = s->L(order & 3); 565 r.L(1) = s->L((order >> 2) & 3); 566 r.L(2) = s->L((order >> 4) & 3); 567 r.L(3) = s->L((order >> 6) & 3); 568 MOVE(*d, r); 569 } 570 571 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order) 572 { 573 Reg r; 574 575 r.W(0) = s->W(order & 3); 576 r.W(1) = s->W((order >> 2) & 3); 577 r.W(2) = s->W((order >> 4) & 3); 578 r.W(3) = s->W((order >> 6) & 3); 579 r.Q(1) = s->Q(1); 580 MOVE(*d, r); 581 } 582 583 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) 584 { 585 Reg r; 586 587 r.Q(0) = s->Q(0); 588 r.W(4) = s->W(4 + (order & 3)); 589 r.W(5) = s->W(4 + ((order >> 2) & 3)); 590 r.W(6) = s->W(4 + ((order >> 4) & 3)); 591 r.W(7) = s->W(4 + ((order >> 6) & 3)); 592 MOVE(*d, r); 593 } 594 #endif 595 596 #if SHIFT == 1 597 /* FPU ops */ 598 /* XXX: not accurate */ 599 600 #define SSE_HELPER_S(name, F) \ 601 void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s) \ 602 { \ 603 d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ 604 d->ZMM_S(1) = F(32, d->ZMM_S(1), s->ZMM_S(1)); \ 605 d->ZMM_S(2) = F(32, d->ZMM_S(2), s->ZMM_S(2)); \ 606 d->ZMM_S(3) = F(32, d->ZMM_S(3), s->ZMM_S(3)); \ 607 } \ 608 \ 609 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \ 610 { \ 611 d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ 612 } \ 613 \ 614 void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s) \ 615 { \ 616 d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ 617 d->ZMM_D(1) = F(64, d->ZMM_D(1), s->ZMM_D(1)); \ 618 } \ 619 \ 620 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \ 621 { \ 622 d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ 623 } 624 625 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status) 626 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status) 627 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status) 628 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status) 629 #define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status) 630 631 /* Note that the choice of comparison op here is important to get the 632 * special cases right: for min and max Intel specifies that (-0,0), 633 * (NaN, anything) and (anything, NaN) return the second argument. 634 */ 635 #define FPU_MIN(size, a, b) \ 636 (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b)) 637 #define FPU_MAX(size, a, b) \ 638 (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b)) 639 640 SSE_HELPER_S(add, FPU_ADD) 641 SSE_HELPER_S(sub, FPU_SUB) 642 SSE_HELPER_S(mul, FPU_MUL) 643 SSE_HELPER_S(div, FPU_DIV) 644 SSE_HELPER_S(min, FPU_MIN) 645 SSE_HELPER_S(max, FPU_MAX) 646 SSE_HELPER_S(sqrt, FPU_SQRT) 647 648 649 /* float to float conversions */ 650 void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s) 651 { 652 float32 s0, s1; 653 654 s0 = s->ZMM_S(0); 655 s1 = s->ZMM_S(1); 656 d->ZMM_D(0) = float32_to_float64(s0, &env->sse_status); 657 d->ZMM_D(1) = float32_to_float64(s1, &env->sse_status); 658 } 659 660 void helper_cvtpd2ps(CPUX86State *env, Reg *d, Reg *s) 661 { 662 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status); 663 d->ZMM_S(1) = float64_to_float32(s->ZMM_D(1), &env->sse_status); 664 d->Q(1) = 0; 665 } 666 667 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s) 668 { 669 d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status); 670 } 671 672 void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s) 673 { 674 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status); 675 } 676 677 /* integer to float */ 678 void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s) 679 { 680 d->ZMM_S(0) = int32_to_float32(s->ZMM_L(0), &env->sse_status); 681 d->ZMM_S(1) = int32_to_float32(s->ZMM_L(1), &env->sse_status); 682 d->ZMM_S(2) = int32_to_float32(s->ZMM_L(2), &env->sse_status); 683 d->ZMM_S(3) = int32_to_float32(s->ZMM_L(3), &env->sse_status); 684 } 685 686 void helper_cvtdq2pd(CPUX86State *env, Reg *d, Reg *s) 687 { 688 int32_t l0, l1; 689 690 l0 = (int32_t)s->ZMM_L(0); 691 l1 = (int32_t)s->ZMM_L(1); 692 d->ZMM_D(0) = int32_to_float64(l0, &env->sse_status); 693 d->ZMM_D(1) = int32_to_float64(l1, &env->sse_status); 694 } 695 696 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s) 697 { 698 d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status); 699 d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status); 700 } 701 702 void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s) 703 { 704 d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status); 705 d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status); 706 } 707 708 void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val) 709 { 710 d->ZMM_S(0) = int32_to_float32(val, &env->sse_status); 711 } 712 713 void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val) 714 { 715 d->ZMM_D(0) = int32_to_float64(val, &env->sse_status); 716 } 717 718 #ifdef TARGET_X86_64 719 void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val) 720 { 721 d->ZMM_S(0) = int64_to_float32(val, &env->sse_status); 722 } 723 724 void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val) 725 { 726 d->ZMM_D(0) = int64_to_float64(val, &env->sse_status); 727 } 728 #endif 729 730 /* float to integer */ 731 732 /* 733 * x86 mandates that we return the indefinite integer value for the result 734 * of any float-to-integer conversion that raises the 'invalid' exception. 735 * Wrap the softfloat functions to get this behaviour. 736 */ 737 #define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE) \ 738 static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s) \ 739 { \ 740 int oldflags, newflags; \ 741 RETTYPE r; \ 742 \ 743 oldflags = get_float_exception_flags(s); \ 744 set_float_exception_flags(0, s); \ 745 r = FN(a, s); \ 746 newflags = get_float_exception_flags(s); \ 747 if (newflags & float_flag_invalid) { \ 748 r = INDEFVALUE; \ 749 } \ 750 set_float_exception_flags(newflags | oldflags, s); \ 751 return r; \ 752 } 753 754 WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN) 755 WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN) 756 WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN) 757 WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN) 758 WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN) 759 WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN) 760 WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN) 761 WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN) 762 763 void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) 764 { 765 d->ZMM_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); 766 d->ZMM_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status); 767 d->ZMM_L(2) = x86_float32_to_int32(s->ZMM_S(2), &env->sse_status); 768 d->ZMM_L(3) = x86_float32_to_int32(s->ZMM_S(3), &env->sse_status); 769 } 770 771 void helper_cvtpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) 772 { 773 d->ZMM_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); 774 d->ZMM_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status); 775 d->ZMM_Q(1) = 0; 776 } 777 778 void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 779 { 780 d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); 781 d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status); 782 } 783 784 void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 785 { 786 d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); 787 d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status); 788 } 789 790 int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s) 791 { 792 return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); 793 } 794 795 int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s) 796 { 797 return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); 798 } 799 800 #ifdef TARGET_X86_64 801 int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s) 802 { 803 return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status); 804 } 805 806 int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s) 807 { 808 return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status); 809 } 810 #endif 811 812 /* float to integer truncated */ 813 void helper_cvttps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) 814 { 815 d->ZMM_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); 816 d->ZMM_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status); 817 d->ZMM_L(2) = x86_float32_to_int32_round_to_zero(s->ZMM_S(2), &env->sse_status); 818 d->ZMM_L(3) = x86_float32_to_int32_round_to_zero(s->ZMM_S(3), &env->sse_status); 819 } 820 821 void helper_cvttpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) 822 { 823 d->ZMM_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status); 824 d->ZMM_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status); 825 d->ZMM_Q(1) = 0; 826 } 827 828 void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 829 { 830 d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); 831 d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status); 832 } 833 834 void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 835 { 836 d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status); 837 d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status); 838 } 839 840 int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s) 841 { 842 return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); 843 } 844 845 int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s) 846 { 847 return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status); 848 } 849 850 #ifdef TARGET_X86_64 851 int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s) 852 { 853 return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status); 854 } 855 856 int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s) 857 { 858 return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status); 859 } 860 #endif 861 862 void helper_rsqrtps(CPUX86State *env, ZMMReg *d, ZMMReg *s) 863 { 864 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 865 d->ZMM_S(0) = float32_div(float32_one, 866 float32_sqrt(s->ZMM_S(0), &env->sse_status), 867 &env->sse_status); 868 d->ZMM_S(1) = float32_div(float32_one, 869 float32_sqrt(s->ZMM_S(1), &env->sse_status), 870 &env->sse_status); 871 d->ZMM_S(2) = float32_div(float32_one, 872 float32_sqrt(s->ZMM_S(2), &env->sse_status), 873 &env->sse_status); 874 d->ZMM_S(3) = float32_div(float32_one, 875 float32_sqrt(s->ZMM_S(3), &env->sse_status), 876 &env->sse_status); 877 set_float_exception_flags(old_flags, &env->sse_status); 878 } 879 880 void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s) 881 { 882 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 883 d->ZMM_S(0) = float32_div(float32_one, 884 float32_sqrt(s->ZMM_S(0), &env->sse_status), 885 &env->sse_status); 886 set_float_exception_flags(old_flags, &env->sse_status); 887 } 888 889 void helper_rcpps(CPUX86State *env, ZMMReg *d, ZMMReg *s) 890 { 891 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 892 d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status); 893 d->ZMM_S(1) = float32_div(float32_one, s->ZMM_S(1), &env->sse_status); 894 d->ZMM_S(2) = float32_div(float32_one, s->ZMM_S(2), &env->sse_status); 895 d->ZMM_S(3) = float32_div(float32_one, s->ZMM_S(3), &env->sse_status); 896 set_float_exception_flags(old_flags, &env->sse_status); 897 } 898 899 void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s) 900 { 901 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 902 d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status); 903 set_float_exception_flags(old_flags, &env->sse_status); 904 } 905 906 static inline uint64_t helper_extrq(uint64_t src, int shift, int len) 907 { 908 uint64_t mask; 909 910 if (len == 0) { 911 mask = ~0LL; 912 } else { 913 mask = (1ULL << len) - 1; 914 } 915 return (src >> shift) & mask; 916 } 917 918 void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s) 919 { 920 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1), s->ZMM_B(0)); 921 } 922 923 void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length) 924 { 925 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length); 926 } 927 928 static inline uint64_t helper_insertq(uint64_t src, int shift, int len) 929 { 930 uint64_t mask; 931 932 if (len == 0) { 933 mask = ~0ULL; 934 } else { 935 mask = (1ULL << len) - 1; 936 } 937 return (src & ~(mask << shift)) | ((src & mask) << shift); 938 } 939 940 void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s) 941 { 942 d->ZMM_Q(0) = helper_insertq(s->ZMM_Q(0), s->ZMM_B(9), s->ZMM_B(8)); 943 } 944 945 void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length) 946 { 947 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length); 948 } 949 950 void helper_haddps(CPUX86State *env, ZMMReg *d, ZMMReg *s) 951 { 952 ZMMReg r; 953 954 r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); 955 r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); 956 r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); 957 r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); 958 MOVE(*d, r); 959 } 960 961 void helper_haddpd(CPUX86State *env, ZMMReg *d, ZMMReg *s) 962 { 963 ZMMReg r; 964 965 r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); 966 r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); 967 MOVE(*d, r); 968 } 969 970 void helper_hsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s) 971 { 972 ZMMReg r; 973 974 r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); 975 r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); 976 r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); 977 r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); 978 MOVE(*d, r); 979 } 980 981 void helper_hsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s) 982 { 983 ZMMReg r; 984 985 r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); 986 r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); 987 MOVE(*d, r); 988 } 989 990 void helper_addsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s) 991 { 992 d->ZMM_S(0) = float32_sub(d->ZMM_S(0), s->ZMM_S(0), &env->sse_status); 993 d->ZMM_S(1) = float32_add(d->ZMM_S(1), s->ZMM_S(1), &env->sse_status); 994 d->ZMM_S(2) = float32_sub(d->ZMM_S(2), s->ZMM_S(2), &env->sse_status); 995 d->ZMM_S(3) = float32_add(d->ZMM_S(3), s->ZMM_S(3), &env->sse_status); 996 } 997 998 void helper_addsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s) 999 { 1000 d->ZMM_D(0) = float64_sub(d->ZMM_D(0), s->ZMM_D(0), &env->sse_status); 1001 d->ZMM_D(1) = float64_add(d->ZMM_D(1), s->ZMM_D(1), &env->sse_status); 1002 } 1003 1004 /* XXX: unordered */ 1005 #define SSE_HELPER_CMP(name, F) \ 1006 void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s) \ 1007 { \ 1008 d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ 1009 d->ZMM_L(1) = F(32, d->ZMM_S(1), s->ZMM_S(1)); \ 1010 d->ZMM_L(2) = F(32, d->ZMM_S(2), s->ZMM_S(2)); \ 1011 d->ZMM_L(3) = F(32, d->ZMM_S(3), s->ZMM_S(3)); \ 1012 } \ 1013 \ 1014 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \ 1015 { \ 1016 d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ 1017 } \ 1018 \ 1019 void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s) \ 1020 { \ 1021 d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ 1022 d->ZMM_Q(1) = F(64, d->ZMM_D(1), s->ZMM_D(1)); \ 1023 } \ 1024 \ 1025 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \ 1026 { \ 1027 d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ 1028 } 1029 1030 #define FPU_CMPEQ(size, a, b) \ 1031 (float ## size ## _eq_quiet(a, b, &env->sse_status) ? -1 : 0) 1032 #define FPU_CMPLT(size, a, b) \ 1033 (float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0) 1034 #define FPU_CMPLE(size, a, b) \ 1035 (float ## size ## _le(a, b, &env->sse_status) ? -1 : 0) 1036 #define FPU_CMPUNORD(size, a, b) \ 1037 (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? -1 : 0) 1038 #define FPU_CMPNEQ(size, a, b) \ 1039 (float ## size ## _eq_quiet(a, b, &env->sse_status) ? 0 : -1) 1040 #define FPU_CMPNLT(size, a, b) \ 1041 (float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1) 1042 #define FPU_CMPNLE(size, a, b) \ 1043 (float ## size ## _le(a, b, &env->sse_status) ? 0 : -1) 1044 #define FPU_CMPORD(size, a, b) \ 1045 (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? 0 : -1) 1046 1047 SSE_HELPER_CMP(cmpeq, FPU_CMPEQ) 1048 SSE_HELPER_CMP(cmplt, FPU_CMPLT) 1049 SSE_HELPER_CMP(cmple, FPU_CMPLE) 1050 SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD) 1051 SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ) 1052 SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT) 1053 SSE_HELPER_CMP(cmpnle, FPU_CMPNLE) 1054 SSE_HELPER_CMP(cmpord, FPU_CMPORD) 1055 1056 static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C}; 1057 1058 void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s) 1059 { 1060 FloatRelation ret; 1061 float32 s0, s1; 1062 1063 s0 = d->ZMM_S(0); 1064 s1 = s->ZMM_S(0); 1065 ret = float32_compare_quiet(s0, s1, &env->sse_status); 1066 CC_SRC = comis_eflags[ret + 1]; 1067 } 1068 1069 void helper_comiss(CPUX86State *env, Reg *d, Reg *s) 1070 { 1071 FloatRelation ret; 1072 float32 s0, s1; 1073 1074 s0 = d->ZMM_S(0); 1075 s1 = s->ZMM_S(0); 1076 ret = float32_compare(s0, s1, &env->sse_status); 1077 CC_SRC = comis_eflags[ret + 1]; 1078 } 1079 1080 void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s) 1081 { 1082 FloatRelation ret; 1083 float64 d0, d1; 1084 1085 d0 = d->ZMM_D(0); 1086 d1 = s->ZMM_D(0); 1087 ret = float64_compare_quiet(d0, d1, &env->sse_status); 1088 CC_SRC = comis_eflags[ret + 1]; 1089 } 1090 1091 void helper_comisd(CPUX86State *env, Reg *d, Reg *s) 1092 { 1093 FloatRelation ret; 1094 float64 d0, d1; 1095 1096 d0 = d->ZMM_D(0); 1097 d1 = s->ZMM_D(0); 1098 ret = float64_compare(d0, d1, &env->sse_status); 1099 CC_SRC = comis_eflags[ret + 1]; 1100 } 1101 1102 uint32_t helper_movmskps(CPUX86State *env, Reg *s) 1103 { 1104 int b0, b1, b2, b3; 1105 1106 b0 = s->ZMM_L(0) >> 31; 1107 b1 = s->ZMM_L(1) >> 31; 1108 b2 = s->ZMM_L(2) >> 31; 1109 b3 = s->ZMM_L(3) >> 31; 1110 return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3); 1111 } 1112 1113 uint32_t helper_movmskpd(CPUX86State *env, Reg *s) 1114 { 1115 int b0, b1; 1116 1117 b0 = s->ZMM_L(1) >> 31; 1118 b1 = s->ZMM_L(3) >> 31; 1119 return b0 | (b1 << 1); 1120 } 1121 1122 #endif 1123 1124 uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s) 1125 { 1126 uint32_t val; 1127 1128 val = 0; 1129 val |= (s->B(0) >> 7); 1130 val |= (s->B(1) >> 6) & 0x02; 1131 val |= (s->B(2) >> 5) & 0x04; 1132 val |= (s->B(3) >> 4) & 0x08; 1133 val |= (s->B(4) >> 3) & 0x10; 1134 val |= (s->B(5) >> 2) & 0x20; 1135 val |= (s->B(6) >> 1) & 0x40; 1136 val |= (s->B(7)) & 0x80; 1137 #if SHIFT == 1 1138 val |= (s->B(8) << 1) & 0x0100; 1139 val |= (s->B(9) << 2) & 0x0200; 1140 val |= (s->B(10) << 3) & 0x0400; 1141 val |= (s->B(11) << 4) & 0x0800; 1142 val |= (s->B(12) << 5) & 0x1000; 1143 val |= (s->B(13) << 6) & 0x2000; 1144 val |= (s->B(14) << 7) & 0x4000; 1145 val |= (s->B(15) << 8) & 0x8000; 1146 #endif 1147 return val; 1148 } 1149 1150 void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1151 { 1152 Reg r; 1153 1154 r.B(0) = satsb((int16_t)d->W(0)); 1155 r.B(1) = satsb((int16_t)d->W(1)); 1156 r.B(2) = satsb((int16_t)d->W(2)); 1157 r.B(3) = satsb((int16_t)d->W(3)); 1158 #if SHIFT == 1 1159 r.B(4) = satsb((int16_t)d->W(4)); 1160 r.B(5) = satsb((int16_t)d->W(5)); 1161 r.B(6) = satsb((int16_t)d->W(6)); 1162 r.B(7) = satsb((int16_t)d->W(7)); 1163 #endif 1164 r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0)); 1165 r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1)); 1166 r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2)); 1167 r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3)); 1168 #if SHIFT == 1 1169 r.B(12) = satsb((int16_t)s->W(4)); 1170 r.B(13) = satsb((int16_t)s->W(5)); 1171 r.B(14) = satsb((int16_t)s->W(6)); 1172 r.B(15) = satsb((int16_t)s->W(7)); 1173 #endif 1174 MOVE(*d, r); 1175 } 1176 1177 void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1178 { 1179 Reg r; 1180 1181 r.B(0) = satub((int16_t)d->W(0)); 1182 r.B(1) = satub((int16_t)d->W(1)); 1183 r.B(2) = satub((int16_t)d->W(2)); 1184 r.B(3) = satub((int16_t)d->W(3)); 1185 #if SHIFT == 1 1186 r.B(4) = satub((int16_t)d->W(4)); 1187 r.B(5) = satub((int16_t)d->W(5)); 1188 r.B(6) = satub((int16_t)d->W(6)); 1189 r.B(7) = satub((int16_t)d->W(7)); 1190 #endif 1191 r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0)); 1192 r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1)); 1193 r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2)); 1194 r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3)); 1195 #if SHIFT == 1 1196 r.B(12) = satub((int16_t)s->W(4)); 1197 r.B(13) = satub((int16_t)s->W(5)); 1198 r.B(14) = satub((int16_t)s->W(6)); 1199 r.B(15) = satub((int16_t)s->W(7)); 1200 #endif 1201 MOVE(*d, r); 1202 } 1203 1204 void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1205 { 1206 Reg r; 1207 1208 r.W(0) = satsw(d->L(0)); 1209 r.W(1) = satsw(d->L(1)); 1210 #if SHIFT == 1 1211 r.W(2) = satsw(d->L(2)); 1212 r.W(3) = satsw(d->L(3)); 1213 #endif 1214 r.W((2 << SHIFT) + 0) = satsw(s->L(0)); 1215 r.W((2 << SHIFT) + 1) = satsw(s->L(1)); 1216 #if SHIFT == 1 1217 r.W(6) = satsw(s->L(2)); 1218 r.W(7) = satsw(s->L(3)); 1219 #endif 1220 MOVE(*d, r); 1221 } 1222 1223 #define UNPCK_OP(base_name, base) \ 1224 \ 1225 void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\ 1226 Reg *d, Reg *s) \ 1227 { \ 1228 Reg r; \ 1229 \ 1230 r.B(0) = d->B((base << (SHIFT + 2)) + 0); \ 1231 r.B(1) = s->B((base << (SHIFT + 2)) + 0); \ 1232 r.B(2) = d->B((base << (SHIFT + 2)) + 1); \ 1233 r.B(3) = s->B((base << (SHIFT + 2)) + 1); \ 1234 r.B(4) = d->B((base << (SHIFT + 2)) + 2); \ 1235 r.B(5) = s->B((base << (SHIFT + 2)) + 2); \ 1236 r.B(6) = d->B((base << (SHIFT + 2)) + 3); \ 1237 r.B(7) = s->B((base << (SHIFT + 2)) + 3); \ 1238 XMM_ONLY( \ 1239 r.B(8) = d->B((base << (SHIFT + 2)) + 4); \ 1240 r.B(9) = s->B((base << (SHIFT + 2)) + 4); \ 1241 r.B(10) = d->B((base << (SHIFT + 2)) + 5); \ 1242 r.B(11) = s->B((base << (SHIFT + 2)) + 5); \ 1243 r.B(12) = d->B((base << (SHIFT + 2)) + 6); \ 1244 r.B(13) = s->B((base << (SHIFT + 2)) + 6); \ 1245 r.B(14) = d->B((base << (SHIFT + 2)) + 7); \ 1246 r.B(15) = s->B((base << (SHIFT + 2)) + 7); \ 1247 ) \ 1248 MOVE(*d, r); \ 1249 } \ 1250 \ 1251 void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\ 1252 Reg *d, Reg *s) \ 1253 { \ 1254 Reg r; \ 1255 \ 1256 r.W(0) = d->W((base << (SHIFT + 1)) + 0); \ 1257 r.W(1) = s->W((base << (SHIFT + 1)) + 0); \ 1258 r.W(2) = d->W((base << (SHIFT + 1)) + 1); \ 1259 r.W(3) = s->W((base << (SHIFT + 1)) + 1); \ 1260 XMM_ONLY( \ 1261 r.W(4) = d->W((base << (SHIFT + 1)) + 2); \ 1262 r.W(5) = s->W((base << (SHIFT + 1)) + 2); \ 1263 r.W(6) = d->W((base << (SHIFT + 1)) + 3); \ 1264 r.W(7) = s->W((base << (SHIFT + 1)) + 3); \ 1265 ) \ 1266 MOVE(*d, r); \ 1267 } \ 1268 \ 1269 void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\ 1270 Reg *d, Reg *s) \ 1271 { \ 1272 Reg r; \ 1273 \ 1274 r.L(0) = d->L((base << SHIFT) + 0); \ 1275 r.L(1) = s->L((base << SHIFT) + 0); \ 1276 XMM_ONLY( \ 1277 r.L(2) = d->L((base << SHIFT) + 1); \ 1278 r.L(3) = s->L((base << SHIFT) + 1); \ 1279 ) \ 1280 MOVE(*d, r); \ 1281 } \ 1282 \ 1283 XMM_ONLY( \ 1284 void glue(helper_punpck ## base_name ## qdq, SUFFIX)(CPUX86State \ 1285 *env, \ 1286 Reg *d, \ 1287 Reg *s) \ 1288 { \ 1289 Reg r; \ 1290 \ 1291 r.Q(0) = d->Q(base); \ 1292 r.Q(1) = s->Q(base); \ 1293 MOVE(*d, r); \ 1294 } \ 1295 ) 1296 1297 UNPCK_OP(l, 0) 1298 UNPCK_OP(h, 1) 1299 1300 /* 3DNow! float ops */ 1301 #if SHIFT == 0 1302 void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s) 1303 { 1304 d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status); 1305 d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status); 1306 } 1307 1308 void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s) 1309 { 1310 d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status); 1311 d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status); 1312 } 1313 1314 void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s) 1315 { 1316 d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status); 1317 d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status); 1318 } 1319 1320 void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s) 1321 { 1322 d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), 1323 &env->mmx_status)); 1324 d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), 1325 &env->mmx_status)); 1326 } 1327 1328 void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1329 { 1330 MMXReg r; 1331 1332 r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1333 r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1334 MOVE(*d, r); 1335 } 1336 1337 void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s) 1338 { 1339 d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1340 d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1341 } 1342 1343 void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s) 1344 { 1345 d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0), 1346 &env->mmx_status) ? -1 : 0; 1347 d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1), 1348 &env->mmx_status) ? -1 : 0; 1349 } 1350 1351 void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s) 1352 { 1353 d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), 1354 &env->mmx_status) ? -1 : 0; 1355 d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), 1356 &env->mmx_status) ? -1 : 0; 1357 } 1358 1359 void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s) 1360 { 1361 d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), 1362 &env->mmx_status) ? -1 : 0; 1363 d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), 1364 &env->mmx_status) ? -1 : 0; 1365 } 1366 1367 void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s) 1368 { 1369 if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) { 1370 d->MMX_S(0) = s->MMX_S(0); 1371 } 1372 if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) { 1373 d->MMX_S(1) = s->MMX_S(1); 1374 } 1375 } 1376 1377 void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s) 1378 { 1379 if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) { 1380 d->MMX_S(0) = s->MMX_S(0); 1381 } 1382 if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) { 1383 d->MMX_S(1) = s->MMX_S(1); 1384 } 1385 } 1386 1387 void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s) 1388 { 1389 d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1390 d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1391 } 1392 1393 void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1394 { 1395 MMXReg r; 1396 1397 r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1398 r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1399 MOVE(*d, r); 1400 } 1401 1402 void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1403 { 1404 MMXReg r; 1405 1406 r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1407 r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1408 MOVE(*d, r); 1409 } 1410 1411 void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s) 1412 { 1413 d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status); 1414 d->MMX_S(1) = d->MMX_S(0); 1415 } 1416 1417 void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s) 1418 { 1419 d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff; 1420 d->MMX_S(1) = float32_div(float32_one, 1421 float32_sqrt(d->MMX_S(1), &env->mmx_status), 1422 &env->mmx_status); 1423 d->MMX_L(1) |= s->MMX_L(0) & 0x80000000; 1424 d->MMX_L(0) = d->MMX_L(1); 1425 } 1426 1427 void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s) 1428 { 1429 d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1430 d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1431 } 1432 1433 void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s) 1434 { 1435 d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status); 1436 d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status); 1437 } 1438 1439 void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s) 1440 { 1441 MMXReg r; 1442 1443 r.MMX_L(0) = s->MMX_L(1); 1444 r.MMX_L(1) = s->MMX_L(0); 1445 MOVE(*d, r); 1446 } 1447 #endif 1448 1449 /* SSSE3 op helpers */ 1450 void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1451 { 1452 int i; 1453 Reg r; 1454 1455 for (i = 0; i < (8 << SHIFT); i++) { 1456 r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1))); 1457 } 1458 1459 MOVE(*d, r); 1460 } 1461 1462 void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1463 { 1464 1465 Reg r; 1466 1467 r.W(0) = (int16_t)d->W(0) + (int16_t)d->W(1); 1468 r.W(1) = (int16_t)d->W(2) + (int16_t)d->W(3); 1469 XMM_ONLY(r.W(2) = (int16_t)d->W(4) + (int16_t)d->W(5)); 1470 XMM_ONLY(r.W(3) = (int16_t)d->W(6) + (int16_t)d->W(7)); 1471 r.W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1); 1472 r.W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3); 1473 XMM_ONLY(r.W(6) = (int16_t)s->W(4) + (int16_t)s->W(5)); 1474 XMM_ONLY(r.W(7) = (int16_t)s->W(6) + (int16_t)s->W(7)); 1475 1476 MOVE(*d, r); 1477 } 1478 1479 void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1480 { 1481 Reg r; 1482 1483 r.L(0) = (int32_t)d->L(0) + (int32_t)d->L(1); 1484 XMM_ONLY(r.L(1) = (int32_t)d->L(2) + (int32_t)d->L(3)); 1485 r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1); 1486 XMM_ONLY(r.L(3) = (int32_t)s->L(2) + (int32_t)s->L(3)); 1487 1488 MOVE(*d, r); 1489 } 1490 1491 void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1492 { 1493 Reg r; 1494 1495 r.W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1)); 1496 r.W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3)); 1497 XMM_ONLY(r.W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5))); 1498 XMM_ONLY(r.W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7))); 1499 r.W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1)); 1500 r.W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3)); 1501 XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5))); 1502 XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7))); 1503 1504 MOVE(*d, r); 1505 } 1506 1507 void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1508 { 1509 d->W(0) = satsw((int8_t)s->B(0) * (uint8_t)d->B(0) + 1510 (int8_t)s->B(1) * (uint8_t)d->B(1)); 1511 d->W(1) = satsw((int8_t)s->B(2) * (uint8_t)d->B(2) + 1512 (int8_t)s->B(3) * (uint8_t)d->B(3)); 1513 d->W(2) = satsw((int8_t)s->B(4) * (uint8_t)d->B(4) + 1514 (int8_t)s->B(5) * (uint8_t)d->B(5)); 1515 d->W(3) = satsw((int8_t)s->B(6) * (uint8_t)d->B(6) + 1516 (int8_t)s->B(7) * (uint8_t)d->B(7)); 1517 #if SHIFT == 1 1518 d->W(4) = satsw((int8_t)s->B(8) * (uint8_t)d->B(8) + 1519 (int8_t)s->B(9) * (uint8_t)d->B(9)); 1520 d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) + 1521 (int8_t)s->B(11) * (uint8_t)d->B(11)); 1522 d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) + 1523 (int8_t)s->B(13) * (uint8_t)d->B(13)); 1524 d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) + 1525 (int8_t)s->B(15) * (uint8_t)d->B(15)); 1526 #endif 1527 } 1528 1529 void glue(helper_phsubw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1530 { 1531 d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1); 1532 d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3); 1533 XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5)); 1534 XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7)); 1535 d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1); 1536 d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3); 1537 XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5)); 1538 XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7)); 1539 } 1540 1541 void glue(helper_phsubd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1542 { 1543 d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1); 1544 XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3)); 1545 d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1); 1546 XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3)); 1547 } 1548 1549 void glue(helper_phsubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1550 { 1551 d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1)); 1552 d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3)); 1553 XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5))); 1554 XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7))); 1555 d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1)); 1556 d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3)); 1557 XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5))); 1558 XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7))); 1559 } 1560 1561 #define FABSB(_, x) (x > INT8_MAX ? -(int8_t)x : x) 1562 #define FABSW(_, x) (x > INT16_MAX ? -(int16_t)x : x) 1563 #define FABSL(_, x) (x > INT32_MAX ? -(int32_t)x : x) 1564 SSE_HELPER_B(helper_pabsb, FABSB) 1565 SSE_HELPER_W(helper_pabsw, FABSW) 1566 SSE_HELPER_L(helper_pabsd, FABSL) 1567 1568 #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15) 1569 SSE_HELPER_W(helper_pmulhrsw, FMULHRSW) 1570 1571 #define FSIGNB(d, s) (s <= INT8_MAX ? s ? d : 0 : -(int8_t)d) 1572 #define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d) 1573 #define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d) 1574 SSE_HELPER_B(helper_psignb, FSIGNB) 1575 SSE_HELPER_W(helper_psignw, FSIGNW) 1576 SSE_HELPER_L(helper_psignd, FSIGNL) 1577 1578 void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1579 int32_t shift) 1580 { 1581 Reg r; 1582 1583 /* XXX could be checked during translation */ 1584 if (shift >= (16 << SHIFT)) { 1585 r.Q(0) = 0; 1586 XMM_ONLY(r.Q(1) = 0); 1587 } else { 1588 shift <<= 3; 1589 #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0) 1590 #if SHIFT == 0 1591 r.Q(0) = SHR(s->Q(0), shift - 0) | 1592 SHR(d->Q(0), shift - 64); 1593 #else 1594 r.Q(0) = SHR(s->Q(0), shift - 0) | 1595 SHR(s->Q(1), shift - 64) | 1596 SHR(d->Q(0), shift - 128) | 1597 SHR(d->Q(1), shift - 192); 1598 r.Q(1) = SHR(s->Q(0), shift + 64) | 1599 SHR(s->Q(1), shift - 0) | 1600 SHR(d->Q(0), shift - 64) | 1601 SHR(d->Q(1), shift - 128); 1602 #endif 1603 #undef SHR 1604 } 1605 1606 MOVE(*d, r); 1607 } 1608 1609 #define XMM0 (env->xmm_regs[0]) 1610 1611 #if SHIFT == 1 1612 #define SSE_HELPER_V(name, elem, num, F) \ 1613 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 1614 { \ 1615 d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0)); \ 1616 d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1)); \ 1617 if (num > 2) { \ 1618 d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2)); \ 1619 d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3)); \ 1620 if (num > 4) { \ 1621 d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4)); \ 1622 d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5)); \ 1623 d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6)); \ 1624 d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7)); \ 1625 if (num > 8) { \ 1626 d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8)); \ 1627 d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9)); \ 1628 d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10)); \ 1629 d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11)); \ 1630 d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12)); \ 1631 d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13)); \ 1632 d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14)); \ 1633 d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15)); \ 1634 } \ 1635 } \ 1636 } \ 1637 } 1638 1639 #define SSE_HELPER_I(name, elem, num, F) \ 1640 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t imm) \ 1641 { \ 1642 d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1)); \ 1643 d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1)); \ 1644 if (num > 2) { \ 1645 d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1)); \ 1646 d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1)); \ 1647 if (num > 4) { \ 1648 d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1)); \ 1649 d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1)); \ 1650 d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1)); \ 1651 d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1)); \ 1652 if (num > 8) { \ 1653 d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1)); \ 1654 d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1)); \ 1655 d->elem(10) = F(d->elem(10), s->elem(10), \ 1656 ((imm >> 10) & 1)); \ 1657 d->elem(11) = F(d->elem(11), s->elem(11), \ 1658 ((imm >> 11) & 1)); \ 1659 d->elem(12) = F(d->elem(12), s->elem(12), \ 1660 ((imm >> 12) & 1)); \ 1661 d->elem(13) = F(d->elem(13), s->elem(13), \ 1662 ((imm >> 13) & 1)); \ 1663 d->elem(14) = F(d->elem(14), s->elem(14), \ 1664 ((imm >> 14) & 1)); \ 1665 d->elem(15) = F(d->elem(15), s->elem(15), \ 1666 ((imm >> 15) & 1)); \ 1667 } \ 1668 } \ 1669 } \ 1670 } 1671 1672 /* SSE4.1 op helpers */ 1673 #define FBLENDVB(d, s, m) ((m & 0x80) ? s : d) 1674 #define FBLENDVPS(d, s, m) ((m & 0x80000000) ? s : d) 1675 #define FBLENDVPD(d, s, m) ((m & 0x8000000000000000LL) ? s : d) 1676 SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB) 1677 SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS) 1678 SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD) 1679 1680 void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1681 { 1682 uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1)); 1683 uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1)); 1684 1685 CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); 1686 } 1687 1688 #define SSE_HELPER_F(name, elem, num, F) \ 1689 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 1690 { \ 1691 if (num > 2) { \ 1692 if (num > 4) { \ 1693 d->elem(7) = F(7); \ 1694 d->elem(6) = F(6); \ 1695 d->elem(5) = F(5); \ 1696 d->elem(4) = F(4); \ 1697 } \ 1698 d->elem(3) = F(3); \ 1699 d->elem(2) = F(2); \ 1700 } \ 1701 d->elem(1) = F(1); \ 1702 d->elem(0) = F(0); \ 1703 } 1704 1705 SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B) 1706 SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B) 1707 SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B) 1708 SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W) 1709 SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W) 1710 SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L) 1711 SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B) 1712 SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B) 1713 SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B) 1714 SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W) 1715 SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W) 1716 SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L) 1717 1718 void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1719 { 1720 d->Q(0) = (int64_t)(int32_t) d->L(0) * (int32_t) s->L(0); 1721 d->Q(1) = (int64_t)(int32_t) d->L(2) * (int32_t) s->L(2); 1722 } 1723 1724 #define FCMPEQQ(d, s) (d == s ? -1 : 0) 1725 SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ) 1726 1727 void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1728 { 1729 Reg r; 1730 1731 r.W(0) = satuw((int32_t) d->L(0)); 1732 r.W(1) = satuw((int32_t) d->L(1)); 1733 r.W(2) = satuw((int32_t) d->L(2)); 1734 r.W(3) = satuw((int32_t) d->L(3)); 1735 r.W(4) = satuw((int32_t) s->L(0)); 1736 r.W(5) = satuw((int32_t) s->L(1)); 1737 r.W(6) = satuw((int32_t) s->L(2)); 1738 r.W(7) = satuw((int32_t) s->L(3)); 1739 MOVE(*d, r); 1740 } 1741 1742 #define FMINSB(d, s) MIN((int8_t)d, (int8_t)s) 1743 #define FMINSD(d, s) MIN((int32_t)d, (int32_t)s) 1744 #define FMAXSB(d, s) MAX((int8_t)d, (int8_t)s) 1745 #define FMAXSD(d, s) MAX((int32_t)d, (int32_t)s) 1746 SSE_HELPER_B(helper_pminsb, FMINSB) 1747 SSE_HELPER_L(helper_pminsd, FMINSD) 1748 SSE_HELPER_W(helper_pminuw, MIN) 1749 SSE_HELPER_L(helper_pminud, MIN) 1750 SSE_HELPER_B(helper_pmaxsb, FMAXSB) 1751 SSE_HELPER_L(helper_pmaxsd, FMAXSD) 1752 SSE_HELPER_W(helper_pmaxuw, MAX) 1753 SSE_HELPER_L(helper_pmaxud, MAX) 1754 1755 #define FMULLD(d, s) ((int32_t)d * (int32_t)s) 1756 SSE_HELPER_L(helper_pmulld, FMULLD) 1757 1758 void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1759 { 1760 int idx = 0; 1761 1762 if (s->W(1) < s->W(idx)) { 1763 idx = 1; 1764 } 1765 if (s->W(2) < s->W(idx)) { 1766 idx = 2; 1767 } 1768 if (s->W(3) < s->W(idx)) { 1769 idx = 3; 1770 } 1771 if (s->W(4) < s->W(idx)) { 1772 idx = 4; 1773 } 1774 if (s->W(5) < s->W(idx)) { 1775 idx = 5; 1776 } 1777 if (s->W(6) < s->W(idx)) { 1778 idx = 6; 1779 } 1780 if (s->W(7) < s->W(idx)) { 1781 idx = 7; 1782 } 1783 1784 d->W(0) = s->W(idx); 1785 d->W(1) = idx; 1786 d->L(1) = 0; 1787 d->Q(1) = 0; 1788 } 1789 1790 void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1791 uint32_t mode) 1792 { 1793 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1794 signed char prev_rounding_mode; 1795 1796 prev_rounding_mode = env->sse_status.float_rounding_mode; 1797 if (!(mode & (1 << 2))) { 1798 switch (mode & 3) { 1799 case 0: 1800 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1801 break; 1802 case 1: 1803 set_float_rounding_mode(float_round_down, &env->sse_status); 1804 break; 1805 case 2: 1806 set_float_rounding_mode(float_round_up, &env->sse_status); 1807 break; 1808 case 3: 1809 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1810 break; 1811 } 1812 } 1813 1814 d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status); 1815 d->ZMM_S(1) = float32_round_to_int(s->ZMM_S(1), &env->sse_status); 1816 d->ZMM_S(2) = float32_round_to_int(s->ZMM_S(2), &env->sse_status); 1817 d->ZMM_S(3) = float32_round_to_int(s->ZMM_S(3), &env->sse_status); 1818 1819 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1820 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1821 ~float_flag_inexact, 1822 &env->sse_status); 1823 } 1824 env->sse_status.float_rounding_mode = prev_rounding_mode; 1825 } 1826 1827 void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1828 uint32_t mode) 1829 { 1830 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1831 signed char prev_rounding_mode; 1832 1833 prev_rounding_mode = env->sse_status.float_rounding_mode; 1834 if (!(mode & (1 << 2))) { 1835 switch (mode & 3) { 1836 case 0: 1837 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1838 break; 1839 case 1: 1840 set_float_rounding_mode(float_round_down, &env->sse_status); 1841 break; 1842 case 2: 1843 set_float_rounding_mode(float_round_up, &env->sse_status); 1844 break; 1845 case 3: 1846 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1847 break; 1848 } 1849 } 1850 1851 d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status); 1852 d->ZMM_D(1) = float64_round_to_int(s->ZMM_D(1), &env->sse_status); 1853 1854 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1855 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1856 ~float_flag_inexact, 1857 &env->sse_status); 1858 } 1859 env->sse_status.float_rounding_mode = prev_rounding_mode; 1860 } 1861 1862 void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1863 uint32_t mode) 1864 { 1865 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1866 signed char prev_rounding_mode; 1867 1868 prev_rounding_mode = env->sse_status.float_rounding_mode; 1869 if (!(mode & (1 << 2))) { 1870 switch (mode & 3) { 1871 case 0: 1872 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1873 break; 1874 case 1: 1875 set_float_rounding_mode(float_round_down, &env->sse_status); 1876 break; 1877 case 2: 1878 set_float_rounding_mode(float_round_up, &env->sse_status); 1879 break; 1880 case 3: 1881 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1882 break; 1883 } 1884 } 1885 1886 d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status); 1887 1888 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1889 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1890 ~float_flag_inexact, 1891 &env->sse_status); 1892 } 1893 env->sse_status.float_rounding_mode = prev_rounding_mode; 1894 } 1895 1896 void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1897 uint32_t mode) 1898 { 1899 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1900 signed char prev_rounding_mode; 1901 1902 prev_rounding_mode = env->sse_status.float_rounding_mode; 1903 if (!(mode & (1 << 2))) { 1904 switch (mode & 3) { 1905 case 0: 1906 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1907 break; 1908 case 1: 1909 set_float_rounding_mode(float_round_down, &env->sse_status); 1910 break; 1911 case 2: 1912 set_float_rounding_mode(float_round_up, &env->sse_status); 1913 break; 1914 case 3: 1915 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1916 break; 1917 } 1918 } 1919 1920 d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status); 1921 1922 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1923 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1924 ~float_flag_inexact, 1925 &env->sse_status); 1926 } 1927 env->sse_status.float_rounding_mode = prev_rounding_mode; 1928 } 1929 1930 #define FBLENDP(d, s, m) (m ? s : d) 1931 SSE_HELPER_I(helper_blendps, L, 4, FBLENDP) 1932 SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP) 1933 SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) 1934 1935 void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) 1936 { 1937 float32 iresult = float32_zero; 1938 1939 if (mask & (1 << 4)) { 1940 iresult = float32_add(iresult, 1941 float32_mul(d->ZMM_S(0), s->ZMM_S(0), 1942 &env->sse_status), 1943 &env->sse_status); 1944 } 1945 if (mask & (1 << 5)) { 1946 iresult = float32_add(iresult, 1947 float32_mul(d->ZMM_S(1), s->ZMM_S(1), 1948 &env->sse_status), 1949 &env->sse_status); 1950 } 1951 if (mask & (1 << 6)) { 1952 iresult = float32_add(iresult, 1953 float32_mul(d->ZMM_S(2), s->ZMM_S(2), 1954 &env->sse_status), 1955 &env->sse_status); 1956 } 1957 if (mask & (1 << 7)) { 1958 iresult = float32_add(iresult, 1959 float32_mul(d->ZMM_S(3), s->ZMM_S(3), 1960 &env->sse_status), 1961 &env->sse_status); 1962 } 1963 d->ZMM_S(0) = (mask & (1 << 0)) ? iresult : float32_zero; 1964 d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero; 1965 d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero; 1966 d->ZMM_S(3) = (mask & (1 << 3)) ? iresult : float32_zero; 1967 } 1968 1969 void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) 1970 { 1971 float64 iresult = float64_zero; 1972 1973 if (mask & (1 << 4)) { 1974 iresult = float64_add(iresult, 1975 float64_mul(d->ZMM_D(0), s->ZMM_D(0), 1976 &env->sse_status), 1977 &env->sse_status); 1978 } 1979 if (mask & (1 << 5)) { 1980 iresult = float64_add(iresult, 1981 float64_mul(d->ZMM_D(1), s->ZMM_D(1), 1982 &env->sse_status), 1983 &env->sse_status); 1984 } 1985 d->ZMM_D(0) = (mask & (1 << 0)) ? iresult : float64_zero; 1986 d->ZMM_D(1) = (mask & (1 << 1)) ? iresult : float64_zero; 1987 } 1988 1989 void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1990 uint32_t offset) 1991 { 1992 int s0 = (offset & 3) << 2; 1993 int d0 = (offset & 4) << 0; 1994 int i; 1995 Reg r; 1996 1997 for (i = 0; i < 8; i++, d0++) { 1998 r.W(i) = 0; 1999 r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0)); 2000 r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1)); 2001 r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2)); 2002 r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3)); 2003 } 2004 2005 MOVE(*d, r); 2006 } 2007 2008 /* SSE4.2 op helpers */ 2009 #define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0) 2010 SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) 2011 2012 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl) 2013 { 2014 int val; 2015 2016 /* Presence of REX.W is indicated by a bit higher than 7 set */ 2017 if (ctrl >> 8) { 2018 val = abs1((int64_t)env->regs[reg]); 2019 } else { 2020 val = abs1((int32_t)env->regs[reg]); 2021 } 2022 2023 if (ctrl & 1) { 2024 if (val > 8) { 2025 return 8; 2026 } 2027 } else { 2028 if (val > 16) { 2029 return 16; 2030 } 2031 } 2032 return val; 2033 } 2034 2035 static inline int pcmp_ilen(Reg *r, uint8_t ctrl) 2036 { 2037 int val = 0; 2038 2039 if (ctrl & 1) { 2040 while (val < 8 && r->W(val)) { 2041 val++; 2042 } 2043 } else { 2044 while (val < 16 && r->B(val)) { 2045 val++; 2046 } 2047 } 2048 2049 return val; 2050 } 2051 2052 static inline int pcmp_val(Reg *r, uint8_t ctrl, int i) 2053 { 2054 switch ((ctrl >> 0) & 3) { 2055 case 0: 2056 return r->B(i); 2057 case 1: 2058 return r->W(i); 2059 case 2: 2060 return (int8_t)r->B(i); 2061 case 3: 2062 default: 2063 return (int16_t)r->W(i); 2064 } 2065 } 2066 2067 static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s, 2068 int8_t ctrl, int valids, int validd) 2069 { 2070 unsigned int res = 0; 2071 int v; 2072 int j, i; 2073 int upper = (ctrl & 1) ? 7 : 15; 2074 2075 valids--; 2076 validd--; 2077 2078 CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0); 2079 2080 switch ((ctrl >> 2) & 3) { 2081 case 0: 2082 for (j = valids; j >= 0; j--) { 2083 res <<= 1; 2084 v = pcmp_val(s, ctrl, j); 2085 for (i = validd; i >= 0; i--) { 2086 res |= (v == pcmp_val(d, ctrl, i)); 2087 } 2088 } 2089 break; 2090 case 1: 2091 for (j = valids; j >= 0; j--) { 2092 res <<= 1; 2093 v = pcmp_val(s, ctrl, j); 2094 for (i = ((validd - 1) | 1); i >= 0; i -= 2) { 2095 res |= (pcmp_val(d, ctrl, i - 0) >= v && 2096 pcmp_val(d, ctrl, i - 1) <= v); 2097 } 2098 } 2099 break; 2100 case 2: 2101 res = (1 << (upper - MAX(valids, validd))) - 1; 2102 res <<= MAX(valids, validd) - MIN(valids, validd); 2103 for (i = MIN(valids, validd); i >= 0; i--) { 2104 res <<= 1; 2105 v = pcmp_val(s, ctrl, i); 2106 res |= (v == pcmp_val(d, ctrl, i)); 2107 } 2108 break; 2109 case 3: 2110 if (validd == -1) { 2111 res = (2 << upper) - 1; 2112 break; 2113 } 2114 for (j = valids == upper ? valids : valids - validd; j >= 0; j--) { 2115 res <<= 1; 2116 v = 1; 2117 for (i = MIN(valids - j, validd); i >= 0; i--) { 2118 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i)); 2119 } 2120 res |= v; 2121 } 2122 break; 2123 } 2124 2125 switch ((ctrl >> 4) & 3) { 2126 case 1: 2127 res ^= (2 << upper) - 1; 2128 break; 2129 case 3: 2130 res ^= (1 << (valids + 1)) - 1; 2131 break; 2132 } 2133 2134 if (res) { 2135 CC_SRC |= CC_C; 2136 } 2137 if (res & 1) { 2138 CC_SRC |= CC_O; 2139 } 2140 2141 return res; 2142 } 2143 2144 void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2145 uint32_t ctrl) 2146 { 2147 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2148 pcmp_elen(env, R_EDX, ctrl), 2149 pcmp_elen(env, R_EAX, ctrl)); 2150 2151 if (res) { 2152 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res); 2153 } else { 2154 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); 2155 } 2156 } 2157 2158 void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2159 uint32_t ctrl) 2160 { 2161 int i; 2162 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2163 pcmp_elen(env, R_EDX, ctrl), 2164 pcmp_elen(env, R_EAX, ctrl)); 2165 2166 if ((ctrl >> 6) & 1) { 2167 if (ctrl & 1) { 2168 for (i = 0; i < 8; i++, res >>= 1) { 2169 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0; 2170 } 2171 } else { 2172 for (i = 0; i < 16; i++, res >>= 1) { 2173 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0; 2174 } 2175 } 2176 } else { 2177 env->xmm_regs[0].Q(1) = 0; 2178 env->xmm_regs[0].Q(0) = res; 2179 } 2180 } 2181 2182 void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2183 uint32_t ctrl) 2184 { 2185 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2186 pcmp_ilen(s, ctrl), 2187 pcmp_ilen(d, ctrl)); 2188 2189 if (res) { 2190 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res); 2191 } else { 2192 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); 2193 } 2194 } 2195 2196 void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2197 uint32_t ctrl) 2198 { 2199 int i; 2200 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2201 pcmp_ilen(s, ctrl), 2202 pcmp_ilen(d, ctrl)); 2203 2204 if ((ctrl >> 6) & 1) { 2205 if (ctrl & 1) { 2206 for (i = 0; i < 8; i++, res >>= 1) { 2207 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0; 2208 } 2209 } else { 2210 for (i = 0; i < 16; i++, res >>= 1) { 2211 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0; 2212 } 2213 } 2214 } else { 2215 env->xmm_regs[0].Q(1) = 0; 2216 env->xmm_regs[0].Q(0) = res; 2217 } 2218 } 2219 2220 #define CRCPOLY 0x1edc6f41 2221 #define CRCPOLY_BITREV 0x82f63b78 2222 target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) 2223 { 2224 target_ulong crc = (msg & ((target_ulong) -1 >> 2225 (TARGET_LONG_BITS - len))) ^ crc1; 2226 2227 while (len--) { 2228 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0); 2229 } 2230 2231 return crc; 2232 } 2233 2234 void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2235 uint32_t ctrl) 2236 { 2237 uint64_t ah, al, b, resh, resl; 2238 2239 ah = 0; 2240 al = d->Q((ctrl & 1) != 0); 2241 b = s->Q((ctrl & 16) != 0); 2242 resh = resl = 0; 2243 2244 while (b) { 2245 if (b & 1) { 2246 resl ^= al; 2247 resh ^= ah; 2248 } 2249 ah = (ah << 1) | (al >> 63); 2250 al <<= 1; 2251 b >>= 1; 2252 } 2253 2254 d->Q(0) = resl; 2255 d->Q(1) = resh; 2256 } 2257 2258 void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2259 { 2260 int i; 2261 Reg st = *d; 2262 Reg rk = *s; 2263 2264 for (i = 0 ; i < 4 ; i++) { 2265 d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4*i+0])] ^ 2266 AES_Td1[st.B(AES_ishifts[4*i+1])] ^ 2267 AES_Td2[st.B(AES_ishifts[4*i+2])] ^ 2268 AES_Td3[st.B(AES_ishifts[4*i+3])]); 2269 } 2270 } 2271 2272 void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2273 { 2274 int i; 2275 Reg st = *d; 2276 Reg rk = *s; 2277 2278 for (i = 0; i < 16; i++) { 2279 d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i])]); 2280 } 2281 } 2282 2283 void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2284 { 2285 int i; 2286 Reg st = *d; 2287 Reg rk = *s; 2288 2289 for (i = 0 ; i < 4 ; i++) { 2290 d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4*i+0])] ^ 2291 AES_Te1[st.B(AES_shifts[4*i+1])] ^ 2292 AES_Te2[st.B(AES_shifts[4*i+2])] ^ 2293 AES_Te3[st.B(AES_shifts[4*i+3])]); 2294 } 2295 } 2296 2297 void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2298 { 2299 int i; 2300 Reg st = *d; 2301 Reg rk = *s; 2302 2303 for (i = 0; i < 16; i++) { 2304 d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i])]); 2305 } 2306 2307 } 2308 2309 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2310 { 2311 int i; 2312 Reg tmp = *s; 2313 2314 for (i = 0 ; i < 4 ; i++) { 2315 d->L(i) = bswap32(AES_imc[tmp.B(4*i+0)][0] ^ 2316 AES_imc[tmp.B(4*i+1)][1] ^ 2317 AES_imc[tmp.B(4*i+2)][2] ^ 2318 AES_imc[tmp.B(4*i+3)][3]); 2319 } 2320 } 2321 2322 void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2323 uint32_t ctrl) 2324 { 2325 int i; 2326 Reg tmp = *s; 2327 2328 for (i = 0 ; i < 4 ; i++) { 2329 d->B(i) = AES_sbox[tmp.B(i + 4)]; 2330 d->B(i + 8) = AES_sbox[tmp.B(i + 12)]; 2331 } 2332 d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl; 2333 d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl; 2334 } 2335 #endif 2336 2337 #undef SHIFT 2338 #undef XMM_ONLY 2339 #undef Reg 2340 #undef B 2341 #undef W 2342 #undef L 2343 #undef Q 2344 #undef SUFFIX 2345 #undef SIZE 2346