1 /* 2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support 3 * 4 * Copyright (c) 2005 Fabrice Bellard 5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com> 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 #include "crypto/aes.h" 22 23 #if SHIFT == 0 24 #define Reg MMXReg 25 #define XMM_ONLY(...) 26 #define B(n) MMX_B(n) 27 #define W(n) MMX_W(n) 28 #define L(n) MMX_L(n) 29 #define Q(n) MMX_Q(n) 30 #define SUFFIX _mmx 31 #else 32 #define Reg ZMMReg 33 #define XMM_ONLY(...) __VA_ARGS__ 34 #define B(n) ZMM_B(n) 35 #define W(n) ZMM_W(n) 36 #define L(n) ZMM_L(n) 37 #define Q(n) ZMM_Q(n) 38 #define SUFFIX _xmm 39 #endif 40 41 #define LANE_WIDTH (SHIFT ? 16 : 8) 42 #define PACK_WIDTH (LANE_WIDTH / 2) 43 44 #if SHIFT == 0 45 #define FPSRL(x, c) ((x) >> shift) 46 #define FPSRAW(x, c) ((int16_t)(x) >> shift) 47 #define FPSRAL(x, c) ((int32_t)(x) >> shift) 48 #define FPSLL(x, c) ((x) << shift) 49 #endif 50 51 void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) 52 { 53 Reg *s = d; 54 int shift; 55 if (c->Q(0) > 15) { 56 for (int i = 0; i < 1 << SHIFT; i++) { 57 d->Q(i) = 0; 58 } 59 } else { 60 shift = c->B(0); 61 for (int i = 0; i < 4 << SHIFT; i++) { 62 d->W(i) = FPSRL(s->W(i), shift); 63 } 64 } 65 } 66 67 void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) 68 { 69 Reg *s = d; 70 int shift; 71 if (c->Q(0) > 15) { 72 for (int i = 0; i < 1 << SHIFT; i++) { 73 d->Q(i) = 0; 74 } 75 } else { 76 shift = c->B(0); 77 for (int i = 0; i < 4 << SHIFT; i++) { 78 d->W(i) = FPSLL(s->W(i), shift); 79 } 80 } 81 } 82 83 void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) 84 { 85 Reg *s = d; 86 int shift; 87 if (c->Q(0) > 15) { 88 shift = 15; 89 } else { 90 shift = c->B(0); 91 } 92 for (int i = 0; i < 4 << SHIFT; i++) { 93 d->W(i) = FPSRAW(s->W(i), shift); 94 } 95 } 96 97 void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) 98 { 99 Reg *s = d; 100 int shift; 101 if (c->Q(0) > 31) { 102 for (int i = 0; i < 1 << SHIFT; i++) { 103 d->Q(i) = 0; 104 } 105 } else { 106 shift = c->B(0); 107 for (int i = 0; i < 2 << SHIFT; i++) { 108 d->L(i) = FPSRL(s->L(i), shift); 109 } 110 } 111 } 112 113 void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) 114 { 115 Reg *s = d; 116 int shift; 117 if (c->Q(0) > 31) { 118 for (int i = 0; i < 1 << SHIFT; i++) { 119 d->Q(i) = 0; 120 } 121 } else { 122 shift = c->B(0); 123 for (int i = 0; i < 2 << SHIFT; i++) { 124 d->L(i) = FPSLL(s->L(i), shift); 125 } 126 } 127 } 128 129 void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) 130 { 131 Reg *s = d; 132 int shift; 133 if (c->Q(0) > 31) { 134 shift = 31; 135 } else { 136 shift = c->B(0); 137 } 138 for (int i = 0; i < 2 << SHIFT; i++) { 139 d->L(i) = FPSRAL(s->L(i), shift); 140 } 141 } 142 143 void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) 144 { 145 Reg *s = d; 146 int shift; 147 if (c->Q(0) > 63) { 148 for (int i = 0; i < 1 << SHIFT; i++) { 149 d->Q(i) = 0; 150 } 151 } else { 152 shift = c->B(0); 153 for (int i = 0; i < 1 << SHIFT; i++) { 154 d->Q(i) = FPSRL(s->Q(i), shift); 155 } 156 } 157 } 158 159 void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) 160 { 161 Reg *s = d; 162 int shift; 163 if (c->Q(0) > 63) { 164 for (int i = 0; i < 1 << SHIFT; i++) { 165 d->Q(i) = 0; 166 } 167 } else { 168 shift = c->B(0); 169 for (int i = 0; i < 1 << SHIFT; i++) { 170 d->Q(i) = FPSLL(s->Q(i), shift); 171 } 172 } 173 } 174 175 #if SHIFT >= 1 176 void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) 177 { 178 Reg *s = d; 179 int shift, i, j; 180 181 shift = c->L(0); 182 if (shift > 16) { 183 shift = 16; 184 } 185 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) { 186 for (i = 0; i < 16 - shift; i++) { 187 d->B(j + i) = s->B(j + i + shift); 188 } 189 for (i = 16 - shift; i < 16; i++) { 190 d->B(j + i) = 0; 191 } 192 } 193 } 194 195 void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) 196 { 197 Reg *s = d; 198 int shift, i, j; 199 200 shift = c->L(0); 201 if (shift > 16) { 202 shift = 16; 203 } 204 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) { 205 for (i = 15; i >= shift; i--) { 206 d->B(j + i) = s->B(j + i - shift); 207 } 208 for (i = 0; i < shift; i++) { 209 d->B(j + i) = 0; 210 } 211 } 212 } 213 #endif 214 215 #define SSE_HELPER_1(name, elem, num, F) \ 216 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 217 { \ 218 int n = num; \ 219 for (int i = 0; i < n; i++) { \ 220 d->elem(i) = F(s->elem(i)); \ 221 } \ 222 } 223 224 #define SSE_HELPER_2(name, elem, num, F) \ 225 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 226 { \ 227 Reg *v = d; \ 228 int n = num; \ 229 for (int i = 0; i < n; i++) { \ 230 d->elem(i) = F(v->elem(i), s->elem(i)); \ 231 } \ 232 } 233 234 #define SSE_HELPER_B(name, F) \ 235 SSE_HELPER_2(name, B, 8 << SHIFT, F) 236 237 #define SSE_HELPER_W(name, F) \ 238 SSE_HELPER_2(name, W, 4 << SHIFT, F) 239 240 #define SSE_HELPER_L(name, F) \ 241 SSE_HELPER_2(name, L, 2 << SHIFT, F) 242 243 #define SSE_HELPER_Q(name, F) \ 244 SSE_HELPER_2(name, Q, 1 << SHIFT, F) 245 246 #if SHIFT == 0 247 static inline int satub(int x) 248 { 249 if (x < 0) { 250 return 0; 251 } else if (x > 255) { 252 return 255; 253 } else { 254 return x; 255 } 256 } 257 258 static inline int satuw(int x) 259 { 260 if (x < 0) { 261 return 0; 262 } else if (x > 65535) { 263 return 65535; 264 } else { 265 return x; 266 } 267 } 268 269 static inline int satsb(int x) 270 { 271 if (x < -128) { 272 return -128; 273 } else if (x > 127) { 274 return 127; 275 } else { 276 return x; 277 } 278 } 279 280 static inline int satsw(int x) 281 { 282 if (x < -32768) { 283 return -32768; 284 } else if (x > 32767) { 285 return 32767; 286 } else { 287 return x; 288 } 289 } 290 291 #define FADD(a, b) ((a) + (b)) 292 #define FADDUB(a, b) satub((a) + (b)) 293 #define FADDUW(a, b) satuw((a) + (b)) 294 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b)) 295 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b)) 296 297 #define FSUB(a, b) ((a) - (b)) 298 #define FSUBUB(a, b) satub((a) - (b)) 299 #define FSUBUW(a, b) satuw((a) - (b)) 300 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b)) 301 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b)) 302 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b) 303 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b) 304 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b) 305 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b) 306 307 #define FAND(a, b) ((a) & (b)) 308 #define FANDN(a, b) ((~(a)) & (b)) 309 #define FOR(a, b) ((a) | (b)) 310 #define FXOR(a, b) ((a) ^ (b)) 311 312 #define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0) 313 #define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0) 314 #define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0) 315 #define FCMPEQ(a, b) ((a) == (b) ? -1 : 0) 316 317 #define FMULLW(a, b) ((a) * (b)) 318 #define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16) 319 #define FMULHUW(a, b) ((a) * (b) >> 16) 320 #define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16) 321 322 #define FAVG(a, b) (((a) + (b) + 1) >> 1) 323 #endif 324 325 SSE_HELPER_B(helper_paddb, FADD) 326 SSE_HELPER_W(helper_paddw, FADD) 327 SSE_HELPER_L(helper_paddl, FADD) 328 SSE_HELPER_Q(helper_paddq, FADD) 329 330 SSE_HELPER_B(helper_psubb, FSUB) 331 SSE_HELPER_W(helper_psubw, FSUB) 332 SSE_HELPER_L(helper_psubl, FSUB) 333 SSE_HELPER_Q(helper_psubq, FSUB) 334 335 SSE_HELPER_B(helper_paddusb, FADDUB) 336 SSE_HELPER_B(helper_paddsb, FADDSB) 337 SSE_HELPER_B(helper_psubusb, FSUBUB) 338 SSE_HELPER_B(helper_psubsb, FSUBSB) 339 340 SSE_HELPER_W(helper_paddusw, FADDUW) 341 SSE_HELPER_W(helper_paddsw, FADDSW) 342 SSE_HELPER_W(helper_psubusw, FSUBUW) 343 SSE_HELPER_W(helper_psubsw, FSUBSW) 344 345 SSE_HELPER_B(helper_pminub, FMINUB) 346 SSE_HELPER_B(helper_pmaxub, FMAXUB) 347 348 SSE_HELPER_W(helper_pminsw, FMINSW) 349 SSE_HELPER_W(helper_pmaxsw, FMAXSW) 350 351 SSE_HELPER_Q(helper_pand, FAND) 352 SSE_HELPER_Q(helper_pandn, FANDN) 353 SSE_HELPER_Q(helper_por, FOR) 354 SSE_HELPER_Q(helper_pxor, FXOR) 355 356 SSE_HELPER_B(helper_pcmpgtb, FCMPGTB) 357 SSE_HELPER_W(helper_pcmpgtw, FCMPGTW) 358 SSE_HELPER_L(helper_pcmpgtl, FCMPGTL) 359 360 SSE_HELPER_B(helper_pcmpeqb, FCMPEQ) 361 SSE_HELPER_W(helper_pcmpeqw, FCMPEQ) 362 SSE_HELPER_L(helper_pcmpeql, FCMPEQ) 363 364 SSE_HELPER_W(helper_pmullw, FMULLW) 365 #if SHIFT == 0 366 SSE_HELPER_W(helper_pmulhrw, FMULHRW) 367 #endif 368 SSE_HELPER_W(helper_pmulhuw, FMULHUW) 369 SSE_HELPER_W(helper_pmulhw, FMULHW) 370 371 SSE_HELPER_B(helper_pavgb, FAVG) 372 SSE_HELPER_W(helper_pavgw, FAVG) 373 374 void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 375 { 376 Reg *v = d; 377 int i; 378 379 for (i = 0; i < (1 << SHIFT); i++) { 380 d->Q(i) = (uint64_t)s->L(i * 2) * (uint64_t)v->L(i * 2); 381 } 382 } 383 384 void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 385 { 386 Reg *v = d; 387 int i; 388 389 for (i = 0; i < (2 << SHIFT); i++) { 390 d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) + 391 (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1); 392 } 393 } 394 395 #if SHIFT == 0 396 static inline int abs1(int a) 397 { 398 if (a < 0) { 399 return -a; 400 } else { 401 return a; 402 } 403 } 404 #endif 405 406 void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 407 { 408 Reg *v = d; 409 int i; 410 411 for (i = 0; i < (1 << SHIFT); i++) { 412 unsigned int val = 0; 413 val += abs1(v->B(8 * i + 0) - s->B(8 * i + 0)); 414 val += abs1(v->B(8 * i + 1) - s->B(8 * i + 1)); 415 val += abs1(v->B(8 * i + 2) - s->B(8 * i + 2)); 416 val += abs1(v->B(8 * i + 3) - s->B(8 * i + 3)); 417 val += abs1(v->B(8 * i + 4) - s->B(8 * i + 4)); 418 val += abs1(v->B(8 * i + 5) - s->B(8 * i + 5)); 419 val += abs1(v->B(8 * i + 6) - s->B(8 * i + 6)); 420 val += abs1(v->B(8 * i + 7) - s->B(8 * i + 7)); 421 d->Q(i) = val; 422 } 423 } 424 425 #if SHIFT < 2 426 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 427 target_ulong a0) 428 { 429 int i; 430 431 for (i = 0; i < (8 << SHIFT); i++) { 432 if (s->B(i) & 0x80) { 433 cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC()); 434 } 435 } 436 } 437 #endif 438 439 void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val) 440 { 441 int i; 442 443 d->L(0) = val; 444 d->L(1) = 0; 445 for (i = 1; i < (1 << SHIFT); i++) { 446 d->Q(i) = 0; 447 } 448 } 449 450 #ifdef TARGET_X86_64 451 void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val) 452 { 453 int i; 454 455 d->Q(0) = val; 456 for (i = 1; i < (1 << SHIFT); i++) { 457 d->Q(i) = 0; 458 } 459 } 460 #endif 461 462 #define SHUFFLE4(F, a, b, offset) do { \ 463 r0 = a->F((order & 3) + offset); \ 464 r1 = a->F(((order >> 2) & 3) + offset); \ 465 r2 = b->F(((order >> 4) & 3) + offset); \ 466 r3 = b->F(((order >> 6) & 3) + offset); \ 467 d->F(offset) = r0; \ 468 d->F(offset + 1) = r1; \ 469 d->F(offset + 2) = r2; \ 470 d->F(offset + 3) = r3; \ 471 } while (0) 472 473 #if SHIFT == 0 474 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) 475 { 476 uint16_t r0, r1, r2, r3; 477 478 SHUFFLE4(W, s, s, 0); 479 } 480 #else 481 void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order) 482 { 483 Reg *v = d; 484 uint32_t r0, r1, r2, r3; 485 int i; 486 487 for (i = 0; i < 2 << SHIFT; i += 4) { 488 SHUFFLE4(L, v, s, i); 489 } 490 } 491 492 void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order) 493 { 494 Reg *v = d; 495 uint64_t r0, r1; 496 int i; 497 498 for (i = 0; i < 1 << SHIFT; i += 2) { 499 r0 = v->Q(((order & 1) & 1) + i); 500 r1 = s->Q(((order >> 1) & 1) + i); 501 d->Q(i) = r0; 502 d->Q(i + 1) = r1; 503 order >>= 2; 504 } 505 } 506 507 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order) 508 { 509 uint32_t r0, r1, r2, r3; 510 int i; 511 512 for (i = 0; i < 2 << SHIFT; i += 4) { 513 SHUFFLE4(L, s, s, i); 514 } 515 } 516 517 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order) 518 { 519 uint16_t r0, r1, r2, r3; 520 int i, j; 521 522 for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) { 523 SHUFFLE4(W, s, s, i); 524 d->Q(j) = s->Q(j); 525 } 526 } 527 528 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) 529 { 530 uint16_t r0, r1, r2, r3; 531 int i, j; 532 533 for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) { 534 d->Q(j) = s->Q(j); 535 SHUFFLE4(W, s, s, i); 536 } 537 } 538 #endif 539 540 #if SHIFT >= 1 541 /* FPU ops */ 542 /* XXX: not accurate */ 543 544 #define SSE_HELPER_P(name, F) \ 545 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ 546 Reg *d, Reg *s) \ 547 { \ 548 Reg *v = d; \ 549 int i; \ 550 for (i = 0; i < 2 << SHIFT; i++) { \ 551 d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i)); \ 552 } \ 553 } \ 554 \ 555 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ 556 Reg *d, Reg *s) \ 557 { \ 558 Reg *v = d; \ 559 int i; \ 560 for (i = 0; i < 1 << SHIFT; i++) { \ 561 d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i)); \ 562 } \ 563 } 564 565 #if SHIFT == 1 566 567 #define SSE_HELPER_S(name, F) \ 568 SSE_HELPER_P(name, F) \ 569 \ 570 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\ 571 { \ 572 Reg *v = d; \ 573 d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \ 574 } \ 575 \ 576 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\ 577 { \ 578 Reg *v = d; \ 579 d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \ 580 } 581 582 #else 583 584 #define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F) 585 586 #endif 587 588 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status) 589 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status) 590 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status) 591 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status) 592 593 /* Note that the choice of comparison op here is important to get the 594 * special cases right: for min and max Intel specifies that (-0,0), 595 * (NaN, anything) and (anything, NaN) return the second argument. 596 */ 597 #define FPU_MIN(size, a, b) \ 598 (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b)) 599 #define FPU_MAX(size, a, b) \ 600 (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b)) 601 602 SSE_HELPER_S(add, FPU_ADD) 603 SSE_HELPER_S(sub, FPU_SUB) 604 SSE_HELPER_S(mul, FPU_MUL) 605 SSE_HELPER_S(div, FPU_DIV) 606 SSE_HELPER_S(min, FPU_MIN) 607 SSE_HELPER_S(max, FPU_MAX) 608 609 void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 610 { 611 int i; 612 for (i = 0; i < 2 << SHIFT; i++) { 613 d->ZMM_S(i) = float32_sqrt(s->ZMM_S(i), &env->sse_status); 614 } 615 } 616 617 void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 618 { 619 int i; 620 for (i = 0; i < 1 << SHIFT; i++) { 621 d->ZMM_D(i) = float64_sqrt(s->ZMM_D(i), &env->sse_status); 622 } 623 } 624 625 #if SHIFT == 1 626 void helper_sqrtss(CPUX86State *env, Reg *d, Reg *s) 627 { 628 d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status); 629 } 630 631 void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *s) 632 { 633 d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status); 634 } 635 #endif 636 637 /* float to float conversions */ 638 void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 639 { 640 int i; 641 for (i = 1 << SHIFT; --i >= 0; ) { 642 d->ZMM_D(i) = float32_to_float64(s->ZMM_S(i), &env->sse_status); 643 } 644 } 645 646 void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 647 { 648 int i; 649 for (i = 0; i < 1 << SHIFT; i++) { 650 d->ZMM_S(i) = float64_to_float32(s->ZMM_D(i), &env->sse_status); 651 } 652 for (i >>= 1; i < 1 << SHIFT; i++) { 653 d->Q(i) = 0; 654 } 655 } 656 657 #if SHIFT == 1 658 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s) 659 { 660 d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status); 661 } 662 663 void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s) 664 { 665 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status); 666 } 667 #endif 668 669 /* integer to float */ 670 void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 671 { 672 int i; 673 for (i = 0; i < 2 << SHIFT; i++) { 674 d->ZMM_S(i) = int32_to_float32(s->ZMM_L(i), &env->sse_status); 675 } 676 } 677 678 void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 679 { 680 int i; 681 for (i = 1 << SHIFT; --i >= 0; ) { 682 int32_t l = s->ZMM_L(i); 683 d->ZMM_D(i) = int32_to_float64(l, &env->sse_status); 684 } 685 } 686 687 #if SHIFT == 1 688 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s) 689 { 690 d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status); 691 d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status); 692 } 693 694 void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s) 695 { 696 d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status); 697 d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status); 698 } 699 700 void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val) 701 { 702 d->ZMM_S(0) = int32_to_float32(val, &env->sse_status); 703 } 704 705 void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val) 706 { 707 d->ZMM_D(0) = int32_to_float64(val, &env->sse_status); 708 } 709 710 #ifdef TARGET_X86_64 711 void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val) 712 { 713 d->ZMM_S(0) = int64_to_float32(val, &env->sse_status); 714 } 715 716 void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val) 717 { 718 d->ZMM_D(0) = int64_to_float64(val, &env->sse_status); 719 } 720 #endif 721 722 #endif 723 724 /* float to integer */ 725 726 #if SHIFT == 1 727 /* 728 * x86 mandates that we return the indefinite integer value for the result 729 * of any float-to-integer conversion that raises the 'invalid' exception. 730 * Wrap the softfloat functions to get this behaviour. 731 */ 732 #define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE) \ 733 static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s) \ 734 { \ 735 int oldflags, newflags; \ 736 RETTYPE r; \ 737 \ 738 oldflags = get_float_exception_flags(s); \ 739 set_float_exception_flags(0, s); \ 740 r = FN(a, s); \ 741 newflags = get_float_exception_flags(s); \ 742 if (newflags & float_flag_invalid) { \ 743 r = INDEFVALUE; \ 744 } \ 745 set_float_exception_flags(newflags | oldflags, s); \ 746 return r; \ 747 } 748 749 WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN) 750 WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN) 751 WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN) 752 WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN) 753 WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN) 754 WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN) 755 WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN) 756 WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN) 757 #endif 758 759 void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 760 { 761 int i; 762 for (i = 0; i < 2 << SHIFT; i++) { 763 d->ZMM_L(i) = x86_float32_to_int32(s->ZMM_S(i), &env->sse_status); 764 } 765 } 766 767 void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 768 { 769 int i; 770 for (i = 0; i < 1 << SHIFT; i++) { 771 d->ZMM_L(i) = x86_float64_to_int32(s->ZMM_D(i), &env->sse_status); 772 } 773 for (i >>= 1; i < 1 << SHIFT; i++) { 774 d->Q(i) = 0; 775 } 776 } 777 778 #if SHIFT == 1 779 void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 780 { 781 d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); 782 d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status); 783 } 784 785 void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 786 { 787 d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); 788 d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status); 789 } 790 791 int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s) 792 { 793 return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); 794 } 795 796 int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s) 797 { 798 return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); 799 } 800 801 #ifdef TARGET_X86_64 802 int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s) 803 { 804 return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status); 805 } 806 807 int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s) 808 { 809 return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status); 810 } 811 #endif 812 #endif 813 814 /* float to integer truncated */ 815 void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 816 { 817 int i; 818 for (i = 0; i < 2 << SHIFT; i++) { 819 d->ZMM_L(i) = x86_float32_to_int32_round_to_zero(s->ZMM_S(i), 820 &env->sse_status); 821 } 822 } 823 824 void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 825 { 826 int i; 827 for (i = 0; i < 1 << SHIFT; i++) { 828 d->ZMM_L(i) = x86_float64_to_int32_round_to_zero(s->ZMM_D(i), 829 &env->sse_status); 830 } 831 for (i >>= 1; i < 1 << SHIFT; i++) { 832 d->Q(i) = 0; 833 } 834 } 835 836 #if SHIFT == 1 837 void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 838 { 839 d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); 840 d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status); 841 } 842 843 void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 844 { 845 d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status); 846 d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status); 847 } 848 849 int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s) 850 { 851 return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); 852 } 853 854 int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s) 855 { 856 return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status); 857 } 858 859 #ifdef TARGET_X86_64 860 int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s) 861 { 862 return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status); 863 } 864 865 int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s) 866 { 867 return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status); 868 } 869 #endif 870 #endif 871 872 void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 873 { 874 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 875 int i; 876 for (i = 0; i < 2 << SHIFT; i++) { 877 d->ZMM_S(i) = float32_div(float32_one, 878 float32_sqrt(s->ZMM_S(i), &env->sse_status), 879 &env->sse_status); 880 } 881 set_float_exception_flags(old_flags, &env->sse_status); 882 } 883 884 #if SHIFT == 1 885 void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s) 886 { 887 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 888 d->ZMM_S(0) = float32_div(float32_one, 889 float32_sqrt(s->ZMM_S(0), &env->sse_status), 890 &env->sse_status); 891 set_float_exception_flags(old_flags, &env->sse_status); 892 } 893 #endif 894 895 void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 896 { 897 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 898 int i; 899 for (i = 0; i < 2 << SHIFT; i++) { 900 d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status); 901 } 902 set_float_exception_flags(old_flags, &env->sse_status); 903 } 904 905 #if SHIFT == 1 906 void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s) 907 { 908 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 909 d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status); 910 set_float_exception_flags(old_flags, &env->sse_status); 911 } 912 #endif 913 914 #if SHIFT == 1 915 static inline uint64_t helper_extrq(uint64_t src, int shift, int len) 916 { 917 uint64_t mask; 918 919 if (len == 0) { 920 mask = ~0LL; 921 } else { 922 mask = (1ULL << len) - 1; 923 } 924 return (src >> shift) & mask; 925 } 926 927 void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s) 928 { 929 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1) & 63, s->ZMM_B(0) & 63); 930 } 931 932 void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length) 933 { 934 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length); 935 } 936 937 static inline uint64_t helper_insertq(uint64_t dest, uint64_t src, int shift, int len) 938 { 939 uint64_t mask; 940 941 if (len == 0) { 942 mask = ~0ULL; 943 } else { 944 mask = (1ULL << len) - 1; 945 } 946 return (dest & ~(mask << shift)) | ((src & mask) << shift); 947 } 948 949 void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s) 950 { 951 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), s->ZMM_B(9) & 63, s->ZMM_B(8) & 63); 952 } 953 954 void helper_insertq_i(CPUX86State *env, ZMMReg *d, ZMMReg *s, int index, int length) 955 { 956 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), index, length); 957 } 958 #endif 959 960 #define SSE_HELPER_HPS(name, F) \ 961 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 962 { \ 963 Reg *v = d; \ 964 float32 r[2 << SHIFT]; \ 965 int i, j, k; \ 966 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ 967 for (i = j = 0; j < 4; i++, j += 2) { \ 968 r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \ 969 } \ 970 for (j = 0; j < 4; i++, j += 2) { \ 971 r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \ 972 } \ 973 } \ 974 for (i = 0; i < 2 << SHIFT; i++) { \ 975 d->ZMM_S(i) = r[i]; \ 976 } \ 977 } 978 979 SSE_HELPER_HPS(haddps, float32_add) 980 SSE_HELPER_HPS(hsubps, float32_sub) 981 982 #define SSE_HELPER_HPD(name, F) \ 983 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 984 { \ 985 Reg *v = d; \ 986 float64 r[1 << SHIFT]; \ 987 int i, j, k; \ 988 for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \ 989 for (i = j = 0; j < 2; i++, j += 2) { \ 990 r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \ 991 } \ 992 for (j = 0; j < 2; i++, j += 2) { \ 993 r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \ 994 } \ 995 } \ 996 for (i = 0; i < 1 << SHIFT; i++) { \ 997 d->ZMM_D(i) = r[i]; \ 998 } \ 999 } 1000 1001 SSE_HELPER_HPD(haddpd, float64_add) 1002 SSE_HELPER_HPD(hsubpd, float64_sub) 1003 1004 void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1005 { 1006 Reg *v = d; 1007 int i; 1008 for (i = 0; i < 2 << SHIFT; i += 2) { 1009 d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); 1010 d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status); 1011 } 1012 } 1013 1014 void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1015 { 1016 Reg *v = d; 1017 int i; 1018 for (i = 0; i < 1 << SHIFT; i += 2) { 1019 d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status); 1020 d->ZMM_D(i+1) = float64_add(v->ZMM_D(i+1), s->ZMM_D(i+1), &env->sse_status); 1021 } 1022 } 1023 1024 #define SSE_HELPER_CMP_P(name, F, C) \ 1025 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ 1026 Reg *d, Reg *s) \ 1027 { \ 1028 Reg *v = d; \ 1029 int i; \ 1030 for (i = 0; i < 2 << SHIFT; i++) { \ 1031 d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0; \ 1032 } \ 1033 } \ 1034 \ 1035 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ 1036 Reg *d, Reg *s) \ 1037 { \ 1038 Reg *v = d; \ 1039 int i; \ 1040 for (i = 0; i < 1 << SHIFT; i++) { \ 1041 d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0; \ 1042 } \ 1043 } 1044 1045 #if SHIFT == 1 1046 #define SSE_HELPER_CMP(name, F, C) \ 1047 SSE_HELPER_CMP_P(name, F, C) \ 1048 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \ 1049 { \ 1050 Reg *v = d; \ 1051 d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0; \ 1052 } \ 1053 \ 1054 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \ 1055 { \ 1056 Reg *v = d; \ 1057 d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0; \ 1058 } 1059 1060 #define FPU_EQ(x) (x == float_relation_equal) 1061 #define FPU_LT(x) (x == float_relation_less) 1062 #define FPU_LE(x) (x <= float_relation_equal) 1063 #define FPU_UNORD(x) (x == float_relation_unordered) 1064 1065 #define FPU_CMPQ(size, a, b) \ 1066 float ## size ## _compare_quiet(a, b, &env->sse_status) 1067 #define FPU_CMPS(size, a, b) \ 1068 float ## size ## _compare(a, b, &env->sse_status) 1069 1070 #else 1071 #define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C) 1072 #endif 1073 1074 SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ) 1075 SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT) 1076 SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE) 1077 SSE_HELPER_CMP(cmpunord, FPU_CMPQ, FPU_UNORD) 1078 SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ) 1079 SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT) 1080 SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE) 1081 SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD) 1082 1083 #undef SSE_HELPER_CMP 1084 1085 #if SHIFT == 1 1086 static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C}; 1087 1088 void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s) 1089 { 1090 FloatRelation ret; 1091 float32 s0, s1; 1092 1093 s0 = d->ZMM_S(0); 1094 s1 = s->ZMM_S(0); 1095 ret = float32_compare_quiet(s0, s1, &env->sse_status); 1096 CC_SRC = comis_eflags[ret + 1]; 1097 } 1098 1099 void helper_comiss(CPUX86State *env, Reg *d, Reg *s) 1100 { 1101 FloatRelation ret; 1102 float32 s0, s1; 1103 1104 s0 = d->ZMM_S(0); 1105 s1 = s->ZMM_S(0); 1106 ret = float32_compare(s0, s1, &env->sse_status); 1107 CC_SRC = comis_eflags[ret + 1]; 1108 } 1109 1110 void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s) 1111 { 1112 FloatRelation ret; 1113 float64 d0, d1; 1114 1115 d0 = d->ZMM_D(0); 1116 d1 = s->ZMM_D(0); 1117 ret = float64_compare_quiet(d0, d1, &env->sse_status); 1118 CC_SRC = comis_eflags[ret + 1]; 1119 } 1120 1121 void helper_comisd(CPUX86State *env, Reg *d, Reg *s) 1122 { 1123 FloatRelation ret; 1124 float64 d0, d1; 1125 1126 d0 = d->ZMM_D(0); 1127 d1 = s->ZMM_D(0); 1128 ret = float64_compare(d0, d1, &env->sse_status); 1129 CC_SRC = comis_eflags[ret + 1]; 1130 } 1131 #endif 1132 1133 uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s) 1134 { 1135 uint32_t mask; 1136 int i; 1137 1138 mask = 0; 1139 for (i = 0; i < 2 << SHIFT; i++) { 1140 mask |= (s->ZMM_L(i) >> (31 - i)) & (1 << i); 1141 } 1142 return mask; 1143 } 1144 1145 uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s) 1146 { 1147 uint32_t mask; 1148 int i; 1149 1150 mask = 0; 1151 for (i = 0; i < 1 << SHIFT; i++) { 1152 mask |= (s->ZMM_Q(i) >> (63 - i)) & (1 << i); 1153 } 1154 return mask; 1155 } 1156 1157 #endif 1158 1159 uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s) 1160 { 1161 uint32_t val; 1162 int i; 1163 1164 val = 0; 1165 for (i = 0; i < (1 << SHIFT); i++) { 1166 uint8_t byte = 0; 1167 byte |= (s->B(8 * i + 0) >> 7); 1168 byte |= (s->B(8 * i + 1) >> 6) & 0x02; 1169 byte |= (s->B(8 * i + 2) >> 5) & 0x04; 1170 byte |= (s->B(8 * i + 3) >> 4) & 0x08; 1171 byte |= (s->B(8 * i + 4) >> 3) & 0x10; 1172 byte |= (s->B(8 * i + 5) >> 2) & 0x20; 1173 byte |= (s->B(8 * i + 6) >> 1) & 0x40; 1174 byte |= (s->B(8 * i + 7)) & 0x80; 1175 val |= byte << (8 * i); 1176 } 1177 return val; 1178 } 1179 1180 #define PACK_HELPER_B(name, F) \ 1181 void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \ 1182 Reg *d, Reg *s) \ 1183 { \ 1184 Reg *v = d; \ 1185 uint8_t r[PACK_WIDTH * 2]; \ 1186 int j, k; \ 1187 for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \ 1188 for (k = 0; k < PACK_WIDTH; k++) { \ 1189 r[k] = F((int16_t)v->W(j + k)); \ 1190 } \ 1191 for (k = 0; k < PACK_WIDTH; k++) { \ 1192 r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \ 1193 } \ 1194 for (k = 0; k < PACK_WIDTH * 2; k++) { \ 1195 d->B(2 * j + k) = r[k]; \ 1196 } \ 1197 } \ 1198 } 1199 1200 PACK_HELPER_B(sswb, satsb) 1201 PACK_HELPER_B(uswb, satub) 1202 1203 void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1204 { 1205 Reg *v = d; 1206 uint16_t r[PACK_WIDTH]; 1207 int j, k; 1208 1209 for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) { 1210 for (k = 0; k < PACK_WIDTH / 2; k++) { 1211 r[k] = satsw(v->L(j + k)); 1212 } 1213 for (k = 0; k < PACK_WIDTH / 2; k++) { 1214 r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k)); 1215 } 1216 for (k = 0; k < PACK_WIDTH; k++) { 1217 d->W(2 * j + k) = r[k]; 1218 } 1219 } 1220 } 1221 1222 #define UNPCK_OP(base_name, base) \ 1223 \ 1224 void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\ 1225 Reg *d, Reg *s) \ 1226 { \ 1227 Reg *v = d; \ 1228 uint8_t r[PACK_WIDTH * 2]; \ 1229 int j, i; \ 1230 \ 1231 for (j = 0; j < 8 << SHIFT; ) { \ 1232 int k = j + base * PACK_WIDTH; \ 1233 for (i = 0; i < PACK_WIDTH; i++) { \ 1234 r[2 * i] = v->B(k + i); \ 1235 r[2 * i + 1] = s->B(k + i); \ 1236 } \ 1237 for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \ 1238 d->B(j) = r[i]; \ 1239 } \ 1240 } \ 1241 } \ 1242 \ 1243 void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\ 1244 Reg *d, Reg *s) \ 1245 { \ 1246 Reg *v = d; \ 1247 uint16_t r[PACK_WIDTH]; \ 1248 int j, i; \ 1249 \ 1250 for (j = 0; j < 4 << SHIFT; ) { \ 1251 int k = j + base * PACK_WIDTH / 2; \ 1252 for (i = 0; i < PACK_WIDTH / 2; i++) { \ 1253 r[2 * i] = v->W(k + i); \ 1254 r[2 * i + 1] = s->W(k + i); \ 1255 } \ 1256 for (i = 0; i < PACK_WIDTH; i++, j++) { \ 1257 d->W(j) = r[i]; \ 1258 } \ 1259 } \ 1260 } \ 1261 \ 1262 void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\ 1263 Reg *d, Reg *s) \ 1264 { \ 1265 Reg *v = d; \ 1266 uint32_t r[PACK_WIDTH / 2]; \ 1267 int j, i; \ 1268 \ 1269 for (j = 0; j < 2 << SHIFT; ) { \ 1270 int k = j + base * PACK_WIDTH / 4; \ 1271 for (i = 0; i < PACK_WIDTH / 4; i++) { \ 1272 r[2 * i] = v->L(k + i); \ 1273 r[2 * i + 1] = s->L(k + i); \ 1274 } \ 1275 for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \ 1276 d->L(j) = r[i]; \ 1277 } \ 1278 } \ 1279 } \ 1280 \ 1281 XMM_ONLY( \ 1282 void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \ 1283 CPUX86State *env, Reg *d, Reg *s) \ 1284 { \ 1285 Reg *v = d; \ 1286 uint64_t r[2]; \ 1287 int i; \ 1288 \ 1289 for (i = 0; i < 1 << SHIFT; i += 2) { \ 1290 r[0] = v->Q(base + i); \ 1291 r[1] = s->Q(base + i); \ 1292 d->Q(i) = r[0]; \ 1293 d->Q(i + 1) = r[1]; \ 1294 } \ 1295 } \ 1296 ) 1297 1298 UNPCK_OP(l, 0) 1299 UNPCK_OP(h, 1) 1300 1301 #undef PACK_WIDTH 1302 #undef PACK_HELPER_B 1303 #undef UNPCK_OP 1304 1305 1306 /* 3DNow! float ops */ 1307 #if SHIFT == 0 1308 void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s) 1309 { 1310 d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status); 1311 d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status); 1312 } 1313 1314 void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s) 1315 { 1316 d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status); 1317 d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status); 1318 } 1319 1320 void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s) 1321 { 1322 d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status); 1323 d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status); 1324 } 1325 1326 void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s) 1327 { 1328 d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), 1329 &env->mmx_status)); 1330 d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), 1331 &env->mmx_status)); 1332 } 1333 1334 void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1335 { 1336 float32 r; 1337 1338 r = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1339 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1340 d->MMX_S(0) = r; 1341 } 1342 1343 void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s) 1344 { 1345 d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1346 d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1347 } 1348 1349 void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s) 1350 { 1351 d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0), 1352 &env->mmx_status) ? -1 : 0; 1353 d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1), 1354 &env->mmx_status) ? -1 : 0; 1355 } 1356 1357 void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s) 1358 { 1359 d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), 1360 &env->mmx_status) ? -1 : 0; 1361 d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), 1362 &env->mmx_status) ? -1 : 0; 1363 } 1364 1365 void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s) 1366 { 1367 d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), 1368 &env->mmx_status) ? -1 : 0; 1369 d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), 1370 &env->mmx_status) ? -1 : 0; 1371 } 1372 1373 void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s) 1374 { 1375 if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) { 1376 d->MMX_S(0) = s->MMX_S(0); 1377 } 1378 if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) { 1379 d->MMX_S(1) = s->MMX_S(1); 1380 } 1381 } 1382 1383 void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s) 1384 { 1385 if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) { 1386 d->MMX_S(0) = s->MMX_S(0); 1387 } 1388 if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) { 1389 d->MMX_S(1) = s->MMX_S(1); 1390 } 1391 } 1392 1393 void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s) 1394 { 1395 d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1396 d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1397 } 1398 1399 void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1400 { 1401 float32 r; 1402 1403 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1404 d->MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1405 d->MMX_S(0) = r; 1406 } 1407 1408 void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1409 { 1410 float32 r; 1411 1412 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1413 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1414 d->MMX_S(0) = r; 1415 } 1416 1417 void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s) 1418 { 1419 d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status); 1420 d->MMX_S(1) = d->MMX_S(0); 1421 } 1422 1423 void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s) 1424 { 1425 d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff; 1426 d->MMX_S(1) = float32_div(float32_one, 1427 float32_sqrt(d->MMX_S(1), &env->mmx_status), 1428 &env->mmx_status); 1429 d->MMX_L(1) |= s->MMX_L(0) & 0x80000000; 1430 d->MMX_L(0) = d->MMX_L(1); 1431 } 1432 1433 void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s) 1434 { 1435 d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1436 d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1437 } 1438 1439 void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s) 1440 { 1441 d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status); 1442 d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status); 1443 } 1444 1445 void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s) 1446 { 1447 uint32_t r; 1448 1449 r = s->MMX_L(0); 1450 d->MMX_L(0) = s->MMX_L(1); 1451 d->MMX_L(1) = r; 1452 } 1453 #endif 1454 1455 /* SSSE3 op helpers */ 1456 void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1457 { 1458 Reg *v = d; 1459 int i; 1460 #if SHIFT == 0 1461 uint8_t r[8]; 1462 1463 for (i = 0; i < 8; i++) { 1464 r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7)); 1465 } 1466 for (i = 0; i < 8; i++) { 1467 d->B(i) = r[i]; 1468 } 1469 #else 1470 uint8_t r[8 << SHIFT]; 1471 1472 for (i = 0; i < 8 << SHIFT; i++) { 1473 int j = i & ~0xf; 1474 r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf)); 1475 } 1476 for (i = 0; i < 8 << SHIFT; i++) { 1477 d->B(i) = r[i]; 1478 } 1479 #endif 1480 } 1481 1482 #define SSE_HELPER_HW(name, F) \ 1483 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 1484 { \ 1485 Reg *v = d; \ 1486 uint16_t r[4 << SHIFT]; \ 1487 int i, j, k; \ 1488 for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \ 1489 for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \ 1490 r[i + k] = F(v->W(j + k), v->W(j + k + 1)); \ 1491 } \ 1492 for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \ 1493 r[i + k] = F(s->W(j + k), s->W(j + k + 1)); \ 1494 } \ 1495 } \ 1496 for (i = 0; i < 4 << SHIFT; i++) { \ 1497 d->W(i) = r[i]; \ 1498 } \ 1499 } 1500 1501 #define SSE_HELPER_HL(name, F) \ 1502 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 1503 { \ 1504 Reg *v = d; \ 1505 uint32_t r[2 << SHIFT]; \ 1506 int i, j, k; \ 1507 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ 1508 for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \ 1509 r[i + k] = F(v->L(j + k), v->L(j + k + 1)); \ 1510 } \ 1511 for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \ 1512 r[i + k] = F(s->L(j + k), s->L(j + k + 1)); \ 1513 } \ 1514 } \ 1515 for (i = 0; i < 2 << SHIFT; i++) { \ 1516 d->L(i) = r[i]; \ 1517 } \ 1518 } 1519 1520 SSE_HELPER_HW(phaddw, FADD) 1521 SSE_HELPER_HW(phsubw, FSUB) 1522 SSE_HELPER_HW(phaddsw, FADDSW) 1523 SSE_HELPER_HW(phsubsw, FSUBSW) 1524 SSE_HELPER_HL(phaddd, FADD) 1525 SSE_HELPER_HL(phsubd, FSUB) 1526 1527 #undef SSE_HELPER_HW 1528 #undef SSE_HELPER_HL 1529 1530 void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1531 { 1532 Reg *v = d; 1533 int i; 1534 for (i = 0; i < 4 << SHIFT; i++) { 1535 d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) + 1536 (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1)); 1537 } 1538 } 1539 1540 #define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x) 1541 #define FABSW(x) (x > INT16_MAX ? -(int16_t)x : x) 1542 #define FABSL(x) (x > INT32_MAX ? -(int32_t)x : x) 1543 SSE_HELPER_1(helper_pabsb, B, 8 << SHIFT, FABSB) 1544 SSE_HELPER_1(helper_pabsw, W, 4 << SHIFT, FABSW) 1545 SSE_HELPER_1(helper_pabsd, L, 2 << SHIFT, FABSL) 1546 1547 #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15) 1548 SSE_HELPER_W(helper_pmulhrsw, FMULHRSW) 1549 1550 #define FSIGNB(d, s) (s <= INT8_MAX ? s ? d : 0 : -(int8_t)d) 1551 #define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d) 1552 #define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d) 1553 SSE_HELPER_B(helper_psignb, FSIGNB) 1554 SSE_HELPER_W(helper_psignw, FSIGNW) 1555 SSE_HELPER_L(helper_psignd, FSIGNL) 1556 1557 void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1558 int32_t shift) 1559 { 1560 Reg *v = d; 1561 int i; 1562 1563 /* XXX could be checked during translation */ 1564 if (shift >= (SHIFT ? 32 : 16)) { 1565 for (i = 0; i < (1 << SHIFT); i++) { 1566 d->Q(i) = 0; 1567 } 1568 } else { 1569 shift <<= 3; 1570 #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0) 1571 #if SHIFT == 0 1572 d->Q(0) = SHR(s->Q(0), shift - 0) | 1573 SHR(v->Q(0), shift - 64); 1574 #else 1575 for (i = 0; i < (1 << SHIFT); i += 2) { 1576 uint64_t r0, r1; 1577 1578 r0 = SHR(s->Q(i), shift - 0) | 1579 SHR(s->Q(i + 1), shift - 64) | 1580 SHR(v->Q(i), shift - 128) | 1581 SHR(v->Q(i + 1), shift - 192); 1582 r1 = SHR(s->Q(i), shift + 64) | 1583 SHR(s->Q(i + 1), shift - 0) | 1584 SHR(v->Q(i), shift - 64) | 1585 SHR(v->Q(i + 1), shift - 128); 1586 d->Q(i) = r0; 1587 d->Q(i + 1) = r1; 1588 } 1589 #endif 1590 #undef SHR 1591 } 1592 } 1593 1594 #if SHIFT >= 1 1595 1596 #define SSE_HELPER_V(name, elem, num, F) \ 1597 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 1598 { \ 1599 Reg *v = d; \ 1600 Reg *m = &env->xmm_regs[0]; \ 1601 int i; \ 1602 for (i = 0; i < num; i++) { \ 1603 d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i)); \ 1604 } \ 1605 } 1606 1607 #define SSE_HELPER_I(name, elem, num, F) \ 1608 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, \ 1609 uint32_t imm) \ 1610 { \ 1611 Reg *v = d; \ 1612 int i; \ 1613 for (i = 0; i < num; i++) { \ 1614 int j = i & 7; \ 1615 d->elem(i) = F(v->elem(i), s->elem(i), (imm >> j) & 1); \ 1616 } \ 1617 } 1618 1619 /* SSE4.1 op helpers */ 1620 #define FBLENDVB(v, s, m) ((m & 0x80) ? s : v) 1621 #define FBLENDVPS(v, s, m) ((m & 0x80000000) ? s : v) 1622 #define FBLENDVPD(v, s, m) ((m & 0x8000000000000000LL) ? s : v) 1623 SSE_HELPER_V(helper_pblendvb, B, 8 << SHIFT, FBLENDVB) 1624 SSE_HELPER_V(helper_blendvps, L, 2 << SHIFT, FBLENDVPS) 1625 SSE_HELPER_V(helper_blendvpd, Q, 1 << SHIFT, FBLENDVPD) 1626 1627 void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1628 { 1629 uint64_t zf = 0, cf = 0; 1630 int i; 1631 1632 for (i = 0; i < 1 << SHIFT; i++) { 1633 zf |= (s->Q(i) & d->Q(i)); 1634 cf |= (s->Q(i) & ~d->Q(i)); 1635 } 1636 CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); 1637 } 1638 1639 #define SSE_HELPER_F(name, elem, num, F) \ 1640 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 1641 { \ 1642 int n = num; \ 1643 for (int i = n; --i >= 0; ) { \ 1644 d->elem(i) = F(i); \ 1645 } \ 1646 } 1647 1648 #if SHIFT > 0 1649 SSE_HELPER_F(helper_pmovsxbw, W, 4 << SHIFT, (int8_t) s->B) 1650 SSE_HELPER_F(helper_pmovsxbd, L, 2 << SHIFT, (int8_t) s->B) 1651 SSE_HELPER_F(helper_pmovsxbq, Q, 1 << SHIFT, (int8_t) s->B) 1652 SSE_HELPER_F(helper_pmovsxwd, L, 2 << SHIFT, (int16_t) s->W) 1653 SSE_HELPER_F(helper_pmovsxwq, Q, 1 << SHIFT, (int16_t) s->W) 1654 SSE_HELPER_F(helper_pmovsxdq, Q, 1 << SHIFT, (int32_t) s->L) 1655 SSE_HELPER_F(helper_pmovzxbw, W, 4 << SHIFT, s->B) 1656 SSE_HELPER_F(helper_pmovzxbd, L, 2 << SHIFT, s->B) 1657 SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B) 1658 SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W) 1659 SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W) 1660 SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L) 1661 #endif 1662 1663 void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1664 { 1665 Reg *v = d; 1666 int i; 1667 1668 for (i = 0; i < 1 << SHIFT; i++) { 1669 d->Q(i) = (int64_t)(int32_t) v->L(2 * i) * (int32_t) s->L(2 * i); 1670 } 1671 } 1672 1673 #define FCMPEQQ(d, s) (d == s ? -1 : 0) 1674 SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ) 1675 1676 void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1677 { 1678 Reg *v = d; 1679 uint16_t r[8]; 1680 int i, j, k; 1681 1682 for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) { 1683 r[0] = satuw(v->L(j)); 1684 r[1] = satuw(v->L(j + 1)); 1685 r[2] = satuw(v->L(j + 2)); 1686 r[3] = satuw(v->L(j + 3)); 1687 r[4] = satuw(s->L(j)); 1688 r[5] = satuw(s->L(j + 1)); 1689 r[6] = satuw(s->L(j + 2)); 1690 r[7] = satuw(s->L(j + 3)); 1691 for (k = 0; k < 8; k++) { 1692 d->W(i + k) = r[k]; 1693 } 1694 } 1695 } 1696 1697 #define FMINSB(d, s) MIN((int8_t)d, (int8_t)s) 1698 #define FMINSD(d, s) MIN((int32_t)d, (int32_t)s) 1699 #define FMAXSB(d, s) MAX((int8_t)d, (int8_t)s) 1700 #define FMAXSD(d, s) MAX((int32_t)d, (int32_t)s) 1701 SSE_HELPER_B(helper_pminsb, FMINSB) 1702 SSE_HELPER_L(helper_pminsd, FMINSD) 1703 SSE_HELPER_W(helper_pminuw, MIN) 1704 SSE_HELPER_L(helper_pminud, MIN) 1705 SSE_HELPER_B(helper_pmaxsb, FMAXSB) 1706 SSE_HELPER_L(helper_pmaxsd, FMAXSD) 1707 SSE_HELPER_W(helper_pmaxuw, MAX) 1708 SSE_HELPER_L(helper_pmaxud, MAX) 1709 1710 #define FMULLD(d, s) ((int32_t)d * (int32_t)s) 1711 SSE_HELPER_L(helper_pmulld, FMULLD) 1712 1713 #if SHIFT == 1 1714 void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1715 { 1716 int idx = 0; 1717 1718 if (s->W(1) < s->W(idx)) { 1719 idx = 1; 1720 } 1721 if (s->W(2) < s->W(idx)) { 1722 idx = 2; 1723 } 1724 if (s->W(3) < s->W(idx)) { 1725 idx = 3; 1726 } 1727 if (s->W(4) < s->W(idx)) { 1728 idx = 4; 1729 } 1730 if (s->W(5) < s->W(idx)) { 1731 idx = 5; 1732 } 1733 if (s->W(6) < s->W(idx)) { 1734 idx = 6; 1735 } 1736 if (s->W(7) < s->W(idx)) { 1737 idx = 7; 1738 } 1739 1740 d->W(0) = s->W(idx); 1741 d->W(1) = idx; 1742 d->L(1) = 0; 1743 d->Q(1) = 0; 1744 } 1745 #endif 1746 1747 void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1748 uint32_t mode) 1749 { 1750 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1751 signed char prev_rounding_mode; 1752 int i; 1753 1754 prev_rounding_mode = env->sse_status.float_rounding_mode; 1755 if (!(mode & (1 << 2))) { 1756 switch (mode & 3) { 1757 case 0: 1758 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1759 break; 1760 case 1: 1761 set_float_rounding_mode(float_round_down, &env->sse_status); 1762 break; 1763 case 2: 1764 set_float_rounding_mode(float_round_up, &env->sse_status); 1765 break; 1766 case 3: 1767 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1768 break; 1769 } 1770 } 1771 1772 for (i = 0; i < 2 << SHIFT; i++) { 1773 d->ZMM_S(i) = float32_round_to_int(s->ZMM_S(i), &env->sse_status); 1774 } 1775 1776 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1777 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1778 ~float_flag_inexact, 1779 &env->sse_status); 1780 } 1781 env->sse_status.float_rounding_mode = prev_rounding_mode; 1782 } 1783 1784 void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1785 uint32_t mode) 1786 { 1787 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1788 signed char prev_rounding_mode; 1789 int i; 1790 1791 prev_rounding_mode = env->sse_status.float_rounding_mode; 1792 if (!(mode & (1 << 2))) { 1793 switch (mode & 3) { 1794 case 0: 1795 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1796 break; 1797 case 1: 1798 set_float_rounding_mode(float_round_down, &env->sse_status); 1799 break; 1800 case 2: 1801 set_float_rounding_mode(float_round_up, &env->sse_status); 1802 break; 1803 case 3: 1804 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1805 break; 1806 } 1807 } 1808 1809 for (i = 0; i < 1 << SHIFT; i++) { 1810 d->ZMM_D(i) = float64_round_to_int(s->ZMM_D(i), &env->sse_status); 1811 } 1812 1813 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1814 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1815 ~float_flag_inexact, 1816 &env->sse_status); 1817 } 1818 env->sse_status.float_rounding_mode = prev_rounding_mode; 1819 } 1820 1821 #if SHIFT == 1 1822 void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1823 uint32_t mode) 1824 { 1825 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1826 signed char prev_rounding_mode; 1827 1828 prev_rounding_mode = env->sse_status.float_rounding_mode; 1829 if (!(mode & (1 << 2))) { 1830 switch (mode & 3) { 1831 case 0: 1832 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1833 break; 1834 case 1: 1835 set_float_rounding_mode(float_round_down, &env->sse_status); 1836 break; 1837 case 2: 1838 set_float_rounding_mode(float_round_up, &env->sse_status); 1839 break; 1840 case 3: 1841 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1842 break; 1843 } 1844 } 1845 1846 d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status); 1847 1848 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1849 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1850 ~float_flag_inexact, 1851 &env->sse_status); 1852 } 1853 env->sse_status.float_rounding_mode = prev_rounding_mode; 1854 } 1855 1856 void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1857 uint32_t mode) 1858 { 1859 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1860 signed char prev_rounding_mode; 1861 1862 prev_rounding_mode = env->sse_status.float_rounding_mode; 1863 if (!(mode & (1 << 2))) { 1864 switch (mode & 3) { 1865 case 0: 1866 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1867 break; 1868 case 1: 1869 set_float_rounding_mode(float_round_down, &env->sse_status); 1870 break; 1871 case 2: 1872 set_float_rounding_mode(float_round_up, &env->sse_status); 1873 break; 1874 case 3: 1875 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1876 break; 1877 } 1878 } 1879 1880 d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status); 1881 1882 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1883 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1884 ~float_flag_inexact, 1885 &env->sse_status); 1886 } 1887 env->sse_status.float_rounding_mode = prev_rounding_mode; 1888 } 1889 #endif 1890 1891 #define FBLENDP(v, s, m) (m ? s : v) 1892 SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP) 1893 SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP) 1894 SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP) 1895 1896 void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1897 uint32_t mask) 1898 { 1899 Reg *v = d; 1900 float32 prod1, prod2, temp2, temp3, temp4; 1901 int i; 1902 1903 for (i = 0; i < 2 << SHIFT; i += 4) { 1904 /* 1905 * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D 1906 * to correctly round the intermediate results 1907 */ 1908 if (mask & (1 << 4)) { 1909 prod1 = float32_mul(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); 1910 } else { 1911 prod1 = float32_zero; 1912 } 1913 if (mask & (1 << 5)) { 1914 prod2 = float32_mul(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status); 1915 } else { 1916 prod2 = float32_zero; 1917 } 1918 temp2 = float32_add(prod1, prod2, &env->sse_status); 1919 if (mask & (1 << 6)) { 1920 prod1 = float32_mul(v->ZMM_S(i+2), s->ZMM_S(i+2), &env->sse_status); 1921 } else { 1922 prod1 = float32_zero; 1923 } 1924 if (mask & (1 << 7)) { 1925 prod2 = float32_mul(v->ZMM_S(i+3), s->ZMM_S(i+3), &env->sse_status); 1926 } else { 1927 prod2 = float32_zero; 1928 } 1929 temp3 = float32_add(prod1, prod2, &env->sse_status); 1930 temp4 = float32_add(temp2, temp3, &env->sse_status); 1931 1932 d->ZMM_S(i) = (mask & (1 << 0)) ? temp4 : float32_zero; 1933 d->ZMM_S(i+1) = (mask & (1 << 1)) ? temp4 : float32_zero; 1934 d->ZMM_S(i+2) = (mask & (1 << 2)) ? temp4 : float32_zero; 1935 d->ZMM_S(i+3) = (mask & (1 << 3)) ? temp4 : float32_zero; 1936 } 1937 } 1938 1939 #if SHIFT == 1 1940 /* Oddly, there is no ymm version of dppd */ 1941 void glue(helper_dppd, SUFFIX)(CPUX86State *env, 1942 Reg *d, Reg *s, uint32_t mask) 1943 { 1944 Reg *v = d; 1945 float64 prod1, prod2, temp2; 1946 1947 if (mask & (1 << 4)) { 1948 prod1 = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status); 1949 } else { 1950 prod1 = float64_zero; 1951 } 1952 if (mask & (1 << 5)) { 1953 prod2 = float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status); 1954 } else { 1955 prod2 = float64_zero; 1956 } 1957 temp2 = float64_add(prod1, prod2, &env->sse_status); 1958 d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero; 1959 d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero; 1960 } 1961 #endif 1962 1963 void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1964 uint32_t offset) 1965 { 1966 Reg *v = d; 1967 int i, j; 1968 uint16_t r[8]; 1969 1970 for (j = 0; j < 4 << SHIFT; ) { 1971 int s0 = (j * 2) + ((offset & 3) << 2); 1972 int d0 = (j * 2) + ((offset & 4) << 0); 1973 for (i = 0; i < LANE_WIDTH / 2; i++, d0++) { 1974 r[i] = 0; 1975 r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0)); 1976 r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1)); 1977 r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2)); 1978 r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3)); 1979 } 1980 for (i = 0; i < LANE_WIDTH / 2; i++, j++) { 1981 d->W(j) = r[i]; 1982 } 1983 offset >>= 3; 1984 } 1985 } 1986 1987 /* SSE4.2 op helpers */ 1988 #define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0) 1989 SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) 1990 1991 #if SHIFT == 1 1992 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl) 1993 { 1994 target_long val, limit; 1995 1996 /* Presence of REX.W is indicated by a bit higher than 7 set */ 1997 if (ctrl >> 8) { 1998 val = (target_long)env->regs[reg]; 1999 } else { 2000 val = (int32_t)env->regs[reg]; 2001 } 2002 if (ctrl & 1) { 2003 limit = 8; 2004 } else { 2005 limit = 16; 2006 } 2007 if ((val > limit) || (val < -limit)) { 2008 return limit; 2009 } 2010 return abs1(val); 2011 } 2012 2013 static inline int pcmp_ilen(Reg *r, uint8_t ctrl) 2014 { 2015 int val = 0; 2016 2017 if (ctrl & 1) { 2018 while (val < 8 && r->W(val)) { 2019 val++; 2020 } 2021 } else { 2022 while (val < 16 && r->B(val)) { 2023 val++; 2024 } 2025 } 2026 2027 return val; 2028 } 2029 2030 static inline int pcmp_val(Reg *r, uint8_t ctrl, int i) 2031 { 2032 switch ((ctrl >> 0) & 3) { 2033 case 0: 2034 return r->B(i); 2035 case 1: 2036 return r->W(i); 2037 case 2: 2038 return (int8_t)r->B(i); 2039 case 3: 2040 default: 2041 return (int16_t)r->W(i); 2042 } 2043 } 2044 2045 static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s, 2046 int8_t ctrl, int valids, int validd) 2047 { 2048 unsigned int res = 0; 2049 int v; 2050 int j, i; 2051 int upper = (ctrl & 1) ? 7 : 15; 2052 2053 valids--; 2054 validd--; 2055 2056 CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0); 2057 2058 switch ((ctrl >> 2) & 3) { 2059 case 0: 2060 for (j = valids; j >= 0; j--) { 2061 res <<= 1; 2062 v = pcmp_val(s, ctrl, j); 2063 for (i = validd; i >= 0; i--) { 2064 res |= (v == pcmp_val(d, ctrl, i)); 2065 } 2066 } 2067 break; 2068 case 1: 2069 for (j = valids; j >= 0; j--) { 2070 res <<= 1; 2071 v = pcmp_val(s, ctrl, j); 2072 for (i = ((validd - 1) | 1); i >= 0; i -= 2) { 2073 res |= (pcmp_val(d, ctrl, i - 0) >= v && 2074 pcmp_val(d, ctrl, i - 1) <= v); 2075 } 2076 } 2077 break; 2078 case 2: 2079 res = (1 << (upper - MAX(valids, validd))) - 1; 2080 res <<= MAX(valids, validd) - MIN(valids, validd); 2081 for (i = MIN(valids, validd); i >= 0; i--) { 2082 res <<= 1; 2083 v = pcmp_val(s, ctrl, i); 2084 res |= (v == pcmp_val(d, ctrl, i)); 2085 } 2086 break; 2087 case 3: 2088 if (validd == -1) { 2089 res = (2 << upper) - 1; 2090 break; 2091 } 2092 for (j = valids == upper ? valids : valids - validd; j >= 0; j--) { 2093 res <<= 1; 2094 v = 1; 2095 for (i = MIN(valids - j, validd); i >= 0; i--) { 2096 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i)); 2097 } 2098 res |= v; 2099 } 2100 break; 2101 } 2102 2103 switch ((ctrl >> 4) & 3) { 2104 case 1: 2105 res ^= (2 << upper) - 1; 2106 break; 2107 case 3: 2108 res ^= (1 << (valids + 1)) - 1; 2109 break; 2110 } 2111 2112 if (res) { 2113 CC_SRC |= CC_C; 2114 } 2115 if (res & 1) { 2116 CC_SRC |= CC_O; 2117 } 2118 2119 return res; 2120 } 2121 2122 void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2123 uint32_t ctrl) 2124 { 2125 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2126 pcmp_elen(env, R_EDX, ctrl), 2127 pcmp_elen(env, R_EAX, ctrl)); 2128 2129 if (res) { 2130 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res); 2131 } else { 2132 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); 2133 } 2134 } 2135 2136 void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2137 uint32_t ctrl) 2138 { 2139 int i; 2140 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2141 pcmp_elen(env, R_EDX, ctrl), 2142 pcmp_elen(env, R_EAX, ctrl)); 2143 2144 if ((ctrl >> 6) & 1) { 2145 if (ctrl & 1) { 2146 for (i = 0; i < 8; i++, res >>= 1) { 2147 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0; 2148 } 2149 } else { 2150 for (i = 0; i < 16; i++, res >>= 1) { 2151 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0; 2152 } 2153 } 2154 } else { 2155 env->xmm_regs[0].Q(1) = 0; 2156 env->xmm_regs[0].Q(0) = res; 2157 } 2158 } 2159 2160 void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2161 uint32_t ctrl) 2162 { 2163 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2164 pcmp_ilen(s, ctrl), 2165 pcmp_ilen(d, ctrl)); 2166 2167 if (res) { 2168 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res); 2169 } else { 2170 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); 2171 } 2172 } 2173 2174 void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2175 uint32_t ctrl) 2176 { 2177 int i; 2178 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2179 pcmp_ilen(s, ctrl), 2180 pcmp_ilen(d, ctrl)); 2181 2182 if ((ctrl >> 6) & 1) { 2183 if (ctrl & 1) { 2184 for (i = 0; i < 8; i++, res >>= 1) { 2185 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0; 2186 } 2187 } else { 2188 for (i = 0; i < 16; i++, res >>= 1) { 2189 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0; 2190 } 2191 } 2192 } else { 2193 env->xmm_regs[0].Q(1) = 0; 2194 env->xmm_regs[0].Q(0) = res; 2195 } 2196 } 2197 2198 #define CRCPOLY 0x1edc6f41 2199 #define CRCPOLY_BITREV 0x82f63b78 2200 target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) 2201 { 2202 target_ulong crc = (msg & ((target_ulong) -1 >> 2203 (TARGET_LONG_BITS - len))) ^ crc1; 2204 2205 while (len--) { 2206 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0); 2207 } 2208 2209 return crc; 2210 } 2211 2212 #endif 2213 2214 #if SHIFT == 1 2215 static void clmulq(uint64_t *dest_l, uint64_t *dest_h, 2216 uint64_t a, uint64_t b) 2217 { 2218 uint64_t al, ah, resh, resl; 2219 2220 ah = 0; 2221 al = a; 2222 resh = resl = 0; 2223 2224 while (b) { 2225 if (b & 1) { 2226 resl ^= al; 2227 resh ^= ah; 2228 } 2229 ah = (ah << 1) | (al >> 63); 2230 al <<= 1; 2231 b >>= 1; 2232 } 2233 2234 *dest_l = resl; 2235 *dest_h = resh; 2236 } 2237 #endif 2238 2239 void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2240 uint32_t ctrl) 2241 { 2242 Reg *v = d; 2243 uint64_t a, b; 2244 int i; 2245 2246 for (i = 0; i < 1 << SHIFT; i += 2) { 2247 a = v->Q(((ctrl & 1) != 0) + i); 2248 b = s->Q(((ctrl & 16) != 0) + i); 2249 clmulq(&d->Q(i), &d->Q(i + 1), a, b); 2250 } 2251 } 2252 2253 void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2254 { 2255 int i; 2256 Reg st = *d; 2257 Reg rk = *s; 2258 2259 for (i = 0 ; i < 2 << SHIFT ; i++) { 2260 int j = i & 3; 2261 d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^ 2262 AES_Td1[st.B(AES_ishifts[4 * j + 1])] ^ 2263 AES_Td2[st.B(AES_ishifts[4 * j + 2])] ^ 2264 AES_Td3[st.B(AES_ishifts[4 * j + 3])]); 2265 } 2266 } 2267 2268 void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2269 { 2270 int i; 2271 Reg st = *d; 2272 Reg rk = *s; 2273 2274 for (i = 0; i < 8 << SHIFT; i++) { 2275 d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]); 2276 } 2277 } 2278 2279 void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2280 { 2281 int i; 2282 Reg st = *d; 2283 Reg rk = *s; 2284 2285 for (i = 0 ; i < 2 << SHIFT ; i++) { 2286 int j = i & 3; 2287 d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^ 2288 AES_Te1[st.B(AES_shifts[4 * j + 1])] ^ 2289 AES_Te2[st.B(AES_shifts[4 * j + 2])] ^ 2290 AES_Te3[st.B(AES_shifts[4 * j + 3])]); 2291 } 2292 } 2293 2294 void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2295 { 2296 int i; 2297 Reg st = *d; 2298 Reg rk = *s; 2299 2300 for (i = 0; i < 8 << SHIFT; i++) { 2301 d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]); 2302 } 2303 } 2304 2305 #if SHIFT == 1 2306 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2307 { 2308 int i; 2309 Reg tmp = *s; 2310 2311 for (i = 0 ; i < 4 ; i++) { 2312 d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^ 2313 AES_imc[tmp.B(4 * i + 1)][1] ^ 2314 AES_imc[tmp.B(4 * i + 2)][2] ^ 2315 AES_imc[tmp.B(4 * i + 3)][3]); 2316 } 2317 } 2318 2319 void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2320 uint32_t ctrl) 2321 { 2322 int i; 2323 Reg tmp = *s; 2324 2325 for (i = 0 ; i < 4 ; i++) { 2326 d->B(i) = AES_sbox[tmp.B(i + 4)]; 2327 d->B(i + 8) = AES_sbox[tmp.B(i + 12)]; 2328 } 2329 d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl; 2330 d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl; 2331 } 2332 #endif 2333 #endif 2334 2335 #undef SSE_HELPER_S 2336 2337 #undef SHIFT 2338 #undef XMM_ONLY 2339 #undef Reg 2340 #undef B 2341 #undef W 2342 #undef L 2343 #undef Q 2344 #undef SUFFIX 2345