1 /* 2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support 3 * 4 * Copyright (c) 2005 Fabrice Bellard 5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com> 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 #include "crypto/aes.h" 22 23 #if SHIFT == 0 24 #define Reg MMXReg 25 #define XMM_ONLY(...) 26 #define B(n) MMX_B(n) 27 #define W(n) MMX_W(n) 28 #define L(n) MMX_L(n) 29 #define Q(n) MMX_Q(n) 30 #define SUFFIX _mmx 31 #else 32 #define Reg ZMMReg 33 #define XMM_ONLY(...) __VA_ARGS__ 34 #define B(n) ZMM_B(n) 35 #define W(n) ZMM_W(n) 36 #define L(n) ZMM_L(n) 37 #define Q(n) ZMM_Q(n) 38 #if SHIFT == 1 39 #define SUFFIX _xmm 40 #else 41 #define SUFFIX _ymm 42 #endif 43 #endif 44 45 #define LANE_WIDTH (SHIFT ? 16 : 8) 46 #define PACK_WIDTH (LANE_WIDTH / 2) 47 48 #if SHIFT == 0 49 #define FPSRL(x, c) ((x) >> shift) 50 #define FPSRAW(x, c) ((int16_t)(x) >> shift) 51 #define FPSRAL(x, c) ((int32_t)(x) >> shift) 52 #define FPSLL(x, c) ((x) << shift) 53 #endif 54 55 void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 56 { 57 int shift; 58 if (c->Q(0) > 15) { 59 for (int i = 0; i < 1 << SHIFT; i++) { 60 d->Q(i) = 0; 61 } 62 } else { 63 shift = c->B(0); 64 for (int i = 0; i < 4 << SHIFT; i++) { 65 d->W(i) = FPSRL(s->W(i), shift); 66 } 67 } 68 } 69 70 void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 71 { 72 int shift; 73 if (c->Q(0) > 15) { 74 for (int i = 0; i < 1 << SHIFT; i++) { 75 d->Q(i) = 0; 76 } 77 } else { 78 shift = c->B(0); 79 for (int i = 0; i < 4 << SHIFT; i++) { 80 d->W(i) = FPSLL(s->W(i), shift); 81 } 82 } 83 } 84 85 void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 86 { 87 int shift; 88 if (c->Q(0) > 15) { 89 shift = 15; 90 } else { 91 shift = c->B(0); 92 } 93 for (int i = 0; i < 4 << SHIFT; i++) { 94 d->W(i) = FPSRAW(s->W(i), shift); 95 } 96 } 97 98 void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 99 { 100 int shift; 101 if (c->Q(0) > 31) { 102 for (int i = 0; i < 1 << SHIFT; i++) { 103 d->Q(i) = 0; 104 } 105 } else { 106 shift = c->B(0); 107 for (int i = 0; i < 2 << SHIFT; i++) { 108 d->L(i) = FPSRL(s->L(i), shift); 109 } 110 } 111 } 112 113 void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 114 { 115 int shift; 116 if (c->Q(0) > 31) { 117 for (int i = 0; i < 1 << SHIFT; i++) { 118 d->Q(i) = 0; 119 } 120 } else { 121 shift = c->B(0); 122 for (int i = 0; i < 2 << SHIFT; i++) { 123 d->L(i) = FPSLL(s->L(i), shift); 124 } 125 } 126 } 127 128 void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 129 { 130 int shift; 131 if (c->Q(0) > 31) { 132 shift = 31; 133 } else { 134 shift = c->B(0); 135 } 136 for (int i = 0; i < 2 << SHIFT; i++) { 137 d->L(i) = FPSRAL(s->L(i), shift); 138 } 139 } 140 141 void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 142 { 143 int shift; 144 if (c->Q(0) > 63) { 145 for (int i = 0; i < 1 << SHIFT; i++) { 146 d->Q(i) = 0; 147 } 148 } else { 149 shift = c->B(0); 150 for (int i = 0; i < 1 << SHIFT; i++) { 151 d->Q(i) = FPSRL(s->Q(i), shift); 152 } 153 } 154 } 155 156 void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 157 { 158 int shift; 159 if (c->Q(0) > 63) { 160 for (int i = 0; i < 1 << SHIFT; i++) { 161 d->Q(i) = 0; 162 } 163 } else { 164 shift = c->B(0); 165 for (int i = 0; i < 1 << SHIFT; i++) { 166 d->Q(i) = FPSLL(s->Q(i), shift); 167 } 168 } 169 } 170 171 #if SHIFT >= 1 172 void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 173 { 174 int shift, i, j; 175 176 shift = c->L(0); 177 if (shift > 16) { 178 shift = 16; 179 } 180 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) { 181 for (i = 0; i < 16 - shift; i++) { 182 d->B(j + i) = s->B(j + i + shift); 183 } 184 for (i = 16 - shift; i < 16; i++) { 185 d->B(j + i) = 0; 186 } 187 } 188 } 189 190 void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 191 { 192 int shift, i, j; 193 194 shift = c->L(0); 195 if (shift > 16) { 196 shift = 16; 197 } 198 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) { 199 for (i = 15; i >= shift; i--) { 200 d->B(j + i) = s->B(j + i - shift); 201 } 202 for (i = 0; i < shift; i++) { 203 d->B(j + i) = 0; 204 } 205 } 206 } 207 #endif 208 209 #define SSE_HELPER_1(name, elem, num, F) \ 210 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 211 { \ 212 int n = num; \ 213 for (int i = 0; i < n; i++) { \ 214 d->elem(i) = F(s->elem(i)); \ 215 } \ 216 } 217 218 #define SSE_HELPER_2(name, elem, num, F) \ 219 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 220 { \ 221 int n = num; \ 222 for (int i = 0; i < n; i++) { \ 223 d->elem(i) = F(v->elem(i), s->elem(i)); \ 224 } \ 225 } 226 227 #define SSE_HELPER_B(name, F) \ 228 SSE_HELPER_2(name, B, 8 << SHIFT, F) 229 230 #define SSE_HELPER_W(name, F) \ 231 SSE_HELPER_2(name, W, 4 << SHIFT, F) 232 233 #define SSE_HELPER_L(name, F) \ 234 SSE_HELPER_2(name, L, 2 << SHIFT, F) 235 236 #define SSE_HELPER_Q(name, F) \ 237 SSE_HELPER_2(name, Q, 1 << SHIFT, F) 238 239 #if SHIFT == 0 240 static inline int satub(int x) 241 { 242 if (x < 0) { 243 return 0; 244 } else if (x > 255) { 245 return 255; 246 } else { 247 return x; 248 } 249 } 250 251 static inline int satuw(int x) 252 { 253 if (x < 0) { 254 return 0; 255 } else if (x > 65535) { 256 return 65535; 257 } else { 258 return x; 259 } 260 } 261 262 static inline int satsb(int x) 263 { 264 if (x < -128) { 265 return -128; 266 } else if (x > 127) { 267 return 127; 268 } else { 269 return x; 270 } 271 } 272 273 static inline int satsw(int x) 274 { 275 if (x < -32768) { 276 return -32768; 277 } else if (x > 32767) { 278 return 32767; 279 } else { 280 return x; 281 } 282 } 283 284 #define FADD(a, b) ((a) + (b)) 285 #define FADDUB(a, b) satub((a) + (b)) 286 #define FADDUW(a, b) satuw((a) + (b)) 287 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b)) 288 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b)) 289 290 #define FSUB(a, b) ((a) - (b)) 291 #define FSUBUB(a, b) satub((a) - (b)) 292 #define FSUBUW(a, b) satuw((a) - (b)) 293 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b)) 294 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b)) 295 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b) 296 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b) 297 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b) 298 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b) 299 300 #define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16) 301 #define FMULHUW(a, b) ((a) * (b) >> 16) 302 #define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16) 303 304 #define FAVG(a, b) (((a) + (b) + 1) >> 1) 305 #endif 306 307 SSE_HELPER_W(helper_pmulhuw, FMULHUW) 308 SSE_HELPER_W(helper_pmulhw, FMULHW) 309 310 #if SHIFT == 0 311 void glue(helper_pmulhrw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 312 { 313 d->W(0) = FMULHRW(d->W(0), s->W(0)); 314 d->W(1) = FMULHRW(d->W(1), s->W(1)); 315 d->W(2) = FMULHRW(d->W(2), s->W(2)); 316 d->W(3) = FMULHRW(d->W(3), s->W(3)); 317 } 318 #endif 319 320 SSE_HELPER_B(helper_pavgb, FAVG) 321 SSE_HELPER_W(helper_pavgw, FAVG) 322 323 void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 324 { 325 int i; 326 327 for (i = 0; i < (1 << SHIFT); i++) { 328 d->Q(i) = (uint64_t)s->L(i * 2) * (uint64_t)v->L(i * 2); 329 } 330 } 331 332 void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 333 { 334 int i; 335 336 for (i = 0; i < (2 << SHIFT); i++) { 337 d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) + 338 (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1); 339 } 340 } 341 342 #if SHIFT == 0 343 static inline int abs1(int a) 344 { 345 if (a < 0) { 346 return -a; 347 } else { 348 return a; 349 } 350 } 351 #endif 352 void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 353 { 354 int i; 355 356 for (i = 0; i < (1 << SHIFT); i++) { 357 unsigned int val = 0; 358 val += abs1(v->B(8 * i + 0) - s->B(8 * i + 0)); 359 val += abs1(v->B(8 * i + 1) - s->B(8 * i + 1)); 360 val += abs1(v->B(8 * i + 2) - s->B(8 * i + 2)); 361 val += abs1(v->B(8 * i + 3) - s->B(8 * i + 3)); 362 val += abs1(v->B(8 * i + 4) - s->B(8 * i + 4)); 363 val += abs1(v->B(8 * i + 5) - s->B(8 * i + 5)); 364 val += abs1(v->B(8 * i + 6) - s->B(8 * i + 6)); 365 val += abs1(v->B(8 * i + 7) - s->B(8 * i + 7)); 366 d->Q(i) = val; 367 } 368 } 369 370 #if SHIFT < 2 371 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 372 target_ulong a0) 373 { 374 int i; 375 376 for (i = 0; i < (8 << SHIFT); i++) { 377 if (s->B(i) & 0x80) { 378 cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC()); 379 } 380 } 381 } 382 #endif 383 384 #define SHUFFLE4(F, a, b, offset) do { \ 385 r0 = a->F((order & 3) + offset); \ 386 r1 = a->F(((order >> 2) & 3) + offset); \ 387 r2 = b->F(((order >> 4) & 3) + offset); \ 388 r3 = b->F(((order >> 6) & 3) + offset); \ 389 d->F(offset) = r0; \ 390 d->F(offset + 1) = r1; \ 391 d->F(offset + 2) = r2; \ 392 d->F(offset + 3) = r3; \ 393 } while (0) 394 395 #if SHIFT == 0 396 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) 397 { 398 uint16_t r0, r1, r2, r3; 399 400 SHUFFLE4(W, s, s, 0); 401 } 402 #else 403 void glue(helper_shufps, SUFFIX)(Reg *d, Reg *v, Reg *s, int order) 404 { 405 uint32_t r0, r1, r2, r3; 406 int i; 407 408 for (i = 0; i < 2 << SHIFT; i += 4) { 409 SHUFFLE4(L, v, s, i); 410 } 411 } 412 413 void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *v, Reg *s, int order) 414 { 415 uint64_t r0, r1; 416 int i; 417 418 for (i = 0; i < 1 << SHIFT; i += 2) { 419 r0 = v->Q(((order & 1) & 1) + i); 420 r1 = s->Q(((order >> 1) & 1) + i); 421 d->Q(i) = r0; 422 d->Q(i + 1) = r1; 423 order >>= 2; 424 } 425 } 426 427 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order) 428 { 429 uint32_t r0, r1, r2, r3; 430 int i; 431 432 for (i = 0; i < 2 << SHIFT; i += 4) { 433 SHUFFLE4(L, s, s, i); 434 } 435 } 436 437 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order) 438 { 439 uint16_t r0, r1, r2, r3; 440 int i, j; 441 442 for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) { 443 SHUFFLE4(W, s, s, i); 444 d->Q(j) = s->Q(j); 445 } 446 } 447 448 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) 449 { 450 uint16_t r0, r1, r2, r3; 451 int i, j; 452 453 for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) { 454 d->Q(j) = s->Q(j); 455 SHUFFLE4(W, s, s, i); 456 } 457 } 458 #endif 459 460 #if SHIFT >= 1 461 /* FPU ops */ 462 /* XXX: not accurate */ 463 464 #define SSE_HELPER_P(name, F) \ 465 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ 466 Reg *d, Reg *v, Reg *s) \ 467 { \ 468 int i; \ 469 for (i = 0; i < 2 << SHIFT; i++) { \ 470 d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i)); \ 471 } \ 472 } \ 473 \ 474 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ 475 Reg *d, Reg *v, Reg *s) \ 476 { \ 477 int i; \ 478 for (i = 0; i < 1 << SHIFT; i++) { \ 479 d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i)); \ 480 } \ 481 } 482 483 #if SHIFT == 1 484 485 #define SSE_HELPER_S(name, F) \ 486 SSE_HELPER_P(name, F) \ 487 \ 488 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)\ 489 { \ 490 int i; \ 491 d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \ 492 for (i = 1; i < 2 << SHIFT; i++) { \ 493 d->ZMM_L(i) = v->ZMM_L(i); \ 494 } \ 495 } \ 496 \ 497 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)\ 498 { \ 499 int i; \ 500 d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \ 501 for (i = 1; i < 1 << SHIFT; i++) { \ 502 d->ZMM_Q(i) = v->ZMM_Q(i); \ 503 } \ 504 } 505 506 #else 507 508 #define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F) 509 510 #endif 511 512 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status) 513 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status) 514 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status) 515 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status) 516 517 /* Note that the choice of comparison op here is important to get the 518 * special cases right: for min and max Intel specifies that (-0,0), 519 * (NaN, anything) and (anything, NaN) return the second argument. 520 */ 521 #define FPU_MIN(size, a, b) \ 522 (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b)) 523 #define FPU_MAX(size, a, b) \ 524 (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b)) 525 526 SSE_HELPER_S(add, FPU_ADD) 527 SSE_HELPER_S(sub, FPU_SUB) 528 SSE_HELPER_S(mul, FPU_MUL) 529 SSE_HELPER_S(div, FPU_DIV) 530 SSE_HELPER_S(min, FPU_MIN) 531 SSE_HELPER_S(max, FPU_MAX) 532 533 void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 534 { 535 int i; 536 for (i = 0; i < 2 << SHIFT; i++) { 537 d->ZMM_S(i) = float32_sqrt(s->ZMM_S(i), &env->sse_status); 538 } 539 } 540 541 void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 542 { 543 int i; 544 for (i = 0; i < 1 << SHIFT; i++) { 545 d->ZMM_D(i) = float64_sqrt(s->ZMM_D(i), &env->sse_status); 546 } 547 } 548 549 #if SHIFT == 1 550 void helper_sqrtss(CPUX86State *env, Reg *d, Reg *v, Reg *s) 551 { 552 int i; 553 d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status); 554 for (i = 1; i < 2 << SHIFT; i++) { 555 d->ZMM_L(i) = v->ZMM_L(i); 556 } 557 } 558 559 void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *v, Reg *s) 560 { 561 int i; 562 d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status); 563 for (i = 1; i < 1 << SHIFT; i++) { 564 d->ZMM_Q(i) = v->ZMM_Q(i); 565 } 566 } 567 #endif 568 569 /* float to float conversions */ 570 void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 571 { 572 int i; 573 for (i = 1 << SHIFT; --i >= 0; ) { 574 d->ZMM_D(i) = float32_to_float64(s->ZMM_S(i), &env->sse_status); 575 } 576 } 577 578 void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 579 { 580 int i; 581 for (i = 0; i < 1 << SHIFT; i++) { 582 d->ZMM_S(i) = float64_to_float32(s->ZMM_D(i), &env->sse_status); 583 } 584 for (i >>= 1; i < 1 << SHIFT; i++) { 585 d->Q(i) = 0; 586 } 587 } 588 589 #if SHIFT >= 1 590 void glue(helper_cvtph2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 591 { 592 int i; 593 594 for (i = 2 << SHIFT; --i >= 0; ) { 595 d->ZMM_S(i) = float16_to_float32(s->ZMM_H(i), true, &env->sse_status); 596 } 597 } 598 599 void glue(helper_cvtps2ph, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, int mode) 600 { 601 int i; 602 FloatRoundMode prev_rounding_mode = env->sse_status.float_rounding_mode; 603 if (!(mode & (1 << 2))) { 604 set_x86_rounding_mode(mode & 3, &env->sse_status); 605 } 606 607 for (i = 0; i < 2 << SHIFT; i++) { 608 d->ZMM_H(i) = float32_to_float16(s->ZMM_S(i), true, &env->sse_status); 609 } 610 for (i >>= 2; i < 1 << SHIFT; i++) { 611 d->Q(i) = 0; 612 } 613 614 env->sse_status.float_rounding_mode = prev_rounding_mode; 615 } 616 #endif 617 618 #if SHIFT == 1 619 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *v, Reg *s) 620 { 621 int i; 622 d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status); 623 for (i = 1; i < 1 << SHIFT; i++) { 624 d->ZMM_Q(i) = v->ZMM_Q(i); 625 } 626 } 627 628 void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *v, Reg *s) 629 { 630 int i; 631 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status); 632 for (i = 1; i < 2 << SHIFT; i++) { 633 d->ZMM_L(i) = v->ZMM_L(i); 634 } 635 } 636 #endif 637 638 /* integer to float */ 639 void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 640 { 641 int i; 642 for (i = 0; i < 2 << SHIFT; i++) { 643 d->ZMM_S(i) = int32_to_float32(s->ZMM_L(i), &env->sse_status); 644 } 645 } 646 647 void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 648 { 649 int i; 650 for (i = 1 << SHIFT; --i >= 0; ) { 651 int32_t l = s->ZMM_L(i); 652 d->ZMM_D(i) = int32_to_float64(l, &env->sse_status); 653 } 654 } 655 656 #if SHIFT == 1 657 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s) 658 { 659 d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status); 660 d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status); 661 } 662 663 void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s) 664 { 665 d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status); 666 d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status); 667 } 668 669 void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val) 670 { 671 d->ZMM_S(0) = int32_to_float32(val, &env->sse_status); 672 } 673 674 void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val) 675 { 676 d->ZMM_D(0) = int32_to_float64(val, &env->sse_status); 677 } 678 679 #ifdef TARGET_X86_64 680 void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val) 681 { 682 d->ZMM_S(0) = int64_to_float32(val, &env->sse_status); 683 } 684 685 void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val) 686 { 687 d->ZMM_D(0) = int64_to_float64(val, &env->sse_status); 688 } 689 #endif 690 691 #endif 692 693 /* float to integer */ 694 695 #if SHIFT == 1 696 /* 697 * x86 mandates that we return the indefinite integer value for the result 698 * of any float-to-integer conversion that raises the 'invalid' exception. 699 * Wrap the softfloat functions to get this behaviour. 700 */ 701 #define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE) \ 702 static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s) \ 703 { \ 704 int oldflags, newflags; \ 705 RETTYPE r; \ 706 \ 707 oldflags = get_float_exception_flags(s); \ 708 set_float_exception_flags(0, s); \ 709 r = FN(a, s); \ 710 newflags = get_float_exception_flags(s); \ 711 if (newflags & float_flag_invalid) { \ 712 r = INDEFVALUE; \ 713 } \ 714 set_float_exception_flags(newflags | oldflags, s); \ 715 return r; \ 716 } 717 718 WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN) 719 WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN) 720 WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN) 721 WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN) 722 WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN) 723 WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN) 724 WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN) 725 WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN) 726 #endif 727 728 void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 729 { 730 int i; 731 for (i = 0; i < 2 << SHIFT; i++) { 732 d->ZMM_L(i) = x86_float32_to_int32(s->ZMM_S(i), &env->sse_status); 733 } 734 } 735 736 void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 737 { 738 int i; 739 for (i = 0; i < 1 << SHIFT; i++) { 740 d->ZMM_L(i) = x86_float64_to_int32(s->ZMM_D(i), &env->sse_status); 741 } 742 for (i >>= 1; i < 1 << SHIFT; i++) { 743 d->Q(i) = 0; 744 } 745 } 746 747 #if SHIFT == 1 748 void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 749 { 750 d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); 751 d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status); 752 } 753 754 void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 755 { 756 d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); 757 d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status); 758 } 759 760 int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s) 761 { 762 return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); 763 } 764 765 int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s) 766 { 767 return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); 768 } 769 770 #ifdef TARGET_X86_64 771 int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s) 772 { 773 return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status); 774 } 775 776 int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s) 777 { 778 return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status); 779 } 780 #endif 781 #endif 782 783 /* float to integer truncated */ 784 void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 785 { 786 int i; 787 for (i = 0; i < 2 << SHIFT; i++) { 788 d->ZMM_L(i) = x86_float32_to_int32_round_to_zero(s->ZMM_S(i), 789 &env->sse_status); 790 } 791 } 792 793 void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 794 { 795 int i; 796 for (i = 0; i < 1 << SHIFT; i++) { 797 d->ZMM_L(i) = x86_float64_to_int32_round_to_zero(s->ZMM_D(i), 798 &env->sse_status); 799 } 800 for (i >>= 1; i < 1 << SHIFT; i++) { 801 d->Q(i) = 0; 802 } 803 } 804 805 #if SHIFT == 1 806 void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 807 { 808 d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); 809 d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status); 810 } 811 812 void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 813 { 814 d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status); 815 d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status); 816 } 817 818 int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s) 819 { 820 return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); 821 } 822 823 int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s) 824 { 825 return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status); 826 } 827 828 #ifdef TARGET_X86_64 829 int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s) 830 { 831 return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status); 832 } 833 834 int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s) 835 { 836 return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status); 837 } 838 #endif 839 #endif 840 841 void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 842 { 843 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 844 int i; 845 for (i = 0; i < 2 << SHIFT; i++) { 846 d->ZMM_S(i) = float32_div(float32_one, 847 float32_sqrt(s->ZMM_S(i), &env->sse_status), 848 &env->sse_status); 849 } 850 set_float_exception_flags(old_flags, &env->sse_status); 851 } 852 853 #if SHIFT == 1 854 void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s) 855 { 856 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 857 int i; 858 d->ZMM_S(0) = float32_div(float32_one, 859 float32_sqrt(s->ZMM_S(0), &env->sse_status), 860 &env->sse_status); 861 set_float_exception_flags(old_flags, &env->sse_status); 862 for (i = 1; i < 2 << SHIFT; i++) { 863 d->ZMM_L(i) = v->ZMM_L(i); 864 } 865 } 866 #endif 867 868 void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 869 { 870 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 871 int i; 872 for (i = 0; i < 2 << SHIFT; i++) { 873 d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status); 874 } 875 set_float_exception_flags(old_flags, &env->sse_status); 876 } 877 878 #if SHIFT == 1 879 void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s) 880 { 881 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 882 int i; 883 d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status); 884 for (i = 1; i < 2 << SHIFT; i++) { 885 d->ZMM_L(i) = v->ZMM_L(i); 886 } 887 set_float_exception_flags(old_flags, &env->sse_status); 888 } 889 #endif 890 891 #if SHIFT == 1 892 static inline uint64_t helper_extrq(uint64_t src, int shift, int len) 893 { 894 uint64_t mask; 895 896 if (len == 0) { 897 mask = ~0LL; 898 } else { 899 mask = (1ULL << len) - 1; 900 } 901 return (src >> shift) & mask; 902 } 903 904 void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s) 905 { 906 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1) & 63, s->ZMM_B(0) & 63); 907 } 908 909 void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length) 910 { 911 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length); 912 } 913 914 static inline uint64_t helper_insertq(uint64_t dest, uint64_t src, int shift, int len) 915 { 916 uint64_t mask; 917 918 if (len == 0) { 919 mask = ~0ULL; 920 } else { 921 mask = (1ULL << len) - 1; 922 } 923 return (dest & ~(mask << shift)) | ((src & mask) << shift); 924 } 925 926 void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s) 927 { 928 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), s->ZMM_B(9) & 63, s->ZMM_B(8) & 63); 929 } 930 931 void helper_insertq_i(CPUX86State *env, ZMMReg *d, ZMMReg *s, int index, int length) 932 { 933 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), index, length); 934 } 935 #endif 936 937 #define SSE_HELPER_HPS(name, F) \ 938 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 939 { \ 940 float32 r[2 << SHIFT]; \ 941 int i, j, k; \ 942 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ 943 for (i = j = 0; j < 4; i++, j += 2) { \ 944 r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \ 945 } \ 946 for (j = 0; j < 4; i++, j += 2) { \ 947 r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \ 948 } \ 949 } \ 950 for (i = 0; i < 2 << SHIFT; i++) { \ 951 d->ZMM_S(i) = r[i]; \ 952 } \ 953 } 954 955 SSE_HELPER_HPS(haddps, float32_add) 956 SSE_HELPER_HPS(hsubps, float32_sub) 957 958 #define SSE_HELPER_HPD(name, F) \ 959 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 960 { \ 961 float64 r[1 << SHIFT]; \ 962 int i, j, k; \ 963 for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \ 964 for (i = j = 0; j < 2; i++, j += 2) { \ 965 r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \ 966 } \ 967 for (j = 0; j < 2; i++, j += 2) { \ 968 r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \ 969 } \ 970 } \ 971 for (i = 0; i < 1 << SHIFT; i++) { \ 972 d->ZMM_D(i) = r[i]; \ 973 } \ 974 } 975 976 SSE_HELPER_HPD(haddpd, float64_add) 977 SSE_HELPER_HPD(hsubpd, float64_sub) 978 979 void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 980 { 981 int i; 982 for (i = 0; i < 2 << SHIFT; i += 2) { 983 d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); 984 d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status); 985 } 986 } 987 988 void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 989 { 990 int i; 991 for (i = 0; i < 1 << SHIFT; i += 2) { 992 d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status); 993 d->ZMM_D(i+1) = float64_add(v->ZMM_D(i+1), s->ZMM_D(i+1), &env->sse_status); 994 } 995 } 996 997 #define SSE_HELPER_CMP_P(name, F, C) \ 998 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ 999 Reg *d, Reg *v, Reg *s) \ 1000 { \ 1001 int i; \ 1002 for (i = 0; i < 2 << SHIFT; i++) { \ 1003 d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0; \ 1004 } \ 1005 } \ 1006 \ 1007 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ 1008 Reg *d, Reg *v, Reg *s) \ 1009 { \ 1010 int i; \ 1011 for (i = 0; i < 1 << SHIFT; i++) { \ 1012 d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0; \ 1013 } \ 1014 } 1015 1016 #if SHIFT == 1 1017 #define SSE_HELPER_CMP(name, F, C) \ 1018 SSE_HELPER_CMP_P(name, F, C) \ 1019 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 1020 { \ 1021 int i; \ 1022 d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0; \ 1023 for (i = 1; i < 2 << SHIFT; i++) { \ 1024 d->ZMM_L(i) = v->ZMM_L(i); \ 1025 } \ 1026 } \ 1027 \ 1028 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 1029 { \ 1030 int i; \ 1031 d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0; \ 1032 for (i = 1; i < 1 << SHIFT; i++) { \ 1033 d->ZMM_Q(i) = v->ZMM_Q(i); \ 1034 } \ 1035 } 1036 1037 static inline bool FPU_EQU(FloatRelation x) 1038 { 1039 return (x == float_relation_equal || x == float_relation_unordered); 1040 } 1041 static inline bool FPU_GE(FloatRelation x) 1042 { 1043 return (x == float_relation_equal || x == float_relation_greater); 1044 } 1045 #define FPU_EQ(x) (x == float_relation_equal) 1046 #define FPU_LT(x) (x == float_relation_less) 1047 #define FPU_LE(x) (x <= float_relation_equal) 1048 #define FPU_GT(x) (x == float_relation_greater) 1049 #define FPU_UNORD(x) (x == float_relation_unordered) 1050 /* We must make sure we evaluate the argument in case it is a signalling NAN */ 1051 #define FPU_FALSE(x) (x == float_relation_equal && 0) 1052 1053 #define FPU_CMPQ(size, a, b) \ 1054 float ## size ## _compare_quiet(a, b, &env->sse_status) 1055 #define FPU_CMPS(size, a, b) \ 1056 float ## size ## _compare(a, b, &env->sse_status) 1057 1058 #else 1059 #define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C) 1060 #endif 1061 1062 SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ) 1063 SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT) 1064 SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE) 1065 SSE_HELPER_CMP(cmpunord, FPU_CMPQ, FPU_UNORD) 1066 SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ) 1067 SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT) 1068 SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE) 1069 SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD) 1070 1071 SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU) 1072 SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE) 1073 SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT) 1074 SSE_HELPER_CMP(cmpfalse, FPU_CMPQ, FPU_FALSE) 1075 SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU) 1076 SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE) 1077 SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT) 1078 SSE_HELPER_CMP(cmptrue, FPU_CMPQ, !FPU_FALSE) 1079 1080 SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ) 1081 SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT) 1082 SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE) 1083 SSE_HELPER_CMP(cmpunords, FPU_CMPS, FPU_UNORD) 1084 SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ) 1085 SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT) 1086 SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE) 1087 SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD) 1088 1089 SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU) 1090 SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE) 1091 SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT) 1092 SSE_HELPER_CMP(cmpfalses, FPU_CMPS, FPU_FALSE) 1093 SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU) 1094 SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE) 1095 SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT) 1096 SSE_HELPER_CMP(cmptrues, FPU_CMPS, !FPU_FALSE) 1097 1098 #undef SSE_HELPER_CMP 1099 1100 #if SHIFT == 1 1101 static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C}; 1102 1103 void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s) 1104 { 1105 FloatRelation ret; 1106 float32 s0, s1; 1107 1108 s0 = d->ZMM_S(0); 1109 s1 = s->ZMM_S(0); 1110 ret = float32_compare_quiet(s0, s1, &env->sse_status); 1111 CC_SRC = comis_eflags[ret + 1]; 1112 } 1113 1114 void helper_comiss(CPUX86State *env, Reg *d, Reg *s) 1115 { 1116 FloatRelation ret; 1117 float32 s0, s1; 1118 1119 s0 = d->ZMM_S(0); 1120 s1 = s->ZMM_S(0); 1121 ret = float32_compare(s0, s1, &env->sse_status); 1122 CC_SRC = comis_eflags[ret + 1]; 1123 } 1124 1125 void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s) 1126 { 1127 FloatRelation ret; 1128 float64 d0, d1; 1129 1130 d0 = d->ZMM_D(0); 1131 d1 = s->ZMM_D(0); 1132 ret = float64_compare_quiet(d0, d1, &env->sse_status); 1133 CC_SRC = comis_eflags[ret + 1]; 1134 } 1135 1136 void helper_comisd(CPUX86State *env, Reg *d, Reg *s) 1137 { 1138 FloatRelation ret; 1139 float64 d0, d1; 1140 1141 d0 = d->ZMM_D(0); 1142 d1 = s->ZMM_D(0); 1143 ret = float64_compare(d0, d1, &env->sse_status); 1144 CC_SRC = comis_eflags[ret + 1]; 1145 } 1146 #endif 1147 1148 uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s) 1149 { 1150 uint32_t mask; 1151 int i; 1152 1153 mask = 0; 1154 for (i = 0; i < 2 << SHIFT; i++) { 1155 mask |= (s->ZMM_L(i) >> (31 - i)) & (1 << i); 1156 } 1157 return mask; 1158 } 1159 1160 uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s) 1161 { 1162 uint32_t mask; 1163 int i; 1164 1165 mask = 0; 1166 for (i = 0; i < 1 << SHIFT; i++) { 1167 mask |= (s->ZMM_Q(i) >> (63 - i)) & (1 << i); 1168 } 1169 return mask; 1170 } 1171 1172 #endif 1173 1174 #define PACK_HELPER_B(name, F) \ 1175 void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \ 1176 Reg *d, Reg *v, Reg *s) \ 1177 { \ 1178 uint8_t r[PACK_WIDTH * 2]; \ 1179 int j, k; \ 1180 for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \ 1181 for (k = 0; k < PACK_WIDTH; k++) { \ 1182 r[k] = F((int16_t)v->W(j + k)); \ 1183 } \ 1184 for (k = 0; k < PACK_WIDTH; k++) { \ 1185 r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \ 1186 } \ 1187 for (k = 0; k < PACK_WIDTH * 2; k++) { \ 1188 d->B(2 * j + k) = r[k]; \ 1189 } \ 1190 } \ 1191 } 1192 1193 PACK_HELPER_B(sswb, satsb) 1194 PACK_HELPER_B(uswb, satub) 1195 1196 void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 1197 { 1198 uint16_t r[PACK_WIDTH]; 1199 int j, k; 1200 1201 for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) { 1202 for (k = 0; k < PACK_WIDTH / 2; k++) { 1203 r[k] = satsw(v->L(j + k)); 1204 } 1205 for (k = 0; k < PACK_WIDTH / 2; k++) { 1206 r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k)); 1207 } 1208 for (k = 0; k < PACK_WIDTH; k++) { 1209 d->W(2 * j + k) = r[k]; 1210 } 1211 } 1212 } 1213 1214 #define UNPCK_OP(base_name, base) \ 1215 \ 1216 void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\ 1217 Reg *d, Reg *v, Reg *s) \ 1218 { \ 1219 uint8_t r[PACK_WIDTH * 2]; \ 1220 int j, i; \ 1221 \ 1222 for (j = 0; j < 8 << SHIFT; ) { \ 1223 int k = j + base * PACK_WIDTH; \ 1224 for (i = 0; i < PACK_WIDTH; i++) { \ 1225 r[2 * i] = v->B(k + i); \ 1226 r[2 * i + 1] = s->B(k + i); \ 1227 } \ 1228 for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \ 1229 d->B(j) = r[i]; \ 1230 } \ 1231 } \ 1232 } \ 1233 \ 1234 void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\ 1235 Reg *d, Reg *v, Reg *s) \ 1236 { \ 1237 uint16_t r[PACK_WIDTH]; \ 1238 int j, i; \ 1239 \ 1240 for (j = 0; j < 4 << SHIFT; ) { \ 1241 int k = j + base * PACK_WIDTH / 2; \ 1242 for (i = 0; i < PACK_WIDTH / 2; i++) { \ 1243 r[2 * i] = v->W(k + i); \ 1244 r[2 * i + 1] = s->W(k + i); \ 1245 } \ 1246 for (i = 0; i < PACK_WIDTH; i++, j++) { \ 1247 d->W(j) = r[i]; \ 1248 } \ 1249 } \ 1250 } \ 1251 \ 1252 void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\ 1253 Reg *d, Reg *v, Reg *s) \ 1254 { \ 1255 uint32_t r[PACK_WIDTH / 2]; \ 1256 int j, i; \ 1257 \ 1258 for (j = 0; j < 2 << SHIFT; ) { \ 1259 int k = j + base * PACK_WIDTH / 4; \ 1260 for (i = 0; i < PACK_WIDTH / 4; i++) { \ 1261 r[2 * i] = v->L(k + i); \ 1262 r[2 * i + 1] = s->L(k + i); \ 1263 } \ 1264 for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \ 1265 d->L(j) = r[i]; \ 1266 } \ 1267 } \ 1268 } \ 1269 \ 1270 XMM_ONLY( \ 1271 void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \ 1272 CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 1273 { \ 1274 uint64_t r[2]; \ 1275 int i; \ 1276 \ 1277 for (i = 0; i < 1 << SHIFT; i += 2) { \ 1278 r[0] = v->Q(base + i); \ 1279 r[1] = s->Q(base + i); \ 1280 d->Q(i) = r[0]; \ 1281 d->Q(i + 1) = r[1]; \ 1282 } \ 1283 } \ 1284 ) 1285 1286 UNPCK_OP(l, 0) 1287 UNPCK_OP(h, 1) 1288 1289 #undef PACK_WIDTH 1290 #undef PACK_HELPER_B 1291 #undef UNPCK_OP 1292 1293 1294 /* 3DNow! float ops */ 1295 #if SHIFT == 0 1296 void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s) 1297 { 1298 d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status); 1299 d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status); 1300 } 1301 1302 void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s) 1303 { 1304 d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status); 1305 d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status); 1306 } 1307 1308 void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s) 1309 { 1310 d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status); 1311 d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status); 1312 } 1313 1314 void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s) 1315 { 1316 d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), 1317 &env->mmx_status)); 1318 d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), 1319 &env->mmx_status)); 1320 } 1321 1322 void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1323 { 1324 float32 r; 1325 1326 r = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1327 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1328 d->MMX_S(0) = r; 1329 } 1330 1331 void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s) 1332 { 1333 d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1334 d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1335 } 1336 1337 void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s) 1338 { 1339 d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0), 1340 &env->mmx_status) ? -1 : 0; 1341 d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1), 1342 &env->mmx_status) ? -1 : 0; 1343 } 1344 1345 void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s) 1346 { 1347 d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), 1348 &env->mmx_status) ? -1 : 0; 1349 d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), 1350 &env->mmx_status) ? -1 : 0; 1351 } 1352 1353 void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s) 1354 { 1355 d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), 1356 &env->mmx_status) ? -1 : 0; 1357 d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), 1358 &env->mmx_status) ? -1 : 0; 1359 } 1360 1361 void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s) 1362 { 1363 if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) { 1364 d->MMX_S(0) = s->MMX_S(0); 1365 } 1366 if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) { 1367 d->MMX_S(1) = s->MMX_S(1); 1368 } 1369 } 1370 1371 void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s) 1372 { 1373 if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) { 1374 d->MMX_S(0) = s->MMX_S(0); 1375 } 1376 if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) { 1377 d->MMX_S(1) = s->MMX_S(1); 1378 } 1379 } 1380 1381 void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s) 1382 { 1383 d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1384 d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1385 } 1386 1387 void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1388 { 1389 float32 r; 1390 1391 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1392 d->MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1393 d->MMX_S(0) = r; 1394 } 1395 1396 void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1397 { 1398 float32 r; 1399 1400 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1401 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1402 d->MMX_S(0) = r; 1403 } 1404 1405 void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s) 1406 { 1407 d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status); 1408 d->MMX_S(1) = d->MMX_S(0); 1409 } 1410 1411 void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s) 1412 { 1413 d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff; 1414 d->MMX_S(1) = float32_div(float32_one, 1415 float32_sqrt(d->MMX_S(1), &env->mmx_status), 1416 &env->mmx_status); 1417 d->MMX_L(1) |= s->MMX_L(0) & 0x80000000; 1418 d->MMX_L(0) = d->MMX_L(1); 1419 } 1420 1421 void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s) 1422 { 1423 d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1424 d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1425 } 1426 1427 void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s) 1428 { 1429 d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status); 1430 d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status); 1431 } 1432 1433 void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s) 1434 { 1435 uint32_t r; 1436 1437 r = s->MMX_L(0); 1438 d->MMX_L(0) = s->MMX_L(1); 1439 d->MMX_L(1) = r; 1440 } 1441 #endif 1442 1443 /* SSSE3 op helpers */ 1444 void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 1445 { 1446 int i; 1447 #if SHIFT == 0 1448 uint8_t r[8]; 1449 1450 for (i = 0; i < 8; i++) { 1451 r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7)); 1452 } 1453 for (i = 0; i < 8; i++) { 1454 d->B(i) = r[i]; 1455 } 1456 #else 1457 uint8_t r[8 << SHIFT]; 1458 1459 for (i = 0; i < 8 << SHIFT; i++) { 1460 int j = i & ~0xf; 1461 r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf)); 1462 } 1463 for (i = 0; i < 8 << SHIFT; i++) { 1464 d->B(i) = r[i]; 1465 } 1466 #endif 1467 } 1468 1469 #define SSE_HELPER_HW(name, F) \ 1470 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 1471 { \ 1472 uint16_t r[4 << SHIFT]; \ 1473 int i, j, k; \ 1474 for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \ 1475 for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \ 1476 r[i + k] = F(v->W(j + k), v->W(j + k + 1)); \ 1477 } \ 1478 for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \ 1479 r[i + k] = F(s->W(j + k), s->W(j + k + 1)); \ 1480 } \ 1481 } \ 1482 for (i = 0; i < 4 << SHIFT; i++) { \ 1483 d->W(i) = r[i]; \ 1484 } \ 1485 } 1486 1487 #define SSE_HELPER_HL(name, F) \ 1488 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 1489 { \ 1490 uint32_t r[2 << SHIFT]; \ 1491 int i, j, k; \ 1492 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ 1493 for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \ 1494 r[i + k] = F(v->L(j + k), v->L(j + k + 1)); \ 1495 } \ 1496 for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \ 1497 r[i + k] = F(s->L(j + k), s->L(j + k + 1)); \ 1498 } \ 1499 } \ 1500 for (i = 0; i < 2 << SHIFT; i++) { \ 1501 d->L(i) = r[i]; \ 1502 } \ 1503 } 1504 1505 SSE_HELPER_HW(phaddw, FADD) 1506 SSE_HELPER_HW(phsubw, FSUB) 1507 SSE_HELPER_HW(phaddsw, FADDSW) 1508 SSE_HELPER_HW(phsubsw, FSUBSW) 1509 SSE_HELPER_HL(phaddd, FADD) 1510 SSE_HELPER_HL(phsubd, FSUB) 1511 1512 #undef SSE_HELPER_HW 1513 #undef SSE_HELPER_HL 1514 1515 void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 1516 { 1517 int i; 1518 for (i = 0; i < 4 << SHIFT; i++) { 1519 d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) + 1520 (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1)); 1521 } 1522 } 1523 1524 #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15) 1525 SSE_HELPER_W(helper_pmulhrsw, FMULHRSW) 1526 1527 #define FSIGNB(d, s) (s <= INT8_MAX ? s ? d : 0 : -(int8_t)d) 1528 #define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d) 1529 #define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d) 1530 SSE_HELPER_B(helper_psignb, FSIGNB) 1531 SSE_HELPER_W(helper_psignw, FSIGNW) 1532 SSE_HELPER_L(helper_psignd, FSIGNL) 1533 1534 void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 1535 uint32_t imm) 1536 { 1537 int i; 1538 1539 /* XXX could be checked during translation */ 1540 if (imm >= (SHIFT ? 32 : 16)) { 1541 for (i = 0; i < (1 << SHIFT); i++) { 1542 d->Q(i) = 0; 1543 } 1544 } else { 1545 int shift = imm * 8; 1546 #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0) 1547 #if SHIFT == 0 1548 d->Q(0) = SHR(s->Q(0), shift - 0) | 1549 SHR(v->Q(0), shift - 64); 1550 #else 1551 for (i = 0; i < (1 << SHIFT); i += 2) { 1552 uint64_t r0, r1; 1553 1554 r0 = SHR(s->Q(i), shift - 0) | 1555 SHR(s->Q(i + 1), shift - 64) | 1556 SHR(v->Q(i), shift - 128) | 1557 SHR(v->Q(i + 1), shift - 192); 1558 r1 = SHR(s->Q(i), shift + 64) | 1559 SHR(s->Q(i + 1), shift - 0) | 1560 SHR(v->Q(i), shift - 64) | 1561 SHR(v->Q(i + 1), shift - 128); 1562 d->Q(i) = r0; 1563 d->Q(i + 1) = r1; 1564 } 1565 #endif 1566 #undef SHR 1567 } 1568 } 1569 1570 #if SHIFT >= 1 1571 1572 #define SSE_HELPER_V(name, elem, num, F) \ 1573 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \ 1574 Reg *m) \ 1575 { \ 1576 int i; \ 1577 for (i = 0; i < num; i++) { \ 1578 d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i)); \ 1579 } \ 1580 } 1581 1582 #define SSE_HELPER_I(name, elem, num, F) \ 1583 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \ 1584 uint32_t imm) \ 1585 { \ 1586 int i; \ 1587 for (i = 0; i < num; i++) { \ 1588 int j = i & 7; \ 1589 d->elem(i) = F(v->elem(i), s->elem(i), (imm >> j) & 1); \ 1590 } \ 1591 } 1592 1593 /* SSE4.1 op helpers */ 1594 #define FBLENDVB(v, s, m) ((m & 0x80) ? s : v) 1595 #define FBLENDVPS(v, s, m) ((m & 0x80000000) ? s : v) 1596 #define FBLENDVPD(v, s, m) ((m & 0x8000000000000000LL) ? s : v) 1597 SSE_HELPER_V(helper_pblendvb, B, 8 << SHIFT, FBLENDVB) 1598 SSE_HELPER_V(helper_blendvps, L, 2 << SHIFT, FBLENDVPS) 1599 SSE_HELPER_V(helper_blendvpd, Q, 1 << SHIFT, FBLENDVPD) 1600 1601 void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1602 { 1603 uint64_t zf = 0, cf = 0; 1604 int i; 1605 1606 for (i = 0; i < 1 << SHIFT; i++) { 1607 zf |= (s->Q(i) & d->Q(i)); 1608 cf |= (s->Q(i) & ~d->Q(i)); 1609 } 1610 CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); 1611 } 1612 1613 #define FMOVSLDUP(i) s->L((i) & ~1) 1614 #define FMOVSHDUP(i) s->L((i) | 1) 1615 #define FMOVDLDUP(i) s->Q((i) & ~1) 1616 1617 #define SSE_HELPER_F(name, elem, num, F) \ 1618 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 1619 { \ 1620 int n = num; \ 1621 for (int i = n; --i >= 0; ) { \ 1622 d->elem(i) = F(i); \ 1623 } \ 1624 } 1625 1626 #if SHIFT > 0 1627 SSE_HELPER_F(helper_pmovsxbw, W, 4 << SHIFT, (int8_t) s->B) 1628 SSE_HELPER_F(helper_pmovsxbd, L, 2 << SHIFT, (int8_t) s->B) 1629 SSE_HELPER_F(helper_pmovsxbq, Q, 1 << SHIFT, (int8_t) s->B) 1630 SSE_HELPER_F(helper_pmovsxwd, L, 2 << SHIFT, (int16_t) s->W) 1631 SSE_HELPER_F(helper_pmovsxwq, Q, 1 << SHIFT, (int16_t) s->W) 1632 SSE_HELPER_F(helper_pmovsxdq, Q, 1 << SHIFT, (int32_t) s->L) 1633 SSE_HELPER_F(helper_pmovzxbw, W, 4 << SHIFT, s->B) 1634 SSE_HELPER_F(helper_pmovzxbd, L, 2 << SHIFT, s->B) 1635 SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B) 1636 SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W) 1637 SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W) 1638 SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L) 1639 SSE_HELPER_F(helper_pmovsldup, L, 2 << SHIFT, FMOVSLDUP) 1640 SSE_HELPER_F(helper_pmovshdup, L, 2 << SHIFT, FMOVSHDUP) 1641 SSE_HELPER_F(helper_pmovdldup, Q, 1 << SHIFT, FMOVDLDUP) 1642 #endif 1643 1644 void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 1645 { 1646 int i; 1647 1648 for (i = 0; i < 1 << SHIFT; i++) { 1649 d->Q(i) = (int64_t)(int32_t) v->L(2 * i) * (int32_t) s->L(2 * i); 1650 } 1651 } 1652 1653 void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 1654 { 1655 uint16_t r[8]; 1656 int i, j, k; 1657 1658 for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) { 1659 r[0] = satuw(v->L(j)); 1660 r[1] = satuw(v->L(j + 1)); 1661 r[2] = satuw(v->L(j + 2)); 1662 r[3] = satuw(v->L(j + 3)); 1663 r[4] = satuw(s->L(j)); 1664 r[5] = satuw(s->L(j + 1)); 1665 r[6] = satuw(s->L(j + 2)); 1666 r[7] = satuw(s->L(j + 3)); 1667 for (k = 0; k < 8; k++) { 1668 d->W(i + k) = r[k]; 1669 } 1670 } 1671 } 1672 1673 #if SHIFT == 1 1674 void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1675 { 1676 int idx = 0; 1677 1678 if (s->W(1) < s->W(idx)) { 1679 idx = 1; 1680 } 1681 if (s->W(2) < s->W(idx)) { 1682 idx = 2; 1683 } 1684 if (s->W(3) < s->W(idx)) { 1685 idx = 3; 1686 } 1687 if (s->W(4) < s->W(idx)) { 1688 idx = 4; 1689 } 1690 if (s->W(5) < s->W(idx)) { 1691 idx = 5; 1692 } 1693 if (s->W(6) < s->W(idx)) { 1694 idx = 6; 1695 } 1696 if (s->W(7) < s->W(idx)) { 1697 idx = 7; 1698 } 1699 1700 d->W(0) = s->W(idx); 1701 d->W(1) = idx; 1702 d->L(1) = 0; 1703 d->Q(1) = 0; 1704 } 1705 #endif 1706 1707 void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1708 uint32_t mode) 1709 { 1710 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1711 signed char prev_rounding_mode; 1712 int i; 1713 1714 prev_rounding_mode = env->sse_status.float_rounding_mode; 1715 if (!(mode & (1 << 2))) { 1716 set_x86_rounding_mode(mode & 3, &env->sse_status); 1717 } 1718 1719 for (i = 0; i < 2 << SHIFT; i++) { 1720 d->ZMM_S(i) = float32_round_to_int(s->ZMM_S(i), &env->sse_status); 1721 } 1722 1723 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1724 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1725 ~float_flag_inexact, 1726 &env->sse_status); 1727 } 1728 env->sse_status.float_rounding_mode = prev_rounding_mode; 1729 } 1730 1731 void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1732 uint32_t mode) 1733 { 1734 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1735 signed char prev_rounding_mode; 1736 int i; 1737 1738 prev_rounding_mode = env->sse_status.float_rounding_mode; 1739 if (!(mode & (1 << 2))) { 1740 set_x86_rounding_mode(mode & 3, &env->sse_status); 1741 } 1742 1743 for (i = 0; i < 1 << SHIFT; i++) { 1744 d->ZMM_D(i) = float64_round_to_int(s->ZMM_D(i), &env->sse_status); 1745 } 1746 1747 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1748 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1749 ~float_flag_inexact, 1750 &env->sse_status); 1751 } 1752 env->sse_status.float_rounding_mode = prev_rounding_mode; 1753 } 1754 1755 #if SHIFT == 1 1756 void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 1757 uint32_t mode) 1758 { 1759 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1760 signed char prev_rounding_mode; 1761 int i; 1762 1763 prev_rounding_mode = env->sse_status.float_rounding_mode; 1764 if (!(mode & (1 << 2))) { 1765 set_x86_rounding_mode(mode & 3, &env->sse_status); 1766 } 1767 1768 d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status); 1769 for (i = 1; i < 2 << SHIFT; i++) { 1770 d->ZMM_L(i) = v->ZMM_L(i); 1771 } 1772 1773 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1774 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1775 ~float_flag_inexact, 1776 &env->sse_status); 1777 } 1778 env->sse_status.float_rounding_mode = prev_rounding_mode; 1779 } 1780 1781 void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 1782 uint32_t mode) 1783 { 1784 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1785 signed char prev_rounding_mode; 1786 int i; 1787 1788 prev_rounding_mode = env->sse_status.float_rounding_mode; 1789 if (!(mode & (1 << 2))) { 1790 set_x86_rounding_mode(mode & 3, &env->sse_status); 1791 } 1792 1793 d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status); 1794 for (i = 1; i < 1 << SHIFT; i++) { 1795 d->ZMM_Q(i) = v->ZMM_Q(i); 1796 } 1797 1798 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1799 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1800 ~float_flag_inexact, 1801 &env->sse_status); 1802 } 1803 env->sse_status.float_rounding_mode = prev_rounding_mode; 1804 } 1805 #endif 1806 1807 #define FBLENDP(v, s, m) (m ? s : v) 1808 SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP) 1809 SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP) 1810 SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP) 1811 1812 void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 1813 uint32_t mask) 1814 { 1815 float32 prod1, prod2, temp2, temp3, temp4; 1816 int i; 1817 1818 for (i = 0; i < 2 << SHIFT; i += 4) { 1819 /* 1820 * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D 1821 * to correctly round the intermediate results 1822 */ 1823 if (mask & (1 << 4)) { 1824 prod1 = float32_mul(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); 1825 } else { 1826 prod1 = float32_zero; 1827 } 1828 if (mask & (1 << 5)) { 1829 prod2 = float32_mul(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status); 1830 } else { 1831 prod2 = float32_zero; 1832 } 1833 temp2 = float32_add(prod1, prod2, &env->sse_status); 1834 if (mask & (1 << 6)) { 1835 prod1 = float32_mul(v->ZMM_S(i+2), s->ZMM_S(i+2), &env->sse_status); 1836 } else { 1837 prod1 = float32_zero; 1838 } 1839 if (mask & (1 << 7)) { 1840 prod2 = float32_mul(v->ZMM_S(i+3), s->ZMM_S(i+3), &env->sse_status); 1841 } else { 1842 prod2 = float32_zero; 1843 } 1844 temp3 = float32_add(prod1, prod2, &env->sse_status); 1845 temp4 = float32_add(temp2, temp3, &env->sse_status); 1846 1847 d->ZMM_S(i) = (mask & (1 << 0)) ? temp4 : float32_zero; 1848 d->ZMM_S(i+1) = (mask & (1 << 1)) ? temp4 : float32_zero; 1849 d->ZMM_S(i+2) = (mask & (1 << 2)) ? temp4 : float32_zero; 1850 d->ZMM_S(i+3) = (mask & (1 << 3)) ? temp4 : float32_zero; 1851 } 1852 } 1853 1854 #if SHIFT == 1 1855 /* Oddly, there is no ymm version of dppd */ 1856 void glue(helper_dppd, SUFFIX)(CPUX86State *env, 1857 Reg *d, Reg *v, Reg *s, uint32_t mask) 1858 { 1859 float64 prod1, prod2, temp2; 1860 1861 if (mask & (1 << 4)) { 1862 prod1 = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status); 1863 } else { 1864 prod1 = float64_zero; 1865 } 1866 if (mask & (1 << 5)) { 1867 prod2 = float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status); 1868 } else { 1869 prod2 = float64_zero; 1870 } 1871 temp2 = float64_add(prod1, prod2, &env->sse_status); 1872 d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero; 1873 d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero; 1874 } 1875 #endif 1876 1877 void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 1878 uint32_t offset) 1879 { 1880 int i, j; 1881 uint16_t r[8]; 1882 1883 for (j = 0; j < 4 << SHIFT; ) { 1884 int s0 = (j * 2) + ((offset & 3) << 2); 1885 int d0 = (j * 2) + ((offset & 4) << 0); 1886 for (i = 0; i < LANE_WIDTH / 2; i++, d0++) { 1887 r[i] = 0; 1888 r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0)); 1889 r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1)); 1890 r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2)); 1891 r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3)); 1892 } 1893 for (i = 0; i < LANE_WIDTH / 2; i++, j++) { 1894 d->W(j) = r[i]; 1895 } 1896 offset >>= 3; 1897 } 1898 } 1899 1900 /* SSE4.2 op helpers */ 1901 #if SHIFT == 1 1902 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl) 1903 { 1904 target_long val, limit; 1905 1906 /* Presence of REX.W is indicated by a bit higher than 7 set */ 1907 if (ctrl >> 8) { 1908 val = (target_long)env->regs[reg]; 1909 } else { 1910 val = (int32_t)env->regs[reg]; 1911 } 1912 if (ctrl & 1) { 1913 limit = 8; 1914 } else { 1915 limit = 16; 1916 } 1917 if ((val > limit) || (val < -limit)) { 1918 return limit; 1919 } 1920 return abs1(val); 1921 } 1922 1923 static inline int pcmp_ilen(Reg *r, uint8_t ctrl) 1924 { 1925 int val = 0; 1926 1927 if (ctrl & 1) { 1928 while (val < 8 && r->W(val)) { 1929 val++; 1930 } 1931 } else { 1932 while (val < 16 && r->B(val)) { 1933 val++; 1934 } 1935 } 1936 1937 return val; 1938 } 1939 1940 static inline int pcmp_val(Reg *r, uint8_t ctrl, int i) 1941 { 1942 switch ((ctrl >> 0) & 3) { 1943 case 0: 1944 return r->B(i); 1945 case 1: 1946 return r->W(i); 1947 case 2: 1948 return (int8_t)r->B(i); 1949 case 3: 1950 default: 1951 return (int16_t)r->W(i); 1952 } 1953 } 1954 1955 static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s, 1956 uint8_t ctrl, int valids, int validd) 1957 { 1958 unsigned int res = 0; 1959 int v; 1960 int j, i; 1961 int upper = (ctrl & 1) ? 7 : 15; 1962 1963 valids--; 1964 validd--; 1965 1966 CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0); 1967 1968 switch ((ctrl >> 2) & 3) { 1969 case 0: 1970 for (j = valids; j >= 0; j--) { 1971 res <<= 1; 1972 v = pcmp_val(s, ctrl, j); 1973 for (i = validd; i >= 0; i--) { 1974 res |= (v == pcmp_val(d, ctrl, i)); 1975 } 1976 } 1977 break; 1978 case 1: 1979 for (j = valids; j >= 0; j--) { 1980 res <<= 1; 1981 v = pcmp_val(s, ctrl, j); 1982 for (i = ((validd - 1) | 1); i >= 0; i -= 2) { 1983 res |= (pcmp_val(d, ctrl, i - 0) >= v && 1984 pcmp_val(d, ctrl, i - 1) <= v); 1985 } 1986 } 1987 break; 1988 case 2: 1989 res = (1 << (upper - MAX(valids, validd))) - 1; 1990 res <<= MAX(valids, validd) - MIN(valids, validd); 1991 for (i = MIN(valids, validd); i >= 0; i--) { 1992 res <<= 1; 1993 v = pcmp_val(s, ctrl, i); 1994 res |= (v == pcmp_val(d, ctrl, i)); 1995 } 1996 break; 1997 case 3: 1998 if (validd == -1) { 1999 res = (2 << upper) - 1; 2000 break; 2001 } 2002 for (j = valids == upper ? valids : valids - validd; j >= 0; j--) { 2003 res <<= 1; 2004 v = 1; 2005 for (i = MIN(valids - j, validd); i >= 0; i--) { 2006 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i)); 2007 } 2008 res |= v; 2009 } 2010 break; 2011 } 2012 2013 switch ((ctrl >> 4) & 3) { 2014 case 1: 2015 res ^= (2 << upper) - 1; 2016 break; 2017 case 3: 2018 res ^= (1 << (valids + 1)) - 1; 2019 break; 2020 } 2021 2022 if (res) { 2023 CC_SRC |= CC_C; 2024 } 2025 if (res & 1) { 2026 CC_SRC |= CC_O; 2027 } 2028 2029 return res; 2030 } 2031 2032 void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2033 uint32_t ctrl) 2034 { 2035 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2036 pcmp_elen(env, R_EDX, ctrl), 2037 pcmp_elen(env, R_EAX, ctrl)); 2038 2039 if (res) { 2040 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res); 2041 } else { 2042 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); 2043 } 2044 } 2045 2046 void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2047 uint32_t ctrl) 2048 { 2049 int i; 2050 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2051 pcmp_elen(env, R_EDX, ctrl), 2052 pcmp_elen(env, R_EAX, ctrl)); 2053 2054 if ((ctrl >> 6) & 1) { 2055 if (ctrl & 1) { 2056 for (i = 0; i < 8; i++, res >>= 1) { 2057 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0; 2058 } 2059 } else { 2060 for (i = 0; i < 16; i++, res >>= 1) { 2061 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0; 2062 } 2063 } 2064 } else { 2065 env->xmm_regs[0].Q(1) = 0; 2066 env->xmm_regs[0].Q(0) = res; 2067 } 2068 } 2069 2070 void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2071 uint32_t ctrl) 2072 { 2073 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2074 pcmp_ilen(s, ctrl), 2075 pcmp_ilen(d, ctrl)); 2076 2077 if (res) { 2078 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res); 2079 } else { 2080 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); 2081 } 2082 } 2083 2084 void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2085 uint32_t ctrl) 2086 { 2087 int i; 2088 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2089 pcmp_ilen(s, ctrl), 2090 pcmp_ilen(d, ctrl)); 2091 2092 if ((ctrl >> 6) & 1) { 2093 if (ctrl & 1) { 2094 for (i = 0; i < 8; i++, res >>= 1) { 2095 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0; 2096 } 2097 } else { 2098 for (i = 0; i < 16; i++, res >>= 1) { 2099 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0; 2100 } 2101 } 2102 } else { 2103 env->xmm_regs[0].Q(1) = 0; 2104 env->xmm_regs[0].Q(0) = res; 2105 } 2106 } 2107 2108 #define CRCPOLY 0x1edc6f41 2109 #define CRCPOLY_BITREV 0x82f63b78 2110 target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) 2111 { 2112 target_ulong crc = (msg & ((target_ulong) -1 >> 2113 (TARGET_LONG_BITS - len))) ^ crc1; 2114 2115 while (len--) { 2116 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0); 2117 } 2118 2119 return crc; 2120 } 2121 2122 #endif 2123 2124 #if SHIFT == 1 2125 static void clmulq(uint64_t *dest_l, uint64_t *dest_h, 2126 uint64_t a, uint64_t b) 2127 { 2128 uint64_t al, ah, resh, resl; 2129 2130 ah = 0; 2131 al = a; 2132 resh = resl = 0; 2133 2134 while (b) { 2135 if (b & 1) { 2136 resl ^= al; 2137 resh ^= ah; 2138 } 2139 ah = (ah << 1) | (al >> 63); 2140 al <<= 1; 2141 b >>= 1; 2142 } 2143 2144 *dest_l = resl; 2145 *dest_h = resh; 2146 } 2147 #endif 2148 2149 void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 2150 uint32_t ctrl) 2151 { 2152 uint64_t a, b; 2153 int i; 2154 2155 for (i = 0; i < 1 << SHIFT; i += 2) { 2156 a = v->Q(((ctrl & 1) != 0) + i); 2157 b = s->Q(((ctrl & 16) != 0) + i); 2158 clmulq(&d->Q(i), &d->Q(i + 1), a, b); 2159 } 2160 } 2161 2162 void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2163 { 2164 int i; 2165 Reg st = *v; 2166 Reg rk = *s; 2167 2168 for (i = 0 ; i < 2 << SHIFT ; i++) { 2169 int j = i & 3; 2170 d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^ 2171 AES_Td1[st.B(AES_ishifts[4 * j + 1])] ^ 2172 AES_Td2[st.B(AES_ishifts[4 * j + 2])] ^ 2173 AES_Td3[st.B(AES_ishifts[4 * j + 3])]); 2174 } 2175 } 2176 2177 void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2178 { 2179 int i; 2180 Reg st = *v; 2181 Reg rk = *s; 2182 2183 for (i = 0; i < 8 << SHIFT; i++) { 2184 d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]); 2185 } 2186 } 2187 2188 void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2189 { 2190 int i; 2191 Reg st = *v; 2192 Reg rk = *s; 2193 2194 for (i = 0 ; i < 2 << SHIFT ; i++) { 2195 int j = i & 3; 2196 d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^ 2197 AES_Te1[st.B(AES_shifts[4 * j + 1])] ^ 2198 AES_Te2[st.B(AES_shifts[4 * j + 2])] ^ 2199 AES_Te3[st.B(AES_shifts[4 * j + 3])]); 2200 } 2201 } 2202 2203 void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2204 { 2205 int i; 2206 Reg st = *v; 2207 Reg rk = *s; 2208 2209 for (i = 0; i < 8 << SHIFT; i++) { 2210 d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]); 2211 } 2212 } 2213 2214 #if SHIFT == 1 2215 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2216 { 2217 int i; 2218 Reg tmp = *s; 2219 2220 for (i = 0 ; i < 4 ; i++) { 2221 d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^ 2222 AES_imc[tmp.B(4 * i + 1)][1] ^ 2223 AES_imc[tmp.B(4 * i + 2)][2] ^ 2224 AES_imc[tmp.B(4 * i + 3)][3]); 2225 } 2226 } 2227 2228 void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2229 uint32_t ctrl) 2230 { 2231 int i; 2232 Reg tmp = *s; 2233 2234 for (i = 0 ; i < 4 ; i++) { 2235 d->B(i) = AES_sbox[tmp.B(i + 4)]; 2236 d->B(i + 8) = AES_sbox[tmp.B(i + 12)]; 2237 } 2238 d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl; 2239 d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl; 2240 } 2241 #endif 2242 #endif 2243 2244 #if SHIFT >= 1 2245 void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2246 { 2247 uint64_t r0, r1; 2248 int i; 2249 2250 for (i = 0; i < 1 << SHIFT; i += 2) { 2251 r0 = v->Q(i + ((s->Q(i) >> 1) & 1)); 2252 r1 = v->Q(i + ((s->Q(i+1) >> 1) & 1)); 2253 d->Q(i) = r0; 2254 d->Q(i+1) = r1; 2255 } 2256 } 2257 2258 void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2259 { 2260 uint32_t r0, r1, r2, r3; 2261 int i; 2262 2263 for (i = 0; i < 2 << SHIFT; i += 4) { 2264 r0 = v->L(i + (s->L(i) & 3)); 2265 r1 = v->L(i + (s->L(i+1) & 3)); 2266 r2 = v->L(i + (s->L(i+2) & 3)); 2267 r3 = v->L(i + (s->L(i+3) & 3)); 2268 d->L(i) = r0; 2269 d->L(i+1) = r1; 2270 d->L(i+2) = r2; 2271 d->L(i+3) = r3; 2272 } 2273 } 2274 2275 void glue(helper_vpermilpd_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order) 2276 { 2277 uint64_t r0, r1; 2278 int i; 2279 2280 for (i = 0; i < 1 << SHIFT; i += 2) { 2281 r0 = s->Q(i + ((order >> 0) & 1)); 2282 r1 = s->Q(i + ((order >> 1) & 1)); 2283 d->Q(i) = r0; 2284 d->Q(i+1) = r1; 2285 2286 order >>= 2; 2287 } 2288 } 2289 2290 void glue(helper_vpermilps_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order) 2291 { 2292 uint32_t r0, r1, r2, r3; 2293 int i; 2294 2295 for (i = 0; i < 2 << SHIFT; i += 4) { 2296 r0 = s->L(i + ((order >> 0) & 3)); 2297 r1 = s->L(i + ((order >> 2) & 3)); 2298 r2 = s->L(i + ((order >> 4) & 3)); 2299 r3 = s->L(i + ((order >> 6) & 3)); 2300 d->L(i) = r0; 2301 d->L(i+1) = r1; 2302 d->L(i+2) = r2; 2303 d->L(i+3) = r3; 2304 } 2305 } 2306 2307 #if SHIFT == 1 2308 #define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0) 2309 #define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0) 2310 #define FPSRAVD(x, c) ((int32_t)(x) >> (c < 32 ? c : 31)) 2311 #define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63)) 2312 #define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0) 2313 #define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0) 2314 #endif 2315 2316 SSE_HELPER_L(helper_vpsrlvd, FPSRLVD) 2317 SSE_HELPER_L(helper_vpsravd, FPSRAVD) 2318 SSE_HELPER_L(helper_vpsllvd, FPSLLVD) 2319 2320 SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ) 2321 SSE_HELPER_Q(helper_vpsravq, FPSRAVQ) 2322 SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ) 2323 2324 void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2325 { 2326 uint32_t zf = 0, cf = 0; 2327 int i; 2328 2329 for (i = 0; i < 2 << SHIFT; i++) { 2330 zf |= (s->L(i) & d->L(i)); 2331 cf |= (s->L(i) & ~d->L(i)); 2332 } 2333 CC_SRC = ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C); 2334 } 2335 2336 void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2337 { 2338 uint64_t zf = 0, cf = 0; 2339 int i; 2340 2341 for (i = 0; i < 1 << SHIFT; i++) { 2342 zf |= (s->Q(i) & d->Q(i)); 2343 cf |= (s->Q(i) & ~d->Q(i)); 2344 } 2345 CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C); 2346 } 2347 2348 void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env, 2349 Reg *v, Reg *s, target_ulong a0) 2350 { 2351 int i; 2352 2353 for (i = 0; i < (2 << SHIFT); i++) { 2354 if (v->L(i) >> 31) { 2355 cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC()); 2356 } 2357 } 2358 } 2359 2360 void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env, 2361 Reg *v, Reg *s, target_ulong a0) 2362 { 2363 int i; 2364 2365 for (i = 0; i < (1 << SHIFT); i++) { 2366 if (v->Q(i) >> 63) { 2367 cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC()); 2368 } 2369 } 2370 } 2371 2372 void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2373 { 2374 int i; 2375 2376 for (i = 0; i < (2 << SHIFT); i++) { 2377 d->L(i) = (v->L(i) >> 31) ? s->L(i) : 0; 2378 } 2379 } 2380 2381 void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2382 { 2383 int i; 2384 2385 for (i = 0; i < (1 << SHIFT); i++) { 2386 d->Q(i) = (v->Q(i) >> 63) ? s->Q(i) : 0; 2387 } 2388 } 2389 2390 void glue(helper_vpgatherdd, SUFFIX)(CPUX86State *env, 2391 Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) 2392 { 2393 int i; 2394 for (i = 0; i < (2 << SHIFT); i++) { 2395 if (v->L(i) >> 31) { 2396 target_ulong addr = a0 2397 + ((target_ulong)(int32_t)s->L(i) << scale); 2398 d->L(i) = cpu_ldl_data_ra(env, addr, GETPC()); 2399 } 2400 v->L(i) = 0; 2401 } 2402 } 2403 2404 void glue(helper_vpgatherdq, SUFFIX)(CPUX86State *env, 2405 Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) 2406 { 2407 int i; 2408 for (i = 0; i < (1 << SHIFT); i++) { 2409 if (v->Q(i) >> 63) { 2410 target_ulong addr = a0 2411 + ((target_ulong)(int32_t)s->L(i) << scale); 2412 d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC()); 2413 } 2414 v->Q(i) = 0; 2415 } 2416 } 2417 2418 void glue(helper_vpgatherqd, SUFFIX)(CPUX86State *env, 2419 Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) 2420 { 2421 int i; 2422 for (i = 0; i < (1 << SHIFT); i++) { 2423 if (v->L(i) >> 31) { 2424 target_ulong addr = a0 2425 + ((target_ulong)(int64_t)s->Q(i) << scale); 2426 d->L(i) = cpu_ldl_data_ra(env, addr, GETPC()); 2427 } 2428 v->L(i) = 0; 2429 } 2430 for (i /= 2; i < 1 << SHIFT; i++) { 2431 d->Q(i) = 0; 2432 v->Q(i) = 0; 2433 } 2434 } 2435 2436 void glue(helper_vpgatherqq, SUFFIX)(CPUX86State *env, 2437 Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) 2438 { 2439 int i; 2440 for (i = 0; i < (1 << SHIFT); i++) { 2441 if (v->Q(i) >> 63) { 2442 target_ulong addr = a0 2443 + ((target_ulong)(int64_t)s->Q(i) << scale); 2444 d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC()); 2445 } 2446 v->Q(i) = 0; 2447 } 2448 } 2449 #endif 2450 2451 #if SHIFT >= 2 2452 void helper_vpermdq_ymm(Reg *d, Reg *v, Reg *s, uint32_t order) 2453 { 2454 uint64_t r0, r1, r2, r3; 2455 2456 switch (order & 3) { 2457 case 0: 2458 r0 = v->Q(0); 2459 r1 = v->Q(1); 2460 break; 2461 case 1: 2462 r0 = v->Q(2); 2463 r1 = v->Q(3); 2464 break; 2465 case 2: 2466 r0 = s->Q(0); 2467 r1 = s->Q(1); 2468 break; 2469 case 3: 2470 r0 = s->Q(2); 2471 r1 = s->Q(3); 2472 break; 2473 } 2474 switch ((order >> 4) & 3) { 2475 case 0: 2476 r2 = v->Q(0); 2477 r3 = v->Q(1); 2478 break; 2479 case 1: 2480 r2 = v->Q(2); 2481 r3 = v->Q(3); 2482 break; 2483 case 2: 2484 r2 = s->Q(0); 2485 r3 = s->Q(1); 2486 break; 2487 case 3: 2488 r2 = s->Q(2); 2489 r3 = s->Q(3); 2490 break; 2491 } 2492 d->Q(0) = r0; 2493 d->Q(1) = r1; 2494 d->Q(2) = r2; 2495 d->Q(3) = r3; 2496 } 2497 2498 void helper_vpermq_ymm(Reg *d, Reg *s, uint32_t order) 2499 { 2500 uint64_t r0, r1, r2, r3; 2501 r0 = s->Q(order & 3); 2502 r1 = s->Q((order >> 2) & 3); 2503 r2 = s->Q((order >> 4) & 3); 2504 r3 = s->Q((order >> 6) & 3); 2505 d->Q(0) = r0; 2506 d->Q(1) = r1; 2507 d->Q(2) = r2; 2508 d->Q(3) = r3; 2509 } 2510 2511 void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s) 2512 { 2513 uint32_t r[8]; 2514 int i; 2515 2516 for (i = 0; i < 8; i++) { 2517 r[i] = s->L(v->L(i) & 7); 2518 } 2519 for (i = 0; i < 8; i++) { 2520 d->L(i) = r[i]; 2521 } 2522 } 2523 #endif 2524 2525 /* FMA3 op helpers */ 2526 #if SHIFT == 1 2527 #define SSE_HELPER_FMAS(name, elem, F) \ 2528 void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c, int flags) \ 2529 { \ 2530 d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0), flags, &env->sse_status); \ 2531 } 2532 #define SSE_HELPER_FMAP(name, elem, num, F) \ 2533 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c, \ 2534 int flags, int flip) \ 2535 { \ 2536 int i; \ 2537 for (i = 0; i < num; i++) { \ 2538 d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i), flags, &env->sse_status); \ 2539 flags ^= flip; \ 2540 } \ 2541 } 2542 2543 SSE_HELPER_FMAS(helper_fma4ss, ZMM_S, float32_muladd) 2544 SSE_HELPER_FMAS(helper_fma4sd, ZMM_D, float64_muladd) 2545 #endif 2546 2547 #if SHIFT >= 1 2548 SSE_HELPER_FMAP(helper_fma4ps, ZMM_S, 2 << SHIFT, float32_muladd) 2549 SSE_HELPER_FMAP(helper_fma4pd, ZMM_D, 1 << SHIFT, float64_muladd) 2550 #endif 2551 2552 #undef SSE_HELPER_S 2553 2554 #undef LANE_WIDTH 2555 #undef SHIFT 2556 #undef XMM_ONLY 2557 #undef Reg 2558 #undef B 2559 #undef W 2560 #undef L 2561 #undef Q 2562 #undef SUFFIX 2563