1 /* 2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support 3 * 4 * Copyright (c) 2005 Fabrice Bellard 5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com> 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21 #include "crypto/aes.h" 22 23 #if SHIFT == 0 24 #define Reg MMXReg 25 #define XMM_ONLY(...) 26 #define B(n) MMX_B(n) 27 #define W(n) MMX_W(n) 28 #define L(n) MMX_L(n) 29 #define Q(n) MMX_Q(n) 30 #define SUFFIX _mmx 31 #else 32 #define Reg ZMMReg 33 #define XMM_ONLY(...) __VA_ARGS__ 34 #define B(n) ZMM_B(n) 35 #define W(n) ZMM_W(n) 36 #define L(n) ZMM_L(n) 37 #define Q(n) ZMM_Q(n) 38 #if SHIFT == 1 39 #define SUFFIX _xmm 40 #else 41 #define SUFFIX _ymm 42 #endif 43 #endif 44 45 #define LANE_WIDTH (SHIFT ? 16 : 8) 46 #define PACK_WIDTH (LANE_WIDTH / 2) 47 48 #if SHIFT == 0 49 #define FPSRL(x, c) ((x) >> shift) 50 #define FPSRAW(x, c) ((int16_t)(x) >> shift) 51 #define FPSRAL(x, c) ((int32_t)(x) >> shift) 52 #define FPSLL(x, c) ((x) << shift) 53 #endif 54 55 void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 56 { 57 int shift; 58 if (c->Q(0) > 15) { 59 for (int i = 0; i < 1 << SHIFT; i++) { 60 d->Q(i) = 0; 61 } 62 } else { 63 shift = c->B(0); 64 for (int i = 0; i < 4 << SHIFT; i++) { 65 d->W(i) = FPSRL(s->W(i), shift); 66 } 67 } 68 } 69 70 void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 71 { 72 int shift; 73 if (c->Q(0) > 15) { 74 for (int i = 0; i < 1 << SHIFT; i++) { 75 d->Q(i) = 0; 76 } 77 } else { 78 shift = c->B(0); 79 for (int i = 0; i < 4 << SHIFT; i++) { 80 d->W(i) = FPSLL(s->W(i), shift); 81 } 82 } 83 } 84 85 void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 86 { 87 int shift; 88 if (c->Q(0) > 15) { 89 shift = 15; 90 } else { 91 shift = c->B(0); 92 } 93 for (int i = 0; i < 4 << SHIFT; i++) { 94 d->W(i) = FPSRAW(s->W(i), shift); 95 } 96 } 97 98 void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 99 { 100 int shift; 101 if (c->Q(0) > 31) { 102 for (int i = 0; i < 1 << SHIFT; i++) { 103 d->Q(i) = 0; 104 } 105 } else { 106 shift = c->B(0); 107 for (int i = 0; i < 2 << SHIFT; i++) { 108 d->L(i) = FPSRL(s->L(i), shift); 109 } 110 } 111 } 112 113 void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 114 { 115 int shift; 116 if (c->Q(0) > 31) { 117 for (int i = 0; i < 1 << SHIFT; i++) { 118 d->Q(i) = 0; 119 } 120 } else { 121 shift = c->B(0); 122 for (int i = 0; i < 2 << SHIFT; i++) { 123 d->L(i) = FPSLL(s->L(i), shift); 124 } 125 } 126 } 127 128 void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 129 { 130 int shift; 131 if (c->Q(0) > 31) { 132 shift = 31; 133 } else { 134 shift = c->B(0); 135 } 136 for (int i = 0; i < 2 << SHIFT; i++) { 137 d->L(i) = FPSRAL(s->L(i), shift); 138 } 139 } 140 141 void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 142 { 143 int shift; 144 if (c->Q(0) > 63) { 145 for (int i = 0; i < 1 << SHIFT; i++) { 146 d->Q(i) = 0; 147 } 148 } else { 149 shift = c->B(0); 150 for (int i = 0; i < 1 << SHIFT; i++) { 151 d->Q(i) = FPSRL(s->Q(i), shift); 152 } 153 } 154 } 155 156 void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 157 { 158 int shift; 159 if (c->Q(0) > 63) { 160 for (int i = 0; i < 1 << SHIFT; i++) { 161 d->Q(i) = 0; 162 } 163 } else { 164 shift = c->B(0); 165 for (int i = 0; i < 1 << SHIFT; i++) { 166 d->Q(i) = FPSLL(s->Q(i), shift); 167 } 168 } 169 } 170 171 #if SHIFT >= 1 172 void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 173 { 174 int shift, i, j; 175 176 shift = c->L(0); 177 if (shift > 16) { 178 shift = 16; 179 } 180 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) { 181 for (i = 0; i < 16 - shift; i++) { 182 d->B(j + i) = s->B(j + i + shift); 183 } 184 for (i = 16 - shift; i < 16; i++) { 185 d->B(j + i) = 0; 186 } 187 } 188 } 189 190 void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c) 191 { 192 int shift, i, j; 193 194 shift = c->L(0); 195 if (shift > 16) { 196 shift = 16; 197 } 198 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) { 199 for (i = 15; i >= shift; i--) { 200 d->B(j + i) = s->B(j + i - shift); 201 } 202 for (i = 0; i < shift; i++) { 203 d->B(j + i) = 0; 204 } 205 } 206 } 207 #endif 208 209 #define SSE_HELPER_1(name, elem, num, F) \ 210 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 211 { \ 212 int n = num; \ 213 for (int i = 0; i < n; i++) { \ 214 d->elem(i) = F(s->elem(i)); \ 215 } \ 216 } 217 218 #define SSE_HELPER_2(name, elem, num, F) \ 219 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 220 { \ 221 int n = num; \ 222 for (int i = 0; i < n; i++) { \ 223 d->elem(i) = F(v->elem(i), s->elem(i)); \ 224 } \ 225 } 226 227 #define SSE_HELPER_B(name, F) \ 228 SSE_HELPER_2(name, B, 8 << SHIFT, F) 229 230 #define SSE_HELPER_W(name, F) \ 231 SSE_HELPER_2(name, W, 4 << SHIFT, F) 232 233 #define SSE_HELPER_L(name, F) \ 234 SSE_HELPER_2(name, L, 2 << SHIFT, F) 235 236 #define SSE_HELPER_Q(name, F) \ 237 SSE_HELPER_2(name, Q, 1 << SHIFT, F) 238 239 #if SHIFT == 0 240 static inline int satub(int x) 241 { 242 if (x < 0) { 243 return 0; 244 } else if (x > 255) { 245 return 255; 246 } else { 247 return x; 248 } 249 } 250 251 static inline int satuw(int x) 252 { 253 if (x < 0) { 254 return 0; 255 } else if (x > 65535) { 256 return 65535; 257 } else { 258 return x; 259 } 260 } 261 262 static inline int satsb(int x) 263 { 264 if (x < -128) { 265 return -128; 266 } else if (x > 127) { 267 return 127; 268 } else { 269 return x; 270 } 271 } 272 273 static inline int satsw(int x) 274 { 275 if (x < -32768) { 276 return -32768; 277 } else if (x > 32767) { 278 return 32767; 279 } else { 280 return x; 281 } 282 } 283 284 #define FADD(a, b) ((a) + (b)) 285 #define FADDUB(a, b) satub((a) + (b)) 286 #define FADDUW(a, b) satuw((a) + (b)) 287 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b)) 288 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b)) 289 290 #define FSUB(a, b) ((a) - (b)) 291 #define FSUBUB(a, b) satub((a) - (b)) 292 #define FSUBUW(a, b) satuw((a) - (b)) 293 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b)) 294 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b)) 295 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b) 296 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b) 297 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b) 298 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b) 299 300 #define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16) 301 #define FMULHUW(a, b) ((a) * (b) >> 16) 302 #define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16) 303 304 #define FAVG(a, b) (((a) + (b) + 1) >> 1) 305 #endif 306 307 SSE_HELPER_W(helper_pmulhuw, FMULHUW) 308 SSE_HELPER_W(helper_pmulhw, FMULHW) 309 310 #if SHIFT == 0 311 void glue(helper_pmulhrw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 312 { 313 d->W(0) = FMULHRW(d->W(0), s->W(0)); 314 d->W(1) = FMULHRW(d->W(1), s->W(1)); 315 d->W(2) = FMULHRW(d->W(2), s->W(2)); 316 d->W(3) = FMULHRW(d->W(3), s->W(3)); 317 } 318 #endif 319 320 SSE_HELPER_B(helper_pavgb, FAVG) 321 SSE_HELPER_W(helper_pavgw, FAVG) 322 323 void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 324 { 325 int i; 326 327 for (i = 0; i < (1 << SHIFT); i++) { 328 d->Q(i) = (uint64_t)s->L(i * 2) * (uint64_t)v->L(i * 2); 329 } 330 } 331 332 void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 333 { 334 int i; 335 336 for (i = 0; i < (2 << SHIFT); i++) { 337 d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) + 338 (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1); 339 } 340 } 341 342 #if SHIFT == 0 343 static inline int abs1(int a) 344 { 345 if (a < 0) { 346 return -a; 347 } else { 348 return a; 349 } 350 } 351 #endif 352 void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 353 { 354 int i; 355 356 for (i = 0; i < (1 << SHIFT); i++) { 357 unsigned int val = 0; 358 val += abs1(v->B(8 * i + 0) - s->B(8 * i + 0)); 359 val += abs1(v->B(8 * i + 1) - s->B(8 * i + 1)); 360 val += abs1(v->B(8 * i + 2) - s->B(8 * i + 2)); 361 val += abs1(v->B(8 * i + 3) - s->B(8 * i + 3)); 362 val += abs1(v->B(8 * i + 4) - s->B(8 * i + 4)); 363 val += abs1(v->B(8 * i + 5) - s->B(8 * i + 5)); 364 val += abs1(v->B(8 * i + 6) - s->B(8 * i + 6)); 365 val += abs1(v->B(8 * i + 7) - s->B(8 * i + 7)); 366 d->Q(i) = val; 367 } 368 } 369 370 #if SHIFT < 2 371 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 372 target_ulong a0) 373 { 374 int i; 375 376 for (i = 0; i < (8 << SHIFT); i++) { 377 if (s->B(i) & 0x80) { 378 cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC()); 379 } 380 } 381 } 382 #endif 383 384 #define SHUFFLE4(F, a, b, offset) do { \ 385 r0 = a->F((order & 3) + offset); \ 386 r1 = a->F(((order >> 2) & 3) + offset); \ 387 r2 = b->F(((order >> 4) & 3) + offset); \ 388 r3 = b->F(((order >> 6) & 3) + offset); \ 389 d->F(offset) = r0; \ 390 d->F(offset + 1) = r1; \ 391 d->F(offset + 2) = r2; \ 392 d->F(offset + 3) = r3; \ 393 } while (0) 394 395 #if SHIFT == 0 396 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) 397 { 398 uint16_t r0, r1, r2, r3; 399 400 SHUFFLE4(W, s, s, 0); 401 } 402 #else 403 void glue(helper_shufps, SUFFIX)(Reg *d, Reg *v, Reg *s, int order) 404 { 405 uint32_t r0, r1, r2, r3; 406 int i; 407 408 for (i = 0; i < 2 << SHIFT; i += 4) { 409 SHUFFLE4(L, v, s, i); 410 } 411 } 412 413 void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *v, Reg *s, int order) 414 { 415 uint64_t r0, r1; 416 int i; 417 418 for (i = 0; i < 1 << SHIFT; i += 2) { 419 r0 = v->Q(((order & 1) & 1) + i); 420 r1 = s->Q(((order >> 1) & 1) + i); 421 d->Q(i) = r0; 422 d->Q(i + 1) = r1; 423 order >>= 2; 424 } 425 } 426 427 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order) 428 { 429 uint32_t r0, r1, r2, r3; 430 int i; 431 432 for (i = 0; i < 2 << SHIFT; i += 4) { 433 SHUFFLE4(L, s, s, i); 434 } 435 } 436 437 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order) 438 { 439 uint16_t r0, r1, r2, r3; 440 int i, j; 441 442 for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) { 443 SHUFFLE4(W, s, s, i); 444 d->Q(j) = s->Q(j); 445 } 446 } 447 448 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) 449 { 450 uint16_t r0, r1, r2, r3; 451 int i, j; 452 453 for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) { 454 d->Q(j) = s->Q(j); 455 SHUFFLE4(W, s, s, i); 456 } 457 } 458 #endif 459 460 #if SHIFT >= 1 461 /* FPU ops */ 462 /* XXX: not accurate */ 463 464 #define SSE_HELPER_P(name, F) \ 465 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ 466 Reg *d, Reg *v, Reg *s) \ 467 { \ 468 int i; \ 469 for (i = 0; i < 2 << SHIFT; i++) { \ 470 d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i)); \ 471 } \ 472 } \ 473 \ 474 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ 475 Reg *d, Reg *v, Reg *s) \ 476 { \ 477 int i; \ 478 for (i = 0; i < 1 << SHIFT; i++) { \ 479 d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i)); \ 480 } \ 481 } 482 483 #if SHIFT == 1 484 485 #define SSE_HELPER_S(name, F) \ 486 SSE_HELPER_P(name, F) \ 487 \ 488 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)\ 489 { \ 490 int i; \ 491 d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \ 492 for (i = 1; i < 2 << SHIFT; i++) { \ 493 d->ZMM_L(i) = v->ZMM_L(i); \ 494 } \ 495 } \ 496 \ 497 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)\ 498 { \ 499 int i; \ 500 d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \ 501 for (i = 1; i < 1 << SHIFT; i++) { \ 502 d->ZMM_Q(i) = v->ZMM_Q(i); \ 503 } \ 504 } 505 506 #else 507 508 #define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F) 509 510 #endif 511 512 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status) 513 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status) 514 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status) 515 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status) 516 517 /* Note that the choice of comparison op here is important to get the 518 * special cases right: for min and max Intel specifies that (-0,0), 519 * (NaN, anything) and (anything, NaN) return the second argument. 520 */ 521 #define FPU_MIN(size, a, b) \ 522 (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b)) 523 #define FPU_MAX(size, a, b) \ 524 (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b)) 525 526 SSE_HELPER_S(add, FPU_ADD) 527 SSE_HELPER_S(sub, FPU_SUB) 528 SSE_HELPER_S(mul, FPU_MUL) 529 SSE_HELPER_S(div, FPU_DIV) 530 SSE_HELPER_S(min, FPU_MIN) 531 SSE_HELPER_S(max, FPU_MAX) 532 533 void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 534 { 535 int i; 536 for (i = 0; i < 2 << SHIFT; i++) { 537 d->ZMM_S(i) = float32_sqrt(s->ZMM_S(i), &env->sse_status); 538 } 539 } 540 541 void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 542 { 543 int i; 544 for (i = 0; i < 1 << SHIFT; i++) { 545 d->ZMM_D(i) = float64_sqrt(s->ZMM_D(i), &env->sse_status); 546 } 547 } 548 549 #if SHIFT == 1 550 void helper_sqrtss(CPUX86State *env, Reg *d, Reg *v, Reg *s) 551 { 552 int i; 553 d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status); 554 for (i = 1; i < 2 << SHIFT; i++) { 555 d->ZMM_L(i) = v->ZMM_L(i); 556 } 557 } 558 559 void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *v, Reg *s) 560 { 561 int i; 562 d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status); 563 for (i = 1; i < 1 << SHIFT; i++) { 564 d->ZMM_Q(i) = v->ZMM_Q(i); 565 } 566 } 567 #endif 568 569 /* float to float conversions */ 570 void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 571 { 572 int i; 573 for (i = 1 << SHIFT; --i >= 0; ) { 574 d->ZMM_D(i) = float32_to_float64(s->ZMM_S(i), &env->sse_status); 575 } 576 } 577 578 void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 579 { 580 int i; 581 for (i = 0; i < 1 << SHIFT; i++) { 582 d->ZMM_S(i) = float64_to_float32(s->ZMM_D(i), &env->sse_status); 583 } 584 for (i >>= 1; i < 1 << SHIFT; i++) { 585 d->Q(i) = 0; 586 } 587 } 588 589 #if SHIFT == 1 590 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *v, Reg *s) 591 { 592 int i; 593 d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status); 594 for (i = 1; i < 1 << SHIFT; i++) { 595 d->ZMM_Q(i) = v->ZMM_Q(i); 596 } 597 } 598 599 void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *v, Reg *s) 600 { 601 int i; 602 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status); 603 for (i = 1; i < 2 << SHIFT; i++) { 604 d->ZMM_L(i) = v->ZMM_L(i); 605 } 606 } 607 #endif 608 609 /* integer to float */ 610 void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 611 { 612 int i; 613 for (i = 0; i < 2 << SHIFT; i++) { 614 d->ZMM_S(i) = int32_to_float32(s->ZMM_L(i), &env->sse_status); 615 } 616 } 617 618 void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 619 { 620 int i; 621 for (i = 1 << SHIFT; --i >= 0; ) { 622 int32_t l = s->ZMM_L(i); 623 d->ZMM_D(i) = int32_to_float64(l, &env->sse_status); 624 } 625 } 626 627 #if SHIFT == 1 628 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s) 629 { 630 d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status); 631 d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status); 632 } 633 634 void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s) 635 { 636 d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status); 637 d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status); 638 } 639 640 void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val) 641 { 642 d->ZMM_S(0) = int32_to_float32(val, &env->sse_status); 643 } 644 645 void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val) 646 { 647 d->ZMM_D(0) = int32_to_float64(val, &env->sse_status); 648 } 649 650 #ifdef TARGET_X86_64 651 void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val) 652 { 653 d->ZMM_S(0) = int64_to_float32(val, &env->sse_status); 654 } 655 656 void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val) 657 { 658 d->ZMM_D(0) = int64_to_float64(val, &env->sse_status); 659 } 660 #endif 661 662 #endif 663 664 /* float to integer */ 665 666 #if SHIFT == 1 667 /* 668 * x86 mandates that we return the indefinite integer value for the result 669 * of any float-to-integer conversion that raises the 'invalid' exception. 670 * Wrap the softfloat functions to get this behaviour. 671 */ 672 #define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE) \ 673 static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s) \ 674 { \ 675 int oldflags, newflags; \ 676 RETTYPE r; \ 677 \ 678 oldflags = get_float_exception_flags(s); \ 679 set_float_exception_flags(0, s); \ 680 r = FN(a, s); \ 681 newflags = get_float_exception_flags(s); \ 682 if (newflags & float_flag_invalid) { \ 683 r = INDEFVALUE; \ 684 } \ 685 set_float_exception_flags(newflags | oldflags, s); \ 686 return r; \ 687 } 688 689 WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN) 690 WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN) 691 WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN) 692 WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN) 693 WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN) 694 WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN) 695 WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN) 696 WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN) 697 #endif 698 699 void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 700 { 701 int i; 702 for (i = 0; i < 2 << SHIFT; i++) { 703 d->ZMM_L(i) = x86_float32_to_int32(s->ZMM_S(i), &env->sse_status); 704 } 705 } 706 707 void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 708 { 709 int i; 710 for (i = 0; i < 1 << SHIFT; i++) { 711 d->ZMM_L(i) = x86_float64_to_int32(s->ZMM_D(i), &env->sse_status); 712 } 713 for (i >>= 1; i < 1 << SHIFT; i++) { 714 d->Q(i) = 0; 715 } 716 } 717 718 #if SHIFT == 1 719 void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 720 { 721 d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); 722 d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status); 723 } 724 725 void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 726 { 727 d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); 728 d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status); 729 } 730 731 int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s) 732 { 733 return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); 734 } 735 736 int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s) 737 { 738 return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); 739 } 740 741 #ifdef TARGET_X86_64 742 int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s) 743 { 744 return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status); 745 } 746 747 int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s) 748 { 749 return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status); 750 } 751 #endif 752 #endif 753 754 /* float to integer truncated */ 755 void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 756 { 757 int i; 758 for (i = 0; i < 2 << SHIFT; i++) { 759 d->ZMM_L(i) = x86_float32_to_int32_round_to_zero(s->ZMM_S(i), 760 &env->sse_status); 761 } 762 } 763 764 void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 765 { 766 int i; 767 for (i = 0; i < 1 << SHIFT; i++) { 768 d->ZMM_L(i) = x86_float64_to_int32_round_to_zero(s->ZMM_D(i), 769 &env->sse_status); 770 } 771 for (i >>= 1; i < 1 << SHIFT; i++) { 772 d->Q(i) = 0; 773 } 774 } 775 776 #if SHIFT == 1 777 void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 778 { 779 d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); 780 d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status); 781 } 782 783 void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) 784 { 785 d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status); 786 d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status); 787 } 788 789 int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s) 790 { 791 return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); 792 } 793 794 int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s) 795 { 796 return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status); 797 } 798 799 #ifdef TARGET_X86_64 800 int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s) 801 { 802 return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status); 803 } 804 805 int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s) 806 { 807 return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status); 808 } 809 #endif 810 #endif 811 812 void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 813 { 814 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 815 int i; 816 for (i = 0; i < 2 << SHIFT; i++) { 817 d->ZMM_S(i) = float32_div(float32_one, 818 float32_sqrt(s->ZMM_S(i), &env->sse_status), 819 &env->sse_status); 820 } 821 set_float_exception_flags(old_flags, &env->sse_status); 822 } 823 824 #if SHIFT == 1 825 void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s) 826 { 827 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 828 int i; 829 d->ZMM_S(0) = float32_div(float32_one, 830 float32_sqrt(s->ZMM_S(0), &env->sse_status), 831 &env->sse_status); 832 set_float_exception_flags(old_flags, &env->sse_status); 833 for (i = 1; i < 2 << SHIFT; i++) { 834 d->ZMM_L(i) = v->ZMM_L(i); 835 } 836 } 837 #endif 838 839 void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 840 { 841 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 842 int i; 843 for (i = 0; i < 2 << SHIFT; i++) { 844 d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status); 845 } 846 set_float_exception_flags(old_flags, &env->sse_status); 847 } 848 849 #if SHIFT == 1 850 void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s) 851 { 852 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 853 int i; 854 d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status); 855 for (i = 1; i < 2 << SHIFT; i++) { 856 d->ZMM_L(i) = v->ZMM_L(i); 857 } 858 set_float_exception_flags(old_flags, &env->sse_status); 859 } 860 #endif 861 862 #if SHIFT == 1 863 static inline uint64_t helper_extrq(uint64_t src, int shift, int len) 864 { 865 uint64_t mask; 866 867 if (len == 0) { 868 mask = ~0LL; 869 } else { 870 mask = (1ULL << len) - 1; 871 } 872 return (src >> shift) & mask; 873 } 874 875 void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s) 876 { 877 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1) & 63, s->ZMM_B(0) & 63); 878 } 879 880 void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length) 881 { 882 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length); 883 } 884 885 static inline uint64_t helper_insertq(uint64_t dest, uint64_t src, int shift, int len) 886 { 887 uint64_t mask; 888 889 if (len == 0) { 890 mask = ~0ULL; 891 } else { 892 mask = (1ULL << len) - 1; 893 } 894 return (dest & ~(mask << shift)) | ((src & mask) << shift); 895 } 896 897 void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s) 898 { 899 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), s->ZMM_B(9) & 63, s->ZMM_B(8) & 63); 900 } 901 902 void helper_insertq_i(CPUX86State *env, ZMMReg *d, ZMMReg *s, int index, int length) 903 { 904 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), index, length); 905 } 906 #endif 907 908 #define SSE_HELPER_HPS(name, F) \ 909 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 910 { \ 911 float32 r[2 << SHIFT]; \ 912 int i, j, k; \ 913 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ 914 for (i = j = 0; j < 4; i++, j += 2) { \ 915 r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \ 916 } \ 917 for (j = 0; j < 4; i++, j += 2) { \ 918 r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \ 919 } \ 920 } \ 921 for (i = 0; i < 2 << SHIFT; i++) { \ 922 d->ZMM_S(i) = r[i]; \ 923 } \ 924 } 925 926 SSE_HELPER_HPS(haddps, float32_add) 927 SSE_HELPER_HPS(hsubps, float32_sub) 928 929 #define SSE_HELPER_HPD(name, F) \ 930 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 931 { \ 932 float64 r[1 << SHIFT]; \ 933 int i, j, k; \ 934 for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \ 935 for (i = j = 0; j < 2; i++, j += 2) { \ 936 r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \ 937 } \ 938 for (j = 0; j < 2; i++, j += 2) { \ 939 r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \ 940 } \ 941 } \ 942 for (i = 0; i < 1 << SHIFT; i++) { \ 943 d->ZMM_D(i) = r[i]; \ 944 } \ 945 } 946 947 SSE_HELPER_HPD(haddpd, float64_add) 948 SSE_HELPER_HPD(hsubpd, float64_sub) 949 950 void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 951 { 952 int i; 953 for (i = 0; i < 2 << SHIFT; i += 2) { 954 d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); 955 d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status); 956 } 957 } 958 959 void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 960 { 961 int i; 962 for (i = 0; i < 1 << SHIFT; i += 2) { 963 d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status); 964 d->ZMM_D(i+1) = float64_add(v->ZMM_D(i+1), s->ZMM_D(i+1), &env->sse_status); 965 } 966 } 967 968 #define SSE_HELPER_CMP_P(name, F, C) \ 969 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ 970 Reg *d, Reg *v, Reg *s) \ 971 { \ 972 int i; \ 973 for (i = 0; i < 2 << SHIFT; i++) { \ 974 d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0; \ 975 } \ 976 } \ 977 \ 978 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ 979 Reg *d, Reg *v, Reg *s) \ 980 { \ 981 int i; \ 982 for (i = 0; i < 1 << SHIFT; i++) { \ 983 d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0; \ 984 } \ 985 } 986 987 #if SHIFT == 1 988 #define SSE_HELPER_CMP(name, F, C) \ 989 SSE_HELPER_CMP_P(name, F, C) \ 990 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 991 { \ 992 int i; \ 993 d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0; \ 994 for (i = 1; i < 2 << SHIFT; i++) { \ 995 d->ZMM_L(i) = v->ZMM_L(i); \ 996 } \ 997 } \ 998 \ 999 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 1000 { \ 1001 int i; \ 1002 d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0; \ 1003 for (i = 1; i < 1 << SHIFT; i++) { \ 1004 d->ZMM_Q(i) = v->ZMM_Q(i); \ 1005 } \ 1006 } 1007 1008 static inline bool FPU_EQU(FloatRelation x) 1009 { 1010 return (x == float_relation_equal || x == float_relation_unordered); 1011 } 1012 static inline bool FPU_GE(FloatRelation x) 1013 { 1014 return (x == float_relation_equal || x == float_relation_greater); 1015 } 1016 #define FPU_EQ(x) (x == float_relation_equal) 1017 #define FPU_LT(x) (x == float_relation_less) 1018 #define FPU_LE(x) (x <= float_relation_equal) 1019 #define FPU_GT(x) (x == float_relation_greater) 1020 #define FPU_UNORD(x) (x == float_relation_unordered) 1021 /* We must make sure we evaluate the argument in case it is a signalling NAN */ 1022 #define FPU_FALSE(x) (x == float_relation_equal && 0) 1023 1024 #define FPU_CMPQ(size, a, b) \ 1025 float ## size ## _compare_quiet(a, b, &env->sse_status) 1026 #define FPU_CMPS(size, a, b) \ 1027 float ## size ## _compare(a, b, &env->sse_status) 1028 1029 #else 1030 #define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C) 1031 #endif 1032 1033 SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ) 1034 SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT) 1035 SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE) 1036 SSE_HELPER_CMP(cmpunord, FPU_CMPQ, FPU_UNORD) 1037 SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ) 1038 SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT) 1039 SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE) 1040 SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD) 1041 1042 SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU) 1043 SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE) 1044 SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT) 1045 SSE_HELPER_CMP(cmpfalse, FPU_CMPQ, FPU_FALSE) 1046 SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU) 1047 SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE) 1048 SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT) 1049 SSE_HELPER_CMP(cmptrue, FPU_CMPQ, !FPU_FALSE) 1050 1051 SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ) 1052 SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT) 1053 SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE) 1054 SSE_HELPER_CMP(cmpunords, FPU_CMPS, FPU_UNORD) 1055 SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ) 1056 SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT) 1057 SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE) 1058 SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD) 1059 1060 SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU) 1061 SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE) 1062 SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT) 1063 SSE_HELPER_CMP(cmpfalses, FPU_CMPS, FPU_FALSE) 1064 SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU) 1065 SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE) 1066 SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT) 1067 SSE_HELPER_CMP(cmptrues, FPU_CMPS, !FPU_FALSE) 1068 1069 #undef SSE_HELPER_CMP 1070 1071 #if SHIFT == 1 1072 static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C}; 1073 1074 void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s) 1075 { 1076 FloatRelation ret; 1077 float32 s0, s1; 1078 1079 s0 = d->ZMM_S(0); 1080 s1 = s->ZMM_S(0); 1081 ret = float32_compare_quiet(s0, s1, &env->sse_status); 1082 CC_SRC = comis_eflags[ret + 1]; 1083 } 1084 1085 void helper_comiss(CPUX86State *env, Reg *d, Reg *s) 1086 { 1087 FloatRelation ret; 1088 float32 s0, s1; 1089 1090 s0 = d->ZMM_S(0); 1091 s1 = s->ZMM_S(0); 1092 ret = float32_compare(s0, s1, &env->sse_status); 1093 CC_SRC = comis_eflags[ret + 1]; 1094 } 1095 1096 void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s) 1097 { 1098 FloatRelation ret; 1099 float64 d0, d1; 1100 1101 d0 = d->ZMM_D(0); 1102 d1 = s->ZMM_D(0); 1103 ret = float64_compare_quiet(d0, d1, &env->sse_status); 1104 CC_SRC = comis_eflags[ret + 1]; 1105 } 1106 1107 void helper_comisd(CPUX86State *env, Reg *d, Reg *s) 1108 { 1109 FloatRelation ret; 1110 float64 d0, d1; 1111 1112 d0 = d->ZMM_D(0); 1113 d1 = s->ZMM_D(0); 1114 ret = float64_compare(d0, d1, &env->sse_status); 1115 CC_SRC = comis_eflags[ret + 1]; 1116 } 1117 #endif 1118 1119 uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s) 1120 { 1121 uint32_t mask; 1122 int i; 1123 1124 mask = 0; 1125 for (i = 0; i < 2 << SHIFT; i++) { 1126 mask |= (s->ZMM_L(i) >> (31 - i)) & (1 << i); 1127 } 1128 return mask; 1129 } 1130 1131 uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s) 1132 { 1133 uint32_t mask; 1134 int i; 1135 1136 mask = 0; 1137 for (i = 0; i < 1 << SHIFT; i++) { 1138 mask |= (s->ZMM_Q(i) >> (63 - i)) & (1 << i); 1139 } 1140 return mask; 1141 } 1142 1143 #endif 1144 1145 #define PACK_HELPER_B(name, F) \ 1146 void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \ 1147 Reg *d, Reg *v, Reg *s) \ 1148 { \ 1149 uint8_t r[PACK_WIDTH * 2]; \ 1150 int j, k; \ 1151 for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \ 1152 for (k = 0; k < PACK_WIDTH; k++) { \ 1153 r[k] = F((int16_t)v->W(j + k)); \ 1154 } \ 1155 for (k = 0; k < PACK_WIDTH; k++) { \ 1156 r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \ 1157 } \ 1158 for (k = 0; k < PACK_WIDTH * 2; k++) { \ 1159 d->B(2 * j + k) = r[k]; \ 1160 } \ 1161 } \ 1162 } 1163 1164 PACK_HELPER_B(sswb, satsb) 1165 PACK_HELPER_B(uswb, satub) 1166 1167 void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 1168 { 1169 uint16_t r[PACK_WIDTH]; 1170 int j, k; 1171 1172 for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) { 1173 for (k = 0; k < PACK_WIDTH / 2; k++) { 1174 r[k] = satsw(v->L(j + k)); 1175 } 1176 for (k = 0; k < PACK_WIDTH / 2; k++) { 1177 r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k)); 1178 } 1179 for (k = 0; k < PACK_WIDTH; k++) { 1180 d->W(2 * j + k) = r[k]; 1181 } 1182 } 1183 } 1184 1185 #define UNPCK_OP(base_name, base) \ 1186 \ 1187 void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\ 1188 Reg *d, Reg *v, Reg *s) \ 1189 { \ 1190 uint8_t r[PACK_WIDTH * 2]; \ 1191 int j, i; \ 1192 \ 1193 for (j = 0; j < 8 << SHIFT; ) { \ 1194 int k = j + base * PACK_WIDTH; \ 1195 for (i = 0; i < PACK_WIDTH; i++) { \ 1196 r[2 * i] = v->B(k + i); \ 1197 r[2 * i + 1] = s->B(k + i); \ 1198 } \ 1199 for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \ 1200 d->B(j) = r[i]; \ 1201 } \ 1202 } \ 1203 } \ 1204 \ 1205 void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\ 1206 Reg *d, Reg *v, Reg *s) \ 1207 { \ 1208 uint16_t r[PACK_WIDTH]; \ 1209 int j, i; \ 1210 \ 1211 for (j = 0; j < 4 << SHIFT; ) { \ 1212 int k = j + base * PACK_WIDTH / 2; \ 1213 for (i = 0; i < PACK_WIDTH / 2; i++) { \ 1214 r[2 * i] = v->W(k + i); \ 1215 r[2 * i + 1] = s->W(k + i); \ 1216 } \ 1217 for (i = 0; i < PACK_WIDTH; i++, j++) { \ 1218 d->W(j) = r[i]; \ 1219 } \ 1220 } \ 1221 } \ 1222 \ 1223 void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\ 1224 Reg *d, Reg *v, Reg *s) \ 1225 { \ 1226 uint32_t r[PACK_WIDTH / 2]; \ 1227 int j, i; \ 1228 \ 1229 for (j = 0; j < 2 << SHIFT; ) { \ 1230 int k = j + base * PACK_WIDTH / 4; \ 1231 for (i = 0; i < PACK_WIDTH / 4; i++) { \ 1232 r[2 * i] = v->L(k + i); \ 1233 r[2 * i + 1] = s->L(k + i); \ 1234 } \ 1235 for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \ 1236 d->L(j) = r[i]; \ 1237 } \ 1238 } \ 1239 } \ 1240 \ 1241 XMM_ONLY( \ 1242 void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \ 1243 CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 1244 { \ 1245 uint64_t r[2]; \ 1246 int i; \ 1247 \ 1248 for (i = 0; i < 1 << SHIFT; i += 2) { \ 1249 r[0] = v->Q(base + i); \ 1250 r[1] = s->Q(base + i); \ 1251 d->Q(i) = r[0]; \ 1252 d->Q(i + 1) = r[1]; \ 1253 } \ 1254 } \ 1255 ) 1256 1257 UNPCK_OP(l, 0) 1258 UNPCK_OP(h, 1) 1259 1260 #undef PACK_WIDTH 1261 #undef PACK_HELPER_B 1262 #undef UNPCK_OP 1263 1264 1265 /* 3DNow! float ops */ 1266 #if SHIFT == 0 1267 void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s) 1268 { 1269 d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status); 1270 d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status); 1271 } 1272 1273 void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s) 1274 { 1275 d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status); 1276 d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status); 1277 } 1278 1279 void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s) 1280 { 1281 d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status); 1282 d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status); 1283 } 1284 1285 void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s) 1286 { 1287 d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), 1288 &env->mmx_status)); 1289 d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), 1290 &env->mmx_status)); 1291 } 1292 1293 void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1294 { 1295 float32 r; 1296 1297 r = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1298 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1299 d->MMX_S(0) = r; 1300 } 1301 1302 void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s) 1303 { 1304 d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1305 d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1306 } 1307 1308 void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s) 1309 { 1310 d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0), 1311 &env->mmx_status) ? -1 : 0; 1312 d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1), 1313 &env->mmx_status) ? -1 : 0; 1314 } 1315 1316 void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s) 1317 { 1318 d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), 1319 &env->mmx_status) ? -1 : 0; 1320 d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), 1321 &env->mmx_status) ? -1 : 0; 1322 } 1323 1324 void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s) 1325 { 1326 d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), 1327 &env->mmx_status) ? -1 : 0; 1328 d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), 1329 &env->mmx_status) ? -1 : 0; 1330 } 1331 1332 void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s) 1333 { 1334 if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) { 1335 d->MMX_S(0) = s->MMX_S(0); 1336 } 1337 if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) { 1338 d->MMX_S(1) = s->MMX_S(1); 1339 } 1340 } 1341 1342 void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s) 1343 { 1344 if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) { 1345 d->MMX_S(0) = s->MMX_S(0); 1346 } 1347 if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) { 1348 d->MMX_S(1) = s->MMX_S(1); 1349 } 1350 } 1351 1352 void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s) 1353 { 1354 d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1355 d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1356 } 1357 1358 void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1359 { 1360 float32 r; 1361 1362 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1363 d->MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1364 d->MMX_S(0) = r; 1365 } 1366 1367 void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s) 1368 { 1369 float32 r; 1370 1371 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); 1372 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); 1373 d->MMX_S(0) = r; 1374 } 1375 1376 void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s) 1377 { 1378 d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status); 1379 d->MMX_S(1) = d->MMX_S(0); 1380 } 1381 1382 void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s) 1383 { 1384 d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff; 1385 d->MMX_S(1) = float32_div(float32_one, 1386 float32_sqrt(d->MMX_S(1), &env->mmx_status), 1387 &env->mmx_status); 1388 d->MMX_L(1) |= s->MMX_L(0) & 0x80000000; 1389 d->MMX_L(0) = d->MMX_L(1); 1390 } 1391 1392 void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s) 1393 { 1394 d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); 1395 d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); 1396 } 1397 1398 void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s) 1399 { 1400 d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status); 1401 d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status); 1402 } 1403 1404 void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s) 1405 { 1406 uint32_t r; 1407 1408 r = s->MMX_L(0); 1409 d->MMX_L(0) = s->MMX_L(1); 1410 d->MMX_L(1) = r; 1411 } 1412 #endif 1413 1414 /* SSSE3 op helpers */ 1415 void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 1416 { 1417 int i; 1418 #if SHIFT == 0 1419 uint8_t r[8]; 1420 1421 for (i = 0; i < 8; i++) { 1422 r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7)); 1423 } 1424 for (i = 0; i < 8; i++) { 1425 d->B(i) = r[i]; 1426 } 1427 #else 1428 uint8_t r[8 << SHIFT]; 1429 1430 for (i = 0; i < 8 << SHIFT; i++) { 1431 int j = i & ~0xf; 1432 r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf)); 1433 } 1434 for (i = 0; i < 8 << SHIFT; i++) { 1435 d->B(i) = r[i]; 1436 } 1437 #endif 1438 } 1439 1440 #define SSE_HELPER_HW(name, F) \ 1441 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 1442 { \ 1443 uint16_t r[4 << SHIFT]; \ 1444 int i, j, k; \ 1445 for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \ 1446 for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \ 1447 r[i + k] = F(v->W(j + k), v->W(j + k + 1)); \ 1448 } \ 1449 for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \ 1450 r[i + k] = F(s->W(j + k), s->W(j + k + 1)); \ 1451 } \ 1452 } \ 1453 for (i = 0; i < 4 << SHIFT; i++) { \ 1454 d->W(i) = r[i]; \ 1455 } \ 1456 } 1457 1458 #define SSE_HELPER_HL(name, F) \ 1459 void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \ 1460 { \ 1461 uint32_t r[2 << SHIFT]; \ 1462 int i, j, k; \ 1463 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ 1464 for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \ 1465 r[i + k] = F(v->L(j + k), v->L(j + k + 1)); \ 1466 } \ 1467 for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \ 1468 r[i + k] = F(s->L(j + k), s->L(j + k + 1)); \ 1469 } \ 1470 } \ 1471 for (i = 0; i < 2 << SHIFT; i++) { \ 1472 d->L(i) = r[i]; \ 1473 } \ 1474 } 1475 1476 SSE_HELPER_HW(phaddw, FADD) 1477 SSE_HELPER_HW(phsubw, FSUB) 1478 SSE_HELPER_HW(phaddsw, FADDSW) 1479 SSE_HELPER_HW(phsubsw, FSUBSW) 1480 SSE_HELPER_HL(phaddd, FADD) 1481 SSE_HELPER_HL(phsubd, FSUB) 1482 1483 #undef SSE_HELPER_HW 1484 #undef SSE_HELPER_HL 1485 1486 void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 1487 { 1488 int i; 1489 for (i = 0; i < 4 << SHIFT; i++) { 1490 d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) + 1491 (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1)); 1492 } 1493 } 1494 1495 #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15) 1496 SSE_HELPER_W(helper_pmulhrsw, FMULHRSW) 1497 1498 #define FSIGNB(d, s) (s <= INT8_MAX ? s ? d : 0 : -(int8_t)d) 1499 #define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d) 1500 #define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d) 1501 SSE_HELPER_B(helper_psignb, FSIGNB) 1502 SSE_HELPER_W(helper_psignw, FSIGNW) 1503 SSE_HELPER_L(helper_psignd, FSIGNL) 1504 1505 void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 1506 uint32_t imm) 1507 { 1508 int i; 1509 1510 /* XXX could be checked during translation */ 1511 if (imm >= (SHIFT ? 32 : 16)) { 1512 for (i = 0; i < (1 << SHIFT); i++) { 1513 d->Q(i) = 0; 1514 } 1515 } else { 1516 int shift = imm * 8; 1517 #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0) 1518 #if SHIFT == 0 1519 d->Q(0) = SHR(s->Q(0), shift - 0) | 1520 SHR(v->Q(0), shift - 64); 1521 #else 1522 for (i = 0; i < (1 << SHIFT); i += 2) { 1523 uint64_t r0, r1; 1524 1525 r0 = SHR(s->Q(i), shift - 0) | 1526 SHR(s->Q(i + 1), shift - 64) | 1527 SHR(v->Q(i), shift - 128) | 1528 SHR(v->Q(i + 1), shift - 192); 1529 r1 = SHR(s->Q(i), shift + 64) | 1530 SHR(s->Q(i + 1), shift - 0) | 1531 SHR(v->Q(i), shift - 64) | 1532 SHR(v->Q(i + 1), shift - 128); 1533 d->Q(i) = r0; 1534 d->Q(i + 1) = r1; 1535 } 1536 #endif 1537 #undef SHR 1538 } 1539 } 1540 1541 #if SHIFT >= 1 1542 1543 #define SSE_HELPER_V(name, elem, num, F) \ 1544 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \ 1545 Reg *m) \ 1546 { \ 1547 int i; \ 1548 for (i = 0; i < num; i++) { \ 1549 d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i)); \ 1550 } \ 1551 } 1552 1553 #define SSE_HELPER_I(name, elem, num, F) \ 1554 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \ 1555 uint32_t imm) \ 1556 { \ 1557 int i; \ 1558 for (i = 0; i < num; i++) { \ 1559 int j = i & 7; \ 1560 d->elem(i) = F(v->elem(i), s->elem(i), (imm >> j) & 1); \ 1561 } \ 1562 } 1563 1564 /* SSE4.1 op helpers */ 1565 #define FBLENDVB(v, s, m) ((m & 0x80) ? s : v) 1566 #define FBLENDVPS(v, s, m) ((m & 0x80000000) ? s : v) 1567 #define FBLENDVPD(v, s, m) ((m & 0x8000000000000000LL) ? s : v) 1568 SSE_HELPER_V(helper_pblendvb, B, 8 << SHIFT, FBLENDVB) 1569 SSE_HELPER_V(helper_blendvps, L, 2 << SHIFT, FBLENDVPS) 1570 SSE_HELPER_V(helper_blendvpd, Q, 1 << SHIFT, FBLENDVPD) 1571 1572 void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1573 { 1574 uint64_t zf = 0, cf = 0; 1575 int i; 1576 1577 for (i = 0; i < 1 << SHIFT; i++) { 1578 zf |= (s->Q(i) & d->Q(i)); 1579 cf |= (s->Q(i) & ~d->Q(i)); 1580 } 1581 CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); 1582 } 1583 1584 #define FMOVSLDUP(i) s->L((i) & ~1) 1585 #define FMOVSHDUP(i) s->L((i) | 1) 1586 #define FMOVDLDUP(i) s->Q((i) & ~1) 1587 1588 #define SSE_HELPER_F(name, elem, num, F) \ 1589 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 1590 { \ 1591 int n = num; \ 1592 for (int i = n; --i >= 0; ) { \ 1593 d->elem(i) = F(i); \ 1594 } \ 1595 } 1596 1597 #if SHIFT > 0 1598 SSE_HELPER_F(helper_pmovsxbw, W, 4 << SHIFT, (int8_t) s->B) 1599 SSE_HELPER_F(helper_pmovsxbd, L, 2 << SHIFT, (int8_t) s->B) 1600 SSE_HELPER_F(helper_pmovsxbq, Q, 1 << SHIFT, (int8_t) s->B) 1601 SSE_HELPER_F(helper_pmovsxwd, L, 2 << SHIFT, (int16_t) s->W) 1602 SSE_HELPER_F(helper_pmovsxwq, Q, 1 << SHIFT, (int16_t) s->W) 1603 SSE_HELPER_F(helper_pmovsxdq, Q, 1 << SHIFT, (int32_t) s->L) 1604 SSE_HELPER_F(helper_pmovzxbw, W, 4 << SHIFT, s->B) 1605 SSE_HELPER_F(helper_pmovzxbd, L, 2 << SHIFT, s->B) 1606 SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B) 1607 SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W) 1608 SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W) 1609 SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L) 1610 SSE_HELPER_F(helper_pmovsldup, L, 2 << SHIFT, FMOVSLDUP) 1611 SSE_HELPER_F(helper_pmovshdup, L, 2 << SHIFT, FMOVSHDUP) 1612 SSE_HELPER_F(helper_pmovdldup, Q, 1 << SHIFT, FMOVDLDUP) 1613 #endif 1614 1615 void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 1616 { 1617 int i; 1618 1619 for (i = 0; i < 1 << SHIFT; i++) { 1620 d->Q(i) = (int64_t)(int32_t) v->L(2 * i) * (int32_t) s->L(2 * i); 1621 } 1622 } 1623 1624 void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 1625 { 1626 uint16_t r[8]; 1627 int i, j, k; 1628 1629 for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) { 1630 r[0] = satuw(v->L(j)); 1631 r[1] = satuw(v->L(j + 1)); 1632 r[2] = satuw(v->L(j + 2)); 1633 r[3] = satuw(v->L(j + 3)); 1634 r[4] = satuw(s->L(j)); 1635 r[5] = satuw(s->L(j + 1)); 1636 r[6] = satuw(s->L(j + 2)); 1637 r[7] = satuw(s->L(j + 3)); 1638 for (k = 0; k < 8; k++) { 1639 d->W(i + k) = r[k]; 1640 } 1641 } 1642 } 1643 1644 #if SHIFT == 1 1645 void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 1646 { 1647 int idx = 0; 1648 1649 if (s->W(1) < s->W(idx)) { 1650 idx = 1; 1651 } 1652 if (s->W(2) < s->W(idx)) { 1653 idx = 2; 1654 } 1655 if (s->W(3) < s->W(idx)) { 1656 idx = 3; 1657 } 1658 if (s->W(4) < s->W(idx)) { 1659 idx = 4; 1660 } 1661 if (s->W(5) < s->W(idx)) { 1662 idx = 5; 1663 } 1664 if (s->W(6) < s->W(idx)) { 1665 idx = 6; 1666 } 1667 if (s->W(7) < s->W(idx)) { 1668 idx = 7; 1669 } 1670 1671 d->W(0) = s->W(idx); 1672 d->W(1) = idx; 1673 d->L(1) = 0; 1674 d->Q(1) = 0; 1675 } 1676 #endif 1677 1678 void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1679 uint32_t mode) 1680 { 1681 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1682 signed char prev_rounding_mode; 1683 int i; 1684 1685 prev_rounding_mode = env->sse_status.float_rounding_mode; 1686 if (!(mode & (1 << 2))) { 1687 switch (mode & 3) { 1688 case 0: 1689 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1690 break; 1691 case 1: 1692 set_float_rounding_mode(float_round_down, &env->sse_status); 1693 break; 1694 case 2: 1695 set_float_rounding_mode(float_round_up, &env->sse_status); 1696 break; 1697 case 3: 1698 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1699 break; 1700 } 1701 } 1702 1703 for (i = 0; i < 2 << SHIFT; i++) { 1704 d->ZMM_S(i) = float32_round_to_int(s->ZMM_S(i), &env->sse_status); 1705 } 1706 1707 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1708 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1709 ~float_flag_inexact, 1710 &env->sse_status); 1711 } 1712 env->sse_status.float_rounding_mode = prev_rounding_mode; 1713 } 1714 1715 void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 1716 uint32_t mode) 1717 { 1718 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1719 signed char prev_rounding_mode; 1720 int i; 1721 1722 prev_rounding_mode = env->sse_status.float_rounding_mode; 1723 if (!(mode & (1 << 2))) { 1724 switch (mode & 3) { 1725 case 0: 1726 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1727 break; 1728 case 1: 1729 set_float_rounding_mode(float_round_down, &env->sse_status); 1730 break; 1731 case 2: 1732 set_float_rounding_mode(float_round_up, &env->sse_status); 1733 break; 1734 case 3: 1735 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1736 break; 1737 } 1738 } 1739 1740 for (i = 0; i < 1 << SHIFT; i++) { 1741 d->ZMM_D(i) = float64_round_to_int(s->ZMM_D(i), &env->sse_status); 1742 } 1743 1744 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1745 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1746 ~float_flag_inexact, 1747 &env->sse_status); 1748 } 1749 env->sse_status.float_rounding_mode = prev_rounding_mode; 1750 } 1751 1752 #if SHIFT == 1 1753 void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 1754 uint32_t mode) 1755 { 1756 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1757 signed char prev_rounding_mode; 1758 int i; 1759 1760 prev_rounding_mode = env->sse_status.float_rounding_mode; 1761 if (!(mode & (1 << 2))) { 1762 switch (mode & 3) { 1763 case 0: 1764 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1765 break; 1766 case 1: 1767 set_float_rounding_mode(float_round_down, &env->sse_status); 1768 break; 1769 case 2: 1770 set_float_rounding_mode(float_round_up, &env->sse_status); 1771 break; 1772 case 3: 1773 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1774 break; 1775 } 1776 } 1777 1778 d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status); 1779 for (i = 1; i < 2 << SHIFT; i++) { 1780 d->ZMM_L(i) = v->ZMM_L(i); 1781 } 1782 1783 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1784 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1785 ~float_flag_inexact, 1786 &env->sse_status); 1787 } 1788 env->sse_status.float_rounding_mode = prev_rounding_mode; 1789 } 1790 1791 void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 1792 uint32_t mode) 1793 { 1794 uint8_t old_flags = get_float_exception_flags(&env->sse_status); 1795 signed char prev_rounding_mode; 1796 int i; 1797 1798 prev_rounding_mode = env->sse_status.float_rounding_mode; 1799 if (!(mode & (1 << 2))) { 1800 switch (mode & 3) { 1801 case 0: 1802 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); 1803 break; 1804 case 1: 1805 set_float_rounding_mode(float_round_down, &env->sse_status); 1806 break; 1807 case 2: 1808 set_float_rounding_mode(float_round_up, &env->sse_status); 1809 break; 1810 case 3: 1811 set_float_rounding_mode(float_round_to_zero, &env->sse_status); 1812 break; 1813 } 1814 } 1815 1816 d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status); 1817 for (i = 1; i < 1 << SHIFT; i++) { 1818 d->ZMM_Q(i) = v->ZMM_Q(i); 1819 } 1820 1821 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { 1822 set_float_exception_flags(get_float_exception_flags(&env->sse_status) & 1823 ~float_flag_inexact, 1824 &env->sse_status); 1825 } 1826 env->sse_status.float_rounding_mode = prev_rounding_mode; 1827 } 1828 #endif 1829 1830 #define FBLENDP(v, s, m) (m ? s : v) 1831 SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP) 1832 SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP) 1833 SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP) 1834 1835 void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 1836 uint32_t mask) 1837 { 1838 float32 prod1, prod2, temp2, temp3, temp4; 1839 int i; 1840 1841 for (i = 0; i < 2 << SHIFT; i += 4) { 1842 /* 1843 * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D 1844 * to correctly round the intermediate results 1845 */ 1846 if (mask & (1 << 4)) { 1847 prod1 = float32_mul(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); 1848 } else { 1849 prod1 = float32_zero; 1850 } 1851 if (mask & (1 << 5)) { 1852 prod2 = float32_mul(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status); 1853 } else { 1854 prod2 = float32_zero; 1855 } 1856 temp2 = float32_add(prod1, prod2, &env->sse_status); 1857 if (mask & (1 << 6)) { 1858 prod1 = float32_mul(v->ZMM_S(i+2), s->ZMM_S(i+2), &env->sse_status); 1859 } else { 1860 prod1 = float32_zero; 1861 } 1862 if (mask & (1 << 7)) { 1863 prod2 = float32_mul(v->ZMM_S(i+3), s->ZMM_S(i+3), &env->sse_status); 1864 } else { 1865 prod2 = float32_zero; 1866 } 1867 temp3 = float32_add(prod1, prod2, &env->sse_status); 1868 temp4 = float32_add(temp2, temp3, &env->sse_status); 1869 1870 d->ZMM_S(i) = (mask & (1 << 0)) ? temp4 : float32_zero; 1871 d->ZMM_S(i+1) = (mask & (1 << 1)) ? temp4 : float32_zero; 1872 d->ZMM_S(i+2) = (mask & (1 << 2)) ? temp4 : float32_zero; 1873 d->ZMM_S(i+3) = (mask & (1 << 3)) ? temp4 : float32_zero; 1874 } 1875 } 1876 1877 #if SHIFT == 1 1878 /* Oddly, there is no ymm version of dppd */ 1879 void glue(helper_dppd, SUFFIX)(CPUX86State *env, 1880 Reg *d, Reg *v, Reg *s, uint32_t mask) 1881 { 1882 float64 prod1, prod2, temp2; 1883 1884 if (mask & (1 << 4)) { 1885 prod1 = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status); 1886 } else { 1887 prod1 = float64_zero; 1888 } 1889 if (mask & (1 << 5)) { 1890 prod2 = float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status); 1891 } else { 1892 prod2 = float64_zero; 1893 } 1894 temp2 = float64_add(prod1, prod2, &env->sse_status); 1895 d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero; 1896 d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero; 1897 } 1898 #endif 1899 1900 void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 1901 uint32_t offset) 1902 { 1903 int i, j; 1904 uint16_t r[8]; 1905 1906 for (j = 0; j < 4 << SHIFT; ) { 1907 int s0 = (j * 2) + ((offset & 3) << 2); 1908 int d0 = (j * 2) + ((offset & 4) << 0); 1909 for (i = 0; i < LANE_WIDTH / 2; i++, d0++) { 1910 r[i] = 0; 1911 r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0)); 1912 r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1)); 1913 r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2)); 1914 r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3)); 1915 } 1916 for (i = 0; i < LANE_WIDTH / 2; i++, j++) { 1917 d->W(j) = r[i]; 1918 } 1919 offset >>= 3; 1920 } 1921 } 1922 1923 /* SSE4.2 op helpers */ 1924 #if SHIFT == 1 1925 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl) 1926 { 1927 target_long val, limit; 1928 1929 /* Presence of REX.W is indicated by a bit higher than 7 set */ 1930 if (ctrl >> 8) { 1931 val = (target_long)env->regs[reg]; 1932 } else { 1933 val = (int32_t)env->regs[reg]; 1934 } 1935 if (ctrl & 1) { 1936 limit = 8; 1937 } else { 1938 limit = 16; 1939 } 1940 if ((val > limit) || (val < -limit)) { 1941 return limit; 1942 } 1943 return abs1(val); 1944 } 1945 1946 static inline int pcmp_ilen(Reg *r, uint8_t ctrl) 1947 { 1948 int val = 0; 1949 1950 if (ctrl & 1) { 1951 while (val < 8 && r->W(val)) { 1952 val++; 1953 } 1954 } else { 1955 while (val < 16 && r->B(val)) { 1956 val++; 1957 } 1958 } 1959 1960 return val; 1961 } 1962 1963 static inline int pcmp_val(Reg *r, uint8_t ctrl, int i) 1964 { 1965 switch ((ctrl >> 0) & 3) { 1966 case 0: 1967 return r->B(i); 1968 case 1: 1969 return r->W(i); 1970 case 2: 1971 return (int8_t)r->B(i); 1972 case 3: 1973 default: 1974 return (int16_t)r->W(i); 1975 } 1976 } 1977 1978 static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s, 1979 uint8_t ctrl, int valids, int validd) 1980 { 1981 unsigned int res = 0; 1982 int v; 1983 int j, i; 1984 int upper = (ctrl & 1) ? 7 : 15; 1985 1986 valids--; 1987 validd--; 1988 1989 CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0); 1990 1991 switch ((ctrl >> 2) & 3) { 1992 case 0: 1993 for (j = valids; j >= 0; j--) { 1994 res <<= 1; 1995 v = pcmp_val(s, ctrl, j); 1996 for (i = validd; i >= 0; i--) { 1997 res |= (v == pcmp_val(d, ctrl, i)); 1998 } 1999 } 2000 break; 2001 case 1: 2002 for (j = valids; j >= 0; j--) { 2003 res <<= 1; 2004 v = pcmp_val(s, ctrl, j); 2005 for (i = ((validd - 1) | 1); i >= 0; i -= 2) { 2006 res |= (pcmp_val(d, ctrl, i - 0) >= v && 2007 pcmp_val(d, ctrl, i - 1) <= v); 2008 } 2009 } 2010 break; 2011 case 2: 2012 res = (1 << (upper - MAX(valids, validd))) - 1; 2013 res <<= MAX(valids, validd) - MIN(valids, validd); 2014 for (i = MIN(valids, validd); i >= 0; i--) { 2015 res <<= 1; 2016 v = pcmp_val(s, ctrl, i); 2017 res |= (v == pcmp_val(d, ctrl, i)); 2018 } 2019 break; 2020 case 3: 2021 if (validd == -1) { 2022 res = (2 << upper) - 1; 2023 break; 2024 } 2025 for (j = valids == upper ? valids : valids - validd; j >= 0; j--) { 2026 res <<= 1; 2027 v = 1; 2028 for (i = MIN(valids - j, validd); i >= 0; i--) { 2029 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i)); 2030 } 2031 res |= v; 2032 } 2033 break; 2034 } 2035 2036 switch ((ctrl >> 4) & 3) { 2037 case 1: 2038 res ^= (2 << upper) - 1; 2039 break; 2040 case 3: 2041 res ^= (1 << (valids + 1)) - 1; 2042 break; 2043 } 2044 2045 if (res) { 2046 CC_SRC |= CC_C; 2047 } 2048 if (res & 1) { 2049 CC_SRC |= CC_O; 2050 } 2051 2052 return res; 2053 } 2054 2055 void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2056 uint32_t ctrl) 2057 { 2058 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2059 pcmp_elen(env, R_EDX, ctrl), 2060 pcmp_elen(env, R_EAX, ctrl)); 2061 2062 if (res) { 2063 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res); 2064 } else { 2065 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); 2066 } 2067 } 2068 2069 void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2070 uint32_t ctrl) 2071 { 2072 int i; 2073 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2074 pcmp_elen(env, R_EDX, ctrl), 2075 pcmp_elen(env, R_EAX, ctrl)); 2076 2077 if ((ctrl >> 6) & 1) { 2078 if (ctrl & 1) { 2079 for (i = 0; i < 8; i++, res >>= 1) { 2080 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0; 2081 } 2082 } else { 2083 for (i = 0; i < 16; i++, res >>= 1) { 2084 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0; 2085 } 2086 } 2087 } else { 2088 env->xmm_regs[0].Q(1) = 0; 2089 env->xmm_regs[0].Q(0) = res; 2090 } 2091 } 2092 2093 void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2094 uint32_t ctrl) 2095 { 2096 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2097 pcmp_ilen(s, ctrl), 2098 pcmp_ilen(d, ctrl)); 2099 2100 if (res) { 2101 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res); 2102 } else { 2103 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); 2104 } 2105 } 2106 2107 void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2108 uint32_t ctrl) 2109 { 2110 int i; 2111 unsigned int res = pcmpxstrx(env, d, s, ctrl, 2112 pcmp_ilen(s, ctrl), 2113 pcmp_ilen(d, ctrl)); 2114 2115 if ((ctrl >> 6) & 1) { 2116 if (ctrl & 1) { 2117 for (i = 0; i < 8; i++, res >>= 1) { 2118 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0; 2119 } 2120 } else { 2121 for (i = 0; i < 16; i++, res >>= 1) { 2122 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0; 2123 } 2124 } 2125 } else { 2126 env->xmm_regs[0].Q(1) = 0; 2127 env->xmm_regs[0].Q(0) = res; 2128 } 2129 } 2130 2131 #define CRCPOLY 0x1edc6f41 2132 #define CRCPOLY_BITREV 0x82f63b78 2133 target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) 2134 { 2135 target_ulong crc = (msg & ((target_ulong) -1 >> 2136 (TARGET_LONG_BITS - len))) ^ crc1; 2137 2138 while (len--) { 2139 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0); 2140 } 2141 2142 return crc; 2143 } 2144 2145 #endif 2146 2147 #if SHIFT == 1 2148 static void clmulq(uint64_t *dest_l, uint64_t *dest_h, 2149 uint64_t a, uint64_t b) 2150 { 2151 uint64_t al, ah, resh, resl; 2152 2153 ah = 0; 2154 al = a; 2155 resh = resl = 0; 2156 2157 while (b) { 2158 if (b & 1) { 2159 resl ^= al; 2160 resh ^= ah; 2161 } 2162 ah = (ah << 1) | (al >> 63); 2163 al <<= 1; 2164 b >>= 1; 2165 } 2166 2167 *dest_l = resl; 2168 *dest_h = resh; 2169 } 2170 #endif 2171 2172 void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, 2173 uint32_t ctrl) 2174 { 2175 uint64_t a, b; 2176 int i; 2177 2178 for (i = 0; i < 1 << SHIFT; i += 2) { 2179 a = v->Q(((ctrl & 1) != 0) + i); 2180 b = s->Q(((ctrl & 16) != 0) + i); 2181 clmulq(&d->Q(i), &d->Q(i + 1), a, b); 2182 } 2183 } 2184 2185 void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2186 { 2187 int i; 2188 Reg st = *v; 2189 Reg rk = *s; 2190 2191 for (i = 0 ; i < 2 << SHIFT ; i++) { 2192 int j = i & 3; 2193 d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^ 2194 AES_Td1[st.B(AES_ishifts[4 * j + 1])] ^ 2195 AES_Td2[st.B(AES_ishifts[4 * j + 2])] ^ 2196 AES_Td3[st.B(AES_ishifts[4 * j + 3])]); 2197 } 2198 } 2199 2200 void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2201 { 2202 int i; 2203 Reg st = *v; 2204 Reg rk = *s; 2205 2206 for (i = 0; i < 8 << SHIFT; i++) { 2207 d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]); 2208 } 2209 } 2210 2211 void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2212 { 2213 int i; 2214 Reg st = *v; 2215 Reg rk = *s; 2216 2217 for (i = 0 ; i < 2 << SHIFT ; i++) { 2218 int j = i & 3; 2219 d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^ 2220 AES_Te1[st.B(AES_shifts[4 * j + 1])] ^ 2221 AES_Te2[st.B(AES_shifts[4 * j + 2])] ^ 2222 AES_Te3[st.B(AES_shifts[4 * j + 3])]); 2223 } 2224 } 2225 2226 void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2227 { 2228 int i; 2229 Reg st = *v; 2230 Reg rk = *s; 2231 2232 for (i = 0; i < 8 << SHIFT; i++) { 2233 d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]); 2234 } 2235 } 2236 2237 #if SHIFT == 1 2238 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2239 { 2240 int i; 2241 Reg tmp = *s; 2242 2243 for (i = 0 ; i < 4 ; i++) { 2244 d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^ 2245 AES_imc[tmp.B(4 * i + 1)][1] ^ 2246 AES_imc[tmp.B(4 * i + 2)][2] ^ 2247 AES_imc[tmp.B(4 * i + 3)][3]); 2248 } 2249 } 2250 2251 void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, 2252 uint32_t ctrl) 2253 { 2254 int i; 2255 Reg tmp = *s; 2256 2257 for (i = 0 ; i < 4 ; i++) { 2258 d->B(i) = AES_sbox[tmp.B(i + 4)]; 2259 d->B(i + 8) = AES_sbox[tmp.B(i + 12)]; 2260 } 2261 d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl; 2262 d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl; 2263 } 2264 #endif 2265 #endif 2266 2267 #if SHIFT >= 1 2268 void glue(helper_vpermilpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2269 { 2270 uint64_t r0, r1; 2271 int i; 2272 2273 for (i = 0; i < 1 << SHIFT; i += 2) { 2274 r0 = v->Q(i + ((s->Q(i) >> 1) & 1)); 2275 r1 = v->Q(i + ((s->Q(i+1) >> 1) & 1)); 2276 d->Q(i) = r0; 2277 d->Q(i+1) = r1; 2278 } 2279 } 2280 2281 void glue(helper_vpermilps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2282 { 2283 uint32_t r0, r1, r2, r3; 2284 int i; 2285 2286 for (i = 0; i < 2 << SHIFT; i += 4) { 2287 r0 = v->L(i + (s->L(i) & 3)); 2288 r1 = v->L(i + (s->L(i+1) & 3)); 2289 r2 = v->L(i + (s->L(i+2) & 3)); 2290 r3 = v->L(i + (s->L(i+3) & 3)); 2291 d->L(i) = r0; 2292 d->L(i+1) = r1; 2293 d->L(i+2) = r2; 2294 d->L(i+3) = r3; 2295 } 2296 } 2297 2298 void glue(helper_vpermilpd_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order) 2299 { 2300 uint64_t r0, r1; 2301 int i; 2302 2303 for (i = 0; i < 1 << SHIFT; i += 2) { 2304 r0 = s->Q(i + ((order >> 0) & 1)); 2305 r1 = s->Q(i + ((order >> 1) & 1)); 2306 d->Q(i) = r0; 2307 d->Q(i+1) = r1; 2308 2309 order >>= 2; 2310 } 2311 } 2312 2313 void glue(helper_vpermilps_imm, SUFFIX)(Reg *d, Reg *s, uint32_t order) 2314 { 2315 uint32_t r0, r1, r2, r3; 2316 int i; 2317 2318 for (i = 0; i < 2 << SHIFT; i += 4) { 2319 r0 = s->L(i + ((order >> 0) & 3)); 2320 r1 = s->L(i + ((order >> 2) & 3)); 2321 r2 = s->L(i + ((order >> 4) & 3)); 2322 r3 = s->L(i + ((order >> 6) & 3)); 2323 d->L(i) = r0; 2324 d->L(i+1) = r1; 2325 d->L(i+2) = r2; 2326 d->L(i+3) = r3; 2327 } 2328 } 2329 2330 #if SHIFT == 1 2331 #define FPSRLVD(x, c) (c < 32 ? ((x) >> c) : 0) 2332 #define FPSRLVQ(x, c) (c < 64 ? ((x) >> c) : 0) 2333 #define FPSRAVD(x, c) ((int32_t)(x) >> (c < 32 ? c : 31)) 2334 #define FPSRAVQ(x, c) ((int64_t)(x) >> (c < 64 ? c : 63)) 2335 #define FPSLLVD(x, c) (c < 32 ? ((x) << c) : 0) 2336 #define FPSLLVQ(x, c) (c < 64 ? ((x) << c) : 0) 2337 #endif 2338 2339 SSE_HELPER_L(helper_vpsrlvd, FPSRLVD) 2340 SSE_HELPER_L(helper_vpsravd, FPSRAVD) 2341 SSE_HELPER_L(helper_vpsllvd, FPSLLVD) 2342 2343 SSE_HELPER_Q(helper_vpsrlvq, FPSRLVQ) 2344 SSE_HELPER_Q(helper_vpsravq, FPSRAVQ) 2345 SSE_HELPER_Q(helper_vpsllvq, FPSLLVQ) 2346 2347 void glue(helper_vtestps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2348 { 2349 uint32_t zf = 0, cf = 0; 2350 int i; 2351 2352 for (i = 0; i < 2 << SHIFT; i++) { 2353 zf |= (s->L(i) & d->L(i)); 2354 cf |= (s->L(i) & ~d->L(i)); 2355 } 2356 CC_SRC = ((zf >> 31) ? 0 : CC_Z) | ((cf >> 31) ? 0 : CC_C); 2357 } 2358 2359 void glue(helper_vtestpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 2360 { 2361 uint64_t zf = 0, cf = 0; 2362 int i; 2363 2364 for (i = 0; i < 1 << SHIFT; i++) { 2365 zf |= (s->Q(i) & d->Q(i)); 2366 cf |= (s->Q(i) & ~d->Q(i)); 2367 } 2368 CC_SRC = ((zf >> 63) ? 0 : CC_Z) | ((cf >> 63) ? 0 : CC_C); 2369 } 2370 2371 void glue(helper_vpmaskmovd_st, SUFFIX)(CPUX86State *env, 2372 Reg *v, Reg *s, target_ulong a0) 2373 { 2374 int i; 2375 2376 for (i = 0; i < (2 << SHIFT); i++) { 2377 if (v->L(i) >> 31) { 2378 cpu_stl_data_ra(env, a0 + i * 4, s->L(i), GETPC()); 2379 } 2380 } 2381 } 2382 2383 void glue(helper_vpmaskmovq_st, SUFFIX)(CPUX86State *env, 2384 Reg *v, Reg *s, target_ulong a0) 2385 { 2386 int i; 2387 2388 for (i = 0; i < (1 << SHIFT); i++) { 2389 if (v->Q(i) >> 63) { 2390 cpu_stq_data_ra(env, a0 + i * 8, s->Q(i), GETPC()); 2391 } 2392 } 2393 } 2394 2395 void glue(helper_vpmaskmovd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2396 { 2397 int i; 2398 2399 for (i = 0; i < (2 << SHIFT); i++) { 2400 d->L(i) = (v->L(i) >> 31) ? s->L(i) : 0; 2401 } 2402 } 2403 2404 void glue(helper_vpmaskmovq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) 2405 { 2406 int i; 2407 2408 for (i = 0; i < (1 << SHIFT); i++) { 2409 d->Q(i) = (v->Q(i) >> 63) ? s->Q(i) : 0; 2410 } 2411 } 2412 2413 void glue(helper_vpgatherdd, SUFFIX)(CPUX86State *env, 2414 Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) 2415 { 2416 int i; 2417 for (i = 0; i < (2 << SHIFT); i++) { 2418 if (v->L(i) >> 31) { 2419 target_ulong addr = a0 2420 + ((target_ulong)(int32_t)s->L(i) << scale); 2421 d->L(i) = cpu_ldl_data_ra(env, addr, GETPC()); 2422 } 2423 v->L(i) = 0; 2424 } 2425 } 2426 2427 void glue(helper_vpgatherdq, SUFFIX)(CPUX86State *env, 2428 Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) 2429 { 2430 int i; 2431 for (i = 0; i < (1 << SHIFT); i++) { 2432 if (v->Q(i) >> 63) { 2433 target_ulong addr = a0 2434 + ((target_ulong)(int32_t)s->L(i) << scale); 2435 d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC()); 2436 } 2437 v->Q(i) = 0; 2438 } 2439 } 2440 2441 void glue(helper_vpgatherqd, SUFFIX)(CPUX86State *env, 2442 Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) 2443 { 2444 int i; 2445 for (i = 0; i < (1 << SHIFT); i++) { 2446 if (v->L(i) >> 31) { 2447 target_ulong addr = a0 2448 + ((target_ulong)(int64_t)s->Q(i) << scale); 2449 d->L(i) = cpu_ldl_data_ra(env, addr, GETPC()); 2450 } 2451 v->L(i) = 0; 2452 } 2453 for (i /= 2; i < 1 << SHIFT; i++) { 2454 d->Q(i) = 0; 2455 v->Q(i) = 0; 2456 } 2457 } 2458 2459 void glue(helper_vpgatherqq, SUFFIX)(CPUX86State *env, 2460 Reg *d, Reg *v, Reg *s, target_ulong a0, unsigned scale) 2461 { 2462 int i; 2463 for (i = 0; i < (1 << SHIFT); i++) { 2464 if (v->Q(i) >> 63) { 2465 target_ulong addr = a0 2466 + ((target_ulong)(int64_t)s->Q(i) << scale); 2467 d->Q(i) = cpu_ldq_data_ra(env, addr, GETPC()); 2468 } 2469 v->Q(i) = 0; 2470 } 2471 } 2472 #endif 2473 2474 #if SHIFT >= 2 2475 void helper_vpermdq_ymm(Reg *d, Reg *v, Reg *s, uint32_t order) 2476 { 2477 uint64_t r0, r1, r2, r3; 2478 2479 switch (order & 3) { 2480 case 0: 2481 r0 = v->Q(0); 2482 r1 = v->Q(1); 2483 break; 2484 case 1: 2485 r0 = v->Q(2); 2486 r1 = v->Q(3); 2487 break; 2488 case 2: 2489 r0 = s->Q(0); 2490 r1 = s->Q(1); 2491 break; 2492 case 3: 2493 r0 = s->Q(2); 2494 r1 = s->Q(3); 2495 break; 2496 } 2497 switch ((order >> 4) & 3) { 2498 case 0: 2499 r2 = v->Q(0); 2500 r3 = v->Q(1); 2501 break; 2502 case 1: 2503 r2 = v->Q(2); 2504 r3 = v->Q(3); 2505 break; 2506 case 2: 2507 r2 = s->Q(0); 2508 r3 = s->Q(1); 2509 break; 2510 case 3: 2511 r2 = s->Q(2); 2512 r3 = s->Q(3); 2513 break; 2514 } 2515 d->Q(0) = r0; 2516 d->Q(1) = r1; 2517 d->Q(2) = r2; 2518 d->Q(3) = r3; 2519 } 2520 2521 void helper_vpermq_ymm(Reg *d, Reg *s, uint32_t order) 2522 { 2523 uint64_t r0, r1, r2, r3; 2524 r0 = s->Q(order & 3); 2525 r1 = s->Q((order >> 2) & 3); 2526 r2 = s->Q((order >> 4) & 3); 2527 r3 = s->Q((order >> 6) & 3); 2528 d->Q(0) = r0; 2529 d->Q(1) = r1; 2530 d->Q(2) = r2; 2531 d->Q(3) = r3; 2532 } 2533 2534 void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s) 2535 { 2536 uint32_t r[8]; 2537 int i; 2538 2539 for (i = 0; i < 8; i++) { 2540 r[i] = s->L(v->L(i) & 7); 2541 } 2542 for (i = 0; i < 8; i++) { 2543 d->L(i) = r[i]; 2544 } 2545 } 2546 #endif 2547 2548 #undef SSE_HELPER_S 2549 2550 #undef LANE_WIDTH 2551 #undef SHIFT 2552 #undef XMM_ONLY 2553 #undef Reg 2554 #undef B 2555 #undef W 2556 #undef L 2557 #undef Q 2558 #undef SUFFIX 2559