ops_sse.h (6f218d6e994bd8b229d6522899b6ac6cd98bdb47) | ops_sse.h (6567ffb4f259d9937ff74f21e96cdac905440620) |
---|---|
1/* 2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support 3 * 4 * Copyright (c) 2005 Fabrice Bellard 5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com> 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public --- 8 unchanged lines hidden (view full) --- 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21#include "crypto/aes.h" 22 23#if SHIFT == 0 24#define Reg MMXReg | 1/* 2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support 3 * 4 * Copyright (c) 2005 Fabrice Bellard 5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com> 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public --- 8 unchanged lines hidden (view full) --- 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 19 */ 20 21#include "crypto/aes.h" 22 23#if SHIFT == 0 24#define Reg MMXReg |
25#define SIZE 8 | |
26#define XMM_ONLY(...) 27#define B(n) MMX_B(n) 28#define W(n) MMX_W(n) 29#define L(n) MMX_L(n) 30#define Q(n) MMX_Q(n) 31#define SUFFIX _mmx 32#else 33#define Reg ZMMReg | 25#define XMM_ONLY(...) 26#define B(n) MMX_B(n) 27#define W(n) MMX_W(n) 28#define L(n) MMX_L(n) 29#define Q(n) MMX_Q(n) 30#define SUFFIX _mmx 31#else 32#define Reg ZMMReg |
34#define SIZE 16 | |
35#define XMM_ONLY(...) __VA_ARGS__ 36#define B(n) ZMM_B(n) 37#define W(n) ZMM_W(n) 38#define L(n) ZMM_L(n) 39#define Q(n) ZMM_Q(n) 40#define SUFFIX _xmm 41#endif 42 43#define LANE_WIDTH (SHIFT ? 16 : 8) 44#define PACK_WIDTH (LANE_WIDTH / 2) 45 | 33#define XMM_ONLY(...) __VA_ARGS__ 34#define B(n) ZMM_B(n) 35#define W(n) ZMM_W(n) 36#define L(n) ZMM_L(n) 37#define Q(n) ZMM_Q(n) 38#define SUFFIX _xmm 39#endif 40 41#define LANE_WIDTH (SHIFT ? 16 : 8) 42#define PACK_WIDTH (LANE_WIDTH / 2) 43 |
46/* 47 * Copy the relevant parts of a Reg value around. In the case where 48 * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of 49 * a 64 byte ZMMReg, so we must copy only those and keep the top bytes 50 * untouched in the guest-visible destination destination register. 51 * Note that the "lower bytes" are placed last in memory on big-endian 52 * hosts, which store the vector backwards in memory. In that case the 53 * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of 54 * the little-endian case. 55 */ 56#if HOST_BIG_ENDIAN 57#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(r).B(SIZE - 1), SIZE) 58#else 59#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE) 60#endif 61 | |
62#if SHIFT == 0 63#define FPSRL(x, c) ((x) >> shift) 64#define FPSRAW(x, c) ((int16_t)(x) >> shift) 65#define FPSRAL(x, c) ((int32_t)(x) >> shift) 66#define FPSLL(x, c) ((x) << shift) 67#endif 68 69void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) --- 870 unchanged lines hidden (view full) --- 940 d->ZMM_Q(0) = helper_insertq(s->ZMM_Q(0), s->ZMM_B(9), s->ZMM_B(8)); 941} 942 943void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length) 944{ 945 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length); 946} 947 | 44#if SHIFT == 0 45#define FPSRL(x, c) ((x) >> shift) 46#define FPSRAW(x, c) ((int16_t)(x) >> shift) 47#define FPSRAL(x, c) ((int32_t)(x) >> shift) 48#define FPSLL(x, c) ((x) << shift) 49#endif 50 51void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) --- 870 unchanged lines hidden (view full) --- 922 d->ZMM_Q(0) = helper_insertq(s->ZMM_Q(0), s->ZMM_B(9), s->ZMM_B(8)); 923} 924 925void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length) 926{ 927 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length); 928} 929 |
948void glue(helper_haddps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 949{ 950 ZMMReg r; 951 952 r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); 953 r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); 954 r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); 955 r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); 956 MOVE(*d, r); | 930#define SSE_HELPER_HPS(name, F) \ 931void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 932{ \ 933 Reg *v = d; \ 934 float32 r[2 << SHIFT]; \ 935 int i, j, k; \ 936 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ 937 for (i = j = 0; j < 4; i++, j += 2) { \ 938 r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \ 939 } \ 940 for (j = 0; j < 4; i++, j += 2) { \ 941 r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \ 942 } \ 943 } \ 944 for (i = 0; i < 2 << SHIFT; i++) { \ 945 d->ZMM_S(i) = r[i]; \ 946 } \ |
957} 958 | 947} 948 |
959void glue(helper_haddpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 960{ 961 ZMMReg r; | 949SSE_HELPER_HPS(haddps, float32_add) 950SSE_HELPER_HPS(hsubps, float32_sub) |
962 | 951 |
963 r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); 964 r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); 965 MOVE(*d, r); | 952#define SSE_HELPER_HPD(name, F) \ 953void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ 954{ \ 955 Reg *v = d; \ 956 float64 r[1 << SHIFT]; \ 957 int i, j, k; \ 958 for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \ 959 for (i = j = 0; j < 2; i++, j += 2) { \ 960 r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \ 961 } \ 962 for (j = 0; j < 2; i++, j += 2) { \ 963 r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \ 964 } \ 965 } \ 966 for (i = 0; i < 1 << SHIFT; i++) { \ 967 d->ZMM_D(i) = r[i]; \ 968 } \ |
966} 967 | 969} 970 |
968void glue(helper_hsubps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 969{ 970 ZMMReg r; | 971SSE_HELPER_HPD(haddpd, float64_add) 972SSE_HELPER_HPD(hsubpd, float64_sub) |
971 | 973 |
972 r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); 973 r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); 974 r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); 975 r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); 976 MOVE(*d, r); 977} 978 979void glue(helper_hsubpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) 980{ 981 ZMMReg r; 982 983 r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); 984 r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); 985 MOVE(*d, r); 986} 987 | |
988void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 989{ 990 Reg *v = d; 991 int i; 992 for (i = 0; i < 2 << SHIFT; i += 2) { 993 d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); 994 d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status); 995 } --- 1330 unchanged lines hidden (view full) --- 2326#undef SHIFT 2327#undef XMM_ONLY 2328#undef Reg 2329#undef B 2330#undef W 2331#undef L 2332#undef Q 2333#undef SUFFIX | 974void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) 975{ 976 Reg *v = d; 977 int i; 978 for (i = 0; i < 2 << SHIFT; i += 2) { 979 d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); 980 d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status); 981 } --- 1330 unchanged lines hidden (view full) --- 2312#undef SHIFT 2313#undef XMM_ONLY 2314#undef Reg 2315#undef B 2316#undef W 2317#undef L 2318#undef Q 2319#undef SUFFIX |
2334#undef SIZE | |