ops_sse.h (6f218d6e994bd8b229d6522899b6ac6cd98bdb47) ops_sse.h (6567ffb4f259d9937ff74f21e96cdac905440620)
1/*
2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
3 *
4 * Copyright (c) 2005 Fabrice Bellard
5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com>
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public

--- 8 unchanged lines hidden (view full) ---

17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#include "crypto/aes.h"
22
23#if SHIFT == 0
24#define Reg MMXReg
1/*
2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
3 *
4 * Copyright (c) 2005 Fabrice Bellard
5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com>
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public

--- 8 unchanged lines hidden (view full) ---

17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#include "crypto/aes.h"
22
23#if SHIFT == 0
24#define Reg MMXReg
25#define SIZE 8
26#define XMM_ONLY(...)
27#define B(n) MMX_B(n)
28#define W(n) MMX_W(n)
29#define L(n) MMX_L(n)
30#define Q(n) MMX_Q(n)
31#define SUFFIX _mmx
32#else
33#define Reg ZMMReg
25#define XMM_ONLY(...)
26#define B(n) MMX_B(n)
27#define W(n) MMX_W(n)
28#define L(n) MMX_L(n)
29#define Q(n) MMX_Q(n)
30#define SUFFIX _mmx
31#else
32#define Reg ZMMReg
34#define SIZE 16
35#define XMM_ONLY(...) __VA_ARGS__
36#define B(n) ZMM_B(n)
37#define W(n) ZMM_W(n)
38#define L(n) ZMM_L(n)
39#define Q(n) ZMM_Q(n)
40#define SUFFIX _xmm
41#endif
42
43#define LANE_WIDTH (SHIFT ? 16 : 8)
44#define PACK_WIDTH (LANE_WIDTH / 2)
45
33#define XMM_ONLY(...) __VA_ARGS__
34#define B(n) ZMM_B(n)
35#define W(n) ZMM_W(n)
36#define L(n) ZMM_L(n)
37#define Q(n) ZMM_Q(n)
38#define SUFFIX _xmm
39#endif
40
41#define LANE_WIDTH (SHIFT ? 16 : 8)
42#define PACK_WIDTH (LANE_WIDTH / 2)
43
46/*
47 * Copy the relevant parts of a Reg value around. In the case where
48 * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of
49 * a 64 byte ZMMReg, so we must copy only those and keep the top bytes
50 * untouched in the guest-visible destination destination register.
51 * Note that the "lower bytes" are placed last in memory on big-endian
52 * hosts, which store the vector backwards in memory. In that case the
53 * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of
54 * the little-endian case.
55 */
56#if HOST_BIG_ENDIAN
57#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(r).B(SIZE - 1), SIZE)
58#else
59#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE)
60#endif
61
62#if SHIFT == 0
63#define FPSRL(x, c) ((x) >> shift)
64#define FPSRAW(x, c) ((int16_t)(x) >> shift)
65#define FPSRAL(x, c) ((int32_t)(x) >> shift)
66#define FPSLL(x, c) ((x) << shift)
67#endif
68
69void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)

--- 870 unchanged lines hidden (view full) ---

940 d->ZMM_Q(0) = helper_insertq(s->ZMM_Q(0), s->ZMM_B(9), s->ZMM_B(8));
941}
942
943void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length)
944{
945 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length);
946}
947
44#if SHIFT == 0
45#define FPSRL(x, c) ((x) >> shift)
46#define FPSRAW(x, c) ((int16_t)(x) >> shift)
47#define FPSRAL(x, c) ((int32_t)(x) >> shift)
48#define FPSLL(x, c) ((x) << shift)
49#endif
50
51void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)

--- 870 unchanged lines hidden (view full) ---

922 d->ZMM_Q(0) = helper_insertq(s->ZMM_Q(0), s->ZMM_B(9), s->ZMM_B(8));
923}
924
925void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length)
926{
927 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length);
928}
929
948void glue(helper_haddps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
949{
950 ZMMReg r;
951
952 r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status);
953 r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
954 r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
955 r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
956 MOVE(*d, r);
930#define SSE_HELPER_HPS(name, F) \
931void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
932{ \
933 Reg *v = d; \
934 float32 r[2 << SHIFT]; \
935 int i, j, k; \
936 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
937 for (i = j = 0; j < 4; i++, j += 2) { \
938 r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \
939 } \
940 for (j = 0; j < 4; i++, j += 2) { \
941 r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \
942 } \
943 } \
944 for (i = 0; i < 2 << SHIFT; i++) { \
945 d->ZMM_S(i) = r[i]; \
946 } \
957}
958
947}
948
959void glue(helper_haddpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
960{
961 ZMMReg r;
949SSE_HELPER_HPS(haddps, float32_add)
950SSE_HELPER_HPS(hsubps, float32_sub)
962
951
963 r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
964 r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
965 MOVE(*d, r);
952#define SSE_HELPER_HPD(name, F) \
953void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
954{ \
955 Reg *v = d; \
956 float64 r[1 << SHIFT]; \
957 int i, j, k; \
958 for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \
959 for (i = j = 0; j < 2; i++, j += 2) { \
960 r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \
961 } \
962 for (j = 0; j < 2; i++, j += 2) { \
963 r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \
964 } \
965 } \
966 for (i = 0; i < 1 << SHIFT; i++) { \
967 d->ZMM_D(i) = r[i]; \
968 } \
966}
967
969}
970
968void glue(helper_hsubps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
969{
970 ZMMReg r;
971SSE_HELPER_HPD(haddpd, float64_add)
972SSE_HELPER_HPD(hsubpd, float64_sub)
971
973
972 r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status);
973 r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status);
974 r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status);
975 r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status);
976 MOVE(*d, r);
977}
978
979void glue(helper_hsubpd, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
980{
981 ZMMReg r;
982
983 r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status);
984 r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status);
985 MOVE(*d, r);
986}
987
988void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
989{
990 Reg *v = d;
991 int i;
992 for (i = 0; i < 2 << SHIFT; i += 2) {
993 d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
994 d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
995 }

--- 1330 unchanged lines hidden (view full) ---

2326#undef SHIFT
2327#undef XMM_ONLY
2328#undef Reg
2329#undef B
2330#undef W
2331#undef L
2332#undef Q
2333#undef SUFFIX
974void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
975{
976 Reg *v = d;
977 int i;
978 for (i = 0; i < 2 << SHIFT; i += 2) {
979 d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
980 d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
981 }

--- 1330 unchanged lines hidden (view full) ---

2312#undef SHIFT
2313#undef XMM_ONLY
2314#undef Reg
2315#undef B
2316#undef W
2317#undef L
2318#undef Q
2319#undef SUFFIX
2334#undef SIZE