xref: /openbmc/linux/arch/x86/crypto/glue_helper-asm-avx2.S (revision 2874c5fd284268364ece81a7bd936f3c8168e567)
1*2874c5fdSThomas Gleixner/* SPDX-License-Identifier: GPL-2.0-or-later */
2cf1521a1SJussi Kivilinna/*
3cf1521a1SJussi Kivilinna * Shared glue code for 128bit block ciphers, AVX2 assembler macros
4cf1521a1SJussi Kivilinna *
5cf1521a1SJussi Kivilinna * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6cf1521a1SJussi Kivilinna */
7cf1521a1SJussi Kivilinna
8cf1521a1SJussi Kivilinna#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
9cf1521a1SJussi Kivilinna	vmovdqu (0*32)(src), x0; \
10cf1521a1SJussi Kivilinna	vmovdqu (1*32)(src), x1; \
11cf1521a1SJussi Kivilinna	vmovdqu (2*32)(src), x2; \
12cf1521a1SJussi Kivilinna	vmovdqu (3*32)(src), x3; \
13cf1521a1SJussi Kivilinna	vmovdqu (4*32)(src), x4; \
14cf1521a1SJussi Kivilinna	vmovdqu (5*32)(src), x5; \
15cf1521a1SJussi Kivilinna	vmovdqu (6*32)(src), x6; \
16cf1521a1SJussi Kivilinna	vmovdqu (7*32)(src), x7;
17cf1521a1SJussi Kivilinna
18cf1521a1SJussi Kivilinna#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
19cf1521a1SJussi Kivilinna	vmovdqu x0, (0*32)(dst); \
20cf1521a1SJussi Kivilinna	vmovdqu x1, (1*32)(dst); \
21cf1521a1SJussi Kivilinna	vmovdqu x2, (2*32)(dst); \
22cf1521a1SJussi Kivilinna	vmovdqu x3, (3*32)(dst); \
23cf1521a1SJussi Kivilinna	vmovdqu x4, (4*32)(dst); \
24cf1521a1SJussi Kivilinna	vmovdqu x5, (5*32)(dst); \
25cf1521a1SJussi Kivilinna	vmovdqu x6, (6*32)(dst); \
26cf1521a1SJussi Kivilinna	vmovdqu x7, (7*32)(dst);
27cf1521a1SJussi Kivilinna
28cf1521a1SJussi Kivilinna#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
29cf1521a1SJussi Kivilinna	vpxor t0, t0, t0; \
30cf1521a1SJussi Kivilinna	vinserti128 $1, (src), t0, t0; \
31cf1521a1SJussi Kivilinna	vpxor t0, x0, x0; \
32cf1521a1SJussi Kivilinna	vpxor (0*32+16)(src), x1, x1; \
33cf1521a1SJussi Kivilinna	vpxor (1*32+16)(src), x2, x2; \
34cf1521a1SJussi Kivilinna	vpxor (2*32+16)(src), x3, x3; \
35cf1521a1SJussi Kivilinna	vpxor (3*32+16)(src), x4, x4; \
36cf1521a1SJussi Kivilinna	vpxor (4*32+16)(src), x5, x5; \
37cf1521a1SJussi Kivilinna	vpxor (5*32+16)(src), x6, x6; \
38cf1521a1SJussi Kivilinna	vpxor (6*32+16)(src), x7, x7; \
39cf1521a1SJussi Kivilinna	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
40cf1521a1SJussi Kivilinna
41cf1521a1SJussi Kivilinna#define inc_le128(x, minus_one, tmp) \
42cf1521a1SJussi Kivilinna	vpcmpeqq minus_one, x, tmp; \
43cf1521a1SJussi Kivilinna	vpsubq minus_one, x, x; \
44cf1521a1SJussi Kivilinna	vpslldq $8, tmp, tmp; \
45cf1521a1SJussi Kivilinna	vpsubq tmp, x, x;
46cf1521a1SJussi Kivilinna
47cf1521a1SJussi Kivilinna#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
48cf1521a1SJussi Kivilinna	vpcmpeqq minus_one, x, tmp1; \
49cf1521a1SJussi Kivilinna	vpcmpeqq minus_two, x, tmp2; \
50cf1521a1SJussi Kivilinna	vpsubq minus_two, x, x; \
51cf1521a1SJussi Kivilinna	vpor tmp2, tmp1, tmp1; \
52cf1521a1SJussi Kivilinna	vpslldq $8, tmp1, tmp1; \
53cf1521a1SJussi Kivilinna	vpsubq tmp1, x, x;
54cf1521a1SJussi Kivilinna
55cf1521a1SJussi Kivilinna#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
56cf1521a1SJussi Kivilinna		       t1x, t2, t2x, t3, t3x, t4, t5) \
57cf1521a1SJussi Kivilinna	vpcmpeqd t0, t0, t0; \
58cf1521a1SJussi Kivilinna	vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
59cf1521a1SJussi Kivilinna	vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
60cf1521a1SJussi Kivilinna	\
61cf1521a1SJussi Kivilinna	/* load IV and byteswap */ \
62cf1521a1SJussi Kivilinna	vmovdqu (iv), t2x; \
63cf1521a1SJussi Kivilinna	vmovdqa t2x, t3x; \
64cf1521a1SJussi Kivilinna	inc_le128(t2x, t0x, t1x); \
65cf1521a1SJussi Kivilinna	vbroadcasti128 bswap, t1; \
66cf1521a1SJussi Kivilinna	vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
67cf1521a1SJussi Kivilinna	vpshufb t1, t2, x0; \
68cf1521a1SJussi Kivilinna	\
69cf1521a1SJussi Kivilinna	/* construct IVs */ \
70cf1521a1SJussi Kivilinna	add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
71cf1521a1SJussi Kivilinna	vpshufb t1, t2, x1; \
72cf1521a1SJussi Kivilinna	add2_le128(t2, t0, t4, t3, t5); \
73cf1521a1SJussi Kivilinna	vpshufb t1, t2, x2; \
74cf1521a1SJussi Kivilinna	add2_le128(t2, t0, t4, t3, t5); \
75cf1521a1SJussi Kivilinna	vpshufb t1, t2, x3; \
76cf1521a1SJussi Kivilinna	add2_le128(t2, t0, t4, t3, t5); \
77cf1521a1SJussi Kivilinna	vpshufb t1, t2, x4; \
78cf1521a1SJussi Kivilinna	add2_le128(t2, t0, t4, t3, t5); \
79cf1521a1SJussi Kivilinna	vpshufb t1, t2, x5; \
80cf1521a1SJussi Kivilinna	add2_le128(t2, t0, t4, t3, t5); \
81cf1521a1SJussi Kivilinna	vpshufb t1, t2, x6; \
82cf1521a1SJussi Kivilinna	add2_le128(t2, t0, t4, t3, t5); \
83cf1521a1SJussi Kivilinna	vpshufb t1, t2, x7; \
84cf1521a1SJussi Kivilinna	vextracti128 $1, t2, t2x; \
85cf1521a1SJussi Kivilinna	inc_le128(t2x, t0x, t3x); \
86cf1521a1SJussi Kivilinna	vmovdqu t2x, (iv);
87cf1521a1SJussi Kivilinna
88cf1521a1SJussi Kivilinna#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
89cf1521a1SJussi Kivilinna	vpxor (0*32)(src), x0, x0; \
90cf1521a1SJussi Kivilinna	vpxor (1*32)(src), x1, x1; \
91cf1521a1SJussi Kivilinna	vpxor (2*32)(src), x2, x2; \
92cf1521a1SJussi Kivilinna	vpxor (3*32)(src), x3, x3; \
93cf1521a1SJussi Kivilinna	vpxor (4*32)(src), x4, x4; \
94cf1521a1SJussi Kivilinna	vpxor (5*32)(src), x5, x5; \
95cf1521a1SJussi Kivilinna	vpxor (6*32)(src), x6, x6; \
96cf1521a1SJussi Kivilinna	vpxor (7*32)(src), x7, x7; \
97cf1521a1SJussi Kivilinna	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
98cf1521a1SJussi Kivilinna
99cf1521a1SJussi Kivilinna#define gf128mul_x_ble(iv, mask, tmp) \
100cf1521a1SJussi Kivilinna	vpsrad $31, iv, tmp; \
101cf1521a1SJussi Kivilinna	vpaddq iv, iv, iv; \
102cf1521a1SJussi Kivilinna	vpshufd $0x13, tmp, tmp; \
103cf1521a1SJussi Kivilinna	vpand mask, tmp, tmp; \
104cf1521a1SJussi Kivilinna	vpxor tmp, iv, iv;
105cf1521a1SJussi Kivilinna
106cf1521a1SJussi Kivilinna#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
107cf1521a1SJussi Kivilinna	vpsrad $31, iv, tmp0; \
108cf1521a1SJussi Kivilinna	vpaddq iv, iv, tmp1; \
109cf1521a1SJussi Kivilinna	vpsllq $2, iv, iv; \
110cf1521a1SJussi Kivilinna	vpshufd $0x13, tmp0, tmp0; \
111cf1521a1SJussi Kivilinna	vpsrad $31, tmp1, tmp1; \
112cf1521a1SJussi Kivilinna	vpand mask2, tmp0, tmp0; \
113cf1521a1SJussi Kivilinna	vpshufd $0x13, tmp1, tmp1; \
114cf1521a1SJussi Kivilinna	vpxor tmp0, iv, iv; \
115cf1521a1SJussi Kivilinna	vpand mask1, tmp1, tmp1; \
116cf1521a1SJussi Kivilinna	vpxor tmp1, iv, iv;
117cf1521a1SJussi Kivilinna
118cf1521a1SJussi Kivilinna#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
119cf1521a1SJussi Kivilinna		       tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
120cf1521a1SJussi Kivilinna		       xts_gf128mul_and_shl1_mask_0, \
121cf1521a1SJussi Kivilinna		       xts_gf128mul_and_shl1_mask_1) \
122cf1521a1SJussi Kivilinna	vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
123cf1521a1SJussi Kivilinna	\
124cf1521a1SJussi Kivilinna	/* load IV and construct second IV */ \
125cf1521a1SJussi Kivilinna	vmovdqu (iv), tivx; \
126cf1521a1SJussi Kivilinna	vmovdqa tivx, t0x; \
127cf1521a1SJussi Kivilinna	gf128mul_x_ble(tivx, t1x, t2x); \
128cf1521a1SJussi Kivilinna	vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
129cf1521a1SJussi Kivilinna	vinserti128 $1, tivx, t0, tiv; \
130cf1521a1SJussi Kivilinna	vpxor (0*32)(src), tiv, x0; \
131cf1521a1SJussi Kivilinna	vmovdqu tiv, (0*32)(dst); \
132cf1521a1SJussi Kivilinna	\
133cf1521a1SJussi Kivilinna	/* construct and store IVs, also xor with source */ \
134cf1521a1SJussi Kivilinna	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
135cf1521a1SJussi Kivilinna	vpxor (1*32)(src), tiv, x1; \
136cf1521a1SJussi Kivilinna	vmovdqu tiv, (1*32)(dst); \
137cf1521a1SJussi Kivilinna	\
138cf1521a1SJussi Kivilinna	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
139cf1521a1SJussi Kivilinna	vpxor (2*32)(src), tiv, x2; \
140cf1521a1SJussi Kivilinna	vmovdqu tiv, (2*32)(dst); \
141cf1521a1SJussi Kivilinna	\
142cf1521a1SJussi Kivilinna	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
143cf1521a1SJussi Kivilinna	vpxor (3*32)(src), tiv, x3; \
144cf1521a1SJussi Kivilinna	vmovdqu tiv, (3*32)(dst); \
145cf1521a1SJussi Kivilinna	\
146cf1521a1SJussi Kivilinna	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
147cf1521a1SJussi Kivilinna	vpxor (4*32)(src), tiv, x4; \
148cf1521a1SJussi Kivilinna	vmovdqu tiv, (4*32)(dst); \
149cf1521a1SJussi Kivilinna	\
150cf1521a1SJussi Kivilinna	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
151cf1521a1SJussi Kivilinna	vpxor (5*32)(src), tiv, x5; \
152cf1521a1SJussi Kivilinna	vmovdqu tiv, (5*32)(dst); \
153cf1521a1SJussi Kivilinna	\
154cf1521a1SJussi Kivilinna	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
155cf1521a1SJussi Kivilinna	vpxor (6*32)(src), tiv, x6; \
156cf1521a1SJussi Kivilinna	vmovdqu tiv, (6*32)(dst); \
157cf1521a1SJussi Kivilinna	\
158cf1521a1SJussi Kivilinna	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
159cf1521a1SJussi Kivilinna	vpxor (7*32)(src), tiv, x7; \
160cf1521a1SJussi Kivilinna	vmovdqu tiv, (7*32)(dst); \
161cf1521a1SJussi Kivilinna	\
162cf1521a1SJussi Kivilinna	vextracti128 $1, tiv, tivx; \
163cf1521a1SJussi Kivilinna	gf128mul_x_ble(tivx, t1x, t2x); \
164cf1521a1SJussi Kivilinna	vmovdqu tivx, (iv);
165cf1521a1SJussi Kivilinna
166cf1521a1SJussi Kivilinna#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
167cf1521a1SJussi Kivilinna	vpxor (0*32)(dst), x0, x0; \
168cf1521a1SJussi Kivilinna	vpxor (1*32)(dst), x1, x1; \
169cf1521a1SJussi Kivilinna	vpxor (2*32)(dst), x2, x2; \
170cf1521a1SJussi Kivilinna	vpxor (3*32)(dst), x3, x3; \
171cf1521a1SJussi Kivilinna	vpxor (4*32)(dst), x4, x4; \
172cf1521a1SJussi Kivilinna	vpxor (5*32)(dst), x5, x5; \
173cf1521a1SJussi Kivilinna	vpxor (6*32)(dst), x6, x6; \
174cf1521a1SJussi Kivilinna	vpxor (7*32)(dst), x7, x7; \
175cf1521a1SJussi Kivilinna	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
176