1/*
2 * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
20 * USA
21 *
22 */
23
24.file "twofish-avx-x86_64-asm_64.S"
25.text
26
27/* structure of crypto context */
28#define s0	0
29#define s1	1024
30#define s2	2048
31#define s3	3072
32#define w	4096
33#define k	4128
34
35/**********************************************************************
36  8-way AVX twofish
37 **********************************************************************/
38#define CTX %rdi
39
40#define RA1 %xmm0
41#define RB1 %xmm1
42#define RC1 %xmm2
43#define RD1 %xmm3
44
45#define RA2 %xmm4
46#define RB2 %xmm5
47#define RC2 %xmm6
48#define RD2 %xmm7
49
50#define RX %xmm8
51#define RY %xmm9
52
53#define RK1 %xmm10
54#define RK2 %xmm11
55
56#define RID1  %rax
57#define RID1b %al
58#define RID2  %rbx
59#define RID2b %bl
60
61#define RGI1   %rdx
62#define RGI1bl %dl
63#define RGI1bh %dh
64#define RGI2   %rcx
65#define RGI2bl %cl
66#define RGI2bh %ch
67
68#define RGS1  %r8
69#define RGS1d %r8d
70#define RGS2  %r9
71#define RGS2d %r9d
72#define RGS3  %r10
73#define RGS3d %r10d
74
75
76#define lookup_32bit(t0, t1, t2, t3, src, dst) \
77	movb		src ## bl,        RID1b;     \
78	movb		src ## bh,        RID2b;     \
79	movl		t0(CTX, RID1, 4), dst ## d;  \
80	xorl		t1(CTX, RID2, 4), dst ## d;  \
81	shrq $16,	src;                         \
82	movb		src ## bl,        RID1b;     \
83	movb		src ## bh,        RID2b;     \
84	xorl		t2(CTX, RID1, 4), dst ## d;  \
85	xorl		t3(CTX, RID2, 4), dst ## d;
86
87#define G(a, x, t0, t1, t2, t3) \
88	vmovq		a,    RGI1;               \
89	vpsrldq $8,	a,    x;                  \
90	vmovq		x,    RGI2;               \
91	\
92	lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
93	shrq $16,	RGI1;                     \
94	lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
95	shlq $32,	RGS2;                     \
96	orq		RGS1, RGS2;               \
97	\
98	lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
99	shrq $16,	RGI2;                     \
100	lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
101	shlq $32,	RGS3;                     \
102	orq		RGS1, RGS3;               \
103	\
104	vmovq		RGS2, x;                  \
105	vpinsrq $1,	RGS3, x, x;
106
107#define encround(a, b, c, d, x, y) \
108	G(a, x, s0, s1, s2, s3);           \
109	G(b, y, s1, s2, s3, s0);           \
110	vpaddd			x, y,   x; \
111	vpaddd			y, x,   y; \
112	vpaddd			x, RK1, x; \
113	vpaddd			y, RK2, y; \
114	vpxor			x, c,   c; \
115	vpsrld $1,		c, x;      \
116	vpslld $(32 - 1),	c, c;      \
117	vpor			c, x,   c; \
118	vpslld $1,		d, x;      \
119	vpsrld $(32 - 1),	d, d;      \
120	vpor			d, x,   d; \
121	vpxor			d, y,   d;
122
123#define decround(a, b, c, d, x, y) \
124	G(a, x, s0, s1, s2, s3);           \
125	G(b, y, s1, s2, s3, s0);           \
126	vpaddd			x, y,   x; \
127	vpaddd			y, x,   y; \
128	vpaddd			y, RK2, y; \
129	vpxor			d, y,   d; \
130	vpsrld $1,		d, y;      \
131	vpslld $(32 - 1),	d, d;      \
132	vpor			d, y,   d; \
133	vpslld $1,		c, y;      \
134	vpsrld $(32 - 1),	c, c;      \
135	vpor			c, y,   c; \
136	vpaddd			x, RK1, x; \
137	vpxor			x, c,   c;
138
139#define encrypt_round(n, a, b, c, d) \
140	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;           \
141	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;           \
142	encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
143	encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
144
145#define decrypt_round(n, a, b, c, d) \
146	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;           \
147	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;           \
148	decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
149	decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
150
151#define encrypt_cycle(n) \
152	encrypt_round((2*n), RA, RB, RC, RD);       \
153	encrypt_round(((2*n) + 1), RC, RD, RA, RB);
154
155#define decrypt_cycle(n) \
156	decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
157	decrypt_round((2*n), RA, RB, RC, RD);
158
159
160#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
161	vpunpckldq		x1, x0, t0; \
162	vpunpckhdq		x1, x0, t2; \
163	vpunpckldq		x3, x2, t1; \
164	vpunpckhdq		x3, x2, x3; \
165	\
166	vpunpcklqdq		t1, t0, x0; \
167	vpunpckhqdq		t1, t0, x1; \
168	vpunpcklqdq		x3, t2, x2; \
169	vpunpckhqdq		x3, t2, x3;
170
171#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
172	vpxor (0*4*4)(in),	wkey, x0; \
173	vpxor (1*4*4)(in),	wkey, x1; \
174	vpxor (2*4*4)(in),	wkey, x2; \
175	vpxor (3*4*4)(in),	wkey, x3; \
176	\
177	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
178
179#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
180	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
181	\
182	vpxor		x0, wkey, x0;     \
183	vmovdqu 	x0, (0*4*4)(out); \
184	vpxor		x1, wkey, x1;     \
185	vmovdqu		x1, (1*4*4)(out); \
186	vpxor		x2, wkey, x2;     \
187	vmovdqu		x2, (2*4*4)(out); \
188	vpxor		x3, wkey, x3;     \
189	vmovdqu		x3, (3*4*4)(out);
190
191#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
192	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
193	\
194	vpxor		x0, wkey, x0;         \
195	vpxor		(0*4*4)(out), x0, x0; \
196	vmovdqu 	x0, (0*4*4)(out);     \
197	vpxor		x1, wkey, x1;         \
198	vpxor		(1*4*4)(out), x1, x1; \
199	vmovdqu	        x1, (1*4*4)(out);     \
200	vpxor		x2, wkey, x2;         \
201	vpxor           (2*4*4)(out), x2, x2; \
202	vmovdqu		x2, (2*4*4)(out);     \
203	vpxor		x3, wkey, x3;         \
204	vpxor           (3*4*4)(out), x3, x3; \
205	vmovdqu		x3, (3*4*4)(out);
206
207.align 8
208.global __twofish_enc_blk_8way
209.type   __twofish_enc_blk_8way,@function;
210
211__twofish_enc_blk_8way:
212	/* input:
213	 *	%rdi: ctx, CTX
214	 *	%rsi: dst
215	 *	%rdx: src
216	 *	%rcx: bool, if true: xor output
217	 */
218
219	pushq %rbx;
220	pushq %rcx;
221
222	vmovdqu w(CTX), RK1;
223
224	leaq (4*4*4)(%rdx), %rax;
225	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
226	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
227
228	xorq RID1, RID1;
229	xorq RID2, RID2;
230
231	encrypt_cycle(0);
232	encrypt_cycle(1);
233	encrypt_cycle(2);
234	encrypt_cycle(3);
235	encrypt_cycle(4);
236	encrypt_cycle(5);
237	encrypt_cycle(6);
238	encrypt_cycle(7);
239
240	vmovdqu (w+4*4)(CTX), RK1;
241
242	popq %rcx;
243	popq %rbx;
244
245	leaq (4*4*4)(%rsi), %rax;
246
247	testb %cl, %cl;
248	jnz __enc_xor8;
249
250	outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
251	outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
252
253	ret;
254
255__enc_xor8:
256	outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
257	outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
258
259	ret;
260
261.align 8
262.global twofish_dec_blk_8way
263.type   twofish_dec_blk_8way,@function;
264
265twofish_dec_blk_8way:
266	/* input:
267	 *	%rdi: ctx, CTX
268	 *	%rsi: dst
269	 *	%rdx: src
270	 */
271
272	pushq %rbx;
273
274	vmovdqu (w+4*4)(CTX), RK1;
275
276	leaq (4*4*4)(%rdx), %rax;
277	inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
278	inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
279
280	xorq RID1, RID1;
281	xorq RID2, RID2;
282
283	decrypt_cycle(7);
284	decrypt_cycle(6);
285	decrypt_cycle(5);
286	decrypt_cycle(4);
287	decrypt_cycle(3);
288	decrypt_cycle(2);
289	decrypt_cycle(1);
290	decrypt_cycle(0);
291
292	vmovdqu (w)(CTX), RK1;
293
294	popq %rbx;
295
296	leaq (4*4*4)(%rsi), %rax;
297	outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
298	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
299
300	ret;
301