1/***************************************************************************
2*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
3*                                                                         *
4*   This program is free software; you can redistribute it and/or modify  *
5*   it under the terms of the GNU General Public License as published by  *
6*   the Free Software Foundation; either version 2 of the License, or     *
7*   (at your option) any later version.                                   *
8*                                                                         *
9*   This program is distributed in the hope that it will be useful,       *
10*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
11*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
12*   GNU General Public License for more details.                          *
13*                                                                         *
14*   You should have received a copy of the GNU General Public License     *
15*   along with this program; if not, write to the                         *
16*   Free Software Foundation, Inc.,                                       *
17*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
18***************************************************************************/
19
20.file "twofish-i586-asm.S"
21.text
22
23#include <linux/linkage.h>
24#include <asm/asm-offsets.h>
25
26/* return address at 0 */
27
28#define in_blk    12  /* input byte array address parameter*/
29#define out_blk   8  /* output byte array address parameter*/
30#define ctx       4  /* Twofish context structure */
31
32#define a_offset	0
33#define b_offset	4
34#define c_offset	8
35#define d_offset	12
36
37/* Structure of the crypto context struct*/
38
39#define s0	0	/* S0 Array 256 Words each */
40#define s1	1024	/* S1 Array */
41#define s2	2048	/* S2 Array */
42#define s3	3072	/* S3 Array */
43#define w	4096	/* 8 whitening keys (word) */
44#define k	4128	/* key 1-32 ( word ) */
45
46/* define a few register aliases to allow macro substitution */
47
48#define R0D    %eax
49#define R0B    %al
50#define R0H    %ah
51
52#define R1D    %ebx
53#define R1B    %bl
54#define R1H    %bh
55
56#define R2D    %ecx
57#define R2B    %cl
58#define R2H    %ch
59
60#define R3D    %edx
61#define R3B    %dl
62#define R3H    %dh
63
64
65/* performs input whitening */
66#define input_whitening(src,context,offset)\
67	xor	w+offset(context),	src;
68
69/* performs input whitening */
70#define output_whitening(src,context,offset)\
71	xor	w+16+offset(context),	src;
72
73/*
74 * a input register containing a (rotated 16)
75 * b input register containing b
76 * c input register containing c
77 * d input register containing d (already rol $1)
78 * operations on a and b are interleaved to increase performance
79 */
80#define encrypt_round(a,b,c,d,round)\
81	push	d ## D;\
82	movzx	b ## B,		%edi;\
83	mov	s1(%ebp,%edi,4),d ## D;\
84	movzx	a ## B,		%edi;\
85	mov	s2(%ebp,%edi,4),%esi;\
86	movzx	b ## H,		%edi;\
87	ror	$16,		b ## D;\
88	xor	s2(%ebp,%edi,4),d ## D;\
89	movzx	a ## H,		%edi;\
90	ror	$16,		a ## D;\
91	xor	s3(%ebp,%edi,4),%esi;\
92	movzx	b ## B,		%edi;\
93	xor	s3(%ebp,%edi,4),d ## D;\
94	movzx	a ## B,		%edi;\
95	xor	(%ebp,%edi,4),	%esi;\
96	movzx	b ## H,		%edi;\
97	ror	$15,		b ## D;\
98	xor	(%ebp,%edi,4),	d ## D;\
99	movzx	a ## H,		%edi;\
100	xor	s1(%ebp,%edi,4),%esi;\
101	pop	%edi;\
102	add	d ## D,		%esi;\
103	add	%esi,		d ## D;\
104	add	k+round(%ebp),	%esi;\
105	xor	%esi,		c ## D;\
106	rol	$15,		c ## D;\
107	add	k+4+round(%ebp),d ## D;\
108	xor	%edi,		d ## D;
109
110/*
111 * a input register containing a (rotated 16)
112 * b input register containing b
113 * c input register containing c
114 * d input register containing d (already rol $1)
115 * operations on a and b are interleaved to increase performance
116 * last round has different rotations for the output preparation
117 */
118#define encrypt_last_round(a,b,c,d,round)\
119	push	d ## D;\
120	movzx	b ## B,		%edi;\
121	mov	s1(%ebp,%edi,4),d ## D;\
122	movzx	a ## B,		%edi;\
123	mov	s2(%ebp,%edi,4),%esi;\
124	movzx	b ## H,		%edi;\
125	ror	$16,		b ## D;\
126	xor	s2(%ebp,%edi,4),d ## D;\
127	movzx	a ## H,		%edi;\
128	ror	$16,		a ## D;\
129	xor	s3(%ebp,%edi,4),%esi;\
130	movzx	b ## B,		%edi;\
131	xor	s3(%ebp,%edi,4),d ## D;\
132	movzx	a ## B,		%edi;\
133	xor	(%ebp,%edi,4),	%esi;\
134	movzx	b ## H,		%edi;\
135	ror	$16,		b ## D;\
136	xor	(%ebp,%edi,4),	d ## D;\
137	movzx	a ## H,		%edi;\
138	xor	s1(%ebp,%edi,4),%esi;\
139	pop	%edi;\
140	add	d ## D,		%esi;\
141	add	%esi,		d ## D;\
142	add	k+round(%ebp),	%esi;\
143	xor	%esi,		c ## D;\
144	ror	$1,		c ## D;\
145	add	k+4+round(%ebp),d ## D;\
146	xor	%edi,		d ## D;
147
148/*
149 * a input register containing a
150 * b input register containing b (rotated 16)
151 * c input register containing c
152 * d input register containing d (already rol $1)
153 * operations on a and b are interleaved to increase performance
154 */
155#define decrypt_round(a,b,c,d,round)\
156	push	c ## D;\
157	movzx	a ## B,		%edi;\
158	mov	(%ebp,%edi,4),	c ## D;\
159	movzx	b ## B,		%edi;\
160	mov	s3(%ebp,%edi,4),%esi;\
161	movzx	a ## H,		%edi;\
162	ror	$16,		a ## D;\
163	xor	s1(%ebp,%edi,4),c ## D;\
164	movzx	b ## H,		%edi;\
165	ror	$16,		b ## D;\
166	xor	(%ebp,%edi,4),	%esi;\
167	movzx	a ## B,		%edi;\
168	xor	s2(%ebp,%edi,4),c ## D;\
169	movzx	b ## B,		%edi;\
170	xor	s1(%ebp,%edi,4),%esi;\
171	movzx	a ## H,		%edi;\
172	ror	$15,		a ## D;\
173	xor	s3(%ebp,%edi,4),c ## D;\
174	movzx	b ## H,		%edi;\
175	xor	s2(%ebp,%edi,4),%esi;\
176	pop	%edi;\
177	add	%esi,		c ## D;\
178	add	c ## D,		%esi;\
179	add	k+round(%ebp),	c ## D;\
180	xor	%edi,		c ## D;\
181	add	k+4+round(%ebp),%esi;\
182	xor	%esi,		d ## D;\
183	rol	$15,		d ## D;
184
185/*
186 * a input register containing a
187 * b input register containing b (rotated 16)
188 * c input register containing c
189 * d input register containing d (already rol $1)
190 * operations on a and b are interleaved to increase performance
191 * last round has different rotations for the output preparation
192 */
193#define decrypt_last_round(a,b,c,d,round)\
194	push	c ## D;\
195	movzx	a ## B,		%edi;\
196	mov	(%ebp,%edi,4),	c ## D;\
197	movzx	b ## B,		%edi;\
198	mov	s3(%ebp,%edi,4),%esi;\
199	movzx	a ## H,		%edi;\
200	ror	$16,		a ## D;\
201	xor	s1(%ebp,%edi,4),c ## D;\
202	movzx	b ## H,		%edi;\
203	ror	$16,		b ## D;\
204	xor	(%ebp,%edi,4),	%esi;\
205	movzx	a ## B,		%edi;\
206	xor	s2(%ebp,%edi,4),c ## D;\
207	movzx	b ## B,		%edi;\
208	xor	s1(%ebp,%edi,4),%esi;\
209	movzx	a ## H,		%edi;\
210	ror	$16,		a ## D;\
211	xor	s3(%ebp,%edi,4),c ## D;\
212	movzx	b ## H,		%edi;\
213	xor	s2(%ebp,%edi,4),%esi;\
214	pop	%edi;\
215	add	%esi,		c ## D;\
216	add	c ## D,		%esi;\
217	add	k+round(%ebp),	c ## D;\
218	xor	%edi,		c ## D;\
219	add	k+4+round(%ebp),%esi;\
220	xor	%esi,		d ## D;\
221	ror	$1,		d ## D;
222
223ENTRY(twofish_enc_blk)
224	push	%ebp			/* save registers according to calling convention*/
225	push    %ebx
226	push    %esi
227	push    %edi
228
229	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base
230					 * pointer to the ctx address */
231	mov     in_blk+16(%esp),%edi	/* input address in edi */
232
233	mov	(%edi),		%eax
234	mov	b_offset(%edi),	%ebx
235	mov	c_offset(%edi),	%ecx
236	mov	d_offset(%edi),	%edx
237	input_whitening(%eax,%ebp,a_offset)
238	ror	$16,	%eax
239	input_whitening(%ebx,%ebp,b_offset)
240	input_whitening(%ecx,%ebp,c_offset)
241	input_whitening(%edx,%ebp,d_offset)
242	rol	$1,	%edx
243
244	encrypt_round(R0,R1,R2,R3,0);
245	encrypt_round(R2,R3,R0,R1,8);
246	encrypt_round(R0,R1,R2,R3,2*8);
247	encrypt_round(R2,R3,R0,R1,3*8);
248	encrypt_round(R0,R1,R2,R3,4*8);
249	encrypt_round(R2,R3,R0,R1,5*8);
250	encrypt_round(R0,R1,R2,R3,6*8);
251	encrypt_round(R2,R3,R0,R1,7*8);
252	encrypt_round(R0,R1,R2,R3,8*8);
253	encrypt_round(R2,R3,R0,R1,9*8);
254	encrypt_round(R0,R1,R2,R3,10*8);
255	encrypt_round(R2,R3,R0,R1,11*8);
256	encrypt_round(R0,R1,R2,R3,12*8);
257	encrypt_round(R2,R3,R0,R1,13*8);
258	encrypt_round(R0,R1,R2,R3,14*8);
259	encrypt_last_round(R2,R3,R0,R1,15*8);
260
261	output_whitening(%eax,%ebp,c_offset)
262	output_whitening(%ebx,%ebp,d_offset)
263	output_whitening(%ecx,%ebp,a_offset)
264	output_whitening(%edx,%ebp,b_offset)
265	mov	out_blk+16(%esp),%edi;
266	mov	%eax,		c_offset(%edi)
267	mov	%ebx,		d_offset(%edi)
268	mov	%ecx,		(%edi)
269	mov	%edx,		b_offset(%edi)
270
271	pop	%edi
272	pop	%esi
273	pop	%ebx
274	pop	%ebp
275	mov	$1,	%eax
276	ret
277ENDPROC(twofish_enc_blk)
278
279ENTRY(twofish_dec_blk)
280	push	%ebp			/* save registers according to calling convention*/
281	push    %ebx
282	push    %esi
283	push    %edi
284
285
286	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base
287					 * pointer to the ctx address */
288	mov     in_blk+16(%esp),%edi	/* input address in edi */
289
290	mov	(%edi),		%eax
291	mov	b_offset(%edi),	%ebx
292	mov	c_offset(%edi),	%ecx
293	mov	d_offset(%edi),	%edx
294	output_whitening(%eax,%ebp,a_offset)
295	output_whitening(%ebx,%ebp,b_offset)
296	ror	$16,	%ebx
297	output_whitening(%ecx,%ebp,c_offset)
298	output_whitening(%edx,%ebp,d_offset)
299	rol	$1,	%ecx
300
301	decrypt_round(R0,R1,R2,R3,15*8);
302	decrypt_round(R2,R3,R0,R1,14*8);
303	decrypt_round(R0,R1,R2,R3,13*8);
304	decrypt_round(R2,R3,R0,R1,12*8);
305	decrypt_round(R0,R1,R2,R3,11*8);
306	decrypt_round(R2,R3,R0,R1,10*8);
307	decrypt_round(R0,R1,R2,R3,9*8);
308	decrypt_round(R2,R3,R0,R1,8*8);
309	decrypt_round(R0,R1,R2,R3,7*8);
310	decrypt_round(R2,R3,R0,R1,6*8);
311	decrypt_round(R0,R1,R2,R3,5*8);
312	decrypt_round(R2,R3,R0,R1,4*8);
313	decrypt_round(R0,R1,R2,R3,3*8);
314	decrypt_round(R2,R3,R0,R1,2*8);
315	decrypt_round(R0,R1,R2,R3,1*8);
316	decrypt_last_round(R2,R3,R0,R1,0);
317
318	input_whitening(%eax,%ebp,c_offset)
319	input_whitening(%ebx,%ebp,d_offset)
320	input_whitening(%ecx,%ebp,a_offset)
321	input_whitening(%edx,%ebp,b_offset)
322	mov	out_blk+16(%esp),%edi;
323	mov	%eax,		c_offset(%edi)
324	mov	%ebx,		d_offset(%edi)
325	mov	%ecx,		(%edi)
326	mov	%edx,		b_offset(%edi)
327
328	pop	%edi
329	pop	%esi
330	pop	%ebx
331	pop	%ebp
332	mov	$1,	%eax
333	ret
334ENDPROC(twofish_dec_blk)
335