1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/***************************************************************************
3*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
4*                                                                         *
5***************************************************************************/
6
7.file "twofish-i586-asm.S"
8.text
9
10#include <linux/linkage.h>
11#include <asm/asm-offsets.h>
12
13/* return address at 0 */
14
15#define in_blk    12  /* input byte array address parameter*/
16#define out_blk   8  /* output byte array address parameter*/
17#define ctx       4  /* Twofish context structure */
18
19#define a_offset	0
20#define b_offset	4
21#define c_offset	8
22#define d_offset	12
23
24/* Structure of the crypto context struct*/
25
26#define s0	0	/* S0 Array 256 Words each */
27#define s1	1024	/* S1 Array */
28#define s2	2048	/* S2 Array */
29#define s3	3072	/* S3 Array */
30#define w	4096	/* 8 whitening keys (word) */
31#define k	4128	/* key 1-32 ( word ) */
32
33/* define a few register aliases to allow macro substitution */
34
35#define R0D    %eax
36#define R0B    %al
37#define R0H    %ah
38
39#define R1D    %ebx
40#define R1B    %bl
41#define R1H    %bh
42
43#define R2D    %ecx
44#define R2B    %cl
45#define R2H    %ch
46
47#define R3D    %edx
48#define R3B    %dl
49#define R3H    %dh
50
51
52/* performs input whitening */
53#define input_whitening(src,context,offset)\
54	xor	w+offset(context),	src;
55
56/* performs input whitening */
57#define output_whitening(src,context,offset)\
58	xor	w+16+offset(context),	src;
59
60/*
61 * a input register containing a (rotated 16)
62 * b input register containing b
63 * c input register containing c
64 * d input register containing d (already rol $1)
65 * operations on a and b are interleaved to increase performance
66 */
67#define encrypt_round(a,b,c,d,round)\
68	push	d ## D;\
69	movzx	b ## B,		%edi;\
70	mov	s1(%ebp,%edi,4),d ## D;\
71	movzx	a ## B,		%edi;\
72	mov	s2(%ebp,%edi,4),%esi;\
73	movzx	b ## H,		%edi;\
74	ror	$16,		b ## D;\
75	xor	s2(%ebp,%edi,4),d ## D;\
76	movzx	a ## H,		%edi;\
77	ror	$16,		a ## D;\
78	xor	s3(%ebp,%edi,4),%esi;\
79	movzx	b ## B,		%edi;\
80	xor	s3(%ebp,%edi,4),d ## D;\
81	movzx	a ## B,		%edi;\
82	xor	(%ebp,%edi,4),	%esi;\
83	movzx	b ## H,		%edi;\
84	ror	$15,		b ## D;\
85	xor	(%ebp,%edi,4),	d ## D;\
86	movzx	a ## H,		%edi;\
87	xor	s1(%ebp,%edi,4),%esi;\
88	pop	%edi;\
89	add	d ## D,		%esi;\
90	add	%esi,		d ## D;\
91	add	k+round(%ebp),	%esi;\
92	xor	%esi,		c ## D;\
93	rol	$15,		c ## D;\
94	add	k+4+round(%ebp),d ## D;\
95	xor	%edi,		d ## D;
96
97/*
98 * a input register containing a (rotated 16)
99 * b input register containing b
100 * c input register containing c
101 * d input register containing d (already rol $1)
102 * operations on a and b are interleaved to increase performance
103 * last round has different rotations for the output preparation
104 */
105#define encrypt_last_round(a,b,c,d,round)\
106	push	d ## D;\
107	movzx	b ## B,		%edi;\
108	mov	s1(%ebp,%edi,4),d ## D;\
109	movzx	a ## B,		%edi;\
110	mov	s2(%ebp,%edi,4),%esi;\
111	movzx	b ## H,		%edi;\
112	ror	$16,		b ## D;\
113	xor	s2(%ebp,%edi,4),d ## D;\
114	movzx	a ## H,		%edi;\
115	ror	$16,		a ## D;\
116	xor	s3(%ebp,%edi,4),%esi;\
117	movzx	b ## B,		%edi;\
118	xor	s3(%ebp,%edi,4),d ## D;\
119	movzx	a ## B,		%edi;\
120	xor	(%ebp,%edi,4),	%esi;\
121	movzx	b ## H,		%edi;\
122	ror	$16,		b ## D;\
123	xor	(%ebp,%edi,4),	d ## D;\
124	movzx	a ## H,		%edi;\
125	xor	s1(%ebp,%edi,4),%esi;\
126	pop	%edi;\
127	add	d ## D,		%esi;\
128	add	%esi,		d ## D;\
129	add	k+round(%ebp),	%esi;\
130	xor	%esi,		c ## D;\
131	ror	$1,		c ## D;\
132	add	k+4+round(%ebp),d ## D;\
133	xor	%edi,		d ## D;
134
135/*
136 * a input register containing a
137 * b input register containing b (rotated 16)
138 * c input register containing c
139 * d input register containing d (already rol $1)
140 * operations on a and b are interleaved to increase performance
141 */
142#define decrypt_round(a,b,c,d,round)\
143	push	c ## D;\
144	movzx	a ## B,		%edi;\
145	mov	(%ebp,%edi,4),	c ## D;\
146	movzx	b ## B,		%edi;\
147	mov	s3(%ebp,%edi,4),%esi;\
148	movzx	a ## H,		%edi;\
149	ror	$16,		a ## D;\
150	xor	s1(%ebp,%edi,4),c ## D;\
151	movzx	b ## H,		%edi;\
152	ror	$16,		b ## D;\
153	xor	(%ebp,%edi,4),	%esi;\
154	movzx	a ## B,		%edi;\
155	xor	s2(%ebp,%edi,4),c ## D;\
156	movzx	b ## B,		%edi;\
157	xor	s1(%ebp,%edi,4),%esi;\
158	movzx	a ## H,		%edi;\
159	ror	$15,		a ## D;\
160	xor	s3(%ebp,%edi,4),c ## D;\
161	movzx	b ## H,		%edi;\
162	xor	s2(%ebp,%edi,4),%esi;\
163	pop	%edi;\
164	add	%esi,		c ## D;\
165	add	c ## D,		%esi;\
166	add	k+round(%ebp),	c ## D;\
167	xor	%edi,		c ## D;\
168	add	k+4+round(%ebp),%esi;\
169	xor	%esi,		d ## D;\
170	rol	$15,		d ## D;
171
172/*
173 * a input register containing a
174 * b input register containing b (rotated 16)
175 * c input register containing c
176 * d input register containing d (already rol $1)
177 * operations on a and b are interleaved to increase performance
178 * last round has different rotations for the output preparation
179 */
180#define decrypt_last_round(a,b,c,d,round)\
181	push	c ## D;\
182	movzx	a ## B,		%edi;\
183	mov	(%ebp,%edi,4),	c ## D;\
184	movzx	b ## B,		%edi;\
185	mov	s3(%ebp,%edi,4),%esi;\
186	movzx	a ## H,		%edi;\
187	ror	$16,		a ## D;\
188	xor	s1(%ebp,%edi,4),c ## D;\
189	movzx	b ## H,		%edi;\
190	ror	$16,		b ## D;\
191	xor	(%ebp,%edi,4),	%esi;\
192	movzx	a ## B,		%edi;\
193	xor	s2(%ebp,%edi,4),c ## D;\
194	movzx	b ## B,		%edi;\
195	xor	s1(%ebp,%edi,4),%esi;\
196	movzx	a ## H,		%edi;\
197	ror	$16,		a ## D;\
198	xor	s3(%ebp,%edi,4),c ## D;\
199	movzx	b ## H,		%edi;\
200	xor	s2(%ebp,%edi,4),%esi;\
201	pop	%edi;\
202	add	%esi,		c ## D;\
203	add	c ## D,		%esi;\
204	add	k+round(%ebp),	c ## D;\
205	xor	%edi,		c ## D;\
206	add	k+4+round(%ebp),%esi;\
207	xor	%esi,		d ## D;\
208	ror	$1,		d ## D;
209
210ENTRY(twofish_enc_blk)
211	push	%ebp			/* save registers according to calling convention*/
212	push    %ebx
213	push    %esi
214	push    %edi
215
216	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base
217					 * pointer to the ctx address */
218	mov     in_blk+16(%esp),%edi	/* input address in edi */
219
220	mov	(%edi),		%eax
221	mov	b_offset(%edi),	%ebx
222	mov	c_offset(%edi),	%ecx
223	mov	d_offset(%edi),	%edx
224	input_whitening(%eax,%ebp,a_offset)
225	ror	$16,	%eax
226	input_whitening(%ebx,%ebp,b_offset)
227	input_whitening(%ecx,%ebp,c_offset)
228	input_whitening(%edx,%ebp,d_offset)
229	rol	$1,	%edx
230
231	encrypt_round(R0,R1,R2,R3,0);
232	encrypt_round(R2,R3,R0,R1,8);
233	encrypt_round(R0,R1,R2,R3,2*8);
234	encrypt_round(R2,R3,R0,R1,3*8);
235	encrypt_round(R0,R1,R2,R3,4*8);
236	encrypt_round(R2,R3,R0,R1,5*8);
237	encrypt_round(R0,R1,R2,R3,6*8);
238	encrypt_round(R2,R3,R0,R1,7*8);
239	encrypt_round(R0,R1,R2,R3,8*8);
240	encrypt_round(R2,R3,R0,R1,9*8);
241	encrypt_round(R0,R1,R2,R3,10*8);
242	encrypt_round(R2,R3,R0,R1,11*8);
243	encrypt_round(R0,R1,R2,R3,12*8);
244	encrypt_round(R2,R3,R0,R1,13*8);
245	encrypt_round(R0,R1,R2,R3,14*8);
246	encrypt_last_round(R2,R3,R0,R1,15*8);
247
248	output_whitening(%eax,%ebp,c_offset)
249	output_whitening(%ebx,%ebp,d_offset)
250	output_whitening(%ecx,%ebp,a_offset)
251	output_whitening(%edx,%ebp,b_offset)
252	mov	out_blk+16(%esp),%edi;
253	mov	%eax,		c_offset(%edi)
254	mov	%ebx,		d_offset(%edi)
255	mov	%ecx,		(%edi)
256	mov	%edx,		b_offset(%edi)
257
258	pop	%edi
259	pop	%esi
260	pop	%ebx
261	pop	%ebp
262	mov	$1,	%eax
263	ret
264ENDPROC(twofish_enc_blk)
265
266ENTRY(twofish_dec_blk)
267	push	%ebp			/* save registers according to calling convention*/
268	push    %ebx
269	push    %esi
270	push    %edi
271
272
273	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base
274					 * pointer to the ctx address */
275	mov     in_blk+16(%esp),%edi	/* input address in edi */
276
277	mov	(%edi),		%eax
278	mov	b_offset(%edi),	%ebx
279	mov	c_offset(%edi),	%ecx
280	mov	d_offset(%edi),	%edx
281	output_whitening(%eax,%ebp,a_offset)
282	output_whitening(%ebx,%ebp,b_offset)
283	ror	$16,	%ebx
284	output_whitening(%ecx,%ebp,c_offset)
285	output_whitening(%edx,%ebp,d_offset)
286	rol	$1,	%ecx
287
288	decrypt_round(R0,R1,R2,R3,15*8);
289	decrypt_round(R2,R3,R0,R1,14*8);
290	decrypt_round(R0,R1,R2,R3,13*8);
291	decrypt_round(R2,R3,R0,R1,12*8);
292	decrypt_round(R0,R1,R2,R3,11*8);
293	decrypt_round(R2,R3,R0,R1,10*8);
294	decrypt_round(R0,R1,R2,R3,9*8);
295	decrypt_round(R2,R3,R0,R1,8*8);
296	decrypt_round(R0,R1,R2,R3,7*8);
297	decrypt_round(R2,R3,R0,R1,6*8);
298	decrypt_round(R0,R1,R2,R3,5*8);
299	decrypt_round(R2,R3,R0,R1,4*8);
300	decrypt_round(R0,R1,R2,R3,3*8);
301	decrypt_round(R2,R3,R0,R1,2*8);
302	decrypt_round(R0,R1,R2,R3,1*8);
303	decrypt_last_round(R2,R3,R0,R1,0);
304
305	input_whitening(%eax,%ebp,c_offset)
306	input_whitening(%ebx,%ebp,d_offset)
307	input_whitening(%ecx,%ebp,a_offset)
308	input_whitening(%edx,%ebp,b_offset)
309	mov	out_blk+16(%esp),%edi;
310	mov	%eax,		c_offset(%edi)
311	mov	%ebx,		d_offset(%edi)
312	mov	%ecx,		(%edi)
313	mov	%edx,		b_offset(%edi)
314
315	pop	%edi
316	pop	%esi
317	pop	%ebx
318	pop	%ebp
319	mov	$1,	%eax
320	ret
321ENDPROC(twofish_dec_blk)
322