1/*
2 * Camellia Cipher Algorithm (x86_64)
3 *
4 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
19 * USA
20 *
21 */
22
23#include <linux/linkage.h>
24
25.file "camellia-x86_64-asm_64.S"
26.text
27
28.extern camellia_sp10011110;
29.extern camellia_sp22000222;
30.extern camellia_sp03303033;
31.extern camellia_sp00444404;
32.extern camellia_sp02220222;
33.extern camellia_sp30333033;
34.extern camellia_sp44044404;
35.extern camellia_sp11101110;
36
37#define sp10011110 camellia_sp10011110
38#define sp22000222 camellia_sp22000222
39#define sp03303033 camellia_sp03303033
40#define sp00444404 camellia_sp00444404
41#define sp02220222 camellia_sp02220222
42#define sp30333033 camellia_sp30333033
43#define sp44044404 camellia_sp44044404
44#define sp11101110 camellia_sp11101110
45
46#define CAMELLIA_TABLE_BYTE_LEN 272
47
48/* struct camellia_ctx: */
49#define key_table 0
50#define key_length CAMELLIA_TABLE_BYTE_LEN
51
52/* register macros */
53#define CTX %rdi
54#define RIO %rsi
55#define RIOd %esi
56
57#define RAB0 %rax
58#define RCD0 %rcx
59#define RAB1 %rbx
60#define RCD1 %rdx
61
62#define RAB0d %eax
63#define RCD0d %ecx
64#define RAB1d %ebx
65#define RCD1d %edx
66
67#define RAB0bl %al
68#define RCD0bl %cl
69#define RAB1bl %bl
70#define RCD1bl %dl
71
72#define RAB0bh %ah
73#define RCD0bh %ch
74#define RAB1bh %bh
75#define RCD1bh %dh
76
77#define RT0 %rsi
78#define RT1 %rbp
79#define RT2 %r8
80
81#define RT0d %esi
82#define RT1d %ebp
83#define RT2d %r8d
84
85#define RT2bl %r8b
86
87#define RXOR %r9
88#define RRBP %r10
89#define RDST %r11
90
91#define RXORd %r9d
92#define RXORbl %r9b
93
94#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
95	movzbl ab ## bl,		tmp2 ## d; \
96	movzbl ab ## bh,		tmp1 ## d; \
97	rorq $16,			ab; \
98	xorq T0(, tmp2, 8),		dst; \
99	xorq T1(, tmp1, 8),		dst;
100
101/**********************************************************************
102  1-way camellia
103 **********************************************************************/
104#define roundsm(ab, subkey, cd) \
105	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
106	\
107	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
108	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
109	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
110	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
111	\
112	xorq RT2,					cd ## 0;
113
114#define fls(l, r, kl, kr) \
115	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
116	andl l ## 0d,					RT0d; \
117	roll $1,					RT0d; \
118	shlq $32,					RT0; \
119	xorq RT0,					l ## 0; \
120	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
121	orq r ## 0,					RT1; \
122	shrq $32,					RT1; \
123	xorq RT1,					r ## 0; \
124	\
125	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
126	orq l ## 0,					RT2; \
127	shrq $32,					RT2; \
128	xorq RT2,					l ## 0; \
129	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
130	andl r ## 0d,					RT0d; \
131	roll $1,					RT0d; \
132	shlq $32,					RT0; \
133	xorq RT0,					r ## 0;
134
135#define enc_rounds(i) \
136	roundsm(RAB, i + 2, RCD); \
137	roundsm(RCD, i + 3, RAB); \
138	roundsm(RAB, i + 4, RCD); \
139	roundsm(RCD, i + 5, RAB); \
140	roundsm(RAB, i + 6, RCD); \
141	roundsm(RCD, i + 7, RAB);
142
143#define enc_fls(i) \
144	fls(RAB, RCD, i + 0, i + 1);
145
146#define enc_inpack() \
147	movq (RIO),			RAB0; \
148	bswapq				RAB0; \
149	rolq $32,			RAB0; \
150	movq 4*2(RIO),			RCD0; \
151	bswapq				RCD0; \
152	rorq $32,			RCD0; \
153	xorq key_table(CTX),		RAB0;
154
155#define enc_outunpack(op, max) \
156	xorq key_table(CTX, max, 8),	RCD0; \
157	rorq $32,			RCD0; \
158	bswapq				RCD0; \
159	op ## q RCD0,			(RIO); \
160	rolq $32,			RAB0; \
161	bswapq				RAB0; \
162	op ## q RAB0,			4*2(RIO);
163
164#define dec_rounds(i) \
165	roundsm(RAB, i + 7, RCD); \
166	roundsm(RCD, i + 6, RAB); \
167	roundsm(RAB, i + 5, RCD); \
168	roundsm(RCD, i + 4, RAB); \
169	roundsm(RAB, i + 3, RCD); \
170	roundsm(RCD, i + 2, RAB);
171
172#define dec_fls(i) \
173	fls(RAB, RCD, i + 1, i + 0);
174
175#define dec_inpack(max) \
176	movq (RIO),			RAB0; \
177	bswapq				RAB0; \
178	rolq $32,			RAB0; \
179	movq 4*2(RIO),			RCD0; \
180	bswapq				RCD0; \
181	rorq $32,			RCD0; \
182	xorq key_table(CTX, max, 8),	RAB0;
183
184#define dec_outunpack() \
185	xorq key_table(CTX),		RCD0; \
186	rorq $32,			RCD0; \
187	bswapq				RCD0; \
188	movq RCD0,			(RIO); \
189	rolq $32,			RAB0; \
190	bswapq				RAB0; \
191	movq RAB0,			4*2(RIO);
192
193ENTRY(__camellia_enc_blk)
194	/* input:
195	 *	%rdi: ctx, CTX
196	 *	%rsi: dst
197	 *	%rdx: src
198	 *	%rcx: bool xor
199	 */
200	movq %rbp, RRBP;
201
202	movq %rcx, RXOR;
203	movq %rsi, RDST;
204	movq %rdx, RIO;
205
206	enc_inpack();
207
208	enc_rounds(0);
209	enc_fls(8);
210	enc_rounds(8);
211	enc_fls(16);
212	enc_rounds(16);
213	movl $24, RT1d; /* max */
214
215	cmpb $16, key_length(CTX);
216	je .L__enc_done;
217
218	enc_fls(24);
219	enc_rounds(24);
220	movl $32, RT1d; /* max */
221
222.L__enc_done:
223	testb RXORbl, RXORbl;
224	movq RDST, RIO;
225
226	jnz .L__enc_xor;
227
228	enc_outunpack(mov, RT1);
229
230	movq RRBP, %rbp;
231	ret;
232
233.L__enc_xor:
234	enc_outunpack(xor, RT1);
235
236	movq RRBP, %rbp;
237	ret;
238ENDPROC(__camellia_enc_blk)
239
240ENTRY(camellia_dec_blk)
241	/* input:
242	 *	%rdi: ctx, CTX
243	 *	%rsi: dst
244	 *	%rdx: src
245	 */
246	cmpl $16, key_length(CTX);
247	movl $32, RT2d;
248	movl $24, RXORd;
249	cmovel RXORd, RT2d; /* max */
250
251	movq %rbp, RRBP;
252	movq %rsi, RDST;
253	movq %rdx, RIO;
254
255	dec_inpack(RT2);
256
257	cmpb $24, RT2bl;
258	je .L__dec_rounds16;
259
260	dec_rounds(24);
261	dec_fls(24);
262
263.L__dec_rounds16:
264	dec_rounds(16);
265	dec_fls(16);
266	dec_rounds(8);
267	dec_fls(8);
268	dec_rounds(0);
269
270	movq RDST, RIO;
271
272	dec_outunpack();
273
274	movq RRBP, %rbp;
275	ret;
276ENDPROC(camellia_dec_blk)
277
278/**********************************************************************
279  2-way camellia
280 **********************************************************************/
281#define roundsm2(ab, subkey, cd) \
282	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
283	xorq RT2,					cd ## 1; \
284	\
285	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
286	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
287	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
288	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
289	\
290		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
291		xorq RT2,					cd ## 0; \
292		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
293		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
294		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
295
296#define fls2(l, r, kl, kr) \
297	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
298	andl l ## 0d,					RT0d; \
299	roll $1,					RT0d; \
300	shlq $32,					RT0; \
301	xorq RT0,					l ## 0; \
302	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
303	orq r ## 0,					RT1; \
304	shrq $32,					RT1; \
305	xorq RT1,					r ## 0; \
306	\
307		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
308		andl l ## 1d,					RT2d; \
309		roll $1,					RT2d; \
310		shlq $32,					RT2; \
311		xorq RT2,					l ## 1; \
312		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
313		orq r ## 1,					RT0; \
314		shrq $32,					RT0; \
315		xorq RT0,					r ## 1; \
316	\
317	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
318	orq l ## 0,					RT1; \
319	shrq $32,					RT1; \
320	xorq RT1,					l ## 0; \
321	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
322	andl r ## 0d,					RT2d; \
323	roll $1,					RT2d; \
324	shlq $32,					RT2; \
325	xorq RT2,					r ## 0; \
326	\
327		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
328		orq l ## 1,					RT0; \
329		shrq $32,					RT0; \
330		xorq RT0,					l ## 1; \
331		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
332		andl r ## 1d,					RT1d; \
333		roll $1,					RT1d; \
334		shlq $32,					RT1; \
335		xorq RT1,					r ## 1;
336
337#define enc_rounds2(i) \
338	roundsm2(RAB, i + 2, RCD); \
339	roundsm2(RCD, i + 3, RAB); \
340	roundsm2(RAB, i + 4, RCD); \
341	roundsm2(RCD, i + 5, RAB); \
342	roundsm2(RAB, i + 6, RCD); \
343	roundsm2(RCD, i + 7, RAB);
344
345#define enc_fls2(i) \
346	fls2(RAB, RCD, i + 0, i + 1);
347
348#define enc_inpack2() \
349	movq (RIO),			RAB0; \
350	bswapq				RAB0; \
351	rorq $32,			RAB0; \
352	movq 4*2(RIO),			RCD0; \
353	bswapq				RCD0; \
354	rolq $32,			RCD0; \
355	xorq key_table(CTX),		RAB0; \
356	\
357		movq 8*2(RIO),			RAB1; \
358		bswapq				RAB1; \
359		rorq $32,			RAB1; \
360		movq 12*2(RIO),			RCD1; \
361		bswapq				RCD1; \
362		rolq $32,			RCD1; \
363		xorq key_table(CTX),		RAB1;
364
365#define enc_outunpack2(op, max) \
366	xorq key_table(CTX, max, 8),	RCD0; \
367	rolq $32,			RCD0; \
368	bswapq				RCD0; \
369	op ## q RCD0,			(RIO); \
370	rorq $32,			RAB0; \
371	bswapq				RAB0; \
372	op ## q RAB0,			4*2(RIO); \
373	\
374		xorq key_table(CTX, max, 8),	RCD1; \
375		rolq $32,			RCD1; \
376		bswapq				RCD1; \
377		op ## q RCD1,			8*2(RIO); \
378		rorq $32,			RAB1; \
379		bswapq				RAB1; \
380		op ## q RAB1,			12*2(RIO);
381
382#define dec_rounds2(i) \
383	roundsm2(RAB, i + 7, RCD); \
384	roundsm2(RCD, i + 6, RAB); \
385	roundsm2(RAB, i + 5, RCD); \
386	roundsm2(RCD, i + 4, RAB); \
387	roundsm2(RAB, i + 3, RCD); \
388	roundsm2(RCD, i + 2, RAB);
389
390#define dec_fls2(i) \
391	fls2(RAB, RCD, i + 1, i + 0);
392
393#define dec_inpack2(max) \
394	movq (RIO),			RAB0; \
395	bswapq				RAB0; \
396	rorq $32,			RAB0; \
397	movq 4*2(RIO),			RCD0; \
398	bswapq				RCD0; \
399	rolq $32,			RCD0; \
400	xorq key_table(CTX, max, 8),	RAB0; \
401	\
402		movq 8*2(RIO),			RAB1; \
403		bswapq				RAB1; \
404		rorq $32,			RAB1; \
405		movq 12*2(RIO),			RCD1; \
406		bswapq				RCD1; \
407		rolq $32,			RCD1; \
408		xorq key_table(CTX, max, 8),	RAB1;
409
410#define dec_outunpack2() \
411	xorq key_table(CTX),		RCD0; \
412	rolq $32,			RCD0; \
413	bswapq				RCD0; \
414	movq RCD0,			(RIO); \
415	rorq $32,			RAB0; \
416	bswapq				RAB0; \
417	movq RAB0,			4*2(RIO); \
418	\
419		xorq key_table(CTX),		RCD1; \
420		rolq $32,			RCD1; \
421		bswapq				RCD1; \
422		movq RCD1,			8*2(RIO); \
423		rorq $32,			RAB1; \
424		bswapq				RAB1; \
425		movq RAB1,			12*2(RIO);
426
427ENTRY(__camellia_enc_blk_2way)
428	/* input:
429	 *	%rdi: ctx, CTX
430	 *	%rsi: dst
431	 *	%rdx: src
432	 *	%rcx: bool xor
433	 */
434	pushq %rbx;
435
436	movq %rbp, RRBP;
437	movq %rcx, RXOR;
438	movq %rsi, RDST;
439	movq %rdx, RIO;
440
441	enc_inpack2();
442
443	enc_rounds2(0);
444	enc_fls2(8);
445	enc_rounds2(8);
446	enc_fls2(16);
447	enc_rounds2(16);
448	movl $24, RT2d; /* max */
449
450	cmpb $16, key_length(CTX);
451	je .L__enc2_done;
452
453	enc_fls2(24);
454	enc_rounds2(24);
455	movl $32, RT2d; /* max */
456
457.L__enc2_done:
458	test RXORbl, RXORbl;
459	movq RDST, RIO;
460	jnz .L__enc2_xor;
461
462	enc_outunpack2(mov, RT2);
463
464	movq RRBP, %rbp;
465	popq %rbx;
466	ret;
467
468.L__enc2_xor:
469	enc_outunpack2(xor, RT2);
470
471	movq RRBP, %rbp;
472	popq %rbx;
473	ret;
474ENDPROC(__camellia_enc_blk_2way)
475
476ENTRY(camellia_dec_blk_2way)
477	/* input:
478	 *	%rdi: ctx, CTX
479	 *	%rsi: dst
480	 *	%rdx: src
481	 */
482	cmpl $16, key_length(CTX);
483	movl $32, RT2d;
484	movl $24, RXORd;
485	cmovel RXORd, RT2d; /* max */
486
487	movq %rbx, RXOR;
488	movq %rbp, RRBP;
489	movq %rsi, RDST;
490	movq %rdx, RIO;
491
492	dec_inpack2(RT2);
493
494	cmpb $24, RT2bl;
495	je .L__dec2_rounds16;
496
497	dec_rounds2(24);
498	dec_fls2(24);
499
500.L__dec2_rounds16:
501	dec_rounds2(16);
502	dec_fls2(16);
503	dec_rounds2(8);
504	dec_fls2(8);
505	dec_rounds2(0);
506
507	movq RDST, RIO;
508
509	dec_outunpack2();
510
511	movq RRBP, %rbp;
512	movq RXOR, %rbx;
513	ret;
514ENDPROC(camellia_dec_blk_2way)
515