1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Camellia Cipher Algorithm (x86_64)
4 *
5 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 */
7
8#include <linux/linkage.h>
9
10.file "camellia-x86_64-asm_64.S"
11.text
12
13.extern camellia_sp10011110;
14.extern camellia_sp22000222;
15.extern camellia_sp03303033;
16.extern camellia_sp00444404;
17.extern camellia_sp02220222;
18.extern camellia_sp30333033;
19.extern camellia_sp44044404;
20.extern camellia_sp11101110;
21
22#define sp10011110 camellia_sp10011110
23#define sp22000222 camellia_sp22000222
24#define sp03303033 camellia_sp03303033
25#define sp00444404 camellia_sp00444404
26#define sp02220222 camellia_sp02220222
27#define sp30333033 camellia_sp30333033
28#define sp44044404 camellia_sp44044404
29#define sp11101110 camellia_sp11101110
30
31#define CAMELLIA_TABLE_BYTE_LEN 272
32
33/* struct camellia_ctx: */
34#define key_table 0
35#define key_length CAMELLIA_TABLE_BYTE_LEN
36
37/* register macros */
38#define CTX %rdi
39#define RIO %rsi
40#define RIOd %esi
41
42#define RAB0 %rax
43#define RCD0 %rcx
44#define RAB1 %rbx
45#define RCD1 %rdx
46
47#define RAB0d %eax
48#define RCD0d %ecx
49#define RAB1d %ebx
50#define RCD1d %edx
51
52#define RAB0bl %al
53#define RCD0bl %cl
54#define RAB1bl %bl
55#define RCD1bl %dl
56
57#define RAB0bh %ah
58#define RCD0bh %ch
59#define RAB1bh %bh
60#define RCD1bh %dh
61
62#define RT0 %rsi
63#define RT1 %r12
64#define RT2 %r8
65
66#define RT0d %esi
67#define RT1d %r12d
68#define RT2d %r8d
69
70#define RT2bl %r8b
71
72#define RXOR %r9
73#define RR12 %r10
74#define RDST %r11
75
76#define RXORd %r9d
77#define RXORbl %r9b
78
79#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
80	movzbl ab ## bl,		tmp2 ## d; \
81	movzbl ab ## bh,		tmp1 ## d; \
82	rorq $16,			ab; \
83	xorq T0(, tmp2, 8),		dst; \
84	xorq T1(, tmp1, 8),		dst;
85
86/**********************************************************************
87  1-way camellia
88 **********************************************************************/
89#define roundsm(ab, subkey, cd) \
90	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
91	\
92	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
93	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
94	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
95	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
96	\
97	xorq RT2,					cd ## 0;
98
99#define fls(l, r, kl, kr) \
100	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
101	andl l ## 0d,					RT0d; \
102	roll $1,					RT0d; \
103	shlq $32,					RT0; \
104	xorq RT0,					l ## 0; \
105	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
106	orq r ## 0,					RT1; \
107	shrq $32,					RT1; \
108	xorq RT1,					r ## 0; \
109	\
110	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
111	orq l ## 0,					RT2; \
112	shrq $32,					RT2; \
113	xorq RT2,					l ## 0; \
114	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
115	andl r ## 0d,					RT0d; \
116	roll $1,					RT0d; \
117	shlq $32,					RT0; \
118	xorq RT0,					r ## 0;
119
120#define enc_rounds(i) \
121	roundsm(RAB, i + 2, RCD); \
122	roundsm(RCD, i + 3, RAB); \
123	roundsm(RAB, i + 4, RCD); \
124	roundsm(RCD, i + 5, RAB); \
125	roundsm(RAB, i + 6, RCD); \
126	roundsm(RCD, i + 7, RAB);
127
128#define enc_fls(i) \
129	fls(RAB, RCD, i + 0, i + 1);
130
131#define enc_inpack() \
132	movq (RIO),			RAB0; \
133	bswapq				RAB0; \
134	rolq $32,			RAB0; \
135	movq 4*2(RIO),			RCD0; \
136	bswapq				RCD0; \
137	rorq $32,			RCD0; \
138	xorq key_table(CTX),		RAB0;
139
140#define enc_outunpack(op, max) \
141	xorq key_table(CTX, max, 8),	RCD0; \
142	rorq $32,			RCD0; \
143	bswapq				RCD0; \
144	op ## q RCD0,			(RIO); \
145	rolq $32,			RAB0; \
146	bswapq				RAB0; \
147	op ## q RAB0,			4*2(RIO);
148
149#define dec_rounds(i) \
150	roundsm(RAB, i + 7, RCD); \
151	roundsm(RCD, i + 6, RAB); \
152	roundsm(RAB, i + 5, RCD); \
153	roundsm(RCD, i + 4, RAB); \
154	roundsm(RAB, i + 3, RCD); \
155	roundsm(RCD, i + 2, RAB);
156
157#define dec_fls(i) \
158	fls(RAB, RCD, i + 1, i + 0);
159
160#define dec_inpack(max) \
161	movq (RIO),			RAB0; \
162	bswapq				RAB0; \
163	rolq $32,			RAB0; \
164	movq 4*2(RIO),			RCD0; \
165	bswapq				RCD0; \
166	rorq $32,			RCD0; \
167	xorq key_table(CTX, max, 8),	RAB0;
168
169#define dec_outunpack() \
170	xorq key_table(CTX),		RCD0; \
171	rorq $32,			RCD0; \
172	bswapq				RCD0; \
173	movq RCD0,			(RIO); \
174	rolq $32,			RAB0; \
175	bswapq				RAB0; \
176	movq RAB0,			4*2(RIO);
177
178ENTRY(__camellia_enc_blk)
179	/* input:
180	 *	%rdi: ctx, CTX
181	 *	%rsi: dst
182	 *	%rdx: src
183	 *	%rcx: bool xor
184	 */
185	movq %r12, RR12;
186
187	movq %rcx, RXOR;
188	movq %rsi, RDST;
189	movq %rdx, RIO;
190
191	enc_inpack();
192
193	enc_rounds(0);
194	enc_fls(8);
195	enc_rounds(8);
196	enc_fls(16);
197	enc_rounds(16);
198	movl $24, RT1d; /* max */
199
200	cmpb $16, key_length(CTX);
201	je .L__enc_done;
202
203	enc_fls(24);
204	enc_rounds(24);
205	movl $32, RT1d; /* max */
206
207.L__enc_done:
208	testb RXORbl, RXORbl;
209	movq RDST, RIO;
210
211	jnz .L__enc_xor;
212
213	enc_outunpack(mov, RT1);
214
215	movq RR12, %r12;
216	ret;
217
218.L__enc_xor:
219	enc_outunpack(xor, RT1);
220
221	movq RR12, %r12;
222	ret;
223ENDPROC(__camellia_enc_blk)
224
225ENTRY(camellia_dec_blk)
226	/* input:
227	 *	%rdi: ctx, CTX
228	 *	%rsi: dst
229	 *	%rdx: src
230	 */
231	cmpl $16, key_length(CTX);
232	movl $32, RT2d;
233	movl $24, RXORd;
234	cmovel RXORd, RT2d; /* max */
235
236	movq %r12, RR12;
237	movq %rsi, RDST;
238	movq %rdx, RIO;
239
240	dec_inpack(RT2);
241
242	cmpb $24, RT2bl;
243	je .L__dec_rounds16;
244
245	dec_rounds(24);
246	dec_fls(24);
247
248.L__dec_rounds16:
249	dec_rounds(16);
250	dec_fls(16);
251	dec_rounds(8);
252	dec_fls(8);
253	dec_rounds(0);
254
255	movq RDST, RIO;
256
257	dec_outunpack();
258
259	movq RR12, %r12;
260	ret;
261ENDPROC(camellia_dec_blk)
262
263/**********************************************************************
264  2-way camellia
265 **********************************************************************/
266#define roundsm2(ab, subkey, cd) \
267	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
268	xorq RT2,					cd ## 1; \
269	\
270	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
271	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
272	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
273	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
274	\
275		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
276		xorq RT2,					cd ## 0; \
277		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
278		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
279		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
280
281#define fls2(l, r, kl, kr) \
282	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
283	andl l ## 0d,					RT0d; \
284	roll $1,					RT0d; \
285	shlq $32,					RT0; \
286	xorq RT0,					l ## 0; \
287	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
288	orq r ## 0,					RT1; \
289	shrq $32,					RT1; \
290	xorq RT1,					r ## 0; \
291	\
292		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
293		andl l ## 1d,					RT2d; \
294		roll $1,					RT2d; \
295		shlq $32,					RT2; \
296		xorq RT2,					l ## 1; \
297		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
298		orq r ## 1,					RT0; \
299		shrq $32,					RT0; \
300		xorq RT0,					r ## 1; \
301	\
302	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
303	orq l ## 0,					RT1; \
304	shrq $32,					RT1; \
305	xorq RT1,					l ## 0; \
306	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
307	andl r ## 0d,					RT2d; \
308	roll $1,					RT2d; \
309	shlq $32,					RT2; \
310	xorq RT2,					r ## 0; \
311	\
312		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
313		orq l ## 1,					RT0; \
314		shrq $32,					RT0; \
315		xorq RT0,					l ## 1; \
316		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
317		andl r ## 1d,					RT1d; \
318		roll $1,					RT1d; \
319		shlq $32,					RT1; \
320		xorq RT1,					r ## 1;
321
322#define enc_rounds2(i) \
323	roundsm2(RAB, i + 2, RCD); \
324	roundsm2(RCD, i + 3, RAB); \
325	roundsm2(RAB, i + 4, RCD); \
326	roundsm2(RCD, i + 5, RAB); \
327	roundsm2(RAB, i + 6, RCD); \
328	roundsm2(RCD, i + 7, RAB);
329
330#define enc_fls2(i) \
331	fls2(RAB, RCD, i + 0, i + 1);
332
333#define enc_inpack2() \
334	movq (RIO),			RAB0; \
335	bswapq				RAB0; \
336	rorq $32,			RAB0; \
337	movq 4*2(RIO),			RCD0; \
338	bswapq				RCD0; \
339	rolq $32,			RCD0; \
340	xorq key_table(CTX),		RAB0; \
341	\
342		movq 8*2(RIO),			RAB1; \
343		bswapq				RAB1; \
344		rorq $32,			RAB1; \
345		movq 12*2(RIO),			RCD1; \
346		bswapq				RCD1; \
347		rolq $32,			RCD1; \
348		xorq key_table(CTX),		RAB1;
349
350#define enc_outunpack2(op, max) \
351	xorq key_table(CTX, max, 8),	RCD0; \
352	rolq $32,			RCD0; \
353	bswapq				RCD0; \
354	op ## q RCD0,			(RIO); \
355	rorq $32,			RAB0; \
356	bswapq				RAB0; \
357	op ## q RAB0,			4*2(RIO); \
358	\
359		xorq key_table(CTX, max, 8),	RCD1; \
360		rolq $32,			RCD1; \
361		bswapq				RCD1; \
362		op ## q RCD1,			8*2(RIO); \
363		rorq $32,			RAB1; \
364		bswapq				RAB1; \
365		op ## q RAB1,			12*2(RIO);
366
367#define dec_rounds2(i) \
368	roundsm2(RAB, i + 7, RCD); \
369	roundsm2(RCD, i + 6, RAB); \
370	roundsm2(RAB, i + 5, RCD); \
371	roundsm2(RCD, i + 4, RAB); \
372	roundsm2(RAB, i + 3, RCD); \
373	roundsm2(RCD, i + 2, RAB);
374
375#define dec_fls2(i) \
376	fls2(RAB, RCD, i + 1, i + 0);
377
378#define dec_inpack2(max) \
379	movq (RIO),			RAB0; \
380	bswapq				RAB0; \
381	rorq $32,			RAB0; \
382	movq 4*2(RIO),			RCD0; \
383	bswapq				RCD0; \
384	rolq $32,			RCD0; \
385	xorq key_table(CTX, max, 8),	RAB0; \
386	\
387		movq 8*2(RIO),			RAB1; \
388		bswapq				RAB1; \
389		rorq $32,			RAB1; \
390		movq 12*2(RIO),			RCD1; \
391		bswapq				RCD1; \
392		rolq $32,			RCD1; \
393		xorq key_table(CTX, max, 8),	RAB1;
394
395#define dec_outunpack2() \
396	xorq key_table(CTX),		RCD0; \
397	rolq $32,			RCD0; \
398	bswapq				RCD0; \
399	movq RCD0,			(RIO); \
400	rorq $32,			RAB0; \
401	bswapq				RAB0; \
402	movq RAB0,			4*2(RIO); \
403	\
404		xorq key_table(CTX),		RCD1; \
405		rolq $32,			RCD1; \
406		bswapq				RCD1; \
407		movq RCD1,			8*2(RIO); \
408		rorq $32,			RAB1; \
409		bswapq				RAB1; \
410		movq RAB1,			12*2(RIO);
411
412ENTRY(__camellia_enc_blk_2way)
413	/* input:
414	 *	%rdi: ctx, CTX
415	 *	%rsi: dst
416	 *	%rdx: src
417	 *	%rcx: bool xor
418	 */
419	pushq %rbx;
420
421	movq %r12, RR12;
422	movq %rcx, RXOR;
423	movq %rsi, RDST;
424	movq %rdx, RIO;
425
426	enc_inpack2();
427
428	enc_rounds2(0);
429	enc_fls2(8);
430	enc_rounds2(8);
431	enc_fls2(16);
432	enc_rounds2(16);
433	movl $24, RT2d; /* max */
434
435	cmpb $16, key_length(CTX);
436	je .L__enc2_done;
437
438	enc_fls2(24);
439	enc_rounds2(24);
440	movl $32, RT2d; /* max */
441
442.L__enc2_done:
443	test RXORbl, RXORbl;
444	movq RDST, RIO;
445	jnz .L__enc2_xor;
446
447	enc_outunpack2(mov, RT2);
448
449	movq RR12, %r12;
450	popq %rbx;
451	ret;
452
453.L__enc2_xor:
454	enc_outunpack2(xor, RT2);
455
456	movq RR12, %r12;
457	popq %rbx;
458	ret;
459ENDPROC(__camellia_enc_blk_2way)
460
461ENTRY(camellia_dec_blk_2way)
462	/* input:
463	 *	%rdi: ctx, CTX
464	 *	%rsi: dst
465	 *	%rdx: src
466	 */
467	cmpl $16, key_length(CTX);
468	movl $32, RT2d;
469	movl $24, RXORd;
470	cmovel RXORd, RT2d; /* max */
471
472	movq %rbx, RXOR;
473	movq %r12, RR12;
474	movq %rsi, RDST;
475	movq %rdx, RIO;
476
477	dec_inpack2(RT2);
478
479	cmpb $24, RT2bl;
480	je .L__dec2_rounds16;
481
482	dec_rounds2(24);
483	dec_fls2(24);
484
485.L__dec2_rounds16:
486	dec_rounds2(16);
487	dec_fls2(16);
488	dec_rounds2(8);
489	dec_fls2(8);
490	dec_rounds2(0);
491
492	movq RDST, RIO;
493
494	dec_outunpack2();
495
496	movq RR12, %r12;
497	movq RXOR, %rbx;
498	ret;
499ENDPROC(camellia_dec_blk_2way)
500