1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Camellia Cipher Algorithm (x86_64)
4 *
5 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 */
7
8#include <linux/linkage.h>
9
10.file "camellia-x86_64-asm_64.S"
11.text
12
13.extern camellia_sp10011110;
14.extern camellia_sp22000222;
15.extern camellia_sp03303033;
16.extern camellia_sp00444404;
17.extern camellia_sp02220222;
18.extern camellia_sp30333033;
19.extern camellia_sp44044404;
20.extern camellia_sp11101110;
21
22#define sp10011110 camellia_sp10011110
23#define sp22000222 camellia_sp22000222
24#define sp03303033 camellia_sp03303033
25#define sp00444404 camellia_sp00444404
26#define sp02220222 camellia_sp02220222
27#define sp30333033 camellia_sp30333033
28#define sp44044404 camellia_sp44044404
29#define sp11101110 camellia_sp11101110
30
31#define CAMELLIA_TABLE_BYTE_LEN 272
32
33/* struct camellia_ctx: */
34#define key_table 0
35#define key_length CAMELLIA_TABLE_BYTE_LEN
36
37/* register macros */
38#define CTX %rdi
39#define RIO %rsi
40#define RIOd %esi
41
42#define RAB0 %rax
43#define RCD0 %rcx
44#define RAB1 %rbx
45#define RCD1 %rdx
46
47#define RAB0d %eax
48#define RCD0d %ecx
49#define RAB1d %ebx
50#define RCD1d %edx
51
52#define RAB0bl %al
53#define RCD0bl %cl
54#define RAB1bl %bl
55#define RCD1bl %dl
56
57#define RAB0bh %ah
58#define RCD0bh %ch
59#define RAB1bh %bh
60#define RCD1bh %dh
61
62#define RT0 %rsi
63#define RT1 %r12
64#define RT2 %r8
65
66#define RT0d %esi
67#define RT1d %r12d
68#define RT2d %r8d
69
70#define RT2bl %r8b
71
72#define RXOR %r9
73#define RR12 %r10
74#define RDST %r11
75
76#define RXORd %r9d
77#define RXORbl %r9b
78
79#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
80	leaq T0(%rip), 			tmp1; \
81	movzbl ab ## bl,		tmp2 ## d; \
82	xorq (tmp1, tmp2, 8),		dst; \
83	leaq T1(%rip), 			tmp2; \
84	movzbl ab ## bh,		tmp1 ## d; \
85	rorq $16,			ab; \
86	xorq (tmp2, tmp1, 8),		dst;
87
88/**********************************************************************
89  1-way camellia
90 **********************************************************************/
91#define roundsm(ab, subkey, cd) \
92	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
93	\
94	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
95	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
96	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
97	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
98	\
99	xorq RT2,					cd ## 0;
100
101#define fls(l, r, kl, kr) \
102	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
103	andl l ## 0d,					RT0d; \
104	roll $1,					RT0d; \
105	shlq $32,					RT0; \
106	xorq RT0,					l ## 0; \
107	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
108	orq r ## 0,					RT1; \
109	shrq $32,					RT1; \
110	xorq RT1,					r ## 0; \
111	\
112	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
113	orq l ## 0,					RT2; \
114	shrq $32,					RT2; \
115	xorq RT2,					l ## 0; \
116	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
117	andl r ## 0d,					RT0d; \
118	roll $1,					RT0d; \
119	shlq $32,					RT0; \
120	xorq RT0,					r ## 0;
121
122#define enc_rounds(i) \
123	roundsm(RAB, i + 2, RCD); \
124	roundsm(RCD, i + 3, RAB); \
125	roundsm(RAB, i + 4, RCD); \
126	roundsm(RCD, i + 5, RAB); \
127	roundsm(RAB, i + 6, RCD); \
128	roundsm(RCD, i + 7, RAB);
129
130#define enc_fls(i) \
131	fls(RAB, RCD, i + 0, i + 1);
132
133#define enc_inpack() \
134	movq (RIO),			RAB0; \
135	bswapq				RAB0; \
136	rolq $32,			RAB0; \
137	movq 4*2(RIO),			RCD0; \
138	bswapq				RCD0; \
139	rorq $32,			RCD0; \
140	xorq key_table(CTX),		RAB0;
141
142#define enc_outunpack(op, max) \
143	xorq key_table(CTX, max, 8),	RCD0; \
144	rorq $32,			RCD0; \
145	bswapq				RCD0; \
146	op ## q RCD0,			(RIO); \
147	rolq $32,			RAB0; \
148	bswapq				RAB0; \
149	op ## q RAB0,			4*2(RIO);
150
151#define dec_rounds(i) \
152	roundsm(RAB, i + 7, RCD); \
153	roundsm(RCD, i + 6, RAB); \
154	roundsm(RAB, i + 5, RCD); \
155	roundsm(RCD, i + 4, RAB); \
156	roundsm(RAB, i + 3, RCD); \
157	roundsm(RCD, i + 2, RAB);
158
159#define dec_fls(i) \
160	fls(RAB, RCD, i + 1, i + 0);
161
162#define dec_inpack(max) \
163	movq (RIO),			RAB0; \
164	bswapq				RAB0; \
165	rolq $32,			RAB0; \
166	movq 4*2(RIO),			RCD0; \
167	bswapq				RCD0; \
168	rorq $32,			RCD0; \
169	xorq key_table(CTX, max, 8),	RAB0;
170
171#define dec_outunpack() \
172	xorq key_table(CTX),		RCD0; \
173	rorq $32,			RCD0; \
174	bswapq				RCD0; \
175	movq RCD0,			(RIO); \
176	rolq $32,			RAB0; \
177	bswapq				RAB0; \
178	movq RAB0,			4*2(RIO);
179
180SYM_FUNC_START(__camellia_enc_blk)
181	/* input:
182	 *	%rdi: ctx, CTX
183	 *	%rsi: dst
184	 *	%rdx: src
185	 *	%rcx: bool xor
186	 */
187	movq %r12, RR12;
188
189	movq %rcx, RXOR;
190	movq %rsi, RDST;
191	movq %rdx, RIO;
192
193	enc_inpack();
194
195	enc_rounds(0);
196	enc_fls(8);
197	enc_rounds(8);
198	enc_fls(16);
199	enc_rounds(16);
200	movl $24, RT1d; /* max */
201
202	cmpb $16, key_length(CTX);
203	je .L__enc_done;
204
205	enc_fls(24);
206	enc_rounds(24);
207	movl $32, RT1d; /* max */
208
209.L__enc_done:
210	testb RXORbl, RXORbl;
211	movq RDST, RIO;
212
213	jnz .L__enc_xor;
214
215	enc_outunpack(mov, RT1);
216
217	movq RR12, %r12;
218	RET;
219
220.L__enc_xor:
221	enc_outunpack(xor, RT1);
222
223	movq RR12, %r12;
224	RET;
225SYM_FUNC_END(__camellia_enc_blk)
226
227SYM_FUNC_START(camellia_dec_blk)
228	/* input:
229	 *	%rdi: ctx, CTX
230	 *	%rsi: dst
231	 *	%rdx: src
232	 */
233	cmpl $16, key_length(CTX);
234	movl $32, RT2d;
235	movl $24, RXORd;
236	cmovel RXORd, RT2d; /* max */
237
238	movq %r12, RR12;
239	movq %rsi, RDST;
240	movq %rdx, RIO;
241
242	dec_inpack(RT2);
243
244	cmpb $24, RT2bl;
245	je .L__dec_rounds16;
246
247	dec_rounds(24);
248	dec_fls(24);
249
250.L__dec_rounds16:
251	dec_rounds(16);
252	dec_fls(16);
253	dec_rounds(8);
254	dec_fls(8);
255	dec_rounds(0);
256
257	movq RDST, RIO;
258
259	dec_outunpack();
260
261	movq RR12, %r12;
262	RET;
263SYM_FUNC_END(camellia_dec_blk)
264
265/**********************************************************************
266  2-way camellia
267 **********************************************************************/
268#define roundsm2(ab, subkey, cd) \
269	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
270	xorq RT2,					cd ## 1; \
271	\
272	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
273	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
274	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
275	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
276	\
277		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
278		xorq RT2,					cd ## 0; \
279		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
280		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
281		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
282
283#define fls2(l, r, kl, kr) \
284	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
285	andl l ## 0d,					RT0d; \
286	roll $1,					RT0d; \
287	shlq $32,					RT0; \
288	xorq RT0,					l ## 0; \
289	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
290	orq r ## 0,					RT1; \
291	shrq $32,					RT1; \
292	xorq RT1,					r ## 0; \
293	\
294		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
295		andl l ## 1d,					RT2d; \
296		roll $1,					RT2d; \
297		shlq $32,					RT2; \
298		xorq RT2,					l ## 1; \
299		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
300		orq r ## 1,					RT0; \
301		shrq $32,					RT0; \
302		xorq RT0,					r ## 1; \
303	\
304	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
305	orq l ## 0,					RT1; \
306	shrq $32,					RT1; \
307	xorq RT1,					l ## 0; \
308	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
309	andl r ## 0d,					RT2d; \
310	roll $1,					RT2d; \
311	shlq $32,					RT2; \
312	xorq RT2,					r ## 0; \
313	\
314		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
315		orq l ## 1,					RT0; \
316		shrq $32,					RT0; \
317		xorq RT0,					l ## 1; \
318		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
319		andl r ## 1d,					RT1d; \
320		roll $1,					RT1d; \
321		shlq $32,					RT1; \
322		xorq RT1,					r ## 1;
323
324#define enc_rounds2(i) \
325	roundsm2(RAB, i + 2, RCD); \
326	roundsm2(RCD, i + 3, RAB); \
327	roundsm2(RAB, i + 4, RCD); \
328	roundsm2(RCD, i + 5, RAB); \
329	roundsm2(RAB, i + 6, RCD); \
330	roundsm2(RCD, i + 7, RAB);
331
332#define enc_fls2(i) \
333	fls2(RAB, RCD, i + 0, i + 1);
334
335#define enc_inpack2() \
336	movq (RIO),			RAB0; \
337	bswapq				RAB0; \
338	rorq $32,			RAB0; \
339	movq 4*2(RIO),			RCD0; \
340	bswapq				RCD0; \
341	rolq $32,			RCD0; \
342	xorq key_table(CTX),		RAB0; \
343	\
344		movq 8*2(RIO),			RAB1; \
345		bswapq				RAB1; \
346		rorq $32,			RAB1; \
347		movq 12*2(RIO),			RCD1; \
348		bswapq				RCD1; \
349		rolq $32,			RCD1; \
350		xorq key_table(CTX),		RAB1;
351
352#define enc_outunpack2(op, max) \
353	xorq key_table(CTX, max, 8),	RCD0; \
354	rolq $32,			RCD0; \
355	bswapq				RCD0; \
356	op ## q RCD0,			(RIO); \
357	rorq $32,			RAB0; \
358	bswapq				RAB0; \
359	op ## q RAB0,			4*2(RIO); \
360	\
361		xorq key_table(CTX, max, 8),	RCD1; \
362		rolq $32,			RCD1; \
363		bswapq				RCD1; \
364		op ## q RCD1,			8*2(RIO); \
365		rorq $32,			RAB1; \
366		bswapq				RAB1; \
367		op ## q RAB1,			12*2(RIO);
368
369#define dec_rounds2(i) \
370	roundsm2(RAB, i + 7, RCD); \
371	roundsm2(RCD, i + 6, RAB); \
372	roundsm2(RAB, i + 5, RCD); \
373	roundsm2(RCD, i + 4, RAB); \
374	roundsm2(RAB, i + 3, RCD); \
375	roundsm2(RCD, i + 2, RAB);
376
377#define dec_fls2(i) \
378	fls2(RAB, RCD, i + 1, i + 0);
379
380#define dec_inpack2(max) \
381	movq (RIO),			RAB0; \
382	bswapq				RAB0; \
383	rorq $32,			RAB0; \
384	movq 4*2(RIO),			RCD0; \
385	bswapq				RCD0; \
386	rolq $32,			RCD0; \
387	xorq key_table(CTX, max, 8),	RAB0; \
388	\
389		movq 8*2(RIO),			RAB1; \
390		bswapq				RAB1; \
391		rorq $32,			RAB1; \
392		movq 12*2(RIO),			RCD1; \
393		bswapq				RCD1; \
394		rolq $32,			RCD1; \
395		xorq key_table(CTX, max, 8),	RAB1;
396
397#define dec_outunpack2() \
398	xorq key_table(CTX),		RCD0; \
399	rolq $32,			RCD0; \
400	bswapq				RCD0; \
401	movq RCD0,			(RIO); \
402	rorq $32,			RAB0; \
403	bswapq				RAB0; \
404	movq RAB0,			4*2(RIO); \
405	\
406		xorq key_table(CTX),		RCD1; \
407		rolq $32,			RCD1; \
408		bswapq				RCD1; \
409		movq RCD1,			8*2(RIO); \
410		rorq $32,			RAB1; \
411		bswapq				RAB1; \
412		movq RAB1,			12*2(RIO);
413
414SYM_FUNC_START(__camellia_enc_blk_2way)
415	/* input:
416	 *	%rdi: ctx, CTX
417	 *	%rsi: dst
418	 *	%rdx: src
419	 *	%rcx: bool xor
420	 */
421	pushq %rbx;
422
423	movq %r12, RR12;
424	movq %rcx, RXOR;
425	movq %rsi, RDST;
426	movq %rdx, RIO;
427
428	enc_inpack2();
429
430	enc_rounds2(0);
431	enc_fls2(8);
432	enc_rounds2(8);
433	enc_fls2(16);
434	enc_rounds2(16);
435	movl $24, RT2d; /* max */
436
437	cmpb $16, key_length(CTX);
438	je .L__enc2_done;
439
440	enc_fls2(24);
441	enc_rounds2(24);
442	movl $32, RT2d; /* max */
443
444.L__enc2_done:
445	test RXORbl, RXORbl;
446	movq RDST, RIO;
447	jnz .L__enc2_xor;
448
449	enc_outunpack2(mov, RT2);
450
451	movq RR12, %r12;
452	popq %rbx;
453	RET;
454
455.L__enc2_xor:
456	enc_outunpack2(xor, RT2);
457
458	movq RR12, %r12;
459	popq %rbx;
460	RET;
461SYM_FUNC_END(__camellia_enc_blk_2way)
462
463SYM_FUNC_START(camellia_dec_blk_2way)
464	/* input:
465	 *	%rdi: ctx, CTX
466	 *	%rsi: dst
467	 *	%rdx: src
468	 */
469	cmpl $16, key_length(CTX);
470	movl $32, RT2d;
471	movl $24, RXORd;
472	cmovel RXORd, RT2d; /* max */
473
474	movq %rbx, RXOR;
475	movq %r12, RR12;
476	movq %rsi, RDST;
477	movq %rdx, RIO;
478
479	dec_inpack2(RT2);
480
481	cmpb $24, RT2bl;
482	je .L__dec2_rounds16;
483
484	dec_rounds2(24);
485	dec_fls2(24);
486
487.L__dec2_rounds16:
488	dec_rounds2(16);
489	dec_fls2(16);
490	dec_rounds2(8);
491	dec_fls2(8);
492	dec_rounds2(0);
493
494	movq RDST, RIO;
495
496	dec_outunpack2();
497
498	movq RR12, %r12;
499	movq RXOR, %rbx;
500	RET;
501SYM_FUNC_END(camellia_dec_blk_2way)
502