xref: /openbmc/linux/arch/x86/crypto/camellia-x86_64-asm_64.S (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Camellia Cipher Algorithm (x86_64)
4  *
5  * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6  */
7 
8 #include <linux/linkage.h>
9 
10 .file "camellia-x86_64-asm_64.S"
11 .text
12 
13 .extern camellia_sp10011110;
14 .extern camellia_sp22000222;
15 .extern camellia_sp03303033;
16 .extern camellia_sp00444404;
17 .extern camellia_sp02220222;
18 .extern camellia_sp30333033;
19 .extern camellia_sp44044404;
20 .extern camellia_sp11101110;
21 
22 #define sp10011110 camellia_sp10011110
23 #define sp22000222 camellia_sp22000222
24 #define sp03303033 camellia_sp03303033
25 #define sp00444404 camellia_sp00444404
26 #define sp02220222 camellia_sp02220222
27 #define sp30333033 camellia_sp30333033
28 #define sp44044404 camellia_sp44044404
29 #define sp11101110 camellia_sp11101110
30 
31 #define CAMELLIA_TABLE_BYTE_LEN 272
32 
33 /* struct camellia_ctx: */
34 #define key_table 0
35 #define key_length CAMELLIA_TABLE_BYTE_LEN
36 
37 /* register macros */
38 #define CTX %rdi
39 #define RIO %rsi
40 #define RIOd %esi
41 
42 #define RAB0 %rax
43 #define RCD0 %rcx
44 #define RAB1 %rbx
45 #define RCD1 %rdx
46 
47 #define RAB0d %eax
48 #define RCD0d %ecx
49 #define RAB1d %ebx
50 #define RCD1d %edx
51 
52 #define RAB0bl %al
53 #define RCD0bl %cl
54 #define RAB1bl %bl
55 #define RCD1bl %dl
56 
57 #define RAB0bh %ah
58 #define RCD0bh %ch
59 #define RAB1bh %bh
60 #define RCD1bh %dh
61 
62 #define RT0 %rsi
63 #define RT1 %r12
64 #define RT2 %r8
65 
66 #define RT0d %esi
67 #define RT1d %r12d
68 #define RT2d %r8d
69 
70 #define RT2bl %r8b
71 
72 #define RXOR %r9
73 #define RR12 %r10
74 #define RDST %r11
75 
76 #define RXORd %r9d
77 #define RXORbl %r9b
78 
79 #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
80 	leaq T0(%rip), 			tmp1; \
81 	movzbl ab ## bl,		tmp2 ## d; \
82 	xorq (tmp1, tmp2, 8),		dst; \
83 	leaq T1(%rip), 			tmp2; \
84 	movzbl ab ## bh,		tmp1 ## d; \
85 	rorq $16,			ab; \
86 	xorq (tmp2, tmp1, 8),		dst;
87 
88 /**********************************************************************
89   1-way camellia
90  **********************************************************************/
91 #define roundsm(ab, subkey, cd) \
92 	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
93 	\
94 	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
95 	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
96 	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
97 	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
98 	\
99 	xorq RT2,					cd ## 0;
100 
101 #define fls(l, r, kl, kr) \
102 	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
103 	andl l ## 0d,					RT0d; \
104 	roll $1,					RT0d; \
105 	shlq $32,					RT0; \
106 	xorq RT0,					l ## 0; \
107 	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
108 	orq r ## 0,					RT1; \
109 	shrq $32,					RT1; \
110 	xorq RT1,					r ## 0; \
111 	\
112 	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
113 	orq l ## 0,					RT2; \
114 	shrq $32,					RT2; \
115 	xorq RT2,					l ## 0; \
116 	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
117 	andl r ## 0d,					RT0d; \
118 	roll $1,					RT0d; \
119 	shlq $32,					RT0; \
120 	xorq RT0,					r ## 0;
121 
122 #define enc_rounds(i) \
123 	roundsm(RAB, i + 2, RCD); \
124 	roundsm(RCD, i + 3, RAB); \
125 	roundsm(RAB, i + 4, RCD); \
126 	roundsm(RCD, i + 5, RAB); \
127 	roundsm(RAB, i + 6, RCD); \
128 	roundsm(RCD, i + 7, RAB);
129 
130 #define enc_fls(i) \
131 	fls(RAB, RCD, i + 0, i + 1);
132 
133 #define enc_inpack() \
134 	movq (RIO),			RAB0; \
135 	bswapq				RAB0; \
136 	rolq $32,			RAB0; \
137 	movq 4*2(RIO),			RCD0; \
138 	bswapq				RCD0; \
139 	rorq $32,			RCD0; \
140 	xorq key_table(CTX),		RAB0;
141 
142 #define enc_outunpack(op, max) \
143 	xorq key_table(CTX, max, 8),	RCD0; \
144 	rorq $32,			RCD0; \
145 	bswapq				RCD0; \
146 	op ## q RCD0,			(RIO); \
147 	rolq $32,			RAB0; \
148 	bswapq				RAB0; \
149 	op ## q RAB0,			4*2(RIO);
150 
151 #define dec_rounds(i) \
152 	roundsm(RAB, i + 7, RCD); \
153 	roundsm(RCD, i + 6, RAB); \
154 	roundsm(RAB, i + 5, RCD); \
155 	roundsm(RCD, i + 4, RAB); \
156 	roundsm(RAB, i + 3, RCD); \
157 	roundsm(RCD, i + 2, RAB);
158 
159 #define dec_fls(i) \
160 	fls(RAB, RCD, i + 1, i + 0);
161 
162 #define dec_inpack(max) \
163 	movq (RIO),			RAB0; \
164 	bswapq				RAB0; \
165 	rolq $32,			RAB0; \
166 	movq 4*2(RIO),			RCD0; \
167 	bswapq				RCD0; \
168 	rorq $32,			RCD0; \
169 	xorq key_table(CTX, max, 8),	RAB0;
170 
171 #define dec_outunpack() \
172 	xorq key_table(CTX),		RCD0; \
173 	rorq $32,			RCD0; \
174 	bswapq				RCD0; \
175 	movq RCD0,			(RIO); \
176 	rolq $32,			RAB0; \
177 	bswapq				RAB0; \
178 	movq RAB0,			4*2(RIO);
179 
180 SYM_FUNC_START(__camellia_enc_blk)
181 	/* input:
182 	 *	%rdi: ctx, CTX
183 	 *	%rsi: dst
184 	 *	%rdx: src
185 	 *	%rcx: bool xor
186 	 */
187 	movq %r12, RR12;
188 
189 	movq %rcx, RXOR;
190 	movq %rsi, RDST;
191 	movq %rdx, RIO;
192 
193 	enc_inpack();
194 
195 	enc_rounds(0);
196 	enc_fls(8);
197 	enc_rounds(8);
198 	enc_fls(16);
199 	enc_rounds(16);
200 	movl $24, RT1d; /* max */
201 
202 	cmpb $16, key_length(CTX);
203 	je .L__enc_done;
204 
205 	enc_fls(24);
206 	enc_rounds(24);
207 	movl $32, RT1d; /* max */
208 
209 .L__enc_done:
210 	testb RXORbl, RXORbl;
211 	movq RDST, RIO;
212 
213 	jnz .L__enc_xor;
214 
215 	enc_outunpack(mov, RT1);
216 
217 	movq RR12, %r12;
218 	RET;
219 
220 .L__enc_xor:
221 	enc_outunpack(xor, RT1);
222 
223 	movq RR12, %r12;
224 	RET;
225 SYM_FUNC_END(__camellia_enc_blk)
226 
227 SYM_FUNC_START(camellia_dec_blk)
228 	/* input:
229 	 *	%rdi: ctx, CTX
230 	 *	%rsi: dst
231 	 *	%rdx: src
232 	 */
233 	cmpl $16, key_length(CTX);
234 	movl $32, RT2d;
235 	movl $24, RXORd;
236 	cmovel RXORd, RT2d; /* max */
237 
238 	movq %r12, RR12;
239 	movq %rsi, RDST;
240 	movq %rdx, RIO;
241 
242 	dec_inpack(RT2);
243 
244 	cmpb $24, RT2bl;
245 	je .L__dec_rounds16;
246 
247 	dec_rounds(24);
248 	dec_fls(24);
249 
250 .L__dec_rounds16:
251 	dec_rounds(16);
252 	dec_fls(16);
253 	dec_rounds(8);
254 	dec_fls(8);
255 	dec_rounds(0);
256 
257 	movq RDST, RIO;
258 
259 	dec_outunpack();
260 
261 	movq RR12, %r12;
262 	RET;
263 SYM_FUNC_END(camellia_dec_blk)
264 
265 /**********************************************************************
266   2-way camellia
267  **********************************************************************/
268 #define roundsm2(ab, subkey, cd) \
269 	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
270 	xorq RT2,					cd ## 1; \
271 	\
272 	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
273 	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
274 	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
275 	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
276 	\
277 		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
278 		xorq RT2,					cd ## 0; \
279 		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
280 		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
281 		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
282 
283 #define fls2(l, r, kl, kr) \
284 	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
285 	andl l ## 0d,					RT0d; \
286 	roll $1,					RT0d; \
287 	shlq $32,					RT0; \
288 	xorq RT0,					l ## 0; \
289 	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
290 	orq r ## 0,					RT1; \
291 	shrq $32,					RT1; \
292 	xorq RT1,					r ## 0; \
293 	\
294 		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
295 		andl l ## 1d,					RT2d; \
296 		roll $1,					RT2d; \
297 		shlq $32,					RT2; \
298 		xorq RT2,					l ## 1; \
299 		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
300 		orq r ## 1,					RT0; \
301 		shrq $32,					RT0; \
302 		xorq RT0,					r ## 1; \
303 	\
304 	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
305 	orq l ## 0,					RT1; \
306 	shrq $32,					RT1; \
307 	xorq RT1,					l ## 0; \
308 	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
309 	andl r ## 0d,					RT2d; \
310 	roll $1,					RT2d; \
311 	shlq $32,					RT2; \
312 	xorq RT2,					r ## 0; \
313 	\
314 		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
315 		orq l ## 1,					RT0; \
316 		shrq $32,					RT0; \
317 		xorq RT0,					l ## 1; \
318 		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
319 		andl r ## 1d,					RT1d; \
320 		roll $1,					RT1d; \
321 		shlq $32,					RT1; \
322 		xorq RT1,					r ## 1;
323 
324 #define enc_rounds2(i) \
325 	roundsm2(RAB, i + 2, RCD); \
326 	roundsm2(RCD, i + 3, RAB); \
327 	roundsm2(RAB, i + 4, RCD); \
328 	roundsm2(RCD, i + 5, RAB); \
329 	roundsm2(RAB, i + 6, RCD); \
330 	roundsm2(RCD, i + 7, RAB);
331 
332 #define enc_fls2(i) \
333 	fls2(RAB, RCD, i + 0, i + 1);
334 
335 #define enc_inpack2() \
336 	movq (RIO),			RAB0; \
337 	bswapq				RAB0; \
338 	rorq $32,			RAB0; \
339 	movq 4*2(RIO),			RCD0; \
340 	bswapq				RCD0; \
341 	rolq $32,			RCD0; \
342 	xorq key_table(CTX),		RAB0; \
343 	\
344 		movq 8*2(RIO),			RAB1; \
345 		bswapq				RAB1; \
346 		rorq $32,			RAB1; \
347 		movq 12*2(RIO),			RCD1; \
348 		bswapq				RCD1; \
349 		rolq $32,			RCD1; \
350 		xorq key_table(CTX),		RAB1;
351 
352 #define enc_outunpack2(op, max) \
353 	xorq key_table(CTX, max, 8),	RCD0; \
354 	rolq $32,			RCD0; \
355 	bswapq				RCD0; \
356 	op ## q RCD0,			(RIO); \
357 	rorq $32,			RAB0; \
358 	bswapq				RAB0; \
359 	op ## q RAB0,			4*2(RIO); \
360 	\
361 		xorq key_table(CTX, max, 8),	RCD1; \
362 		rolq $32,			RCD1; \
363 		bswapq				RCD1; \
364 		op ## q RCD1,			8*2(RIO); \
365 		rorq $32,			RAB1; \
366 		bswapq				RAB1; \
367 		op ## q RAB1,			12*2(RIO);
368 
369 #define dec_rounds2(i) \
370 	roundsm2(RAB, i + 7, RCD); \
371 	roundsm2(RCD, i + 6, RAB); \
372 	roundsm2(RAB, i + 5, RCD); \
373 	roundsm2(RCD, i + 4, RAB); \
374 	roundsm2(RAB, i + 3, RCD); \
375 	roundsm2(RCD, i + 2, RAB);
376 
377 #define dec_fls2(i) \
378 	fls2(RAB, RCD, i + 1, i + 0);
379 
380 #define dec_inpack2(max) \
381 	movq (RIO),			RAB0; \
382 	bswapq				RAB0; \
383 	rorq $32,			RAB0; \
384 	movq 4*2(RIO),			RCD0; \
385 	bswapq				RCD0; \
386 	rolq $32,			RCD0; \
387 	xorq key_table(CTX, max, 8),	RAB0; \
388 	\
389 		movq 8*2(RIO),			RAB1; \
390 		bswapq				RAB1; \
391 		rorq $32,			RAB1; \
392 		movq 12*2(RIO),			RCD1; \
393 		bswapq				RCD1; \
394 		rolq $32,			RCD1; \
395 		xorq key_table(CTX, max, 8),	RAB1;
396 
397 #define dec_outunpack2() \
398 	xorq key_table(CTX),		RCD0; \
399 	rolq $32,			RCD0; \
400 	bswapq				RCD0; \
401 	movq RCD0,			(RIO); \
402 	rorq $32,			RAB0; \
403 	bswapq				RAB0; \
404 	movq RAB0,			4*2(RIO); \
405 	\
406 		xorq key_table(CTX),		RCD1; \
407 		rolq $32,			RCD1; \
408 		bswapq				RCD1; \
409 		movq RCD1,			8*2(RIO); \
410 		rorq $32,			RAB1; \
411 		bswapq				RAB1; \
412 		movq RAB1,			12*2(RIO);
413 
414 SYM_FUNC_START(__camellia_enc_blk_2way)
415 	/* input:
416 	 *	%rdi: ctx, CTX
417 	 *	%rsi: dst
418 	 *	%rdx: src
419 	 *	%rcx: bool xor
420 	 */
421 	pushq %rbx;
422 
423 	movq %r12, RR12;
424 	movq %rcx, RXOR;
425 	movq %rsi, RDST;
426 	movq %rdx, RIO;
427 
428 	enc_inpack2();
429 
430 	enc_rounds2(0);
431 	enc_fls2(8);
432 	enc_rounds2(8);
433 	enc_fls2(16);
434 	enc_rounds2(16);
435 	movl $24, RT2d; /* max */
436 
437 	cmpb $16, key_length(CTX);
438 	je .L__enc2_done;
439 
440 	enc_fls2(24);
441 	enc_rounds2(24);
442 	movl $32, RT2d; /* max */
443 
444 .L__enc2_done:
445 	test RXORbl, RXORbl;
446 	movq RDST, RIO;
447 	jnz .L__enc2_xor;
448 
449 	enc_outunpack2(mov, RT2);
450 
451 	movq RR12, %r12;
452 	popq %rbx;
453 	RET;
454 
455 .L__enc2_xor:
456 	enc_outunpack2(xor, RT2);
457 
458 	movq RR12, %r12;
459 	popq %rbx;
460 	RET;
461 SYM_FUNC_END(__camellia_enc_blk_2way)
462 
463 SYM_FUNC_START(camellia_dec_blk_2way)
464 	/* input:
465 	 *	%rdi: ctx, CTX
466 	 *	%rsi: dst
467 	 *	%rdx: src
468 	 */
469 	cmpl $16, key_length(CTX);
470 	movl $32, RT2d;
471 	movl $24, RXORd;
472 	cmovel RXORd, RT2d; /* max */
473 
474 	movq %rbx, RXOR;
475 	movq %r12, RR12;
476 	movq %rsi, RDST;
477 	movq %rdx, RIO;
478 
479 	dec_inpack2(RT2);
480 
481 	cmpb $24, RT2bl;
482 	je .L__dec2_rounds16;
483 
484 	dec_rounds2(24);
485 	dec_fls2(24);
486 
487 .L__dec2_rounds16:
488 	dec_rounds2(16);
489 	dec_fls2(16);
490 	dec_rounds2(8);
491 	dec_fls2(8);
492 	dec_rounds2(0);
493 
494 	movq RDST, RIO;
495 
496 	dec_outunpack2();
497 
498 	movq RR12, %r12;
499 	movq RXOR, %rbx;
500 	RET;
501 SYM_FUNC_END(camellia_dec_blk_2way)
502