xref: /openbmc/linux/arch/x86/crypto/blowfish-x86_64-asm_64.S (revision 9a87ffc99ec8eb8d35eed7c4f816d75f5cc9662e)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Blowfish Cipher Algorithm (x86_64)
4  *
5  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6  */
7 
8 #include <linux/linkage.h>
9 
10 .file "blowfish-x86_64-asm.S"
11 .text
12 
13 /* structure of crypto context */
14 #define p	0
15 #define s0	((16 + 2) * 4)
16 #define s1	((16 + 2 + (1 * 256)) * 4)
17 #define s2	((16 + 2 + (2 * 256)) * 4)
18 #define s3	((16 + 2 + (3 * 256)) * 4)
19 
20 /* register macros */
21 #define CTX %r12
22 #define RIO %rsi
23 
24 #define RX0 %rax
25 #define RX1 %rbx
26 #define RX2 %rcx
27 #define RX3 %rdx
28 
29 #define RX0d %eax
30 #define RX1d %ebx
31 #define RX2d %ecx
32 #define RX3d %edx
33 
34 #define RX0bl %al
35 #define RX1bl %bl
36 #define RX2bl %cl
37 #define RX3bl %dl
38 
39 #define RX0bh %ah
40 #define RX1bh %bh
41 #define RX2bh %ch
42 #define RX3bh %dh
43 
44 #define RT0 %rdi
45 #define RT1 %rsi
46 #define RT2 %r8
47 #define RT3 %r9
48 
49 #define RT0d %edi
50 #define RT1d %esi
51 #define RT2d %r8d
52 #define RT3d %r9d
53 
54 #define RKEY %r10
55 
56 /***********************************************************************
57  * 1-way blowfish
58  ***********************************************************************/
59 #define F() \
60 	rorq $16,		RX0; \
61 	movzbl RX0bh,		RT0d; \
62 	movzbl RX0bl,		RT1d; \
63 	rolq $16,		RX0; \
64 	movl s0(CTX,RT0,4),	RT0d; \
65 	addl s1(CTX,RT1,4),	RT0d; \
66 	movzbl RX0bh,		RT1d; \
67 	movzbl RX0bl,		RT2d; \
68 	rolq $32,		RX0; \
69 	xorl s2(CTX,RT1,4),	RT0d; \
70 	addl s3(CTX,RT2,4),	RT0d; \
71 	xorq RT0,		RX0;
72 
73 #define add_roundkey_enc(n) \
74 	xorq p+4*(n)(CTX), 	RX0;
75 
76 #define round_enc(n) \
77 	add_roundkey_enc(n); \
78 	\
79 	F(); \
80 	F();
81 
82 #define add_roundkey_dec(n) \
83 	movq p+4*(n-1)(CTX),	RT0; \
84 	rorq $32,		RT0; \
85 	xorq RT0,		RX0;
86 
87 #define round_dec(n) \
88 	add_roundkey_dec(n); \
89 	\
90 	F(); \
91 	F(); \
92 
93 #define read_block() \
94 	movq (RIO), 		RX0; \
95 	rorq $32, 		RX0; \
96 	bswapq 			RX0;
97 
98 #define write_block() \
99 	bswapq 			RX0; \
100 	movq RX0, 		(RIO);
101 
102 SYM_FUNC_START(blowfish_enc_blk)
103 	/* input:
104 	 *	%rdi: ctx
105 	 *	%rsi: dst
106 	 *	%rdx: src
107 	 */
108 	movq %r12, %r11;
109 
110 	movq %rdi, CTX;
111 	movq %rsi, %r10;
112 	movq %rdx, RIO;
113 
114 	read_block();
115 
116 	round_enc(0);
117 	round_enc(2);
118 	round_enc(4);
119 	round_enc(6);
120 	round_enc(8);
121 	round_enc(10);
122 	round_enc(12);
123 	round_enc(14);
124 	add_roundkey_enc(16);
125 
126 	movq %r11, %r12;
127 	movq %r10, RIO;
128 
129 	write_block();
130 	RET;
131 SYM_FUNC_END(blowfish_enc_blk)
132 
133 SYM_FUNC_START(blowfish_dec_blk)
134 	/* input:
135 	 *	%rdi: ctx
136 	 *	%rsi: dst
137 	 *	%rdx: src
138 	 */
139 	movq %r12, %r11;
140 
141 	movq %rdi, CTX;
142 	movq %rsi, %r10;
143 	movq %rdx, RIO;
144 
145 	read_block();
146 
147 	round_dec(17);
148 	round_dec(15);
149 	round_dec(13);
150 	round_dec(11);
151 	round_dec(9);
152 	round_dec(7);
153 	round_dec(5);
154 	round_dec(3);
155 	add_roundkey_dec(1);
156 
157 	movq %r10, RIO;
158 	write_block();
159 
160 	movq %r11, %r12;
161 
162 	RET;
163 SYM_FUNC_END(blowfish_dec_blk)
164 
165 /**********************************************************************
166   4-way blowfish, four blocks parallel
167  **********************************************************************/
168 
169 /* F() for 4-way. Slower when used alone/1-way, but faster when used
170  * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
171  */
172 #define F4(x) \
173 	movzbl x ## bh,		RT1d; \
174 	movzbl x ## bl,		RT3d; \
175 	rorq $16,		x; \
176 	movzbl x ## bh,		RT0d; \
177 	movzbl x ## bl,		RT2d; \
178 	rorq $16,		x; \
179 	movl s0(CTX,RT0,4),	RT0d; \
180 	addl s1(CTX,RT2,4),	RT0d; \
181 	xorl s2(CTX,RT1,4),	RT0d; \
182 	addl s3(CTX,RT3,4),	RT0d; \
183 	xorq RT0,		x;
184 
185 #define add_preloaded_roundkey4() \
186 	xorq RKEY,		RX0; \
187 	xorq RKEY,		RX1; \
188 	xorq RKEY,		RX2; \
189 	xorq RKEY,		RX3;
190 
191 #define preload_roundkey_enc(n) \
192 	movq p+4*(n)(CTX),	RKEY;
193 
194 #define add_roundkey_enc4(n) \
195 	add_preloaded_roundkey4(); \
196 	preload_roundkey_enc(n + 2);
197 
198 #define round_enc4(n) \
199 	add_roundkey_enc4(n); \
200 	\
201 	F4(RX0); \
202 	F4(RX1); \
203 	F4(RX2); \
204 	F4(RX3); \
205 	\
206 	F4(RX0); \
207 	F4(RX1); \
208 	F4(RX2); \
209 	F4(RX3);
210 
211 #define preload_roundkey_dec(n) \
212 	movq p+4*((n)-1)(CTX),	RKEY; \
213 	rorq $32,		RKEY;
214 
215 #define add_roundkey_dec4(n) \
216 	add_preloaded_roundkey4(); \
217 	preload_roundkey_dec(n - 2);
218 
219 #define round_dec4(n) \
220 	add_roundkey_dec4(n); \
221 	\
222 	F4(RX0); \
223 	F4(RX1); \
224 	F4(RX2); \
225 	F4(RX3); \
226 	\
227 	F4(RX0); \
228 	F4(RX1); \
229 	F4(RX2); \
230 	F4(RX3);
231 
232 #define read_block4() \
233 	movq (RIO),		RX0; \
234 	rorq $32,		RX0; \
235 	bswapq 			RX0; \
236 	\
237 	movq 8(RIO),		RX1; \
238 	rorq $32,		RX1; \
239 	bswapq 			RX1; \
240 	\
241 	movq 16(RIO),		RX2; \
242 	rorq $32,		RX2; \
243 	bswapq 			RX2; \
244 	\
245 	movq 24(RIO),		RX3; \
246 	rorq $32,		RX3; \
247 	bswapq 			RX3;
248 
249 #define write_block4() \
250 	bswapq 			RX0; \
251 	movq RX0,		(RIO); \
252 	\
253 	bswapq 			RX1; \
254 	movq RX1,		8(RIO); \
255 	\
256 	bswapq 			RX2; \
257 	movq RX2,		16(RIO); \
258 	\
259 	bswapq 			RX3; \
260 	movq RX3,		24(RIO);
261 
262 #define xor_block4() \
263 	movq (RIO),		RT0; \
264 	bswapq			RT0; \
265 	xorq RT0,		RX1; \
266 	\
267 	movq 8(RIO),		RT2; \
268 	bswapq			RT2; \
269 	xorq RT2,		RX2; \
270 	\
271 	movq 16(RIO),		RT3; \
272 	bswapq			RT3; \
273 	xorq RT3,		RX3;
274 
275 SYM_FUNC_START(blowfish_enc_blk_4way)
276 	/* input:
277 	 *	%rdi: ctx
278 	 *	%rsi: dst
279 	 *	%rdx: src
280 	 */
281 	pushq %r12;
282 	pushq %rbx;
283 
284 	movq %rdi, CTX
285 	movq %rsi, %r11;
286 	movq %rdx, RIO;
287 
288 	preload_roundkey_enc(0);
289 
290 	read_block4();
291 
292 	round_enc4(0);
293 	round_enc4(2);
294 	round_enc4(4);
295 	round_enc4(6);
296 	round_enc4(8);
297 	round_enc4(10);
298 	round_enc4(12);
299 	round_enc4(14);
300 	add_preloaded_roundkey4();
301 
302 	movq %r11, RIO;
303 	write_block4();
304 
305 	popq %rbx;
306 	popq %r12;
307 	RET;
308 SYM_FUNC_END(blowfish_enc_blk_4way)
309 
310 SYM_FUNC_START(__blowfish_dec_blk_4way)
311 	/* input:
312 	 *	%rdi: ctx
313 	 *	%rsi: dst
314 	 *	%rdx: src
315 	 *	%rcx: cbc (bool)
316 	 */
317 	pushq %r12;
318 	pushq %rbx;
319 	pushq %rcx;
320 	pushq %rdx;
321 
322 	movq %rdi, CTX;
323 	movq %rsi, %r11;
324 	movq %rdx, RIO;
325 
326 	preload_roundkey_dec(17);
327 	read_block4();
328 
329 	round_dec4(17);
330 	round_dec4(15);
331 	round_dec4(13);
332 	round_dec4(11);
333 	round_dec4(9);
334 	round_dec4(7);
335 	round_dec4(5);
336 	round_dec4(3);
337 	add_preloaded_roundkey4();
338 
339 	popq RIO;
340 	popq %r12;
341 	testq %r12, %r12;
342 	jz .L_no_cbc_xor;
343 
344 	xor_block4();
345 
346 .L_no_cbc_xor:
347 	movq %r11, RIO;
348 	write_block4();
349 
350 	popq %rbx;
351 	popq %r12;
352 
353 	RET;
354 SYM_FUNC_END(__blowfish_dec_blk_4way)
355