xref: /openbmc/linux/arch/x86/crypto/sha1_ssse3_asm.S (revision 7ae9fb1b7ecbb5d85d07857943f677fd1a559b18)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
4  * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
5  * processors. CPUs supporting Intel(R) AVX extensions will get an additional
6  * boost.
7  *
8  * This work was inspired by the vectorized implementation of Dean Gaudet.
9  * Additional information on it can be found at:
10  *    http://www.arctic.org/~dean/crypto/sha1.html
11  *
12  * It was improved upon with more efficient vectorization of the message
13  * scheduling. This implementation has also been optimized for all current and
14  * several future generations of Intel CPUs.
15  *
16  * See this article for more information about the implementation details:
17  *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
18  *
19  * Copyright (C) 2010, Intel Corp.
20  *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
21  *            Ronen Zohar <ronen.zohar@intel.com>
22  *
23  * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
24  *   Author: Mathias Krause <minipli@googlemail.com>
25  */
26 
27 #include <linux/linkage.h>
28 #include <linux/cfi_types.h>
29 
30 #define CTX	%rdi	// arg1
31 #define BUF	%rsi	// arg2
32 #define CNT	%rdx	// arg3
33 
34 #define REG_A	%ecx
35 #define REG_B	%esi
36 #define REG_C	%edi
37 #define REG_D	%r12d
38 #define REG_E	%edx
39 
40 #define REG_T1	%eax
41 #define REG_T2	%ebx
42 
43 #define K_BASE		%r8
44 #define HASH_PTR	%r9
45 #define BUFFER_PTR	%r10
46 #define BUFFER_END	%r11
47 
48 #define W_TMP1	%xmm0
49 #define W_TMP2	%xmm9
50 
51 #define W0	%xmm1
52 #define W4	%xmm2
53 #define W8	%xmm3
54 #define W12	%xmm4
55 #define W16	%xmm5
56 #define W20	%xmm6
57 #define W24	%xmm7
58 #define W28	%xmm8
59 
60 #define XMM_SHUFB_BSWAP	%xmm10
61 
62 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
63 #define WK(t)	(((t) & 15) * 4)(%rsp)
64 #define W_PRECALC_AHEAD	16
65 
66 /*
67  * This macro implements the SHA-1 function's body for single 64-byte block
68  * param: function's name
69  */
70 .macro SHA1_VECTOR_ASM  name
71 	SYM_TYPED_FUNC_START(\name)
72 
73 	push	%rbx
74 	push	%r12
75 	push	%rbp
76 	mov	%rsp, %rbp
77 
78 	sub	$64, %rsp		# allocate workspace
79 	and	$~15, %rsp		# align stack
80 
81 	mov	CTX, HASH_PTR
82 	mov	BUF, BUFFER_PTR
83 
84 	shl	$6, CNT			# multiply by 64
85 	add	BUF, CNT
86 	mov	CNT, BUFFER_END
87 
88 	lea	K_XMM_AR(%rip), K_BASE
89 	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
90 
91 	SHA1_PIPELINED_MAIN_BODY
92 
93 	# cleanup workspace
94 	mov	$8, %ecx
95 	mov	%rsp, %rdi
96 	xor	%eax, %eax
97 	rep stosq
98 
99 	mov	%rbp, %rsp		# deallocate workspace
100 	pop	%rbp
101 	pop	%r12
102 	pop	%rbx
103 	RET
104 
105 	SYM_FUNC_END(\name)
106 .endm
107 
108 /*
109  * This macro implements 80 rounds of SHA-1 for one 64-byte block
110  */
111 .macro SHA1_PIPELINED_MAIN_BODY
112 	INIT_REGALLOC
113 
114 	mov	  (HASH_PTR), A
115 	mov	 4(HASH_PTR), B
116 	mov	 8(HASH_PTR), C
117 	mov	12(HASH_PTR), D
118 	mov	16(HASH_PTR), E
119 
120   .set i, 0
121   .rept W_PRECALC_AHEAD
122 	W_PRECALC i
123     .set i, (i+1)
124   .endr
125 
126 .align 4
127 1:
128 	RR F1,A,B,C,D,E,0
129 	RR F1,D,E,A,B,C,2
130 	RR F1,B,C,D,E,A,4
131 	RR F1,E,A,B,C,D,6
132 	RR F1,C,D,E,A,B,8
133 
134 	RR F1,A,B,C,D,E,10
135 	RR F1,D,E,A,B,C,12
136 	RR F1,B,C,D,E,A,14
137 	RR F1,E,A,B,C,D,16
138 	RR F1,C,D,E,A,B,18
139 
140 	RR F2,A,B,C,D,E,20
141 	RR F2,D,E,A,B,C,22
142 	RR F2,B,C,D,E,A,24
143 	RR F2,E,A,B,C,D,26
144 	RR F2,C,D,E,A,B,28
145 
146 	RR F2,A,B,C,D,E,30
147 	RR F2,D,E,A,B,C,32
148 	RR F2,B,C,D,E,A,34
149 	RR F2,E,A,B,C,D,36
150 	RR F2,C,D,E,A,B,38
151 
152 	RR F3,A,B,C,D,E,40
153 	RR F3,D,E,A,B,C,42
154 	RR F3,B,C,D,E,A,44
155 	RR F3,E,A,B,C,D,46
156 	RR F3,C,D,E,A,B,48
157 
158 	RR F3,A,B,C,D,E,50
159 	RR F3,D,E,A,B,C,52
160 	RR F3,B,C,D,E,A,54
161 	RR F3,E,A,B,C,D,56
162 	RR F3,C,D,E,A,B,58
163 
164 	add	$64, BUFFER_PTR		# move to the next 64-byte block
165 	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
166 	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
167 
168 	RR F4,A,B,C,D,E,60
169 	RR F4,D,E,A,B,C,62
170 	RR F4,B,C,D,E,A,64
171 	RR F4,E,A,B,C,D,66
172 	RR F4,C,D,E,A,B,68
173 
174 	RR F4,A,B,C,D,E,70
175 	RR F4,D,E,A,B,C,72
176 	RR F4,B,C,D,E,A,74
177 	RR F4,E,A,B,C,D,76
178 	RR F4,C,D,E,A,B,78
179 
180 	UPDATE_HASH   (HASH_PTR), A
181 	UPDATE_HASH  4(HASH_PTR), B
182 	UPDATE_HASH  8(HASH_PTR), C
183 	UPDATE_HASH 12(HASH_PTR), D
184 	UPDATE_HASH 16(HASH_PTR), E
185 
186 	RESTORE_RENAMED_REGS
187 	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
188 	jne	1b
189 .endm
190 
191 .macro INIT_REGALLOC
192   .set A, REG_A
193   .set B, REG_B
194   .set C, REG_C
195   .set D, REG_D
196   .set E, REG_E
197   .set T1, REG_T1
198   .set T2, REG_T2
199 .endm
200 
201 .macro RESTORE_RENAMED_REGS
202 	# order is important (REG_C is where it should be)
203 	mov	B, REG_B
204 	mov	D, REG_D
205 	mov	A, REG_A
206 	mov	E, REG_E
207 .endm
208 
209 .macro SWAP_REG_NAMES  a, b
210   .set _T, \a
211   .set \a, \b
212   .set \b, _T
213 .endm
214 
215 .macro F1  b, c, d
216 	mov	\c, T1
217 	SWAP_REG_NAMES \c, T1
218 	xor	\d, T1
219 	and	\b, T1
220 	xor	\d, T1
221 .endm
222 
223 .macro F2  b, c, d
224 	mov	\d, T1
225 	SWAP_REG_NAMES \d, T1
226 	xor	\c, T1
227 	xor	\b, T1
228 .endm
229 
230 .macro F3  b, c ,d
231 	mov	\c, T1
232 	SWAP_REG_NAMES \c, T1
233 	mov	\b, T2
234 	or	\b, T1
235 	and	\c, T2
236 	and	\d, T1
237 	or	T2, T1
238 .endm
239 
240 .macro F4  b, c, d
241 	F2 \b, \c, \d
242 .endm
243 
244 .macro UPDATE_HASH  hash, val
245 	add	\hash, \val
246 	mov	\val, \hash
247 .endm
248 
249 /*
250  * RR does two rounds of SHA-1 back to back with W[] pre-calc
251  *   t1 = F(b, c, d);   e += w(i)
252  *   e += t1;           b <<= 30;   d  += w(i+1);
253  *   t1 = F(a, b, c);
254  *   d += t1;           a <<= 5;
255  *   e += a;
256  *   t1 = e;            a >>= 7;
257  *   t1 <<= 5;
258  *   d += t1;
259  */
260 .macro RR  F, a, b, c, d, e, round
261 	add	WK(\round), \e
262 	\F   \b, \c, \d		# t1 = F(b, c, d);
263 	W_PRECALC (\round + W_PRECALC_AHEAD)
264 	rol	$30, \b
265 	add	T1, \e
266 	add	WK(\round + 1), \d
267 
268 	\F   \a, \b, \c
269 	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
270 	rol	$5, \a
271 	add	\a, \e
272 	add	T1, \d
273 	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
274 
275 	mov	\e, T1
276 	SWAP_REG_NAMES \e, T1
277 
278 	rol	$5, T1
279 	add	T1, \d
280 
281 	# write:  \a, \b
282 	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
283 .endm
284 
285 .macro W_PRECALC  r
286   .set i, \r
287 
288   .if (i < 20)
289     .set K_XMM, 0
290   .elseif (i < 40)
291     .set K_XMM, 16
292   .elseif (i < 60)
293     .set K_XMM, 32
294   .elseif (i < 80)
295     .set K_XMM, 48
296   .endif
297 
298   .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
299     .set i, ((\r) % 80)	    # pre-compute for the next iteration
300     .if (i == 0)
301 	W_PRECALC_RESET
302     .endif
303 	W_PRECALC_00_15
304   .elseif (i<32)
305 	W_PRECALC_16_31
306   .elseif (i < 80)   // rounds 32-79
307 	W_PRECALC_32_79
308   .endif
309 .endm
310 
311 .macro W_PRECALC_RESET
312   .set W,          W0
313   .set W_minus_04, W4
314   .set W_minus_08, W8
315   .set W_minus_12, W12
316   .set W_minus_16, W16
317   .set W_minus_20, W20
318   .set W_minus_24, W24
319   .set W_minus_28, W28
320   .set W_minus_32, W
321 .endm
322 
323 .macro W_PRECALC_ROTATE
324   .set W_minus_32, W_minus_28
325   .set W_minus_28, W_minus_24
326   .set W_minus_24, W_minus_20
327   .set W_minus_20, W_minus_16
328   .set W_minus_16, W_minus_12
329   .set W_minus_12, W_minus_08
330   .set W_minus_08, W_minus_04
331   .set W_minus_04, W
332   .set W,          W_minus_32
333 .endm
334 
335 .macro W_PRECALC_SSSE3
336 
337 .macro W_PRECALC_00_15
338 	W_PRECALC_00_15_SSSE3
339 .endm
340 .macro W_PRECALC_16_31
341 	W_PRECALC_16_31_SSSE3
342 .endm
343 .macro W_PRECALC_32_79
344 	W_PRECALC_32_79_SSSE3
345 .endm
346 
347 /* message scheduling pre-compute for rounds 0-15 */
348 .macro W_PRECALC_00_15_SSSE3
349   .if ((i & 3) == 0)
350 	movdqu	(i*4)(BUFFER_PTR), W_TMP1
351   .elseif ((i & 3) == 1)
352 	pshufb	XMM_SHUFB_BSWAP, W_TMP1
353 	movdqa	W_TMP1, W
354   .elseif ((i & 3) == 2)
355 	paddd	(K_BASE), W_TMP1
356   .elseif ((i & 3) == 3)
357 	movdqa  W_TMP1, WK(i&~3)
358 	W_PRECALC_ROTATE
359   .endif
360 .endm
361 
362 /* message scheduling pre-compute for rounds 16-31
363  *
364  * - calculating last 32 w[i] values in 8 XMM registers
365  * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
366  *   instruction
367  *
368  * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
369  * dependency, but improves for 32-79
370  */
371 .macro W_PRECALC_16_31_SSSE3
372   # blended scheduling of vector and scalar instruction streams, one 4-wide
373   # vector iteration / 4 scalar rounds
374   .if ((i & 3) == 0)
375 	movdqa	W_minus_12, W
376 	palignr	$8, W_minus_16, W	# w[i-14]
377 	movdqa	W_minus_04, W_TMP1
378 	psrldq	$4, W_TMP1		# w[i-3]
379 	pxor	W_minus_08, W
380   .elseif ((i & 3) == 1)
381 	pxor	W_minus_16, W_TMP1
382 	pxor	W_TMP1, W
383 	movdqa	W, W_TMP2
384 	movdqa	W, W_TMP1
385 	pslldq	$12, W_TMP2
386   .elseif ((i & 3) == 2)
387 	psrld	$31, W
388 	pslld	$1, W_TMP1
389 	por	W, W_TMP1
390 	movdqa	W_TMP2, W
391 	psrld	$30, W_TMP2
392 	pslld	$2, W
393   .elseif ((i & 3) == 3)
394 	pxor	W, W_TMP1
395 	pxor	W_TMP2, W_TMP1
396 	movdqa	W_TMP1, W
397 	paddd	K_XMM(K_BASE), W_TMP1
398 	movdqa	W_TMP1, WK(i&~3)
399 	W_PRECALC_ROTATE
400   .endif
401 .endm
402 
403 /* message scheduling pre-compute for rounds 32-79
404  *
405  * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
406  * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
407  * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
408  */
409 .macro W_PRECALC_32_79_SSSE3
410   .if ((i & 3) == 0)
411 	movdqa	W_minus_04, W_TMP1
412 	pxor	W_minus_28, W		# W is W_minus_32 before xor
413 	palignr	$8, W_minus_08, W_TMP1
414   .elseif ((i & 3) == 1)
415 	pxor	W_minus_16, W
416 	pxor	W_TMP1, W
417 	movdqa	W, W_TMP1
418   .elseif ((i & 3) == 2)
419 	psrld	$30, W
420 	pslld	$2, W_TMP1
421 	por	W, W_TMP1
422   .elseif ((i & 3) == 3)
423 	movdqa	W_TMP1, W
424 	paddd	K_XMM(K_BASE), W_TMP1
425 	movdqa	W_TMP1, WK(i&~3)
426 	W_PRECALC_ROTATE
427   .endif
428 .endm
429 
430 .endm		// W_PRECALC_SSSE3
431 
432 
433 #define K1	0x5a827999
434 #define K2	0x6ed9eba1
435 #define K3	0x8f1bbcdc
436 #define K4	0xca62c1d6
437 
438 .section .rodata
439 .align 16
440 
441 K_XMM_AR:
442 	.long K1, K1, K1, K1
443 	.long K2, K2, K2, K2
444 	.long K3, K3, K3, K3
445 	.long K4, K4, K4, K4
446 
447 BSWAP_SHUFB_CTL:
448 	.long 0x00010203
449 	.long 0x04050607
450 	.long 0x08090a0b
451 	.long 0x0c0d0e0f
452 
453 
454 .section .text
455 
456 W_PRECALC_SSSE3
457 .macro xmm_mov a, b
458 	movdqu	\a,\b
459 .endm
460 
461 /*
462  * SSSE3 optimized implementation:
463  *
464  * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
465  *					const u8 *data, int blocks);
466  *
467  * Note that struct sha1_state is assumed to begin with u32 state[5].
468  */
469 SHA1_VECTOR_ASM     sha1_transform_ssse3
470 
471 .macro W_PRECALC_AVX
472 
473 .purgem W_PRECALC_00_15
474 .macro  W_PRECALC_00_15
475     W_PRECALC_00_15_AVX
476 .endm
477 .purgem W_PRECALC_16_31
478 .macro  W_PRECALC_16_31
479     W_PRECALC_16_31_AVX
480 .endm
481 .purgem W_PRECALC_32_79
482 .macro  W_PRECALC_32_79
483     W_PRECALC_32_79_AVX
484 .endm
485 
486 .macro W_PRECALC_00_15_AVX
487   .if ((i & 3) == 0)
488 	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
489   .elseif ((i & 3) == 1)
490 	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
491   .elseif ((i & 3) == 2)
492 	vpaddd	(K_BASE), W, W_TMP1
493   .elseif ((i & 3) == 3)
494 	vmovdqa	W_TMP1, WK(i&~3)
495 	W_PRECALC_ROTATE
496   .endif
497 .endm
498 
499 .macro W_PRECALC_16_31_AVX
500   .if ((i & 3) == 0)
501 	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
502 	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
503 	vpxor	W_minus_08, W, W
504 	vpxor	W_minus_16, W_TMP1, W_TMP1
505   .elseif ((i & 3) == 1)
506 	vpxor	W_TMP1, W, W
507 	vpslldq	$12, W, W_TMP2
508 	vpslld	$1, W, W_TMP1
509   .elseif ((i & 3) == 2)
510 	vpsrld	$31, W, W
511 	vpor	W, W_TMP1, W_TMP1
512 	vpslld	$2, W_TMP2, W
513 	vpsrld	$30, W_TMP2, W_TMP2
514   .elseif ((i & 3) == 3)
515 	vpxor	W, W_TMP1, W_TMP1
516 	vpxor	W_TMP2, W_TMP1, W
517 	vpaddd	K_XMM(K_BASE), W, W_TMP1
518 	vmovdqu	W_TMP1, WK(i&~3)
519 	W_PRECALC_ROTATE
520   .endif
521 .endm
522 
523 .macro W_PRECALC_32_79_AVX
524   .if ((i & 3) == 0)
525 	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
526 	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
527   .elseif ((i & 3) == 1)
528 	vpxor	W_minus_16, W_TMP1, W_TMP1
529 	vpxor	W_TMP1, W, W
530   .elseif ((i & 3) == 2)
531 	vpslld	$2, W, W_TMP1
532 	vpsrld	$30, W, W
533 	vpor	W, W_TMP1, W
534   .elseif ((i & 3) == 3)
535 	vpaddd	K_XMM(K_BASE), W, W_TMP1
536 	vmovdqu	W_TMP1, WK(i&~3)
537 	W_PRECALC_ROTATE
538   .endif
539 .endm
540 
541 .endm    // W_PRECALC_AVX
542 
543 W_PRECALC_AVX
544 .purgem xmm_mov
545 .macro xmm_mov a, b
546 	vmovdqu	\a,\b
547 .endm
548 
549 
550 /* AVX optimized implementation:
551  *  extern "C" void sha1_transform_avx(struct sha1_state *state,
552  *				       const u8 *data, int blocks);
553  */
554 SHA1_VECTOR_ASM     sha1_transform_avx
555