xref: /openbmc/linux/arch/x86/crypto/sha1_ssse3_asm.S (revision 7f2e85840871f199057e65232ebde846192ed989)
1/*
2 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
3 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
4 * processors. CPUs supporting Intel(R) AVX extensions will get an additional
5 * boost.
6 *
7 * This work was inspired by the vectorized implementation of Dean Gaudet.
8 * Additional information on it can be found at:
9 *    http://www.arctic.org/~dean/crypto/sha1.html
10 *
11 * It was improved upon with more efficient vectorization of the message
12 * scheduling. This implementation has also been optimized for all current and
13 * several future generations of Intel CPUs.
14 *
15 * See this article for more information about the implementation details:
16 *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
17 *
18 * Copyright (C) 2010, Intel Corp.
19 *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
20 *            Ronen Zohar <ronen.zohar@intel.com>
21 *
22 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
23 *   Author: Mathias Krause <minipli@googlemail.com>
24 *
25 * This program is free software; you can redistribute it and/or modify
26 * it under the terms of the GNU General Public License as published by
27 * the Free Software Foundation; either version 2 of the License, or
28 * (at your option) any later version.
29 */
30
31#include <linux/linkage.h>
32
33#define CTX	%rdi	// arg1
34#define BUF	%rsi	// arg2
35#define CNT	%rdx	// arg3
36
37#define REG_A	%ecx
38#define REG_B	%esi
39#define REG_C	%edi
40#define REG_D	%r12d
41#define REG_E	%edx
42
43#define REG_T1	%eax
44#define REG_T2	%ebx
45
46#define K_BASE		%r8
47#define HASH_PTR	%r9
48#define BUFFER_PTR	%r10
49#define BUFFER_END	%r11
50
51#define W_TMP1	%xmm0
52#define W_TMP2	%xmm9
53
54#define W0	%xmm1
55#define W4	%xmm2
56#define W8	%xmm3
57#define W12	%xmm4
58#define W16	%xmm5
59#define W20	%xmm6
60#define W24	%xmm7
61#define W28	%xmm8
62
63#define XMM_SHUFB_BSWAP	%xmm10
64
65/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
66#define WK(t)	(((t) & 15) * 4)(%rsp)
67#define W_PRECALC_AHEAD	16
68
69/*
70 * This macro implements the SHA-1 function's body for single 64-byte block
71 * param: function's name
72 */
73.macro SHA1_VECTOR_ASM  name
74	ENTRY(\name)
75
76	push	%rbx
77	push	%r12
78	push	%rbp
79	mov	%rsp, %rbp
80
81	sub	$64, %rsp		# allocate workspace
82	and	$~15, %rsp		# align stack
83
84	mov	CTX, HASH_PTR
85	mov	BUF, BUFFER_PTR
86
87	shl	$6, CNT			# multiply by 64
88	add	BUF, CNT
89	mov	CNT, BUFFER_END
90
91	lea	K_XMM_AR(%rip), K_BASE
92	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
93
94	SHA1_PIPELINED_MAIN_BODY
95
96	# cleanup workspace
97	mov	$8, %ecx
98	mov	%rsp, %rdi
99	xor	%rax, %rax
100	rep stosq
101
102	mov	%rbp, %rsp		# deallocate workspace
103	pop	%rbp
104	pop	%r12
105	pop	%rbx
106	ret
107
108	ENDPROC(\name)
109.endm
110
111/*
112 * This macro implements 80 rounds of SHA-1 for one 64-byte block
113 */
114.macro SHA1_PIPELINED_MAIN_BODY
115	INIT_REGALLOC
116
117	mov	  (HASH_PTR), A
118	mov	 4(HASH_PTR), B
119	mov	 8(HASH_PTR), C
120	mov	12(HASH_PTR), D
121	mov	16(HASH_PTR), E
122
123  .set i, 0
124  .rept W_PRECALC_AHEAD
125	W_PRECALC i
126    .set i, (i+1)
127  .endr
128
129.align 4
1301:
131	RR F1,A,B,C,D,E,0
132	RR F1,D,E,A,B,C,2
133	RR F1,B,C,D,E,A,4
134	RR F1,E,A,B,C,D,6
135	RR F1,C,D,E,A,B,8
136
137	RR F1,A,B,C,D,E,10
138	RR F1,D,E,A,B,C,12
139	RR F1,B,C,D,E,A,14
140	RR F1,E,A,B,C,D,16
141	RR F1,C,D,E,A,B,18
142
143	RR F2,A,B,C,D,E,20
144	RR F2,D,E,A,B,C,22
145	RR F2,B,C,D,E,A,24
146	RR F2,E,A,B,C,D,26
147	RR F2,C,D,E,A,B,28
148
149	RR F2,A,B,C,D,E,30
150	RR F2,D,E,A,B,C,32
151	RR F2,B,C,D,E,A,34
152	RR F2,E,A,B,C,D,36
153	RR F2,C,D,E,A,B,38
154
155	RR F3,A,B,C,D,E,40
156	RR F3,D,E,A,B,C,42
157	RR F3,B,C,D,E,A,44
158	RR F3,E,A,B,C,D,46
159	RR F3,C,D,E,A,B,48
160
161	RR F3,A,B,C,D,E,50
162	RR F3,D,E,A,B,C,52
163	RR F3,B,C,D,E,A,54
164	RR F3,E,A,B,C,D,56
165	RR F3,C,D,E,A,B,58
166
167	add	$64, BUFFER_PTR		# move to the next 64-byte block
168	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
169	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
170
171	RR F4,A,B,C,D,E,60
172	RR F4,D,E,A,B,C,62
173	RR F4,B,C,D,E,A,64
174	RR F4,E,A,B,C,D,66
175	RR F4,C,D,E,A,B,68
176
177	RR F4,A,B,C,D,E,70
178	RR F4,D,E,A,B,C,72
179	RR F4,B,C,D,E,A,74
180	RR F4,E,A,B,C,D,76
181	RR F4,C,D,E,A,B,78
182
183	UPDATE_HASH   (HASH_PTR), A
184	UPDATE_HASH  4(HASH_PTR), B
185	UPDATE_HASH  8(HASH_PTR), C
186	UPDATE_HASH 12(HASH_PTR), D
187	UPDATE_HASH 16(HASH_PTR), E
188
189	RESTORE_RENAMED_REGS
190	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
191	jne	1b
192.endm
193
194.macro INIT_REGALLOC
195  .set A, REG_A
196  .set B, REG_B
197  .set C, REG_C
198  .set D, REG_D
199  .set E, REG_E
200  .set T1, REG_T1
201  .set T2, REG_T2
202.endm
203
204.macro RESTORE_RENAMED_REGS
205	# order is important (REG_C is where it should be)
206	mov	B, REG_B
207	mov	D, REG_D
208	mov	A, REG_A
209	mov	E, REG_E
210.endm
211
212.macro SWAP_REG_NAMES  a, b
213  .set _T, \a
214  .set \a, \b
215  .set \b, _T
216.endm
217
218.macro F1  b, c, d
219	mov	\c, T1
220	SWAP_REG_NAMES \c, T1
221	xor	\d, T1
222	and	\b, T1
223	xor	\d, T1
224.endm
225
226.macro F2  b, c, d
227	mov	\d, T1
228	SWAP_REG_NAMES \d, T1
229	xor	\c, T1
230	xor	\b, T1
231.endm
232
233.macro F3  b, c ,d
234	mov	\c, T1
235	SWAP_REG_NAMES \c, T1
236	mov	\b, T2
237	or	\b, T1
238	and	\c, T2
239	and	\d, T1
240	or	T2, T1
241.endm
242
243.macro F4  b, c, d
244	F2 \b, \c, \d
245.endm
246
247.macro UPDATE_HASH  hash, val
248	add	\hash, \val
249	mov	\val, \hash
250.endm
251
252/*
253 * RR does two rounds of SHA-1 back to back with W[] pre-calc
254 *   t1 = F(b, c, d);   e += w(i)
255 *   e += t1;           b <<= 30;   d  += w(i+1);
256 *   t1 = F(a, b, c);
257 *   d += t1;           a <<= 5;
258 *   e += a;
259 *   t1 = e;            a >>= 7;
260 *   t1 <<= 5;
261 *   d += t1;
262 */
263.macro RR  F, a, b, c, d, e, round
264	add	WK(\round), \e
265	\F   \b, \c, \d		# t1 = F(b, c, d);
266	W_PRECALC (\round + W_PRECALC_AHEAD)
267	rol	$30, \b
268	add	T1, \e
269	add	WK(\round + 1), \d
270
271	\F   \a, \b, \c
272	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
273	rol	$5, \a
274	add	\a, \e
275	add	T1, \d
276	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
277
278	mov	\e, T1
279	SWAP_REG_NAMES \e, T1
280
281	rol	$5, T1
282	add	T1, \d
283
284	# write:  \a, \b
285	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
286.endm
287
288.macro W_PRECALC  r
289  .set i, \r
290
291  .if (i < 20)
292    .set K_XMM, 0
293  .elseif (i < 40)
294    .set K_XMM, 16
295  .elseif (i < 60)
296    .set K_XMM, 32
297  .elseif (i < 80)
298    .set K_XMM, 48
299  .endif
300
301  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
302    .set i, ((\r) % 80)	    # pre-compute for the next iteration
303    .if (i == 0)
304	W_PRECALC_RESET
305    .endif
306	W_PRECALC_00_15
307  .elseif (i<32)
308	W_PRECALC_16_31
309  .elseif (i < 80)   // rounds 32-79
310	W_PRECALC_32_79
311  .endif
312.endm
313
314.macro W_PRECALC_RESET
315  .set W,          W0
316  .set W_minus_04, W4
317  .set W_minus_08, W8
318  .set W_minus_12, W12
319  .set W_minus_16, W16
320  .set W_minus_20, W20
321  .set W_minus_24, W24
322  .set W_minus_28, W28
323  .set W_minus_32, W
324.endm
325
326.macro W_PRECALC_ROTATE
327  .set W_minus_32, W_minus_28
328  .set W_minus_28, W_minus_24
329  .set W_minus_24, W_minus_20
330  .set W_minus_20, W_minus_16
331  .set W_minus_16, W_minus_12
332  .set W_minus_12, W_minus_08
333  .set W_minus_08, W_minus_04
334  .set W_minus_04, W
335  .set W,          W_minus_32
336.endm
337
338.macro W_PRECALC_SSSE3
339
340.macro W_PRECALC_00_15
341	W_PRECALC_00_15_SSSE3
342.endm
343.macro W_PRECALC_16_31
344	W_PRECALC_16_31_SSSE3
345.endm
346.macro W_PRECALC_32_79
347	W_PRECALC_32_79_SSSE3
348.endm
349
350/* message scheduling pre-compute for rounds 0-15 */
351.macro W_PRECALC_00_15_SSSE3
352  .if ((i & 3) == 0)
353	movdqu	(i*4)(BUFFER_PTR), W_TMP1
354  .elseif ((i & 3) == 1)
355	pshufb	XMM_SHUFB_BSWAP, W_TMP1
356	movdqa	W_TMP1, W
357  .elseif ((i & 3) == 2)
358	paddd	(K_BASE), W_TMP1
359  .elseif ((i & 3) == 3)
360	movdqa  W_TMP1, WK(i&~3)
361	W_PRECALC_ROTATE
362  .endif
363.endm
364
365/* message scheduling pre-compute for rounds 16-31
366 *
367 * - calculating last 32 w[i] values in 8 XMM registers
368 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
369 *   instruction
370 *
371 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
372 * dependency, but improves for 32-79
373 */
374.macro W_PRECALC_16_31_SSSE3
375  # blended scheduling of vector and scalar instruction streams, one 4-wide
376  # vector iteration / 4 scalar rounds
377  .if ((i & 3) == 0)
378	movdqa	W_minus_12, W
379	palignr	$8, W_minus_16, W	# w[i-14]
380	movdqa	W_minus_04, W_TMP1
381	psrldq	$4, W_TMP1		# w[i-3]
382	pxor	W_minus_08, W
383  .elseif ((i & 3) == 1)
384	pxor	W_minus_16, W_TMP1
385	pxor	W_TMP1, W
386	movdqa	W, W_TMP2
387	movdqa	W, W_TMP1
388	pslldq	$12, W_TMP2
389  .elseif ((i & 3) == 2)
390	psrld	$31, W
391	pslld	$1, W_TMP1
392	por	W, W_TMP1
393	movdqa	W_TMP2, W
394	psrld	$30, W_TMP2
395	pslld	$2, W
396  .elseif ((i & 3) == 3)
397	pxor	W, W_TMP1
398	pxor	W_TMP2, W_TMP1
399	movdqa	W_TMP1, W
400	paddd	K_XMM(K_BASE), W_TMP1
401	movdqa	W_TMP1, WK(i&~3)
402	W_PRECALC_ROTATE
403  .endif
404.endm
405
406/* message scheduling pre-compute for rounds 32-79
407 *
408 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
409 * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
410 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
411 */
412.macro W_PRECALC_32_79_SSSE3
413  .if ((i & 3) == 0)
414	movdqa	W_minus_04, W_TMP1
415	pxor	W_minus_28, W		# W is W_minus_32 before xor
416	palignr	$8, W_minus_08, W_TMP1
417  .elseif ((i & 3) == 1)
418	pxor	W_minus_16, W
419	pxor	W_TMP1, W
420	movdqa	W, W_TMP1
421  .elseif ((i & 3) == 2)
422	psrld	$30, W
423	pslld	$2, W_TMP1
424	por	W, W_TMP1
425  .elseif ((i & 3) == 3)
426	movdqa	W_TMP1, W
427	paddd	K_XMM(K_BASE), W_TMP1
428	movdqa	W_TMP1, WK(i&~3)
429	W_PRECALC_ROTATE
430  .endif
431.endm
432
433.endm		// W_PRECALC_SSSE3
434
435
436#define K1	0x5a827999
437#define K2	0x6ed9eba1
438#define K3	0x8f1bbcdc
439#define K4	0xca62c1d6
440
441.section .rodata
442.align 16
443
444K_XMM_AR:
445	.long K1, K1, K1, K1
446	.long K2, K2, K2, K2
447	.long K3, K3, K3, K3
448	.long K4, K4, K4, K4
449
450BSWAP_SHUFB_CTL:
451	.long 0x00010203
452	.long 0x04050607
453	.long 0x08090a0b
454	.long 0x0c0d0e0f
455
456
457.section .text
458
459W_PRECALC_SSSE3
460.macro xmm_mov a, b
461	movdqu	\a,\b
462.endm
463
464/* SSSE3 optimized implementation:
465 *  extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
466 *                                       unsigned int rounds);
467 */
468SHA1_VECTOR_ASM     sha1_transform_ssse3
469
470#ifdef CONFIG_AS_AVX
471
472.macro W_PRECALC_AVX
473
474.purgem W_PRECALC_00_15
475.macro  W_PRECALC_00_15
476    W_PRECALC_00_15_AVX
477.endm
478.purgem W_PRECALC_16_31
479.macro  W_PRECALC_16_31
480    W_PRECALC_16_31_AVX
481.endm
482.purgem W_PRECALC_32_79
483.macro  W_PRECALC_32_79
484    W_PRECALC_32_79_AVX
485.endm
486
487.macro W_PRECALC_00_15_AVX
488  .if ((i & 3) == 0)
489	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
490  .elseif ((i & 3) == 1)
491	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
492  .elseif ((i & 3) == 2)
493	vpaddd	(K_BASE), W, W_TMP1
494  .elseif ((i & 3) == 3)
495	vmovdqa	W_TMP1, WK(i&~3)
496	W_PRECALC_ROTATE
497  .endif
498.endm
499
500.macro W_PRECALC_16_31_AVX
501  .if ((i & 3) == 0)
502	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
503	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
504	vpxor	W_minus_08, W, W
505	vpxor	W_minus_16, W_TMP1, W_TMP1
506  .elseif ((i & 3) == 1)
507	vpxor	W_TMP1, W, W
508	vpslldq	$12, W, W_TMP2
509	vpslld	$1, W, W_TMP1
510  .elseif ((i & 3) == 2)
511	vpsrld	$31, W, W
512	vpor	W, W_TMP1, W_TMP1
513	vpslld	$2, W_TMP2, W
514	vpsrld	$30, W_TMP2, W_TMP2
515  .elseif ((i & 3) == 3)
516	vpxor	W, W_TMP1, W_TMP1
517	vpxor	W_TMP2, W_TMP1, W
518	vpaddd	K_XMM(K_BASE), W, W_TMP1
519	vmovdqu	W_TMP1, WK(i&~3)
520	W_PRECALC_ROTATE
521  .endif
522.endm
523
524.macro W_PRECALC_32_79_AVX
525  .if ((i & 3) == 0)
526	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
527	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
528  .elseif ((i & 3) == 1)
529	vpxor	W_minus_16, W_TMP1, W_TMP1
530	vpxor	W_TMP1, W, W
531  .elseif ((i & 3) == 2)
532	vpslld	$2, W, W_TMP1
533	vpsrld	$30, W, W
534	vpor	W, W_TMP1, W
535  .elseif ((i & 3) == 3)
536	vpaddd	K_XMM(K_BASE), W, W_TMP1
537	vmovdqu	W_TMP1, WK(i&~3)
538	W_PRECALC_ROTATE
539  .endif
540.endm
541
542.endm    // W_PRECALC_AVX
543
544W_PRECALC_AVX
545.purgem xmm_mov
546.macro xmm_mov a, b
547	vmovdqu	\a,\b
548.endm
549
550
551/* AVX optimized implementation:
552 *  extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
553 *                                     unsigned int rounds);
554 */
555SHA1_VECTOR_ASM     sha1_transform_avx
556
557#endif
558