xref: /openbmc/linux/arch/x86/crypto/blake2s-core.S (revision 7b73a9c8e26ce5769c41d4b787767c10fe7269db)
1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5 */
6
7#include <linux/linkage.h>
8
9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
10.align 32
11IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
12	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
13.section .rodata.cst16.ROT16, "aM", @progbits, 16
14.align 16
15ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
16.section .rodata.cst16.ROR328, "aM", @progbits, 16
17.align 16
18ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
20.align 64
21SIGMA:
22.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
23.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
24.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
25.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
26.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
27.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
28.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
29.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
30.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
31.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
32#ifdef CONFIG_AS_AVX512
33.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
34.align 64
35SIGMA2:
36.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
37.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
38.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
39.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
40.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
41.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
42.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
43.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
44.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
45.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
46#endif /* CONFIG_AS_AVX512 */
47
48.text
49#ifdef CONFIG_AS_SSSE3
50SYM_FUNC_START(blake2s_compress_ssse3)
51	testq		%rdx,%rdx
52	je		.Lendofloop
53	movdqu		(%rdi),%xmm0
54	movdqu		0x10(%rdi),%xmm1
55	movdqa		ROT16(%rip),%xmm12
56	movdqa		ROR328(%rip),%xmm13
57	movdqu		0x20(%rdi),%xmm14
58	movq		%rcx,%xmm15
59	leaq		SIGMA+0xa0(%rip),%r8
60	jmp		.Lbeginofloop
61	.align		32
62.Lbeginofloop:
63	movdqa		%xmm0,%xmm10
64	movdqa		%xmm1,%xmm11
65	paddq		%xmm15,%xmm14
66	movdqa		IV(%rip),%xmm2
67	movdqa		%xmm14,%xmm3
68	pxor		IV+0x10(%rip),%xmm3
69	leaq		SIGMA(%rip),%rcx
70.Lroundloop:
71	movzbl		(%rcx),%eax
72	movd		(%rsi,%rax,4),%xmm4
73	movzbl		0x1(%rcx),%eax
74	movd		(%rsi,%rax,4),%xmm5
75	movzbl		0x2(%rcx),%eax
76	movd		(%rsi,%rax,4),%xmm6
77	movzbl		0x3(%rcx),%eax
78	movd		(%rsi,%rax,4),%xmm7
79	punpckldq	%xmm5,%xmm4
80	punpckldq	%xmm7,%xmm6
81	punpcklqdq	%xmm6,%xmm4
82	paddd		%xmm4,%xmm0
83	paddd		%xmm1,%xmm0
84	pxor		%xmm0,%xmm3
85	pshufb		%xmm12,%xmm3
86	paddd		%xmm3,%xmm2
87	pxor		%xmm2,%xmm1
88	movdqa		%xmm1,%xmm8
89	psrld		$0xc,%xmm1
90	pslld		$0x14,%xmm8
91	por		%xmm8,%xmm1
92	movzbl		0x4(%rcx),%eax
93	movd		(%rsi,%rax,4),%xmm5
94	movzbl		0x5(%rcx),%eax
95	movd		(%rsi,%rax,4),%xmm6
96	movzbl		0x6(%rcx),%eax
97	movd		(%rsi,%rax,4),%xmm7
98	movzbl		0x7(%rcx),%eax
99	movd		(%rsi,%rax,4),%xmm4
100	punpckldq	%xmm6,%xmm5
101	punpckldq	%xmm4,%xmm7
102	punpcklqdq	%xmm7,%xmm5
103	paddd		%xmm5,%xmm0
104	paddd		%xmm1,%xmm0
105	pxor		%xmm0,%xmm3
106	pshufb		%xmm13,%xmm3
107	paddd		%xmm3,%xmm2
108	pxor		%xmm2,%xmm1
109	movdqa		%xmm1,%xmm8
110	psrld		$0x7,%xmm1
111	pslld		$0x19,%xmm8
112	por		%xmm8,%xmm1
113	pshufd		$0x93,%xmm0,%xmm0
114	pshufd		$0x4e,%xmm3,%xmm3
115	pshufd		$0x39,%xmm2,%xmm2
116	movzbl		0x8(%rcx),%eax
117	movd		(%rsi,%rax,4),%xmm6
118	movzbl		0x9(%rcx),%eax
119	movd		(%rsi,%rax,4),%xmm7
120	movzbl		0xa(%rcx),%eax
121	movd		(%rsi,%rax,4),%xmm4
122	movzbl		0xb(%rcx),%eax
123	movd		(%rsi,%rax,4),%xmm5
124	punpckldq	%xmm7,%xmm6
125	punpckldq	%xmm5,%xmm4
126	punpcklqdq	%xmm4,%xmm6
127	paddd		%xmm6,%xmm0
128	paddd		%xmm1,%xmm0
129	pxor		%xmm0,%xmm3
130	pshufb		%xmm12,%xmm3
131	paddd		%xmm3,%xmm2
132	pxor		%xmm2,%xmm1
133	movdqa		%xmm1,%xmm8
134	psrld		$0xc,%xmm1
135	pslld		$0x14,%xmm8
136	por		%xmm8,%xmm1
137	movzbl		0xc(%rcx),%eax
138	movd		(%rsi,%rax,4),%xmm7
139	movzbl		0xd(%rcx),%eax
140	movd		(%rsi,%rax,4),%xmm4
141	movzbl		0xe(%rcx),%eax
142	movd		(%rsi,%rax,4),%xmm5
143	movzbl		0xf(%rcx),%eax
144	movd		(%rsi,%rax,4),%xmm6
145	punpckldq	%xmm4,%xmm7
146	punpckldq	%xmm6,%xmm5
147	punpcklqdq	%xmm5,%xmm7
148	paddd		%xmm7,%xmm0
149	paddd		%xmm1,%xmm0
150	pxor		%xmm0,%xmm3
151	pshufb		%xmm13,%xmm3
152	paddd		%xmm3,%xmm2
153	pxor		%xmm2,%xmm1
154	movdqa		%xmm1,%xmm8
155	psrld		$0x7,%xmm1
156	pslld		$0x19,%xmm8
157	por		%xmm8,%xmm1
158	pshufd		$0x39,%xmm0,%xmm0
159	pshufd		$0x4e,%xmm3,%xmm3
160	pshufd		$0x93,%xmm2,%xmm2
161	addq		$0x10,%rcx
162	cmpq		%r8,%rcx
163	jnz		.Lroundloop
164	pxor		%xmm2,%xmm0
165	pxor		%xmm3,%xmm1
166	pxor		%xmm10,%xmm0
167	pxor		%xmm11,%xmm1
168	addq		$0x40,%rsi
169	decq		%rdx
170	jnz		.Lbeginofloop
171	movdqu		%xmm0,(%rdi)
172	movdqu		%xmm1,0x10(%rdi)
173	movdqu		%xmm14,0x20(%rdi)
174.Lendofloop:
175	ret
176SYM_FUNC_END(blake2s_compress_ssse3)
177#endif /* CONFIG_AS_SSSE3 */
178
179#ifdef CONFIG_AS_AVX512
180SYM_FUNC_START(blake2s_compress_avx512)
181	vmovdqu		(%rdi),%xmm0
182	vmovdqu		0x10(%rdi),%xmm1
183	vmovdqu		0x20(%rdi),%xmm4
184	vmovq		%rcx,%xmm5
185	vmovdqa		IV(%rip),%xmm14
186	vmovdqa		IV+16(%rip),%xmm15
187	jmp		.Lblake2s_compress_avx512_mainloop
188.align 32
189.Lblake2s_compress_avx512_mainloop:
190	vmovdqa		%xmm0,%xmm10
191	vmovdqa		%xmm1,%xmm11
192	vpaddq		%xmm5,%xmm4,%xmm4
193	vmovdqa		%xmm14,%xmm2
194	vpxor		%xmm15,%xmm4,%xmm3
195	vmovdqu		(%rsi),%ymm6
196	vmovdqu		0x20(%rsi),%ymm7
197	addq		$0x40,%rsi
198	leaq		SIGMA2(%rip),%rax
199	movb		$0xa,%cl
200.Lblake2s_compress_avx512_roundloop:
201	addq		$0x40,%rax
202	vmovdqa		-0x40(%rax),%ymm8
203	vmovdqa		-0x20(%rax),%ymm9
204	vpermi2d	%ymm7,%ymm6,%ymm8
205	vpermi2d	%ymm7,%ymm6,%ymm9
206	vmovdqa		%ymm8,%ymm6
207	vmovdqa		%ymm9,%ymm7
208	vpaddd		%xmm8,%xmm0,%xmm0
209	vpaddd		%xmm1,%xmm0,%xmm0
210	vpxor		%xmm0,%xmm3,%xmm3
211	vprord		$0x10,%xmm3,%xmm3
212	vpaddd		%xmm3,%xmm2,%xmm2
213	vpxor		%xmm2,%xmm1,%xmm1
214	vprord		$0xc,%xmm1,%xmm1
215	vextracti128	$0x1,%ymm8,%xmm8
216	vpaddd		%xmm8,%xmm0,%xmm0
217	vpaddd		%xmm1,%xmm0,%xmm0
218	vpxor		%xmm0,%xmm3,%xmm3
219	vprord		$0x8,%xmm3,%xmm3
220	vpaddd		%xmm3,%xmm2,%xmm2
221	vpxor		%xmm2,%xmm1,%xmm1
222	vprord		$0x7,%xmm1,%xmm1
223	vpshufd		$0x93,%xmm0,%xmm0
224	vpshufd		$0x4e,%xmm3,%xmm3
225	vpshufd		$0x39,%xmm2,%xmm2
226	vpaddd		%xmm9,%xmm0,%xmm0
227	vpaddd		%xmm1,%xmm0,%xmm0
228	vpxor		%xmm0,%xmm3,%xmm3
229	vprord		$0x10,%xmm3,%xmm3
230	vpaddd		%xmm3,%xmm2,%xmm2
231	vpxor		%xmm2,%xmm1,%xmm1
232	vprord		$0xc,%xmm1,%xmm1
233	vextracti128	$0x1,%ymm9,%xmm9
234	vpaddd		%xmm9,%xmm0,%xmm0
235	vpaddd		%xmm1,%xmm0,%xmm0
236	vpxor		%xmm0,%xmm3,%xmm3
237	vprord		$0x8,%xmm3,%xmm3
238	vpaddd		%xmm3,%xmm2,%xmm2
239	vpxor		%xmm2,%xmm1,%xmm1
240	vprord		$0x7,%xmm1,%xmm1
241	vpshufd		$0x39,%xmm0,%xmm0
242	vpshufd		$0x4e,%xmm3,%xmm3
243	vpshufd		$0x93,%xmm2,%xmm2
244	decb		%cl
245	jne		.Lblake2s_compress_avx512_roundloop
246	vpxor		%xmm10,%xmm0,%xmm0
247	vpxor		%xmm11,%xmm1,%xmm1
248	vpxor		%xmm2,%xmm0,%xmm0
249	vpxor		%xmm3,%xmm1,%xmm1
250	decq		%rdx
251	jne		.Lblake2s_compress_avx512_mainloop
252	vmovdqu		%xmm0,(%rdi)
253	vmovdqu		%xmm1,0x10(%rdi)
254	vmovdqu		%xmm4,0x20(%rdi)
255	vzeroupper
256	retq
257SYM_FUNC_END(blake2s_compress_avx512)
258#endif /* CONFIG_AS_AVX512 */
259