xref: /openbmc/linux/arch/arm/crypto/ghash-ce-core.S (revision 5b4cb650)
1/*
2 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
3 *
4 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	SHASH		.req	q0
15	T1		.req	q1
16	XL		.req	q2
17	XM		.req	q3
18	XH		.req	q4
19	IN1		.req	q4
20
21	SHASH_L		.req	d0
22	SHASH_H		.req	d1
23	T1_L		.req	d2
24	T1_H		.req	d3
25	XL_L		.req	d4
26	XL_H		.req	d5
27	XM_L		.req	d6
28	XM_H		.req	d7
29	XH_L		.req	d8
30
31	t0l		.req	d10
32	t0h		.req	d11
33	t1l		.req	d12
34	t1h		.req	d13
35	t2l		.req	d14
36	t2h		.req	d15
37	t3l		.req	d16
38	t3h		.req	d17
39	t4l		.req	d18
40	t4h		.req	d19
41
42	t0q		.req	q5
43	t1q		.req	q6
44	t2q		.req	q7
45	t3q		.req	q8
46	t4q		.req	q9
47	T2		.req	q9
48
49	s1l		.req	d20
50	s1h		.req	d21
51	s2l		.req	d22
52	s2h		.req	d23
53	s3l		.req	d24
54	s3h		.req	d25
55	s4l		.req	d26
56	s4h		.req	d27
57
58	MASK		.req	d28
59	SHASH2_p8	.req	d28
60
61	k16		.req	d29
62	k32		.req	d30
63	k48		.req	d31
64	SHASH2_p64	.req	d31
65
66	HH		.req	q10
67	HH3		.req	q11
68	HH4		.req	q12
69	HH34		.req	q13
70
71	HH_L		.req	d20
72	HH_H		.req	d21
73	HH3_L		.req	d22
74	HH3_H		.req	d23
75	HH4_L		.req	d24
76	HH4_H		.req	d25
77	HH34_L		.req	d26
78	HH34_H		.req	d27
79	SHASH2_H	.req	d29
80
81	XL2		.req	q5
82	XM2		.req	q6
83	XH2		.req	q7
84	T3		.req	q8
85
86	XL2_L		.req	d10
87	XL2_H		.req	d11
88	XM2_L		.req	d12
89	XM2_H		.req	d13
90	T3_L		.req	d16
91	T3_H		.req	d17
92
93	.text
94	.fpu		crypto-neon-fp-armv8
95
96	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
97	vmull.p64	\rd, \rn, \rm
98	.endm
99
100	/*
101	 * This implementation of 64x64 -> 128 bit polynomial multiplication
102	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
103	 * "Fast Software Polynomial Multiplication on ARM Processors Using
104	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
105	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
106	 *
107	 * It has been slightly tweaked for in-order performance, and to allow
108	 * 'rq' to overlap with 'ad' or 'bd'.
109	 */
110	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
111	vext.8		t0l, \ad, \ad, #1	@ A1
112	.ifc		\b1, t4l
113	vext.8		t4l, \bd, \bd, #1	@ B1
114	.endif
115	vmull.p8	t0q, t0l, \bd		@ F = A1*B
116	vext.8		t1l, \ad, \ad, #2	@ A2
117	vmull.p8	t4q, \ad, \b1		@ E = A*B1
118	.ifc		\b2, t3l
119	vext.8		t3l, \bd, \bd, #2	@ B2
120	.endif
121	vmull.p8	t1q, t1l, \bd		@ H = A2*B
122	vext.8		t2l, \ad, \ad, #3	@ A3
123	vmull.p8	t3q, \ad, \b2		@ G = A*B2
124	veor		t0q, t0q, t4q		@ L = E + F
125	.ifc		\b3, t4l
126	vext.8		t4l, \bd, \bd, #3	@ B3
127	.endif
128	vmull.p8	t2q, t2l, \bd		@ J = A3*B
129	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
130	veor		t1q, t1q, t3q		@ M = G + H
131	.ifc		\b4, t3l
132	vext.8		t3l, \bd, \bd, #4	@ B4
133	.endif
134	vmull.p8	t4q, \ad, \b3		@ I = A*B3
135	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
136	vmull.p8	t3q, \ad, \b4		@ K = A*B4
137	vand		t0h, t0h, k48
138	vand		t1h, t1h, k32
139	veor		t2q, t2q, t4q		@ N = I + J
140	veor		t0l, t0l, t0h
141	veor		t1l, t1l, t1h
142	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
143	vand		t2h, t2h, k16
144	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
145	vmov.i64	t3h, #0
146	vext.8		t0q, t0q, t0q, #15
147	veor		t2l, t2l, t2h
148	vext.8		t1q, t1q, t1q, #14
149	vmull.p8	\rq, \ad, \bd		@ D = A*B
150	vext.8		t2q, t2q, t2q, #13
151	vext.8		t3q, t3q, t3q, #12
152	veor		t0q, t0q, t1q
153	veor		t2q, t2q, t3q
154	veor		\rq, \rq, t0q
155	veor		\rq, \rq, t2q
156	.endm
157
158	//
159	// PMULL (64x64->128) based reduction for CPUs that can do
160	// it in a single instruction.
161	//
162	.macro		__pmull_reduce_p64
163	vmull.p64	T1, XL_L, MASK
164
165	veor		XH_L, XH_L, XM_H
166	vext.8		T1, T1, T1, #8
167	veor		XL_H, XL_H, XM_L
168	veor		T1, T1, XL
169
170	vmull.p64	XL, T1_H, MASK
171	.endm
172
173	//
174	// Alternative reduction for CPUs that lack support for the
175	// 64x64->128 PMULL instruction
176	//
177	.macro		__pmull_reduce_p8
178	veor		XL_H, XL_H, XM_L
179	veor		XH_L, XH_L, XM_H
180
181	vshl.i64	T1, XL, #57
182	vshl.i64	T2, XL, #62
183	veor		T1, T1, T2
184	vshl.i64	T2, XL, #63
185	veor		T1, T1, T2
186	veor		XL_H, XL_H, T1_L
187	veor		XH_L, XH_L, T1_H
188
189	vshr.u64	T1, XL, #1
190	veor		XH, XH, XL
191	veor		XL, XL, T1
192	vshr.u64	T1, T1, #6
193	vshr.u64	XL, XL, #1
194	.endm
195
196	.macro		ghash_update, pn
197	vld1.64		{XL}, [r1]
198
199	/* do the head block first, if supplied */
200	ldr		ip, [sp]
201	teq		ip, #0
202	beq		0f
203	vld1.64		{T1}, [ip]
204	teq		r0, #0
205	b		3f
206
2070:	.ifc		\pn, p64
208	tst		r0, #3			// skip until #blocks is a
209	bne		2f			// round multiple of 4
210
211	vld1.8		{XL2-XM2}, [r2]!
2121:	vld1.8		{T3-T2}, [r2]!
213	vrev64.8	XL2, XL2
214	vrev64.8	XM2, XM2
215
216	subs		r0, r0, #4
217
218	vext.8		T1, XL2, XL2, #8
219	veor		XL2_H, XL2_H, XL_L
220	veor		XL, XL, T1
221
222	vrev64.8	T3, T3
223	vrev64.8	T1, T2
224
225	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
226	veor		XL2_H, XL2_H, XL_H
227	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
228	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
229
230	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
231	veor		XM2_L, XM2_L, XM2_H
232	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
233	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
234
235	veor		XH, XH, XH2
236	veor		XL, XL, XL2
237	veor		XM, XM, XM2
238
239	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
240	veor		T3_L, T3_L, T3_H
241	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
242	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
243
244	veor		XH, XH, XH2
245	veor		XL, XL, XL2
246	veor		XM, XM, XM2
247
248	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
249	veor		T1_L, T1_L, T1_H
250	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
251	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
252
253	veor		XH, XH, XH2
254	veor		XL, XL, XL2
255	veor		XM, XM, XM2
256
257	beq		4f
258
259	vld1.8		{XL2-XM2}, [r2]!
260
261	veor		T1, XL, XH
262	veor		XM, XM, T1
263
264	__pmull_reduce_p64
265
266	veor		T1, T1, XH
267	veor		XL, XL, T1
268
269	b		1b
270	.endif
271
2722:	vld1.64		{T1}, [r2]!
273	subs		r0, r0, #1
274
2753:	/* multiply XL by SHASH in GF(2^128) */
276#ifndef CONFIG_CPU_BIG_ENDIAN
277	vrev64.8	T1, T1
278#endif
279	vext.8		IN1, T1, T1, #8
280	veor		T1_L, T1_L, XL_H
281	veor		XL, XL, IN1
282
283	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
284	veor		T1, T1, XL
285	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
286	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
287
2884:	veor		T1, XL, XH
289	veor		XM, XM, T1
290
291	__pmull_reduce_\pn
292
293	veor		T1, T1, XH
294	veor		XL, XL, T1
295
296	bne		0b
297
298	vst1.64		{XL}, [r1]
299	bx		lr
300	.endm
301
302	/*
303	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
304	 *			   struct ghash_key const *k, const char *head)
305	 */
306ENTRY(pmull_ghash_update_p64)
307	vld1.64		{SHASH}, [r3]!
308	vld1.64		{HH}, [r3]!
309	vld1.64		{HH3-HH4}, [r3]
310
311	veor		SHASH2_p64, SHASH_L, SHASH_H
312	veor		SHASH2_H, HH_L, HH_H
313	veor		HH34_L, HH3_L, HH3_H
314	veor		HH34_H, HH4_L, HH4_H
315
316	vmov.i8		MASK, #0xe1
317	vshl.u64	MASK, MASK, #57
318
319	ghash_update	p64
320ENDPROC(pmull_ghash_update_p64)
321
322ENTRY(pmull_ghash_update_p8)
323	vld1.64		{SHASH}, [r3]
324	veor		SHASH2_p8, SHASH_L, SHASH_H
325
326	vext.8		s1l, SHASH_L, SHASH_L, #1
327	vext.8		s2l, SHASH_L, SHASH_L, #2
328	vext.8		s3l, SHASH_L, SHASH_L, #3
329	vext.8		s4l, SHASH_L, SHASH_L, #4
330	vext.8		s1h, SHASH_H, SHASH_H, #1
331	vext.8		s2h, SHASH_H, SHASH_H, #2
332	vext.8		s3h, SHASH_H, SHASH_H, #3
333	vext.8		s4h, SHASH_H, SHASH_H, #4
334
335	vmov.i64	k16, #0xffff
336	vmov.i64	k32, #0xffffffff
337	vmov.i64	k48, #0xffffffffffff
338
339	ghash_update	p8
340ENDPROC(pmull_ghash_update_p8)
341