xref: /openbmc/linux/arch/arm/crypto/ghash-ce-core.S (revision feac8c8b)
1/*
2 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
3 *
4 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	SHASH		.req	q0
15	T1		.req	q1
16	XL		.req	q2
17	XM		.req	q3
18	XH		.req	q4
19	IN1		.req	q4
20
21	SHASH_L		.req	d0
22	SHASH_H		.req	d1
23	T1_L		.req	d2
24	T1_H		.req	d3
25	XL_L		.req	d4
26	XL_H		.req	d5
27	XM_L		.req	d6
28	XM_H		.req	d7
29	XH_L		.req	d8
30
31	t0l		.req	d10
32	t0h		.req	d11
33	t1l		.req	d12
34	t1h		.req	d13
35	t2l		.req	d14
36	t2h		.req	d15
37	t3l		.req	d16
38	t3h		.req	d17
39	t4l		.req	d18
40	t4h		.req	d19
41
42	t0q		.req	q5
43	t1q		.req	q6
44	t2q		.req	q7
45	t3q		.req	q8
46	t4q		.req	q9
47	T2		.req	q9
48
49	s1l		.req	d20
50	s1h		.req	d21
51	s2l		.req	d22
52	s2h		.req	d23
53	s3l		.req	d24
54	s3h		.req	d25
55	s4l		.req	d26
56	s4h		.req	d27
57
58	MASK		.req	d28
59	SHASH2_p8	.req	d28
60
61	k16		.req	d29
62	k32		.req	d30
63	k48		.req	d31
64	SHASH2_p64	.req	d31
65
66	.text
67	.fpu		crypto-neon-fp-armv8
68
69	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
70	vmull.p64	\rd, \rn, \rm
71	.endm
72
73	/*
74	 * This implementation of 64x64 -> 128 bit polynomial multiplication
75	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
76	 * "Fast Software Polynomial Multiplication on ARM Processors Using
77	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
78	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
79	 *
80	 * It has been slightly tweaked for in-order performance, and to allow
81	 * 'rq' to overlap with 'ad' or 'bd'.
82	 */
83	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
84	vext.8		t0l, \ad, \ad, #1	@ A1
85	.ifc		\b1, t4l
86	vext.8		t4l, \bd, \bd, #1	@ B1
87	.endif
88	vmull.p8	t0q, t0l, \bd		@ F = A1*B
89	vext.8		t1l, \ad, \ad, #2	@ A2
90	vmull.p8	t4q, \ad, \b1		@ E = A*B1
91	.ifc		\b2, t3l
92	vext.8		t3l, \bd, \bd, #2	@ B2
93	.endif
94	vmull.p8	t1q, t1l, \bd		@ H = A2*B
95	vext.8		t2l, \ad, \ad, #3	@ A3
96	vmull.p8	t3q, \ad, \b2		@ G = A*B2
97	veor		t0q, t0q, t4q		@ L = E + F
98	.ifc		\b3, t4l
99	vext.8		t4l, \bd, \bd, #3	@ B3
100	.endif
101	vmull.p8	t2q, t2l, \bd		@ J = A3*B
102	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
103	veor		t1q, t1q, t3q		@ M = G + H
104	.ifc		\b4, t3l
105	vext.8		t3l, \bd, \bd, #4	@ B4
106	.endif
107	vmull.p8	t4q, \ad, \b3		@ I = A*B3
108	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
109	vmull.p8	t3q, \ad, \b4		@ K = A*B4
110	vand		t0h, t0h, k48
111	vand		t1h, t1h, k32
112	veor		t2q, t2q, t4q		@ N = I + J
113	veor		t0l, t0l, t0h
114	veor		t1l, t1l, t1h
115	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
116	vand		t2h, t2h, k16
117	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
118	vmov.i64	t3h, #0
119	vext.8		t0q, t0q, t0q, #15
120	veor		t2l, t2l, t2h
121	vext.8		t1q, t1q, t1q, #14
122	vmull.p8	\rq, \ad, \bd		@ D = A*B
123	vext.8		t2q, t2q, t2q, #13
124	vext.8		t3q, t3q, t3q, #12
125	veor		t0q, t0q, t1q
126	veor		t2q, t2q, t3q
127	veor		\rq, \rq, t0q
128	veor		\rq, \rq, t2q
129	.endm
130
131	//
132	// PMULL (64x64->128) based reduction for CPUs that can do
133	// it in a single instruction.
134	//
135	.macro		__pmull_reduce_p64
136	vmull.p64	T1, XL_L, MASK
137
138	veor		XH_L, XH_L, XM_H
139	vext.8		T1, T1, T1, #8
140	veor		XL_H, XL_H, XM_L
141	veor		T1, T1, XL
142
143	vmull.p64	XL, T1_H, MASK
144	.endm
145
146	//
147	// Alternative reduction for CPUs that lack support for the
148	// 64x64->128 PMULL instruction
149	//
150	.macro		__pmull_reduce_p8
151	veor		XL_H, XL_H, XM_L
152	veor		XH_L, XH_L, XM_H
153
154	vshl.i64	T1, XL, #57
155	vshl.i64	T2, XL, #62
156	veor		T1, T1, T2
157	vshl.i64	T2, XL, #63
158	veor		T1, T1, T2
159	veor		XL_H, XL_H, T1_L
160	veor		XH_L, XH_L, T1_H
161
162	vshr.u64	T1, XL, #1
163	veor		XH, XH, XL
164	veor		XL, XL, T1
165	vshr.u64	T1, T1, #6
166	vshr.u64	XL, XL, #1
167	.endm
168
169	.macro		ghash_update, pn
170	vld1.64		{XL}, [r1]
171
172	/* do the head block first, if supplied */
173	ldr		ip, [sp]
174	teq		ip, #0
175	beq		0f
176	vld1.64		{T1}, [ip]
177	teq		r0, #0
178	b		1f
179
1800:	vld1.64		{T1}, [r2]!
181	subs		r0, r0, #1
182
1831:	/* multiply XL by SHASH in GF(2^128) */
184#ifndef CONFIG_CPU_BIG_ENDIAN
185	vrev64.8	T1, T1
186#endif
187	vext.8		IN1, T1, T1, #8
188	veor		T1_L, T1_L, XL_H
189	veor		XL, XL, IN1
190
191	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
192	veor		T1, T1, XL
193	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
194	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
195
196	veor		T1, XL, XH
197	veor		XM, XM, T1
198
199	__pmull_reduce_\pn
200
201	veor		T1, T1, XH
202	veor		XL, XL, T1
203
204	bne		0b
205
206	vst1.64		{XL}, [r1]
207	bx		lr
208	.endm
209
210	/*
211	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
212	 *			   struct ghash_key const *k, const char *head)
213	 */
214ENTRY(pmull_ghash_update_p64)
215	vld1.64		{SHASH}, [r3]
216	veor		SHASH2_p64, SHASH_L, SHASH_H
217
218	vmov.i8		MASK, #0xe1
219	vshl.u64	MASK, MASK, #57
220
221	ghash_update	p64
222ENDPROC(pmull_ghash_update_p64)
223
224ENTRY(pmull_ghash_update_p8)
225	vld1.64		{SHASH}, [r3]
226	veor		SHASH2_p8, SHASH_L, SHASH_H
227
228	vext.8		s1l, SHASH_L, SHASH_L, #1
229	vext.8		s2l, SHASH_L, SHASH_L, #2
230	vext.8		s3l, SHASH_L, SHASH_L, #3
231	vext.8		s4l, SHASH_L, SHASH_L, #4
232	vext.8		s1h, SHASH_H, SHASH_H, #1
233	vext.8		s2h, SHASH_H, SHASH_H, #2
234	vext.8		s3h, SHASH_H, SHASH_H, #3
235	vext.8		s4h, SHASH_H, SHASH_H, #4
236
237	vmov.i64	k16, #0xffff
238	vmov.i64	k32, #0xffffffff
239	vmov.i64	k48, #0xffffffffffff
240
241	ghash_update	p8
242ENDPROC(pmull_ghash_update_p8)
243