1/*
2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3 *
4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	SHASH		.req	v0
15	SHASH2		.req	v1
16	T1		.req	v2
17	T2		.req	v3
18	MASK		.req	v4
19	XL		.req	v5
20	XM		.req	v6
21	XH		.req	v7
22	IN1		.req	v7
23
24	k00_16		.req	v8
25	k32_48		.req	v9
26
27	t3		.req	v10
28	t4		.req	v11
29	t5		.req	v12
30	t6		.req	v13
31	t7		.req	v14
32	t8		.req	v15
33	t9		.req	v16
34
35	perm1		.req	v17
36	perm2		.req	v18
37	perm3		.req	v19
38
39	sh1		.req	v20
40	sh2		.req	v21
41	sh3		.req	v22
42	sh4		.req	v23
43
44	ss1		.req	v24
45	ss2		.req	v25
46	ss3		.req	v26
47	ss4		.req	v27
48
49	.text
50	.arch		armv8-a+crypto
51
52	.macro		__pmull_p64, rd, rn, rm
53	pmull		\rd\().1q, \rn\().1d, \rm\().1d
54	.endm
55
56	.macro		__pmull2_p64, rd, rn, rm
57	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
58	.endm
59
60	.macro		__pmull_p8, rq, ad, bd
61	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
62	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
63	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
64
65	__pmull_p8_\bd	\rq, \ad
66	.endm
67
68	.macro		__pmull2_p8, rq, ad, bd
69	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
70	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
71	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
72
73	__pmull2_p8_\bd	\rq, \ad
74	.endm
75
76	.macro		__pmull_p8_SHASH, rq, ad
77	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
78	.endm
79
80	.macro		__pmull_p8_SHASH2, rq, ad
81	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
82	.endm
83
84	.macro		__pmull2_p8_SHASH, rq, ad
85	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
86	.endm
87
88	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
89	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
90	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
91	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
92	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
93	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
94	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
95	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
96	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
97
98	eor		t3.16b, t3.16b, t4.16b			// L = E + F
99	eor		t5.16b, t5.16b, t6.16b			// M = G + H
100	eor		t7.16b, t7.16b, t8.16b			// N = I + J
101
102	uzp1		t4.2d, t3.2d, t5.2d
103	uzp2		t3.2d, t3.2d, t5.2d
104	uzp1		t6.2d, t7.2d, t9.2d
105	uzp2		t7.2d, t7.2d, t9.2d
106
107	// t3 = (L) (P0 + P1) << 8
108	// t5 = (M) (P2 + P3) << 16
109	eor		t4.16b, t4.16b, t3.16b
110	and		t3.16b, t3.16b, k32_48.16b
111
112	// t7 = (N) (P4 + P5) << 24
113	// t9 = (K) (P6 + P7) << 32
114	eor		t6.16b, t6.16b, t7.16b
115	and		t7.16b, t7.16b, k00_16.16b
116
117	eor		t4.16b, t4.16b, t3.16b
118	eor		t6.16b, t6.16b, t7.16b
119
120	zip2		t5.2d, t4.2d, t3.2d
121	zip1		t3.2d, t4.2d, t3.2d
122	zip2		t9.2d, t6.2d, t7.2d
123	zip1		t7.2d, t6.2d, t7.2d
124
125	ext		t3.16b, t3.16b, t3.16b, #15
126	ext		t5.16b, t5.16b, t5.16b, #14
127	ext		t7.16b, t7.16b, t7.16b, #13
128	ext		t9.16b, t9.16b, t9.16b, #12
129
130	eor		t3.16b, t3.16b, t5.16b
131	eor		t7.16b, t7.16b, t9.16b
132	eor		\rq\().16b, \rq\().16b, t3.16b
133	eor		\rq\().16b, \rq\().16b, t7.16b
134	.endm
135
136	.macro		__pmull_pre_p64
137	movi		MASK.16b, #0xe1
138	shl		MASK.2d, MASK.2d, #57
139	.endm
140
141	.macro		__pmull_pre_p8
142	// k00_16 := 0x0000000000000000_000000000000ffff
143	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
144	movi		k32_48.2d, #0xffffffff
145	mov		k32_48.h[2], k32_48.h[0]
146	ushr		k00_16.2d, k32_48.2d, #32
147
148	// prepare the permutation vectors
149	mov_q		x5, 0x080f0e0d0c0b0a09
150	movi		T1.8b, #8
151	dup		perm1.2d, x5
152	eor		perm1.16b, perm1.16b, T1.16b
153	ushr		perm2.2d, perm1.2d, #8
154	ushr		perm3.2d, perm1.2d, #16
155	ushr		T1.2d, perm1.2d, #24
156	sli		perm2.2d, perm1.2d, #56
157	sli		perm3.2d, perm1.2d, #48
158	sli		T1.2d, perm1.2d, #40
159
160	// precompute loop invariants
161	tbl		sh1.16b, {SHASH.16b}, perm1.16b
162	tbl		sh2.16b, {SHASH.16b}, perm2.16b
163	tbl		sh3.16b, {SHASH.16b}, perm3.16b
164	tbl		sh4.16b, {SHASH.16b}, T1.16b
165	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
166	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
167	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
168	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
169	.endm
170
171	//
172	// PMULL (64x64->128) based reduction for CPUs that can do
173	// it in a single instruction.
174	//
175	.macro		__pmull_reduce_p64
176	pmull		T2.1q, XL.1d, MASK.1d
177	eor		XM.16b, XM.16b, T1.16b
178
179	mov		XH.d[0], XM.d[1]
180	mov		XM.d[1], XL.d[0]
181
182	eor		XL.16b, XM.16b, T2.16b
183	ext		T2.16b, XL.16b, XL.16b, #8
184	pmull		XL.1q, XL.1d, MASK.1d
185	.endm
186
187	//
188	// Alternative reduction for CPUs that lack support for the
189	// 64x64->128 PMULL instruction
190	//
191	.macro		__pmull_reduce_p8
192	eor		XM.16b, XM.16b, T1.16b
193
194	mov		XL.d[1], XM.d[0]
195	mov		XH.d[0], XM.d[1]
196
197	shl		T1.2d, XL.2d, #57
198	shl		T2.2d, XL.2d, #62
199	eor		T2.16b, T2.16b, T1.16b
200	shl		T1.2d, XL.2d, #63
201	eor		T2.16b, T2.16b, T1.16b
202	ext		T1.16b, XL.16b, XH.16b, #8
203	eor		T2.16b, T2.16b, T1.16b
204
205	mov		XL.d[1], T2.d[0]
206	mov		XH.d[0], T2.d[1]
207
208	ushr		T2.2d, XL.2d, #1
209	eor		XH.16b, XH.16b, XL.16b
210	eor		XL.16b, XL.16b, T2.16b
211	ushr		T2.2d, T2.2d, #6
212	ushr		XL.2d, XL.2d, #1
213	.endm
214
215	.macro		__pmull_ghash, pn
216	ld1		{SHASH.2d}, [x3]
217	ld1		{XL.2d}, [x1]
218	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
219	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
220
221	__pmull_pre_\pn
222
223	/* do the head block first, if supplied */
224	cbz		x4, 0f
225	ld1		{T1.2d}, [x4]
226	b		1f
227
2280:	ld1		{T1.2d}, [x2], #16
229	sub		w0, w0, #1
230
2311:	/* multiply XL by SHASH in GF(2^128) */
232CPU_LE(	rev64		T1.16b, T1.16b	)
233
234	ext		T2.16b, XL.16b, XL.16b, #8
235	ext		IN1.16b, T1.16b, T1.16b, #8
236	eor		T1.16b, T1.16b, T2.16b
237	eor		XL.16b, XL.16b, IN1.16b
238
239	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
240	eor		T1.16b, T1.16b, XL.16b
241	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
242	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
243
244	eor		T2.16b, XL.16b, XH.16b
245	ext		T1.16b, XL.16b, XH.16b, #8
246	eor		XM.16b, XM.16b, T2.16b
247
248	__pmull_reduce_\pn
249
250	eor		T2.16b, T2.16b, XH.16b
251	eor		XL.16b, XL.16b, T2.16b
252
253	cbnz		w0, 0b
254
255	st1		{XL.2d}, [x1]
256	ret
257	.endm
258
259	/*
260	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
261	 *			   struct ghash_key const *k, const char *head)
262	 */
263ENTRY(pmull_ghash_update_p64)
264	__pmull_ghash	p64
265ENDPROC(pmull_ghash_update_p64)
266
267ENTRY(pmull_ghash_update_p8)
268	__pmull_ghash	p8
269ENDPROC(pmull_ghash_update_p8)
270
271	KS		.req	v8
272	CTR		.req	v9
273	INP		.req	v10
274
275	.macro		load_round_keys, rounds, rk
276	cmp		\rounds, #12
277	blo		2222f		/* 128 bits */
278	beq		1111f		/* 192 bits */
279	ld1		{v17.4s-v18.4s}, [\rk], #32
2801111:	ld1		{v19.4s-v20.4s}, [\rk], #32
2812222:	ld1		{v21.4s-v24.4s}, [\rk], #64
282	ld1		{v25.4s-v28.4s}, [\rk], #64
283	ld1		{v29.4s-v31.4s}, [\rk]
284	.endm
285
286	.macro		enc_round, state, key
287	aese		\state\().16b, \key\().16b
288	aesmc		\state\().16b, \state\().16b
289	.endm
290
291	.macro		enc_block, state, rounds
292	cmp		\rounds, #12
293	b.lo		2222f		/* 128 bits */
294	b.eq		1111f		/* 192 bits */
295	enc_round	\state, v17
296	enc_round	\state, v18
2971111:	enc_round	\state, v19
298	enc_round	\state, v20
2992222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
300	enc_round	\state, \key
301	.endr
302	aese		\state\().16b, v30.16b
303	eor		\state\().16b, \state\().16b, v31.16b
304	.endm
305
306	.macro		pmull_gcm_do_crypt, enc
307	ld1		{SHASH.2d}, [x4]
308	ld1		{XL.2d}, [x1]
309	ldr		x8, [x5, #8]			// load lower counter
310
311	movi		MASK.16b, #0xe1
312	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
313CPU_LE(	rev		x8, x8		)
314	shl		MASK.2d, MASK.2d, #57
315	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
316
317	.if		\enc == 1
318	ld1		{KS.16b}, [x7]
319	.endif
320
3210:	ld1		{CTR.8b}, [x5]			// load upper counter
322	ld1		{INP.16b}, [x3], #16
323	rev		x9, x8
324	add		x8, x8, #1
325	sub		w0, w0, #1
326	ins		CTR.d[1], x9			// set lower counter
327
328	.if		\enc == 1
329	eor		INP.16b, INP.16b, KS.16b	// encrypt input
330	st1		{INP.16b}, [x2], #16
331	.endif
332
333	rev64		T1.16b, INP.16b
334
335	cmp		w6, #12
336	b.ge		2f				// AES-192/256?
337
3381:	enc_round	CTR, v21
339
340	ext		T2.16b, XL.16b, XL.16b, #8
341	ext		IN1.16b, T1.16b, T1.16b, #8
342
343	enc_round	CTR, v22
344
345	eor		T1.16b, T1.16b, T2.16b
346	eor		XL.16b, XL.16b, IN1.16b
347
348	enc_round	CTR, v23
349
350	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
351	eor		T1.16b, T1.16b, XL.16b
352
353	enc_round	CTR, v24
354
355	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
356	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
357
358	enc_round	CTR, v25
359
360	ext		T1.16b, XL.16b, XH.16b, #8
361	eor		T2.16b, XL.16b, XH.16b
362	eor		XM.16b, XM.16b, T1.16b
363
364	enc_round	CTR, v26
365
366	eor		XM.16b, XM.16b, T2.16b
367	pmull		T2.1q, XL.1d, MASK.1d
368
369	enc_round	CTR, v27
370
371	mov		XH.d[0], XM.d[1]
372	mov		XM.d[1], XL.d[0]
373
374	enc_round	CTR, v28
375
376	eor		XL.16b, XM.16b, T2.16b
377
378	enc_round	CTR, v29
379
380	ext		T2.16b, XL.16b, XL.16b, #8
381
382	aese		CTR.16b, v30.16b
383
384	pmull		XL.1q, XL.1d, MASK.1d
385	eor		T2.16b, T2.16b, XH.16b
386
387	eor		KS.16b, CTR.16b, v31.16b
388
389	eor		XL.16b, XL.16b, T2.16b
390
391	.if		\enc == 0
392	eor		INP.16b, INP.16b, KS.16b
393	st1		{INP.16b}, [x2], #16
394	.endif
395
396	cbnz		w0, 0b
397
398CPU_LE(	rev		x8, x8		)
399	st1		{XL.2d}, [x1]
400	str		x8, [x5, #8]			// store lower counter
401
402	.if		\enc == 1
403	st1		{KS.16b}, [x7]
404	.endif
405
406	ret
407
4082:	b.eq		3f				// AES-192?
409	enc_round	CTR, v17
410	enc_round	CTR, v18
4113:	enc_round	CTR, v19
412	enc_round	CTR, v20
413	b		1b
414	.endm
415
416	/*
417	 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
418	 *			  struct ghash_key const *k, u8 ctr[],
419	 *			  int rounds, u8 ks[])
420	 */
421ENTRY(pmull_gcm_encrypt)
422	pmull_gcm_do_crypt	1
423ENDPROC(pmull_gcm_encrypt)
424
425	/*
426	 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
427	 *			  struct ghash_key const *k, u8 ctr[],
428	 *			  int rounds)
429	 */
430ENTRY(pmull_gcm_decrypt)
431	pmull_gcm_do_crypt	0
432ENDPROC(pmull_gcm_decrypt)
433
434	/*
435	 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
436	 */
437ENTRY(pmull_gcm_encrypt_block)
438	cbz		x2, 0f
439	load_round_keys	w3, x2
4400:	ld1		{v0.16b}, [x1]
441	enc_block	v0, w3
442	st1		{v0.16b}, [x0]
443	ret
444ENDPROC(pmull_gcm_encrypt_block)
445