1/*
2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3 *
4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	SHASH		.req	v0
15	SHASH2		.req	v1
16	T1		.req	v2
17	T2		.req	v3
18	MASK		.req	v4
19	XL		.req	v5
20	XM		.req	v6
21	XH		.req	v7
22	IN1		.req	v7
23
24	k00_16		.req	v8
25	k32_48		.req	v9
26
27	t3		.req	v10
28	t4		.req	v11
29	t5		.req	v12
30	t6		.req	v13
31	t7		.req	v14
32	t8		.req	v15
33	t9		.req	v16
34
35	perm1		.req	v17
36	perm2		.req	v18
37	perm3		.req	v19
38
39	sh1		.req	v20
40	sh2		.req	v21
41	sh3		.req	v22
42	sh4		.req	v23
43
44	ss1		.req	v24
45	ss2		.req	v25
46	ss3		.req	v26
47	ss4		.req	v27
48
49	.text
50	.arch		armv8-a+crypto
51
52	.macro		__pmull_p64, rd, rn, rm
53	pmull		\rd\().1q, \rn\().1d, \rm\().1d
54	.endm
55
56	.macro		__pmull2_p64, rd, rn, rm
57	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
58	.endm
59
60	.macro		__pmull_p8, rq, ad, bd
61	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
62	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
63	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
64
65	__pmull_p8_\bd	\rq, \ad
66	.endm
67
68	.macro		__pmull2_p8, rq, ad, bd
69	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
70	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
71	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
72
73	__pmull2_p8_\bd	\rq, \ad
74	.endm
75
76	.macro		__pmull_p8_SHASH, rq, ad
77	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
78	.endm
79
80	.macro		__pmull_p8_SHASH2, rq, ad
81	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
82	.endm
83
84	.macro		__pmull2_p8_SHASH, rq, ad
85	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
86	.endm
87
88	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
89	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
90	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
91	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
92	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
93	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
94	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
95	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
96	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
97
98	eor		t3.16b, t3.16b, t4.16b			// L = E + F
99	eor		t5.16b, t5.16b, t6.16b			// M = G + H
100	eor		t7.16b, t7.16b, t8.16b			// N = I + J
101
102	uzp1		t4.2d, t3.2d, t5.2d
103	uzp2		t3.2d, t3.2d, t5.2d
104	uzp1		t6.2d, t7.2d, t9.2d
105	uzp2		t7.2d, t7.2d, t9.2d
106
107	// t3 = (L) (P0 + P1) << 8
108	// t5 = (M) (P2 + P3) << 16
109	eor		t4.16b, t4.16b, t3.16b
110	and		t3.16b, t3.16b, k32_48.16b
111
112	// t7 = (N) (P4 + P5) << 24
113	// t9 = (K) (P6 + P7) << 32
114	eor		t6.16b, t6.16b, t7.16b
115	and		t7.16b, t7.16b, k00_16.16b
116
117	eor		t4.16b, t4.16b, t3.16b
118	eor		t6.16b, t6.16b, t7.16b
119
120	zip2		t5.2d, t4.2d, t3.2d
121	zip1		t3.2d, t4.2d, t3.2d
122	zip2		t9.2d, t6.2d, t7.2d
123	zip1		t7.2d, t6.2d, t7.2d
124
125	ext		t3.16b, t3.16b, t3.16b, #15
126	ext		t5.16b, t5.16b, t5.16b, #14
127	ext		t7.16b, t7.16b, t7.16b, #13
128	ext		t9.16b, t9.16b, t9.16b, #12
129
130	eor		t3.16b, t3.16b, t5.16b
131	eor		t7.16b, t7.16b, t9.16b
132	eor		\rq\().16b, \rq\().16b, t3.16b
133	eor		\rq\().16b, \rq\().16b, t7.16b
134	.endm
135
136	.macro		__pmull_pre_p64
137	movi		MASK.16b, #0xe1
138	shl		MASK.2d, MASK.2d, #57
139	.endm
140
141	.macro		__pmull_pre_p8
142	// k00_16 := 0x0000000000000000_000000000000ffff
143	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
144	movi		k32_48.2d, #0xffffffff
145	mov		k32_48.h[2], k32_48.h[0]
146	ushr		k00_16.2d, k32_48.2d, #32
147
148	// prepare the permutation vectors
149	mov_q		x5, 0x080f0e0d0c0b0a09
150	movi		T1.8b, #8
151	dup		perm1.2d, x5
152	eor		perm1.16b, perm1.16b, T1.16b
153	ushr		perm2.2d, perm1.2d, #8
154	ushr		perm3.2d, perm1.2d, #16
155	ushr		T1.2d, perm1.2d, #24
156	sli		perm2.2d, perm1.2d, #56
157	sli		perm3.2d, perm1.2d, #48
158	sli		T1.2d, perm1.2d, #40
159
160	// precompute loop invariants
161	tbl		sh1.16b, {SHASH.16b}, perm1.16b
162	tbl		sh2.16b, {SHASH.16b}, perm2.16b
163	tbl		sh3.16b, {SHASH.16b}, perm3.16b
164	tbl		sh4.16b, {SHASH.16b}, T1.16b
165	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
166	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
167	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
168	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
169	.endm
170
171	//
172	// PMULL (64x64->128) based reduction for CPUs that can do
173	// it in a single instruction.
174	//
175	.macro		__pmull_reduce_p64
176	pmull		T2.1q, XL.1d, MASK.1d
177	eor		XM.16b, XM.16b, T1.16b
178
179	mov		XH.d[0], XM.d[1]
180	mov		XM.d[1], XL.d[0]
181
182	eor		XL.16b, XM.16b, T2.16b
183	ext		T2.16b, XL.16b, XL.16b, #8
184	pmull		XL.1q, XL.1d, MASK.1d
185	.endm
186
187	//
188	// Alternative reduction for CPUs that lack support for the
189	// 64x64->128 PMULL instruction
190	//
191	.macro		__pmull_reduce_p8
192	eor		XM.16b, XM.16b, T1.16b
193
194	mov		XL.d[1], XM.d[0]
195	mov		XH.d[0], XM.d[1]
196
197	shl		T1.2d, XL.2d, #57
198	shl		T2.2d, XL.2d, #62
199	eor		T2.16b, T2.16b, T1.16b
200	shl		T1.2d, XL.2d, #63
201	eor		T2.16b, T2.16b, T1.16b
202	ext		T1.16b, XL.16b, XH.16b, #8
203	eor		T2.16b, T2.16b, T1.16b
204
205	mov		XL.d[1], T2.d[0]
206	mov		XH.d[0], T2.d[1]
207
208	ushr		T2.2d, XL.2d, #1
209	eor		XH.16b, XH.16b, XL.16b
210	eor		XL.16b, XL.16b, T2.16b
211	ushr		T2.2d, T2.2d, #6
212	ushr		XL.2d, XL.2d, #1
213	.endm
214
215	.macro		__pmull_ghash, pn
216	frame_push	5
217
218	mov		x19, x0
219	mov		x20, x1
220	mov		x21, x2
221	mov		x22, x3
222	mov		x23, x4
223
2240:	ld1		{SHASH.2d}, [x22]
225	ld1		{XL.2d}, [x20]
226	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
227	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
228
229	__pmull_pre_\pn
230
231	/* do the head block first, if supplied */
232	cbz		x23, 1f
233	ld1		{T1.2d}, [x23]
234	mov		x23, xzr
235	b		2f
236
2371:	ld1		{T1.2d}, [x21], #16
238	sub		w19, w19, #1
239
2402:	/* multiply XL by SHASH in GF(2^128) */
241CPU_LE(	rev64		T1.16b, T1.16b	)
242
243	ext		T2.16b, XL.16b, XL.16b, #8
244	ext		IN1.16b, T1.16b, T1.16b, #8
245	eor		T1.16b, T1.16b, T2.16b
246	eor		XL.16b, XL.16b, IN1.16b
247
248	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
249	eor		T1.16b, T1.16b, XL.16b
250	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
251	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
252
253	eor		T2.16b, XL.16b, XH.16b
254	ext		T1.16b, XL.16b, XH.16b, #8
255	eor		XM.16b, XM.16b, T2.16b
256
257	__pmull_reduce_\pn
258
259	eor		T2.16b, T2.16b, XH.16b
260	eor		XL.16b, XL.16b, T2.16b
261
262	cbz		w19, 3f
263
264	if_will_cond_yield_neon
265	st1		{XL.2d}, [x20]
266	do_cond_yield_neon
267	b		0b
268	endif_yield_neon
269
270	b		1b
271
2723:	st1		{XL.2d}, [x20]
273	frame_pop
274	ret
275	.endm
276
277	/*
278	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
279	 *			   struct ghash_key const *k, const char *head)
280	 */
281ENTRY(pmull_ghash_update_p64)
282	__pmull_ghash	p64
283ENDPROC(pmull_ghash_update_p64)
284
285ENTRY(pmull_ghash_update_p8)
286	__pmull_ghash	p8
287ENDPROC(pmull_ghash_update_p8)
288
289	KS		.req	v8
290	CTR		.req	v9
291	INP		.req	v10
292
293	.macro		load_round_keys, rounds, rk
294	cmp		\rounds, #12
295	blo		2222f		/* 128 bits */
296	beq		1111f		/* 192 bits */
297	ld1		{v17.4s-v18.4s}, [\rk], #32
2981111:	ld1		{v19.4s-v20.4s}, [\rk], #32
2992222:	ld1		{v21.4s-v24.4s}, [\rk], #64
300	ld1		{v25.4s-v28.4s}, [\rk], #64
301	ld1		{v29.4s-v31.4s}, [\rk]
302	.endm
303
304	.macro		enc_round, state, key
305	aese		\state\().16b, \key\().16b
306	aesmc		\state\().16b, \state\().16b
307	.endm
308
309	.macro		enc_block, state, rounds
310	cmp		\rounds, #12
311	b.lo		2222f		/* 128 bits */
312	b.eq		1111f		/* 192 bits */
313	enc_round	\state, v17
314	enc_round	\state, v18
3151111:	enc_round	\state, v19
316	enc_round	\state, v20
3172222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
318	enc_round	\state, \key
319	.endr
320	aese		\state\().16b, v30.16b
321	eor		\state\().16b, \state\().16b, v31.16b
322	.endm
323
324	.macro		pmull_gcm_do_crypt, enc
325	frame_push	10
326
327	mov		x19, x0
328	mov		x20, x1
329	mov		x21, x2
330	mov		x22, x3
331	mov		x23, x4
332	mov		x24, x5
333	mov		x25, x6
334	mov		x26, x7
335	.if		\enc == 1
336	ldr		x27, [sp, #96]			// first stacked arg
337	.endif
338
339	ldr		x28, [x24, #8]			// load lower counter
340CPU_LE(	rev		x28, x28	)
341
3420:	mov		x0, x25
343	load_round_keys	w26, x0
344	ld1		{SHASH.2d}, [x23]
345	ld1		{XL.2d}, [x20]
346
347	movi		MASK.16b, #0xe1
348	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
349	shl		MASK.2d, MASK.2d, #57
350	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
351
352	.if		\enc == 1
353	ld1		{KS.16b}, [x27]
354	.endif
355
3561:	ld1		{CTR.8b}, [x24]			// load upper counter
357	ld1		{INP.16b}, [x22], #16
358	rev		x9, x28
359	add		x28, x28, #1
360	sub		w19, w19, #1
361	ins		CTR.d[1], x9			// set lower counter
362
363	.if		\enc == 1
364	eor		INP.16b, INP.16b, KS.16b	// encrypt input
365	st1		{INP.16b}, [x21], #16
366	.endif
367
368	rev64		T1.16b, INP.16b
369
370	cmp		w26, #12
371	b.ge		4f				// AES-192/256?
372
3732:	enc_round	CTR, v21
374
375	ext		T2.16b, XL.16b, XL.16b, #8
376	ext		IN1.16b, T1.16b, T1.16b, #8
377
378	enc_round	CTR, v22
379
380	eor		T1.16b, T1.16b, T2.16b
381	eor		XL.16b, XL.16b, IN1.16b
382
383	enc_round	CTR, v23
384
385	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
386	eor		T1.16b, T1.16b, XL.16b
387
388	enc_round	CTR, v24
389
390	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
391	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
392
393	enc_round	CTR, v25
394
395	ext		T1.16b, XL.16b, XH.16b, #8
396	eor		T2.16b, XL.16b, XH.16b
397	eor		XM.16b, XM.16b, T1.16b
398
399	enc_round	CTR, v26
400
401	eor		XM.16b, XM.16b, T2.16b
402	pmull		T2.1q, XL.1d, MASK.1d
403
404	enc_round	CTR, v27
405
406	mov		XH.d[0], XM.d[1]
407	mov		XM.d[1], XL.d[0]
408
409	enc_round	CTR, v28
410
411	eor		XL.16b, XM.16b, T2.16b
412
413	enc_round	CTR, v29
414
415	ext		T2.16b, XL.16b, XL.16b, #8
416
417	aese		CTR.16b, v30.16b
418
419	pmull		XL.1q, XL.1d, MASK.1d
420	eor		T2.16b, T2.16b, XH.16b
421
422	eor		KS.16b, CTR.16b, v31.16b
423
424	eor		XL.16b, XL.16b, T2.16b
425
426	.if		\enc == 0
427	eor		INP.16b, INP.16b, KS.16b
428	st1		{INP.16b}, [x21], #16
429	.endif
430
431	cbz		w19, 3f
432
433	if_will_cond_yield_neon
434	st1		{XL.2d}, [x20]
435	.if		\enc == 1
436	st1		{KS.16b}, [x27]
437	.endif
438	do_cond_yield_neon
439	b		0b
440	endif_yield_neon
441
442	b		1b
443
4443:	st1		{XL.2d}, [x20]
445	.if		\enc == 1
446	st1		{KS.16b}, [x27]
447	.endif
448
449CPU_LE(	rev		x28, x28	)
450	str		x28, [x24, #8]			// store lower counter
451
452	frame_pop
453	ret
454
4554:	b.eq		5f				// AES-192?
456	enc_round	CTR, v17
457	enc_round	CTR, v18
4585:	enc_round	CTR, v19
459	enc_round	CTR, v20
460	b		2b
461	.endm
462
463	/*
464	 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
465	 *			  struct ghash_key const *k, u8 ctr[],
466	 *			  int rounds, u8 ks[])
467	 */
468ENTRY(pmull_gcm_encrypt)
469	pmull_gcm_do_crypt	1
470ENDPROC(pmull_gcm_encrypt)
471
472	/*
473	 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
474	 *			  struct ghash_key const *k, u8 ctr[],
475	 *			  int rounds)
476	 */
477ENTRY(pmull_gcm_decrypt)
478	pmull_gcm_do_crypt	0
479ENDPROC(pmull_gcm_decrypt)
480
481	/*
482	 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
483	 */
484ENTRY(pmull_gcm_encrypt_block)
485	cbz		x2, 0f
486	load_round_keys	w3, x2
4870:	ld1		{v0.16b}, [x1]
488	enc_block	v0, w3
489	st1		{v0.16b}, [x0]
490	ret
491ENDPROC(pmull_gcm_encrypt_block)
492