1/*
2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3 *
4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	SHASH		.req	v0
15	SHASH2		.req	v1
16	T1		.req	v2
17	T2		.req	v3
18	MASK		.req	v4
19	XL		.req	v5
20	XM		.req	v6
21	XH		.req	v7
22	IN1		.req	v7
23
24	k00_16		.req	v8
25	k32_48		.req	v9
26
27	t3		.req	v10
28	t4		.req	v11
29	t5		.req	v12
30	t6		.req	v13
31	t7		.req	v14
32	t8		.req	v15
33	t9		.req	v16
34
35	perm1		.req	v17
36	perm2		.req	v18
37	perm3		.req	v19
38
39	sh1		.req	v20
40	sh2		.req	v21
41	sh3		.req	v22
42	sh4		.req	v23
43
44	ss1		.req	v24
45	ss2		.req	v25
46	ss3		.req	v26
47	ss4		.req	v27
48
49	.text
50	.arch		armv8-a+crypto
51
52	.macro		__pmull_p64, rd, rn, rm
53	pmull		\rd\().1q, \rn\().1d, \rm\().1d
54	.endm
55
56	.macro		__pmull2_p64, rd, rn, rm
57	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
58	.endm
59
60	.macro		__pmull_p8, rq, ad, bd
61	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
62	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
63	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
64
65	__pmull_p8_\bd	\rq, \ad
66	.endm
67
68	.macro		__pmull2_p8, rq, ad, bd
69	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
70	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
71	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
72
73	__pmull2_p8_\bd	\rq, \ad
74	.endm
75
76	.macro		__pmull_p8_SHASH, rq, ad
77	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
78	.endm
79
80	.macro		__pmull_p8_SHASH2, rq, ad
81	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
82	.endm
83
84	.macro		__pmull2_p8_SHASH, rq, ad
85	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
86	.endm
87
88	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
89	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
90	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
91	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
92	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
93	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
94	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
95	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
96	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
97
98	eor		t3.16b, t3.16b, t4.16b			// L = E + F
99	eor		t5.16b, t5.16b, t6.16b			// M = G + H
100	eor		t7.16b, t7.16b, t8.16b			// N = I + J
101
102	uzp1		t4.2d, t3.2d, t5.2d
103	uzp2		t3.2d, t3.2d, t5.2d
104	uzp1		t6.2d, t7.2d, t9.2d
105	uzp2		t7.2d, t7.2d, t9.2d
106
107	// t3 = (L) (P0 + P1) << 8
108	// t5 = (M) (P2 + P3) << 16
109	eor		t4.16b, t4.16b, t3.16b
110	and		t3.16b, t3.16b, k32_48.16b
111
112	// t7 = (N) (P4 + P5) << 24
113	// t9 = (K) (P6 + P7) << 32
114	eor		t6.16b, t6.16b, t7.16b
115	and		t7.16b, t7.16b, k00_16.16b
116
117	eor		t4.16b, t4.16b, t3.16b
118	eor		t6.16b, t6.16b, t7.16b
119
120	zip2		t5.2d, t4.2d, t3.2d
121	zip1		t3.2d, t4.2d, t3.2d
122	zip2		t9.2d, t6.2d, t7.2d
123	zip1		t7.2d, t6.2d, t7.2d
124
125	ext		t3.16b, t3.16b, t3.16b, #15
126	ext		t5.16b, t5.16b, t5.16b, #14
127	ext		t7.16b, t7.16b, t7.16b, #13
128	ext		t9.16b, t9.16b, t9.16b, #12
129
130	eor		t3.16b, t3.16b, t5.16b
131	eor		t7.16b, t7.16b, t9.16b
132	eor		\rq\().16b, \rq\().16b, t3.16b
133	eor		\rq\().16b, \rq\().16b, t7.16b
134	.endm
135
136	.macro		__pmull_pre_p64
137	movi		MASK.16b, #0xe1
138	shl		MASK.2d, MASK.2d, #57
139	.endm
140
141	.macro		__pmull_pre_p8
142	// k00_16 := 0x0000000000000000_000000000000ffff
143	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
144	movi		k32_48.2d, #0xffffffff
145	mov		k32_48.h[2], k32_48.h[0]
146	ushr		k00_16.2d, k32_48.2d, #32
147
148	// prepare the permutation vectors
149	mov_q		x5, 0x080f0e0d0c0b0a09
150	movi		T1.8b, #8
151	dup		perm1.2d, x5
152	eor		perm1.16b, perm1.16b, T1.16b
153	ushr		perm2.2d, perm1.2d, #8
154	ushr		perm3.2d, perm1.2d, #16
155	ushr		T1.2d, perm1.2d, #24
156	sli		perm2.2d, perm1.2d, #56
157	sli		perm3.2d, perm1.2d, #48
158	sli		T1.2d, perm1.2d, #40
159
160	// precompute loop invariants
161	tbl		sh1.16b, {SHASH.16b}, perm1.16b
162	tbl		sh2.16b, {SHASH.16b}, perm2.16b
163	tbl		sh3.16b, {SHASH.16b}, perm3.16b
164	tbl		sh4.16b, {SHASH.16b}, T1.16b
165	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
166	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
167	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
168	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
169	.endm
170
171	//
172	// PMULL (64x64->128) based reduction for CPUs that can do
173	// it in a single instruction.
174	//
175	.macro		__pmull_reduce_p64
176	pmull		T2.1q, XL.1d, MASK.1d
177	eor		XM.16b, XM.16b, T1.16b
178
179	mov		XH.d[0], XM.d[1]
180	mov		XM.d[1], XL.d[0]
181
182	eor		XL.16b, XM.16b, T2.16b
183	ext		T2.16b, XL.16b, XL.16b, #8
184	pmull		XL.1q, XL.1d, MASK.1d
185	.endm
186
187	//
188	// Alternative reduction for CPUs that lack support for the
189	// 64x64->128 PMULL instruction
190	//
191	.macro		__pmull_reduce_p8
192	eor		XM.16b, XM.16b, T1.16b
193
194	mov		XL.d[1], XM.d[0]
195	mov		XH.d[0], XM.d[1]
196
197	shl		T1.2d, XL.2d, #57
198	shl		T2.2d, XL.2d, #62
199	eor		T2.16b, T2.16b, T1.16b
200	shl		T1.2d, XL.2d, #63
201	eor		T2.16b, T2.16b, T1.16b
202	ext		T1.16b, XL.16b, XH.16b, #8
203	eor		T2.16b, T2.16b, T1.16b
204
205	mov		XL.d[1], T2.d[0]
206	mov		XH.d[0], T2.d[1]
207
208	ushr		T2.2d, XL.2d, #1
209	eor		XH.16b, XH.16b, XL.16b
210	eor		XL.16b, XL.16b, T2.16b
211	ushr		T2.2d, T2.2d, #6
212	ushr		XL.2d, XL.2d, #1
213	.endm
214
215	.macro		__pmull_ghash, pn
216	frame_push	5
217
218	mov		x19, x0
219	mov		x20, x1
220	mov		x21, x2
221	mov		x22, x3
222	mov		x23, x4
223
2240:	ld1		{SHASH.2d}, [x22]
225	ld1		{XL.2d}, [x20]
226	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
227	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
228
229	__pmull_pre_\pn
230
231	/* do the head block first, if supplied */
232	cbz		x23, 1f
233	ld1		{T1.2d}, [x23]
234	mov		x23, xzr
235	b		2f
236
2371:	ld1		{T1.2d}, [x21], #16
238	sub		w19, w19, #1
239
2402:	/* multiply XL by SHASH in GF(2^128) */
241CPU_LE(	rev64		T1.16b, T1.16b	)
242
243	ext		T2.16b, XL.16b, XL.16b, #8
244	ext		IN1.16b, T1.16b, T1.16b, #8
245	eor		T1.16b, T1.16b, T2.16b
246	eor		XL.16b, XL.16b, IN1.16b
247
248	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
249	eor		T1.16b, T1.16b, XL.16b
250	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
251	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
252
253	eor		T2.16b, XL.16b, XH.16b
254	ext		T1.16b, XL.16b, XH.16b, #8
255	eor		XM.16b, XM.16b, T2.16b
256
257	__pmull_reduce_\pn
258
259	eor		T2.16b, T2.16b, XH.16b
260	eor		XL.16b, XL.16b, T2.16b
261
262	cbz		w19, 3f
263
264	if_will_cond_yield_neon
265	st1		{XL.2d}, [x20]
266	do_cond_yield_neon
267	b		0b
268	endif_yield_neon
269
270	b		1b
271
2723:	st1		{XL.2d}, [x20]
273	frame_pop
274	ret
275	.endm
276
277	/*
278	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
279	 *			   struct ghash_key const *k, const char *head)
280	 */
281ENTRY(pmull_ghash_update_p64)
282	__pmull_ghash	p64
283ENDPROC(pmull_ghash_update_p64)
284
285ENTRY(pmull_ghash_update_p8)
286	__pmull_ghash	p8
287ENDPROC(pmull_ghash_update_p8)
288
289	KS0		.req	v8
290	KS1		.req	v9
291	INP0		.req	v10
292	INP1		.req	v11
293
294	.macro		load_round_keys, rounds, rk
295	cmp		\rounds, #12
296	blo		2222f		/* 128 bits */
297	beq		1111f		/* 192 bits */
298	ld1		{v17.4s-v18.4s}, [\rk], #32
2991111:	ld1		{v19.4s-v20.4s}, [\rk], #32
3002222:	ld1		{v21.4s-v24.4s}, [\rk], #64
301	ld1		{v25.4s-v28.4s}, [\rk], #64
302	ld1		{v29.4s-v31.4s}, [\rk]
303	.endm
304
305	.macro		enc_round, state, key
306	aese		\state\().16b, \key\().16b
307	aesmc		\state\().16b, \state\().16b
308	.endm
309
310	.macro		enc_block, state, rounds
311	cmp		\rounds, #12
312	b.lo		2222f		/* 128 bits */
313	b.eq		1111f		/* 192 bits */
314	enc_round	\state, v17
315	enc_round	\state, v18
3161111:	enc_round	\state, v19
317	enc_round	\state, v20
3182222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
319	enc_round	\state, \key
320	.endr
321	aese		\state\().16b, v30.16b
322	eor		\state\().16b, \state\().16b, v31.16b
323	.endm
324
325	.macro		pmull_gcm_do_crypt, enc
326	ld1		{SHASH.2d}, [x4]
327	ld1		{XL.2d}, [x1]
328	ldr		x8, [x5, #8]			// load lower counter
329
330	load_round_keys	w7, x6
331
332	movi		MASK.16b, #0xe1
333	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
334CPU_LE(	rev		x8, x8		)
335	shl		MASK.2d, MASK.2d, #57
336	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
337
338	.if		\enc == 1
339	ldr		x10, [sp]
340	ld1		{KS0.16b-KS1.16b}, [x10]
341	.endif
342
3430:	ld1		{INP0.16b-INP1.16b}, [x3], #32
344
345	rev		x9, x8
346	add		x11, x8, #1
347	add		x8, x8, #2
348
349	.if		\enc == 1
350	eor		INP0.16b, INP0.16b, KS0.16b	// encrypt input
351	eor		INP1.16b, INP1.16b, KS1.16b
352	.endif
353
354	ld1		{KS0.8b}, [x5]			// load upper counter
355	rev		x11, x11
356	sub		w0, w0, #2
357	mov		KS1.8b, KS0.8b
358	ins		KS0.d[1], x9			// set lower counter
359	ins		KS1.d[1], x11
360
361	rev64		T1.16b, INP0.16b
362
363	cmp		w7, #12
364	b.ge		2f				// AES-192/256?
365
3661:	enc_round	KS0, v21
367
368	ext		T2.16b, XL.16b, XL.16b, #8
369	ext		IN1.16b, T1.16b, T1.16b, #8
370
371	enc_round	KS1, v21
372
373	eor		T1.16b, T1.16b, T2.16b
374	eor		XL.16b, XL.16b, IN1.16b
375
376	enc_round	KS0, v22
377
378	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
379	eor		T1.16b, T1.16b, XL.16b
380
381	enc_round	KS1, v22
382
383	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
384	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
385
386	enc_round	KS0, v23
387
388	ext		T1.16b, XL.16b, XH.16b, #8
389	eor		T2.16b, XL.16b, XH.16b
390	eor		XM.16b, XM.16b, T1.16b
391
392	enc_round	KS1, v23
393
394	eor		XM.16b, XM.16b, T2.16b
395	pmull		T2.1q, XL.1d, MASK.1d
396
397	enc_round	KS0, v24
398
399	mov		XH.d[0], XM.d[1]
400	mov		XM.d[1], XL.d[0]
401
402	enc_round	KS1, v24
403
404	eor		XL.16b, XM.16b, T2.16b
405
406	enc_round	KS0, v25
407
408	ext		T2.16b, XL.16b, XL.16b, #8
409
410	enc_round	KS1, v25
411
412	pmull		XL.1q, XL.1d, MASK.1d
413	eor		T2.16b, T2.16b, XH.16b
414
415	enc_round	KS0, v26
416
417	eor		XL.16b, XL.16b, T2.16b
418	rev64		T1.16b, INP1.16b
419
420	enc_round	KS1, v26
421
422	ext		T2.16b, XL.16b, XL.16b, #8
423	ext		IN1.16b, T1.16b, T1.16b, #8
424
425	enc_round	KS0, v27
426
427	eor		T1.16b, T1.16b, T2.16b
428	eor		XL.16b, XL.16b, IN1.16b
429
430	enc_round	KS1, v27
431
432	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
433	eor		T1.16b, T1.16b, XL.16b
434
435	enc_round	KS0, v28
436
437	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
438	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
439
440	enc_round	KS1, v28
441
442	ext		T1.16b, XL.16b, XH.16b, #8
443	eor		T2.16b, XL.16b, XH.16b
444	eor		XM.16b, XM.16b, T1.16b
445
446	enc_round	KS0, v29
447
448	eor		XM.16b, XM.16b, T2.16b
449	pmull		T2.1q, XL.1d, MASK.1d
450
451	enc_round	KS1, v29
452
453	mov		XH.d[0], XM.d[1]
454	mov		XM.d[1], XL.d[0]
455
456	aese		KS0.16b, v30.16b
457
458	eor		XL.16b, XM.16b, T2.16b
459
460	aese		KS1.16b, v30.16b
461
462	ext		T2.16b, XL.16b, XL.16b, #8
463
464	eor		KS0.16b, KS0.16b, v31.16b
465
466	pmull		XL.1q, XL.1d, MASK.1d
467	eor		T2.16b, T2.16b, XH.16b
468
469	eor		KS1.16b, KS1.16b, v31.16b
470
471	eor		XL.16b, XL.16b, T2.16b
472
473	.if		\enc == 0
474	eor		INP0.16b, INP0.16b, KS0.16b
475	eor		INP1.16b, INP1.16b, KS1.16b
476	.endif
477
478	st1		{INP0.16b-INP1.16b}, [x2], #32
479
480	cbnz		w0, 0b
481
482CPU_LE(	rev		x8, x8		)
483	st1		{XL.2d}, [x1]
484	str		x8, [x5, #8]			// store lower counter
485
486	.if		\enc == 1
487	st1		{KS0.16b-KS1.16b}, [x10]
488	.endif
489
490	ret
491
4922:	b.eq		3f				// AES-192?
493	enc_round	KS0, v17
494	enc_round	KS1, v17
495	enc_round	KS0, v18
496	enc_round	KS1, v18
4973:	enc_round	KS0, v19
498	enc_round	KS1, v19
499	enc_round	KS0, v20
500	enc_round	KS1, v20
501	b		1b
502	.endm
503
504	/*
505	 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
506	 *			  struct ghash_key const *k, u8 ctr[],
507	 *			  int rounds, u8 ks[])
508	 */
509ENTRY(pmull_gcm_encrypt)
510	pmull_gcm_do_crypt	1
511ENDPROC(pmull_gcm_encrypt)
512
513	/*
514	 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
515	 *			  struct ghash_key const *k, u8 ctr[],
516	 *			  int rounds)
517	 */
518ENTRY(pmull_gcm_decrypt)
519	pmull_gcm_do_crypt	0
520ENDPROC(pmull_gcm_decrypt)
521
522	/*
523	 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
524	 */
525ENTRY(pmull_gcm_encrypt_block)
526	cbz		x2, 0f
527	load_round_keys	w3, x2
5280:	ld1		{v0.16b}, [x1]
529	enc_block	v0, w3
530	st1		{v0.16b}, [x0]
531	ret
532ENDPROC(pmull_gcm_encrypt_block)
533