1/*
2 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3 *
4 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	SHASH		.req	v0
15	SHASH2		.req	v1
16	T1		.req	v2
17	T2		.req	v3
18	MASK		.req	v4
19	XL		.req	v5
20	XM		.req	v6
21	XH		.req	v7
22	IN1		.req	v7
23
24	k00_16		.req	v8
25	k32_48		.req	v9
26
27	t3		.req	v10
28	t4		.req	v11
29	t5		.req	v12
30	t6		.req	v13
31	t7		.req	v14
32	t8		.req	v15
33	t9		.req	v16
34
35	perm1		.req	v17
36	perm2		.req	v18
37	perm3		.req	v19
38
39	sh1		.req	v20
40	sh2		.req	v21
41	sh3		.req	v22
42	sh4		.req	v23
43
44	ss1		.req	v24
45	ss2		.req	v25
46	ss3		.req	v26
47	ss4		.req	v27
48
49	XL2		.req	v8
50	XM2		.req	v9
51	XH2		.req	v10
52	XL3		.req	v11
53	XM3		.req	v12
54	XH3		.req	v13
55	TT3		.req	v14
56	TT4		.req	v15
57	HH		.req	v16
58	HH3		.req	v17
59	HH4		.req	v18
60	HH34		.req	v19
61
62	.text
63	.arch		armv8-a+crypto
64
65	.macro		__pmull_p64, rd, rn, rm
66	pmull		\rd\().1q, \rn\().1d, \rm\().1d
67	.endm
68
69	.macro		__pmull2_p64, rd, rn, rm
70	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
71	.endm
72
73	.macro		__pmull_p8, rq, ad, bd
74	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
75	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
76	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
77
78	__pmull_p8_\bd	\rq, \ad
79	.endm
80
81	.macro		__pmull2_p8, rq, ad, bd
82	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
83	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
84	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
85
86	__pmull2_p8_\bd	\rq, \ad
87	.endm
88
89	.macro		__pmull_p8_SHASH, rq, ad
90	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
91	.endm
92
93	.macro		__pmull_p8_SHASH2, rq, ad
94	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
95	.endm
96
97	.macro		__pmull2_p8_SHASH, rq, ad
98	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
99	.endm
100
101	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
102	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
103	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
104	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
105	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
106	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
107	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
108	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
109	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
110
111	eor		t3.16b, t3.16b, t4.16b			// L = E + F
112	eor		t5.16b, t5.16b, t6.16b			// M = G + H
113	eor		t7.16b, t7.16b, t8.16b			// N = I + J
114
115	uzp1		t4.2d, t3.2d, t5.2d
116	uzp2		t3.2d, t3.2d, t5.2d
117	uzp1		t6.2d, t7.2d, t9.2d
118	uzp2		t7.2d, t7.2d, t9.2d
119
120	// t3 = (L) (P0 + P1) << 8
121	// t5 = (M) (P2 + P3) << 16
122	eor		t4.16b, t4.16b, t3.16b
123	and		t3.16b, t3.16b, k32_48.16b
124
125	// t7 = (N) (P4 + P5) << 24
126	// t9 = (K) (P6 + P7) << 32
127	eor		t6.16b, t6.16b, t7.16b
128	and		t7.16b, t7.16b, k00_16.16b
129
130	eor		t4.16b, t4.16b, t3.16b
131	eor		t6.16b, t6.16b, t7.16b
132
133	zip2		t5.2d, t4.2d, t3.2d
134	zip1		t3.2d, t4.2d, t3.2d
135	zip2		t9.2d, t6.2d, t7.2d
136	zip1		t7.2d, t6.2d, t7.2d
137
138	ext		t3.16b, t3.16b, t3.16b, #15
139	ext		t5.16b, t5.16b, t5.16b, #14
140	ext		t7.16b, t7.16b, t7.16b, #13
141	ext		t9.16b, t9.16b, t9.16b, #12
142
143	eor		t3.16b, t3.16b, t5.16b
144	eor		t7.16b, t7.16b, t9.16b
145	eor		\rq\().16b, \rq\().16b, t3.16b
146	eor		\rq\().16b, \rq\().16b, t7.16b
147	.endm
148
149	.macro		__pmull_pre_p64
150	add		x8, x3, #16
151	ld1		{HH.2d-HH4.2d}, [x8]
152
153	trn1		SHASH2.2d, SHASH.2d, HH.2d
154	trn2		T1.2d, SHASH.2d, HH.2d
155	eor		SHASH2.16b, SHASH2.16b, T1.16b
156
157	trn1		HH34.2d, HH3.2d, HH4.2d
158	trn2		T1.2d, HH3.2d, HH4.2d
159	eor		HH34.16b, HH34.16b, T1.16b
160
161	movi		MASK.16b, #0xe1
162	shl		MASK.2d, MASK.2d, #57
163	.endm
164
165	.macro		__pmull_pre_p8
166	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
167	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
168
169	// k00_16 := 0x0000000000000000_000000000000ffff
170	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
171	movi		k32_48.2d, #0xffffffff
172	mov		k32_48.h[2], k32_48.h[0]
173	ushr		k00_16.2d, k32_48.2d, #32
174
175	// prepare the permutation vectors
176	mov_q		x5, 0x080f0e0d0c0b0a09
177	movi		T1.8b, #8
178	dup		perm1.2d, x5
179	eor		perm1.16b, perm1.16b, T1.16b
180	ushr		perm2.2d, perm1.2d, #8
181	ushr		perm3.2d, perm1.2d, #16
182	ushr		T1.2d, perm1.2d, #24
183	sli		perm2.2d, perm1.2d, #56
184	sli		perm3.2d, perm1.2d, #48
185	sli		T1.2d, perm1.2d, #40
186
187	// precompute loop invariants
188	tbl		sh1.16b, {SHASH.16b}, perm1.16b
189	tbl		sh2.16b, {SHASH.16b}, perm2.16b
190	tbl		sh3.16b, {SHASH.16b}, perm3.16b
191	tbl		sh4.16b, {SHASH.16b}, T1.16b
192	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
193	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
194	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
195	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
196	.endm
197
198	//
199	// PMULL (64x64->128) based reduction for CPUs that can do
200	// it in a single instruction.
201	//
202	.macro		__pmull_reduce_p64
203	pmull		T2.1q, XL.1d, MASK.1d
204	eor		XM.16b, XM.16b, T1.16b
205
206	mov		XH.d[0], XM.d[1]
207	mov		XM.d[1], XL.d[0]
208
209	eor		XL.16b, XM.16b, T2.16b
210	ext		T2.16b, XL.16b, XL.16b, #8
211	pmull		XL.1q, XL.1d, MASK.1d
212	.endm
213
214	//
215	// Alternative reduction for CPUs that lack support for the
216	// 64x64->128 PMULL instruction
217	//
218	.macro		__pmull_reduce_p8
219	eor		XM.16b, XM.16b, T1.16b
220
221	mov		XL.d[1], XM.d[0]
222	mov		XH.d[0], XM.d[1]
223
224	shl		T1.2d, XL.2d, #57
225	shl		T2.2d, XL.2d, #62
226	eor		T2.16b, T2.16b, T1.16b
227	shl		T1.2d, XL.2d, #63
228	eor		T2.16b, T2.16b, T1.16b
229	ext		T1.16b, XL.16b, XH.16b, #8
230	eor		T2.16b, T2.16b, T1.16b
231
232	mov		XL.d[1], T2.d[0]
233	mov		XH.d[0], T2.d[1]
234
235	ushr		T2.2d, XL.2d, #1
236	eor		XH.16b, XH.16b, XL.16b
237	eor		XL.16b, XL.16b, T2.16b
238	ushr		T2.2d, T2.2d, #6
239	ushr		XL.2d, XL.2d, #1
240	.endm
241
242	.macro		__pmull_ghash, pn
243	ld1		{SHASH.2d}, [x3]
244	ld1		{XL.2d}, [x1]
245
246	__pmull_pre_\pn
247
248	/* do the head block first, if supplied */
249	cbz		x4, 0f
250	ld1		{T1.2d}, [x4]
251	mov		x4, xzr
252	b		3f
253
2540:	.ifc		\pn, p64
255	tbnz		w0, #0, 2f		// skip until #blocks is a
256	tbnz		w0, #1, 2f		// round multiple of 4
257
2581:	ld1		{XM3.16b-TT4.16b}, [x2], #64
259
260	sub		w0, w0, #4
261
262	rev64		T1.16b, XM3.16b
263	rev64		T2.16b, XH3.16b
264	rev64		TT4.16b, TT4.16b
265	rev64		TT3.16b, TT3.16b
266
267	ext		IN1.16b, TT4.16b, TT4.16b, #8
268	ext		XL3.16b, TT3.16b, TT3.16b, #8
269
270	eor		TT4.16b, TT4.16b, IN1.16b
271	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
272	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
273	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
274
275	eor		TT3.16b, TT3.16b, XL3.16b
276	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
277	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
278	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
279
280	ext		IN1.16b, T2.16b, T2.16b, #8
281	eor		XL2.16b, XL2.16b, XL3.16b
282	eor		XH2.16b, XH2.16b, XH3.16b
283	eor		XM2.16b, XM2.16b, XM3.16b
284
285	eor		T2.16b, T2.16b, IN1.16b
286	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
287	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
288	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
289
290	eor		XL2.16b, XL2.16b, XL3.16b
291	eor		XH2.16b, XH2.16b, XH3.16b
292	eor		XM2.16b, XM2.16b, XM3.16b
293
294	ext		IN1.16b, T1.16b, T1.16b, #8
295	ext		TT3.16b, XL.16b, XL.16b, #8
296	eor		XL.16b, XL.16b, IN1.16b
297	eor		T1.16b, T1.16b, TT3.16b
298
299	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
300	eor		T1.16b, T1.16b, XL.16b
301	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
302	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
303
304	eor		XL.16b, XL.16b, XL2.16b
305	eor		XH.16b, XH.16b, XH2.16b
306	eor		XM.16b, XM.16b, XM2.16b
307
308	eor		T2.16b, XL.16b, XH.16b
309	ext		T1.16b, XL.16b, XH.16b, #8
310	eor		XM.16b, XM.16b, T2.16b
311
312	__pmull_reduce_p64
313
314	eor		T2.16b, T2.16b, XH.16b
315	eor		XL.16b, XL.16b, T2.16b
316
317	cbz		w0, 5f
318	b		1b
319	.endif
320
3212:	ld1		{T1.2d}, [x2], #16
322	sub		w0, w0, #1
323
3243:	/* multiply XL by SHASH in GF(2^128) */
325CPU_LE(	rev64		T1.16b, T1.16b	)
326
327	ext		T2.16b, XL.16b, XL.16b, #8
328	ext		IN1.16b, T1.16b, T1.16b, #8
329	eor		T1.16b, T1.16b, T2.16b
330	eor		XL.16b, XL.16b, IN1.16b
331
332	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
333	eor		T1.16b, T1.16b, XL.16b
334	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
335	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
336
3374:	eor		T2.16b, XL.16b, XH.16b
338	ext		T1.16b, XL.16b, XH.16b, #8
339	eor		XM.16b, XM.16b, T2.16b
340
341	__pmull_reduce_\pn
342
343	eor		T2.16b, T2.16b, XH.16b
344	eor		XL.16b, XL.16b, T2.16b
345
346	cbnz		w0, 0b
347
3485:	st1		{XL.2d}, [x1]
349	ret
350	.endm
351
352	/*
353	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
354	 *			   struct ghash_key const *k, const char *head)
355	 */
356ENTRY(pmull_ghash_update_p64)
357	__pmull_ghash	p64
358ENDPROC(pmull_ghash_update_p64)
359
360ENTRY(pmull_ghash_update_p8)
361	__pmull_ghash	p8
362ENDPROC(pmull_ghash_update_p8)
363
364	KS0		.req	v12
365	KS1		.req	v13
366	INP0		.req	v14
367	INP1		.req	v15
368
369	.macro		load_round_keys, rounds, rk
370	cmp		\rounds, #12
371	blo		2222f		/* 128 bits */
372	beq		1111f		/* 192 bits */
373	ld1		{v17.4s-v18.4s}, [\rk], #32
3741111:	ld1		{v19.4s-v20.4s}, [\rk], #32
3752222:	ld1		{v21.4s-v24.4s}, [\rk], #64
376	ld1		{v25.4s-v28.4s}, [\rk], #64
377	ld1		{v29.4s-v31.4s}, [\rk]
378	.endm
379
380	.macro		enc_round, state, key
381	aese		\state\().16b, \key\().16b
382	aesmc		\state\().16b, \state\().16b
383	.endm
384
385	.macro		enc_block, state, rounds
386	cmp		\rounds, #12
387	b.lo		2222f		/* 128 bits */
388	b.eq		1111f		/* 192 bits */
389	enc_round	\state, v17
390	enc_round	\state, v18
3911111:	enc_round	\state, v19
392	enc_round	\state, v20
3932222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
394	enc_round	\state, \key
395	.endr
396	aese		\state\().16b, v30.16b
397	eor		\state\().16b, \state\().16b, v31.16b
398	.endm
399
400	.macro		pmull_gcm_do_crypt, enc
401	ld1		{SHASH.2d}, [x4], #16
402	ld1		{HH.2d}, [x4]
403	ld1		{XL.2d}, [x1]
404	ldr		x8, [x5, #8]			// load lower counter
405
406	movi		MASK.16b, #0xe1
407	trn1		SHASH2.2d, SHASH.2d, HH.2d
408	trn2		T1.2d, SHASH.2d, HH.2d
409CPU_LE(	rev		x8, x8		)
410	shl		MASK.2d, MASK.2d, #57
411	eor		SHASH2.16b, SHASH2.16b, T1.16b
412
413	.if		\enc == 1
414	ldr		x10, [sp]
415	ld1		{KS0.16b-KS1.16b}, [x10]
416	.endif
417
418	cbnz		x6, 4f
419
4200:	ld1		{INP0.16b-INP1.16b}, [x3], #32
421
422	rev		x9, x8
423	add		x11, x8, #1
424	add		x8, x8, #2
425
426	.if		\enc == 1
427	eor		INP0.16b, INP0.16b, KS0.16b	// encrypt input
428	eor		INP1.16b, INP1.16b, KS1.16b
429	.endif
430
431	ld1		{KS0.8b}, [x5]			// load upper counter
432	rev		x11, x11
433	sub		w0, w0, #2
434	mov		KS1.8b, KS0.8b
435	ins		KS0.d[1], x9			// set lower counter
436	ins		KS1.d[1], x11
437
438	rev64		T1.16b, INP1.16b
439
440	cmp		w7, #12
441	b.ge		2f				// AES-192/256?
442
4431:	enc_round	KS0, v21
444	ext		IN1.16b, T1.16b, T1.16b, #8
445
446	enc_round	KS1, v21
447	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
448
449	enc_round	KS0, v22
450	eor		T1.16b, T1.16b, IN1.16b
451
452	enc_round	KS1, v22
453	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
454
455	enc_round	KS0, v23
456	pmull		XM2.1q, SHASH2.1d, T1.1d	// (a1 + a0)(b1 + b0)
457
458	enc_round	KS1, v23
459	rev64		T1.16b, INP0.16b
460	ext		T2.16b, XL.16b, XL.16b, #8
461
462	enc_round	KS0, v24
463	ext		IN1.16b, T1.16b, T1.16b, #8
464	eor		T1.16b, T1.16b, T2.16b
465
466	enc_round	KS1, v24
467	eor		XL.16b, XL.16b, IN1.16b
468
469	enc_round	KS0, v25
470	eor		T1.16b, T1.16b, XL.16b
471
472	enc_round	KS1, v25
473	pmull2		XH.1q, HH.2d, XL.2d		// a1 * b1
474
475	enc_round	KS0, v26
476	pmull		XL.1q, HH.1d, XL.1d		// a0 * b0
477
478	enc_round	KS1, v26
479	pmull2		XM.1q, SHASH2.2d, T1.2d		// (a1 + a0)(b1 + b0)
480
481	enc_round	KS0, v27
482	eor		XL.16b, XL.16b, XL2.16b
483	eor		XH.16b, XH.16b, XH2.16b
484
485	enc_round	KS1, v27
486	eor		XM.16b, XM.16b, XM2.16b
487	ext		T1.16b, XL.16b, XH.16b, #8
488
489	enc_round	KS0, v28
490	eor		T2.16b, XL.16b, XH.16b
491	eor		XM.16b, XM.16b, T1.16b
492
493	enc_round	KS1, v28
494	eor		XM.16b, XM.16b, T2.16b
495
496	enc_round	KS0, v29
497	pmull		T2.1q, XL.1d, MASK.1d
498
499	enc_round	KS1, v29
500	mov		XH.d[0], XM.d[1]
501	mov		XM.d[1], XL.d[0]
502
503	aese		KS0.16b, v30.16b
504	eor		XL.16b, XM.16b, T2.16b
505
506	aese		KS1.16b, v30.16b
507	ext		T2.16b, XL.16b, XL.16b, #8
508
509	eor		KS0.16b, KS0.16b, v31.16b
510	pmull		XL.1q, XL.1d, MASK.1d
511	eor		T2.16b, T2.16b, XH.16b
512
513	eor		KS1.16b, KS1.16b, v31.16b
514	eor		XL.16b, XL.16b, T2.16b
515
516	.if		\enc == 0
517	eor		INP0.16b, INP0.16b, KS0.16b
518	eor		INP1.16b, INP1.16b, KS1.16b
519	.endif
520
521	st1		{INP0.16b-INP1.16b}, [x2], #32
522
523	cbnz		w0, 0b
524
525CPU_LE(	rev		x8, x8		)
526	st1		{XL.2d}, [x1]
527	str		x8, [x5, #8]			// store lower counter
528
529	.if		\enc == 1
530	st1		{KS0.16b-KS1.16b}, [x10]
531	.endif
532
533	ret
534
5352:	b.eq		3f				// AES-192?
536	enc_round	KS0, v17
537	enc_round	KS1, v17
538	enc_round	KS0, v18
539	enc_round	KS1, v18
5403:	enc_round	KS0, v19
541	enc_round	KS1, v19
542	enc_round	KS0, v20
543	enc_round	KS1, v20
544	b		1b
545
5464:	load_round_keys	w7, x6
547	b		0b
548	.endm
549
550	/*
551	 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
552	 *			  struct ghash_key const *k, u8 ctr[],
553	 *			  int rounds, u8 ks[])
554	 */
555ENTRY(pmull_gcm_encrypt)
556	pmull_gcm_do_crypt	1
557ENDPROC(pmull_gcm_encrypt)
558
559	/*
560	 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
561	 *			  struct ghash_key const *k, u8 ctr[],
562	 *			  int rounds)
563	 */
564ENTRY(pmull_gcm_decrypt)
565	pmull_gcm_do_crypt	0
566ENDPROC(pmull_gcm_decrypt)
567
568	/*
569	 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
570	 */
571ENTRY(pmull_gcm_encrypt_block)
572	cbz		x2, 0f
573	load_round_keys	w3, x2
5740:	ld1		{v0.16b}, [x1]
575	enc_block	v0, w3
576	st1		{v0.16b}, [x0]
577	ret
578ENDPROC(pmull_gcm_encrypt_block)
579