1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4 *
5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	SHASH		.req	v0
12	SHASH2		.req	v1
13	T1		.req	v2
14	T2		.req	v3
15	MASK		.req	v4
16	XL		.req	v5
17	XM		.req	v6
18	XH		.req	v7
19	IN1		.req	v7
20
21	k00_16		.req	v8
22	k32_48		.req	v9
23
24	t3		.req	v10
25	t4		.req	v11
26	t5		.req	v12
27	t6		.req	v13
28	t7		.req	v14
29	t8		.req	v15
30	t9		.req	v16
31
32	perm1		.req	v17
33	perm2		.req	v18
34	perm3		.req	v19
35
36	sh1		.req	v20
37	sh2		.req	v21
38	sh3		.req	v22
39	sh4		.req	v23
40
41	ss1		.req	v24
42	ss2		.req	v25
43	ss3		.req	v26
44	ss4		.req	v27
45
46	XL2		.req	v8
47	XM2		.req	v9
48	XH2		.req	v10
49	XL3		.req	v11
50	XM3		.req	v12
51	XH3		.req	v13
52	TT3		.req	v14
53	TT4		.req	v15
54	HH		.req	v16
55	HH3		.req	v17
56	HH4		.req	v18
57	HH34		.req	v19
58
59	.text
60	.arch		armv8-a+crypto
61
62	.macro		__pmull_p64, rd, rn, rm
63	pmull		\rd\().1q, \rn\().1d, \rm\().1d
64	.endm
65
66	.macro		__pmull2_p64, rd, rn, rm
67	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
68	.endm
69
70	.macro		__pmull_p8, rq, ad, bd
71	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
72	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
73	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
74
75	__pmull_p8_\bd	\rq, \ad
76	.endm
77
78	.macro		__pmull2_p8, rq, ad, bd
79	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
80	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
81	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
82
83	__pmull2_p8_\bd	\rq, \ad
84	.endm
85
86	.macro		__pmull_p8_SHASH, rq, ad
87	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
88	.endm
89
90	.macro		__pmull_p8_SHASH2, rq, ad
91	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
92	.endm
93
94	.macro		__pmull2_p8_SHASH, rq, ad
95	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
96	.endm
97
98	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
99	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
100	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
101	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
102	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
103	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
104	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
105	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
106	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
107
108	eor		t3.16b, t3.16b, t4.16b			// L = E + F
109	eor		t5.16b, t5.16b, t6.16b			// M = G + H
110	eor		t7.16b, t7.16b, t8.16b			// N = I + J
111
112	uzp1		t4.2d, t3.2d, t5.2d
113	uzp2		t3.2d, t3.2d, t5.2d
114	uzp1		t6.2d, t7.2d, t9.2d
115	uzp2		t7.2d, t7.2d, t9.2d
116
117	// t3 = (L) (P0 + P1) << 8
118	// t5 = (M) (P2 + P3) << 16
119	eor		t4.16b, t4.16b, t3.16b
120	and		t3.16b, t3.16b, k32_48.16b
121
122	// t7 = (N) (P4 + P5) << 24
123	// t9 = (K) (P6 + P7) << 32
124	eor		t6.16b, t6.16b, t7.16b
125	and		t7.16b, t7.16b, k00_16.16b
126
127	eor		t4.16b, t4.16b, t3.16b
128	eor		t6.16b, t6.16b, t7.16b
129
130	zip2		t5.2d, t4.2d, t3.2d
131	zip1		t3.2d, t4.2d, t3.2d
132	zip2		t9.2d, t6.2d, t7.2d
133	zip1		t7.2d, t6.2d, t7.2d
134
135	ext		t3.16b, t3.16b, t3.16b, #15
136	ext		t5.16b, t5.16b, t5.16b, #14
137	ext		t7.16b, t7.16b, t7.16b, #13
138	ext		t9.16b, t9.16b, t9.16b, #12
139
140	eor		t3.16b, t3.16b, t5.16b
141	eor		t7.16b, t7.16b, t9.16b
142	eor		\rq\().16b, \rq\().16b, t3.16b
143	eor		\rq\().16b, \rq\().16b, t7.16b
144	.endm
145
146	.macro		__pmull_pre_p64
147	add		x8, x3, #16
148	ld1		{HH.2d-HH4.2d}, [x8]
149
150	trn1		SHASH2.2d, SHASH.2d, HH.2d
151	trn2		T1.2d, SHASH.2d, HH.2d
152	eor		SHASH2.16b, SHASH2.16b, T1.16b
153
154	trn1		HH34.2d, HH3.2d, HH4.2d
155	trn2		T1.2d, HH3.2d, HH4.2d
156	eor		HH34.16b, HH34.16b, T1.16b
157
158	movi		MASK.16b, #0xe1
159	shl		MASK.2d, MASK.2d, #57
160	.endm
161
162	.macro		__pmull_pre_p8
163	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
164	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
165
166	// k00_16 := 0x0000000000000000_000000000000ffff
167	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
168	movi		k32_48.2d, #0xffffffff
169	mov		k32_48.h[2], k32_48.h[0]
170	ushr		k00_16.2d, k32_48.2d, #32
171
172	// prepare the permutation vectors
173	mov_q		x5, 0x080f0e0d0c0b0a09
174	movi		T1.8b, #8
175	dup		perm1.2d, x5
176	eor		perm1.16b, perm1.16b, T1.16b
177	ushr		perm2.2d, perm1.2d, #8
178	ushr		perm3.2d, perm1.2d, #16
179	ushr		T1.2d, perm1.2d, #24
180	sli		perm2.2d, perm1.2d, #56
181	sli		perm3.2d, perm1.2d, #48
182	sli		T1.2d, perm1.2d, #40
183
184	// precompute loop invariants
185	tbl		sh1.16b, {SHASH.16b}, perm1.16b
186	tbl		sh2.16b, {SHASH.16b}, perm2.16b
187	tbl		sh3.16b, {SHASH.16b}, perm3.16b
188	tbl		sh4.16b, {SHASH.16b}, T1.16b
189	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
190	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
191	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
192	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
193	.endm
194
195	//
196	// PMULL (64x64->128) based reduction for CPUs that can do
197	// it in a single instruction.
198	//
199	.macro		__pmull_reduce_p64
200	pmull		T2.1q, XL.1d, MASK.1d
201	eor		XM.16b, XM.16b, T1.16b
202
203	mov		XH.d[0], XM.d[1]
204	mov		XM.d[1], XL.d[0]
205
206	eor		XL.16b, XM.16b, T2.16b
207	ext		T2.16b, XL.16b, XL.16b, #8
208	pmull		XL.1q, XL.1d, MASK.1d
209	.endm
210
211	//
212	// Alternative reduction for CPUs that lack support for the
213	// 64x64->128 PMULL instruction
214	//
215	.macro		__pmull_reduce_p8
216	eor		XM.16b, XM.16b, T1.16b
217
218	mov		XL.d[1], XM.d[0]
219	mov		XH.d[0], XM.d[1]
220
221	shl		T1.2d, XL.2d, #57
222	shl		T2.2d, XL.2d, #62
223	eor		T2.16b, T2.16b, T1.16b
224	shl		T1.2d, XL.2d, #63
225	eor		T2.16b, T2.16b, T1.16b
226	ext		T1.16b, XL.16b, XH.16b, #8
227	eor		T2.16b, T2.16b, T1.16b
228
229	mov		XL.d[1], T2.d[0]
230	mov		XH.d[0], T2.d[1]
231
232	ushr		T2.2d, XL.2d, #1
233	eor		XH.16b, XH.16b, XL.16b
234	eor		XL.16b, XL.16b, T2.16b
235	ushr		T2.2d, T2.2d, #6
236	ushr		XL.2d, XL.2d, #1
237	.endm
238
239	.macro		__pmull_ghash, pn
240	ld1		{SHASH.2d}, [x3]
241	ld1		{XL.2d}, [x1]
242
243	__pmull_pre_\pn
244
245	/* do the head block first, if supplied */
246	cbz		x4, 0f
247	ld1		{T1.2d}, [x4]
248	mov		x4, xzr
249	b		3f
250
2510:	.ifc		\pn, p64
252	tbnz		w0, #0, 2f		// skip until #blocks is a
253	tbnz		w0, #1, 2f		// round multiple of 4
254
2551:	ld1		{XM3.16b-TT4.16b}, [x2], #64
256
257	sub		w0, w0, #4
258
259	rev64		T1.16b, XM3.16b
260	rev64		T2.16b, XH3.16b
261	rev64		TT4.16b, TT4.16b
262	rev64		TT3.16b, TT3.16b
263
264	ext		IN1.16b, TT4.16b, TT4.16b, #8
265	ext		XL3.16b, TT3.16b, TT3.16b, #8
266
267	eor		TT4.16b, TT4.16b, IN1.16b
268	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
269	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
270	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
271
272	eor		TT3.16b, TT3.16b, XL3.16b
273	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
274	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
275	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
276
277	ext		IN1.16b, T2.16b, T2.16b, #8
278	eor		XL2.16b, XL2.16b, XL3.16b
279	eor		XH2.16b, XH2.16b, XH3.16b
280	eor		XM2.16b, XM2.16b, XM3.16b
281
282	eor		T2.16b, T2.16b, IN1.16b
283	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
284	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
285	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
286
287	eor		XL2.16b, XL2.16b, XL3.16b
288	eor		XH2.16b, XH2.16b, XH3.16b
289	eor		XM2.16b, XM2.16b, XM3.16b
290
291	ext		IN1.16b, T1.16b, T1.16b, #8
292	ext		TT3.16b, XL.16b, XL.16b, #8
293	eor		XL.16b, XL.16b, IN1.16b
294	eor		T1.16b, T1.16b, TT3.16b
295
296	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
297	eor		T1.16b, T1.16b, XL.16b
298	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
299	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
300
301	eor		XL.16b, XL.16b, XL2.16b
302	eor		XH.16b, XH.16b, XH2.16b
303	eor		XM.16b, XM.16b, XM2.16b
304
305	eor		T2.16b, XL.16b, XH.16b
306	ext		T1.16b, XL.16b, XH.16b, #8
307	eor		XM.16b, XM.16b, T2.16b
308
309	__pmull_reduce_p64
310
311	eor		T2.16b, T2.16b, XH.16b
312	eor		XL.16b, XL.16b, T2.16b
313
314	cbz		w0, 5f
315	b		1b
316	.endif
317
3182:	ld1		{T1.2d}, [x2], #16
319	sub		w0, w0, #1
320
3213:	/* multiply XL by SHASH in GF(2^128) */
322CPU_LE(	rev64		T1.16b, T1.16b	)
323
324	ext		T2.16b, XL.16b, XL.16b, #8
325	ext		IN1.16b, T1.16b, T1.16b, #8
326	eor		T1.16b, T1.16b, T2.16b
327	eor		XL.16b, XL.16b, IN1.16b
328
329	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
330	eor		T1.16b, T1.16b, XL.16b
331	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
332	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
333
3344:	eor		T2.16b, XL.16b, XH.16b
335	ext		T1.16b, XL.16b, XH.16b, #8
336	eor		XM.16b, XM.16b, T2.16b
337
338	__pmull_reduce_\pn
339
340	eor		T2.16b, T2.16b, XH.16b
341	eor		XL.16b, XL.16b, T2.16b
342
343	cbnz		w0, 0b
344
3455:	st1		{XL.2d}, [x1]
346	ret
347	.endm
348
349	/*
350	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
351	 *			   struct ghash_key const *k, const char *head)
352	 */
353ENTRY(pmull_ghash_update_p64)
354	__pmull_ghash	p64
355ENDPROC(pmull_ghash_update_p64)
356
357ENTRY(pmull_ghash_update_p8)
358	__pmull_ghash	p8
359ENDPROC(pmull_ghash_update_p8)
360
361	KS0		.req	v12
362	KS1		.req	v13
363	INP0		.req	v14
364	INP1		.req	v15
365
366	.macro		load_round_keys, rounds, rk
367	cmp		\rounds, #12
368	blo		2222f		/* 128 bits */
369	beq		1111f		/* 192 bits */
370	ld1		{v17.4s-v18.4s}, [\rk], #32
3711111:	ld1		{v19.4s-v20.4s}, [\rk], #32
3722222:	ld1		{v21.4s-v24.4s}, [\rk], #64
373	ld1		{v25.4s-v28.4s}, [\rk], #64
374	ld1		{v29.4s-v31.4s}, [\rk]
375	.endm
376
377	.macro		enc_round, state, key
378	aese		\state\().16b, \key\().16b
379	aesmc		\state\().16b, \state\().16b
380	.endm
381
382	.macro		enc_block, state, rounds
383	cmp		\rounds, #12
384	b.lo		2222f		/* 128 bits */
385	b.eq		1111f		/* 192 bits */
386	enc_round	\state, v17
387	enc_round	\state, v18
3881111:	enc_round	\state, v19
389	enc_round	\state, v20
3902222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
391	enc_round	\state, \key
392	.endr
393	aese		\state\().16b, v30.16b
394	eor		\state\().16b, \state\().16b, v31.16b
395	.endm
396
397	.macro		pmull_gcm_do_crypt, enc
398	ld1		{SHASH.2d}, [x4], #16
399	ld1		{HH.2d}, [x4]
400	ld1		{XL.2d}, [x1]
401	ldr		x8, [x5, #8]			// load lower counter
402
403	movi		MASK.16b, #0xe1
404	trn1		SHASH2.2d, SHASH.2d, HH.2d
405	trn2		T1.2d, SHASH.2d, HH.2d
406CPU_LE(	rev		x8, x8		)
407	shl		MASK.2d, MASK.2d, #57
408	eor		SHASH2.16b, SHASH2.16b, T1.16b
409
410	.if		\enc == 1
411	ldr		x10, [sp]
412	ld1		{KS0.16b-KS1.16b}, [x10]
413	.endif
414
415	cbnz		x6, 4f
416
4170:	ld1		{INP0.16b-INP1.16b}, [x3], #32
418
419	rev		x9, x8
420	add		x11, x8, #1
421	add		x8, x8, #2
422
423	.if		\enc == 1
424	eor		INP0.16b, INP0.16b, KS0.16b	// encrypt input
425	eor		INP1.16b, INP1.16b, KS1.16b
426	.endif
427
428	ld1		{KS0.8b}, [x5]			// load upper counter
429	rev		x11, x11
430	sub		w0, w0, #2
431	mov		KS1.8b, KS0.8b
432	ins		KS0.d[1], x9			// set lower counter
433	ins		KS1.d[1], x11
434
435	rev64		T1.16b, INP1.16b
436
437	cmp		w7, #12
438	b.ge		2f				// AES-192/256?
439
4401:	enc_round	KS0, v21
441	ext		IN1.16b, T1.16b, T1.16b, #8
442
443	enc_round	KS1, v21
444	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
445
446	enc_round	KS0, v22
447	eor		T1.16b, T1.16b, IN1.16b
448
449	enc_round	KS1, v22
450	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
451
452	enc_round	KS0, v23
453	pmull		XM2.1q, SHASH2.1d, T1.1d	// (a1 + a0)(b1 + b0)
454
455	enc_round	KS1, v23
456	rev64		T1.16b, INP0.16b
457	ext		T2.16b, XL.16b, XL.16b, #8
458
459	enc_round	KS0, v24
460	ext		IN1.16b, T1.16b, T1.16b, #8
461	eor		T1.16b, T1.16b, T2.16b
462
463	enc_round	KS1, v24
464	eor		XL.16b, XL.16b, IN1.16b
465
466	enc_round	KS0, v25
467	eor		T1.16b, T1.16b, XL.16b
468
469	enc_round	KS1, v25
470	pmull2		XH.1q, HH.2d, XL.2d		// a1 * b1
471
472	enc_round	KS0, v26
473	pmull		XL.1q, HH.1d, XL.1d		// a0 * b0
474
475	enc_round	KS1, v26
476	pmull2		XM.1q, SHASH2.2d, T1.2d		// (a1 + a0)(b1 + b0)
477
478	enc_round	KS0, v27
479	eor		XL.16b, XL.16b, XL2.16b
480	eor		XH.16b, XH.16b, XH2.16b
481
482	enc_round	KS1, v27
483	eor		XM.16b, XM.16b, XM2.16b
484	ext		T1.16b, XL.16b, XH.16b, #8
485
486	enc_round	KS0, v28
487	eor		T2.16b, XL.16b, XH.16b
488	eor		XM.16b, XM.16b, T1.16b
489
490	enc_round	KS1, v28
491	eor		XM.16b, XM.16b, T2.16b
492
493	enc_round	KS0, v29
494	pmull		T2.1q, XL.1d, MASK.1d
495
496	enc_round	KS1, v29
497	mov		XH.d[0], XM.d[1]
498	mov		XM.d[1], XL.d[0]
499
500	aese		KS0.16b, v30.16b
501	eor		XL.16b, XM.16b, T2.16b
502
503	aese		KS1.16b, v30.16b
504	ext		T2.16b, XL.16b, XL.16b, #8
505
506	eor		KS0.16b, KS0.16b, v31.16b
507	pmull		XL.1q, XL.1d, MASK.1d
508	eor		T2.16b, T2.16b, XH.16b
509
510	eor		KS1.16b, KS1.16b, v31.16b
511	eor		XL.16b, XL.16b, T2.16b
512
513	.if		\enc == 0
514	eor		INP0.16b, INP0.16b, KS0.16b
515	eor		INP1.16b, INP1.16b, KS1.16b
516	.endif
517
518	st1		{INP0.16b-INP1.16b}, [x2], #32
519
520	cbnz		w0, 0b
521
522CPU_LE(	rev		x8, x8		)
523	st1		{XL.2d}, [x1]
524	str		x8, [x5, #8]			// store lower counter
525
526	.if		\enc == 1
527	st1		{KS0.16b-KS1.16b}, [x10]
528	.endif
529
530	ret
531
5322:	b.eq		3f				// AES-192?
533	enc_round	KS0, v17
534	enc_round	KS1, v17
535	enc_round	KS0, v18
536	enc_round	KS1, v18
5373:	enc_round	KS0, v19
538	enc_round	KS1, v19
539	enc_round	KS0, v20
540	enc_round	KS1, v20
541	b		1b
542
5434:	load_round_keys	w7, x6
544	b		0b
545	.endm
546
547	/*
548	 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
549	 *			  struct ghash_key const *k, u8 ctr[],
550	 *			  int rounds, u8 ks[])
551	 */
552ENTRY(pmull_gcm_encrypt)
553	pmull_gcm_do_crypt	1
554ENDPROC(pmull_gcm_encrypt)
555
556	/*
557	 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
558	 *			  struct ghash_key const *k, u8 ctr[],
559	 *			  int rounds)
560	 */
561ENTRY(pmull_gcm_decrypt)
562	pmull_gcm_do_crypt	0
563ENDPROC(pmull_gcm_decrypt)
564
565	/*
566	 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
567	 */
568ENTRY(pmull_gcm_encrypt_block)
569	cbz		x2, 0f
570	load_round_keys	w3, x2
5710:	ld1		{v0.16b}, [x1]
572	enc_block	v0, w3
573	st1		{v0.16b}, [x0]
574	ret
575ENDPROC(pmull_gcm_encrypt_block)
576