1/*
2 * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
3 *
4 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	.text
15	.arch	armv8-a+crypto
16
17	/*
18	 * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
19	 *			     u32 *macp, u8 const rk[], u32 rounds);
20	 */
21ENTRY(ce_aes_ccm_auth_data)
22	frame_push	7
23
24	mov	x19, x0
25	mov	x20, x1
26	mov	x21, x2
27	mov	x22, x3
28	mov	x23, x4
29	mov	x24, x5
30
31	ldr	w25, [x22]			/* leftover from prev round? */
32	ld1	{v0.16b}, [x0]			/* load mac */
33	cbz	w25, 1f
34	sub	w25, w25, #16
35	eor	v1.16b, v1.16b, v1.16b
360:	ldrb	w7, [x20], #1			/* get 1 byte of input */
37	subs	w21, w21, #1
38	add	w25, w25, #1
39	ins	v1.b[0], w7
40	ext	v1.16b, v1.16b, v1.16b, #1	/* rotate in the input bytes */
41	beq	8f				/* out of input? */
42	cbnz	w25, 0b
43	eor	v0.16b, v0.16b, v1.16b
441:	ld1	{v3.4s}, [x23]			/* load first round key */
45	prfm	pldl1strm, [x20]
46	cmp	w24, #12			/* which key size? */
47	add	x6, x23, #16
48	sub	w7, w24, #2			/* modified # of rounds */
49	bmi	2f
50	bne	5f
51	mov	v5.16b, v3.16b
52	b	4f
532:	mov	v4.16b, v3.16b
54	ld1	{v5.4s}, [x6], #16		/* load 2nd round key */
553:	aese	v0.16b, v4.16b
56	aesmc	v0.16b, v0.16b
574:	ld1	{v3.4s}, [x6], #16		/* load next round key */
58	aese	v0.16b, v5.16b
59	aesmc	v0.16b, v0.16b
605:	ld1	{v4.4s}, [x6], #16		/* load next round key */
61	subs	w7, w7, #3
62	aese	v0.16b, v3.16b
63	aesmc	v0.16b, v0.16b
64	ld1	{v5.4s}, [x6], #16		/* load next round key */
65	bpl	3b
66	aese	v0.16b, v4.16b
67	subs	w21, w21, #16			/* last data? */
68	eor	v0.16b, v0.16b, v5.16b		/* final round */
69	bmi	6f
70	ld1	{v1.16b}, [x20], #16		/* load next input block */
71	eor	v0.16b, v0.16b, v1.16b		/* xor with mac */
72	beq	6f
73
74	if_will_cond_yield_neon
75	st1	{v0.16b}, [x19]			/* store mac */
76	do_cond_yield_neon
77	ld1	{v0.16b}, [x19]			/* reload mac */
78	endif_yield_neon
79
80	b	1b
816:	st1	{v0.16b}, [x19]			/* store mac */
82	beq	10f
83	adds	w21, w21, #16
84	beq	10f
85	mov	w25, w21
867:	ldrb	w7, [x20], #1
87	umov	w6, v0.b[0]
88	eor	w6, w6, w7
89	strb	w6, [x19], #1
90	subs	w21, w21, #1
91	beq	10f
92	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */
93	b	7b
948:	mov	w7, w25
95	add	w25, w25, #16
969:	ext	v1.16b, v1.16b, v1.16b, #1
97	adds	w7, w7, #1
98	bne	9b
99	eor	v0.16b, v0.16b, v1.16b
100	st1	{v0.16b}, [x19]
10110:	str	w25, [x22]
102
103	frame_pop
104	ret
105ENDPROC(ce_aes_ccm_auth_data)
106
107	/*
108	 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
109	 * 			 u32 rounds);
110	 */
111ENTRY(ce_aes_ccm_final)
112	ld1	{v3.4s}, [x2], #16		/* load first round key */
113	ld1	{v0.16b}, [x0]			/* load mac */
114	cmp	w3, #12				/* which key size? */
115	sub	w3, w3, #2			/* modified # of rounds */
116	ld1	{v1.16b}, [x1]			/* load 1st ctriv */
117	bmi	0f
118	bne	3f
119	mov	v5.16b, v3.16b
120	b	2f
1210:	mov	v4.16b, v3.16b
1221:	ld1	{v5.4s}, [x2], #16		/* load next round key */
123	aese	v0.16b, v4.16b
124	aesmc	v0.16b, v0.16b
125	aese	v1.16b, v4.16b
126	aesmc	v1.16b, v1.16b
1272:	ld1	{v3.4s}, [x2], #16		/* load next round key */
128	aese	v0.16b, v5.16b
129	aesmc	v0.16b, v0.16b
130	aese	v1.16b, v5.16b
131	aesmc	v1.16b, v1.16b
1323:	ld1	{v4.4s}, [x2], #16		/* load next round key */
133	subs	w3, w3, #3
134	aese	v0.16b, v3.16b
135	aesmc	v0.16b, v0.16b
136	aese	v1.16b, v3.16b
137	aesmc	v1.16b, v1.16b
138	bpl	1b
139	aese	v0.16b, v4.16b
140	aese	v1.16b, v4.16b
141	/* final round key cancels out */
142	eor	v0.16b, v0.16b, v1.16b		/* en-/decrypt the mac */
143	st1	{v0.16b}, [x0]			/* store result */
144	ret
145ENDPROC(ce_aes_ccm_final)
146
147	.macro	aes_ccm_do_crypt,enc
148	frame_push	8
149
150	mov	x19, x0
151	mov	x20, x1
152	mov	x21, x2
153	mov	x22, x3
154	mov	x23, x4
155	mov	x24, x5
156	mov	x25, x6
157
158	ldr	x26, [x25, #8]			/* load lower ctr */
159	ld1	{v0.16b}, [x24]			/* load mac */
160CPU_LE(	rev	x26, x26		)	/* keep swabbed ctr in reg */
1610:	/* outer loop */
162	ld1	{v1.8b}, [x25]			/* load upper ctr */
163	prfm	pldl1strm, [x20]
164	add	x26, x26, #1
165	rev	x9, x26
166	cmp	w23, #12			/* which key size? */
167	sub	w7, w23, #2			/* get modified # of rounds */
168	ins	v1.d[1], x9			/* no carry in lower ctr */
169	ld1	{v3.4s}, [x22]			/* load first round key */
170	add	x10, x22, #16
171	bmi	1f
172	bne	4f
173	mov	v5.16b, v3.16b
174	b	3f
1751:	mov	v4.16b, v3.16b
176	ld1	{v5.4s}, [x10], #16		/* load 2nd round key */
1772:	/* inner loop: 3 rounds, 2x interleaved */
178	aese	v0.16b, v4.16b
179	aesmc	v0.16b, v0.16b
180	aese	v1.16b, v4.16b
181	aesmc	v1.16b, v1.16b
1823:	ld1	{v3.4s}, [x10], #16		/* load next round key */
183	aese	v0.16b, v5.16b
184	aesmc	v0.16b, v0.16b
185	aese	v1.16b, v5.16b
186	aesmc	v1.16b, v1.16b
1874:	ld1	{v4.4s}, [x10], #16		/* load next round key */
188	subs	w7, w7, #3
189	aese	v0.16b, v3.16b
190	aesmc	v0.16b, v0.16b
191	aese	v1.16b, v3.16b
192	aesmc	v1.16b, v1.16b
193	ld1	{v5.4s}, [x10], #16		/* load next round key */
194	bpl	2b
195	aese	v0.16b, v4.16b
196	aese	v1.16b, v4.16b
197	subs	w21, w21, #16
198	bmi	7f				/* partial block? */
199	ld1	{v2.16b}, [x20], #16		/* load next input block */
200	.if	\enc == 1
201	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */
202	eor	v1.16b, v1.16b, v2.16b		/* xor with crypted ctr */
203	.else
204	eor	v2.16b, v2.16b, v1.16b		/* xor with crypted ctr */
205	eor	v1.16b, v2.16b, v5.16b		/* final round enc */
206	.endif
207	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */
208	st1	{v1.16b}, [x19], #16		/* write output block */
209	beq	5f
210
211	if_will_cond_yield_neon
212	st1	{v0.16b}, [x24]			/* store mac */
213	do_cond_yield_neon
214	ld1	{v0.16b}, [x24]			/* reload mac */
215	endif_yield_neon
216
217	b	0b
2185:
219CPU_LE(	rev	x26, x26			)
220	st1	{v0.16b}, [x24]			/* store mac */
221	str	x26, [x25, #8]			/* store lsb end of ctr (BE) */
222
2236:	frame_pop
224	ret
225
2267:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
227	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
228	st1	{v0.16b}, [x24]			/* store mac */
229	add	w21, w21, #16			/* process partial tail block */
2308:	ldrb	w9, [x20], #1			/* get 1 byte of input */
231	umov	w6, v1.b[0]			/* get top crypted ctr byte */
232	umov	w7, v0.b[0]			/* get top mac byte */
233	.if	\enc == 1
234	eor	w7, w7, w9
235	eor	w9, w9, w6
236	.else
237	eor	w9, w9, w6
238	eor	w7, w7, w9
239	.endif
240	strb	w9, [x19], #1			/* store out byte */
241	strb	w7, [x24], #1			/* store mac byte */
242	subs	w21, w21, #1
243	beq	6b
244	ext	v0.16b, v0.16b, v0.16b, #1	/* shift out mac byte */
245	ext	v1.16b, v1.16b, v1.16b, #1	/* shift out ctr byte */
246	b	8b
247	.endm
248
249	/*
250	 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
251	 * 			   u8 const rk[], u32 rounds, u8 mac[],
252	 * 			   u8 ctr[]);
253	 * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
254	 * 			   u8 const rk[], u32 rounds, u8 mac[],
255	 * 			   u8 ctr[]);
256	 */
257ENTRY(ce_aes_ccm_encrypt)
258	aes_ccm_do_crypt	1
259ENDPROC(ce_aes_ccm_encrypt)
260
261ENTRY(ce_aes_ccm_decrypt)
262	aes_ccm_do_crypt	0
263ENDPROC(ce_aes_ccm_decrypt)
264