xref: /openbmc/linux/arch/arm/crypto/aes-ce-core.S (revision 113094f7)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
4 *
5 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	.text
12	.fpu		crypto-neon-fp-armv8
13	.align		3
14
15	.macro		enc_round, state, key
16	aese.8		\state, \key
17	aesmc.8		\state, \state
18	.endm
19
20	.macro		dec_round, state, key
21	aesd.8		\state, \key
22	aesimc.8	\state, \state
23	.endm
24
25	.macro		enc_dround, key1, key2
26	enc_round	q0, \key1
27	enc_round	q0, \key2
28	.endm
29
30	.macro		dec_dround, key1, key2
31	dec_round	q0, \key1
32	dec_round	q0, \key2
33	.endm
34
35	.macro		enc_fround, key1, key2, key3
36	enc_round	q0, \key1
37	aese.8		q0, \key2
38	veor		q0, q0, \key3
39	.endm
40
41	.macro		dec_fround, key1, key2, key3
42	dec_round	q0, \key1
43	aesd.8		q0, \key2
44	veor		q0, q0, \key3
45	.endm
46
47	.macro		enc_dround_3x, key1, key2
48	enc_round	q0, \key1
49	enc_round	q1, \key1
50	enc_round	q2, \key1
51	enc_round	q0, \key2
52	enc_round	q1, \key2
53	enc_round	q2, \key2
54	.endm
55
56	.macro		dec_dround_3x, key1, key2
57	dec_round	q0, \key1
58	dec_round	q1, \key1
59	dec_round	q2, \key1
60	dec_round	q0, \key2
61	dec_round	q1, \key2
62	dec_round	q2, \key2
63	.endm
64
65	.macro		enc_fround_3x, key1, key2, key3
66	enc_round	q0, \key1
67	enc_round	q1, \key1
68	enc_round	q2, \key1
69	aese.8		q0, \key2
70	aese.8		q1, \key2
71	aese.8		q2, \key2
72	veor		q0, q0, \key3
73	veor		q1, q1, \key3
74	veor		q2, q2, \key3
75	.endm
76
77	.macro		dec_fround_3x, key1, key2, key3
78	dec_round	q0, \key1
79	dec_round	q1, \key1
80	dec_round	q2, \key1
81	aesd.8		q0, \key2
82	aesd.8		q1, \key2
83	aesd.8		q2, \key2
84	veor		q0, q0, \key3
85	veor		q1, q1, \key3
86	veor		q2, q2, \key3
87	.endm
88
89	.macro		do_block, dround, fround
90	cmp		r3, #12			@ which key size?
91	vld1.8		{q10-q11}, [ip]!
92	\dround		q8, q9
93	vld1.8		{q12-q13}, [ip]!
94	\dround		q10, q11
95	vld1.8		{q10-q11}, [ip]!
96	\dround		q12, q13
97	vld1.8		{q12-q13}, [ip]!
98	\dround		q10, q11
99	blo		0f			@ AES-128: 10 rounds
100	vld1.8		{q10-q11}, [ip]!
101	\dround		q12, q13
102	beq		1f			@ AES-192: 12 rounds
103	vld1.8		{q12-q13}, [ip]
104	\dround		q10, q11
1050:	\fround		q12, q13, q14
106	bx		lr
107
1081:	\fround		q10, q11, q14
109	bx		lr
110	.endm
111
112	/*
113	 * Internal, non-AAPCS compliant functions that implement the core AES
114	 * transforms. These should preserve all registers except q0 - q2 and ip
115	 * Arguments:
116	 *   q0        : first in/output block
117	 *   q1        : second in/output block (_3x version only)
118	 *   q2        : third in/output block (_3x version only)
119	 *   q8        : first round key
120	 *   q9        : secound round key
121	 *   q14       : final round key
122	 *   r2        : address of round key array
123	 *   r3        : number of rounds
124	 */
125	.align		6
126aes_encrypt:
127	add		ip, r2, #32		@ 3rd round key
128.Laes_encrypt_tweak:
129	do_block	enc_dround, enc_fround
130ENDPROC(aes_encrypt)
131
132	.align		6
133aes_decrypt:
134	add		ip, r2, #32		@ 3rd round key
135	do_block	dec_dround, dec_fround
136ENDPROC(aes_decrypt)
137
138	.align		6
139aes_encrypt_3x:
140	add		ip, r2, #32		@ 3rd round key
141	do_block	enc_dround_3x, enc_fround_3x
142ENDPROC(aes_encrypt_3x)
143
144	.align		6
145aes_decrypt_3x:
146	add		ip, r2, #32		@ 3rd round key
147	do_block	dec_dround_3x, dec_fround_3x
148ENDPROC(aes_decrypt_3x)
149
150	.macro		prepare_key, rk, rounds
151	add		ip, \rk, \rounds, lsl #4
152	vld1.8		{q8-q9}, [\rk]		@ load first 2 round keys
153	vld1.8		{q14}, [ip]		@ load last round key
154	.endm
155
156	/*
157	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
158	 *		   int blocks)
159	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
160	 *		   int blocks)
161	 */
162ENTRY(ce_aes_ecb_encrypt)
163	push		{r4, lr}
164	ldr		r4, [sp, #8]
165	prepare_key	r2, r3
166.Lecbencloop3x:
167	subs		r4, r4, #3
168	bmi		.Lecbenc1x
169	vld1.8		{q0-q1}, [r1]!
170	vld1.8		{q2}, [r1]!
171	bl		aes_encrypt_3x
172	vst1.8		{q0-q1}, [r0]!
173	vst1.8		{q2}, [r0]!
174	b		.Lecbencloop3x
175.Lecbenc1x:
176	adds		r4, r4, #3
177	beq		.Lecbencout
178.Lecbencloop:
179	vld1.8		{q0}, [r1]!
180	bl		aes_encrypt
181	vst1.8		{q0}, [r0]!
182	subs		r4, r4, #1
183	bne		.Lecbencloop
184.Lecbencout:
185	pop		{r4, pc}
186ENDPROC(ce_aes_ecb_encrypt)
187
188ENTRY(ce_aes_ecb_decrypt)
189	push		{r4, lr}
190	ldr		r4, [sp, #8]
191	prepare_key	r2, r3
192.Lecbdecloop3x:
193	subs		r4, r4, #3
194	bmi		.Lecbdec1x
195	vld1.8		{q0-q1}, [r1]!
196	vld1.8		{q2}, [r1]!
197	bl		aes_decrypt_3x
198	vst1.8		{q0-q1}, [r0]!
199	vst1.8		{q2}, [r0]!
200	b		.Lecbdecloop3x
201.Lecbdec1x:
202	adds		r4, r4, #3
203	beq		.Lecbdecout
204.Lecbdecloop:
205	vld1.8		{q0}, [r1]!
206	bl		aes_decrypt
207	vst1.8		{q0}, [r0]!
208	subs		r4, r4, #1
209	bne		.Lecbdecloop
210.Lecbdecout:
211	pop		{r4, pc}
212ENDPROC(ce_aes_ecb_decrypt)
213
214	/*
215	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
216	 *		   int blocks, u8 iv[])
217	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
218	 *		   int blocks, u8 iv[])
219	 */
220ENTRY(ce_aes_cbc_encrypt)
221	push		{r4-r6, lr}
222	ldrd		r4, r5, [sp, #16]
223	vld1.8		{q0}, [r5]
224	prepare_key	r2, r3
225.Lcbcencloop:
226	vld1.8		{q1}, [r1]!		@ get next pt block
227	veor		q0, q0, q1		@ ..and xor with iv
228	bl		aes_encrypt
229	vst1.8		{q0}, [r0]!
230	subs		r4, r4, #1
231	bne		.Lcbcencloop
232	vst1.8		{q0}, [r5]
233	pop		{r4-r6, pc}
234ENDPROC(ce_aes_cbc_encrypt)
235
236ENTRY(ce_aes_cbc_decrypt)
237	push		{r4-r6, lr}
238	ldrd		r4, r5, [sp, #16]
239	vld1.8		{q6}, [r5]		@ keep iv in q6
240	prepare_key	r2, r3
241.Lcbcdecloop3x:
242	subs		r4, r4, #3
243	bmi		.Lcbcdec1x
244	vld1.8		{q0-q1}, [r1]!
245	vld1.8		{q2}, [r1]!
246	vmov		q3, q0
247	vmov		q4, q1
248	vmov		q5, q2
249	bl		aes_decrypt_3x
250	veor		q0, q0, q6
251	veor		q1, q1, q3
252	veor		q2, q2, q4
253	vmov		q6, q5
254	vst1.8		{q0-q1}, [r0]!
255	vst1.8		{q2}, [r0]!
256	b		.Lcbcdecloop3x
257.Lcbcdec1x:
258	adds		r4, r4, #3
259	beq		.Lcbcdecout
260	vmov		q15, q14		@ preserve last round key
261.Lcbcdecloop:
262	vld1.8		{q0}, [r1]!		@ get next ct block
263	veor		q14, q15, q6		@ combine prev ct with last key
264	vmov		q6, q0
265	bl		aes_decrypt
266	vst1.8		{q0}, [r0]!
267	subs		r4, r4, #1
268	bne		.Lcbcdecloop
269.Lcbcdecout:
270	vst1.8		{q6}, [r5]		@ keep iv in q6
271	pop		{r4-r6, pc}
272ENDPROC(ce_aes_cbc_decrypt)
273
274	/*
275	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
276	 *		   int blocks, u8 ctr[])
277	 */
278ENTRY(ce_aes_ctr_encrypt)
279	push		{r4-r6, lr}
280	ldrd		r4, r5, [sp, #16]
281	vld1.8		{q6}, [r5]		@ load ctr
282	prepare_key	r2, r3
283	vmov		r6, s27			@ keep swabbed ctr in r6
284	rev		r6, r6
285	cmn		r6, r4			@ 32 bit overflow?
286	bcs		.Lctrloop
287.Lctrloop3x:
288	subs		r4, r4, #3
289	bmi		.Lctr1x
290	add		r6, r6, #1
291	vmov		q0, q6
292	vmov		q1, q6
293	rev		ip, r6
294	add		r6, r6, #1
295	vmov		q2, q6
296	vmov		s7, ip
297	rev		ip, r6
298	add		r6, r6, #1
299	vmov		s11, ip
300	vld1.8		{q3-q4}, [r1]!
301	vld1.8		{q5}, [r1]!
302	bl		aes_encrypt_3x
303	veor		q0, q0, q3
304	veor		q1, q1, q4
305	veor		q2, q2, q5
306	rev		ip, r6
307	vst1.8		{q0-q1}, [r0]!
308	vst1.8		{q2}, [r0]!
309	vmov		s27, ip
310	b		.Lctrloop3x
311.Lctr1x:
312	adds		r4, r4, #3
313	beq		.Lctrout
314.Lctrloop:
315	vmov		q0, q6
316	bl		aes_encrypt
317
318	adds		r6, r6, #1		@ increment BE ctr
319	rev		ip, r6
320	vmov		s27, ip
321	bcs		.Lctrcarry
322
323.Lctrcarrydone:
324	subs		r4, r4, #1
325	bmi		.Lctrtailblock		@ blocks < 0 means tail block
326	vld1.8		{q3}, [r1]!
327	veor		q3, q0, q3
328	vst1.8		{q3}, [r0]!
329	bne		.Lctrloop
330
331.Lctrout:
332	vst1.8		{q6}, [r5]		@ return next CTR value
333	pop		{r4-r6, pc}
334
335.Lctrtailblock:
336	vst1.8		{q0}, [r0, :64]		@ return the key stream
337	b		.Lctrout
338
339.Lctrcarry:
340	.irp		sreg, s26, s25, s24
341	vmov		ip, \sreg		@ load next word of ctr
342	rev		ip, ip			@ ... to handle the carry
343	adds		ip, ip, #1
344	rev		ip, ip
345	vmov		\sreg, ip
346	bcc		.Lctrcarrydone
347	.endr
348	b		.Lctrcarrydone
349ENDPROC(ce_aes_ctr_encrypt)
350
351	/*
352	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
353	 *		   int blocks, u8 iv[], u8 const rk2[], int first)
354	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
355	 *		   int blocks, u8 iv[], u8 const rk2[], int first)
356	 */
357
358	.macro		next_tweak, out, in, const, tmp
359	vshr.s64	\tmp, \in, #63
360	vand		\tmp, \tmp, \const
361	vadd.u64	\out, \in, \in
362	vext.8		\tmp, \tmp, \tmp, #8
363	veor		\out, \out, \tmp
364	.endm
365
366	.align		3
367.Lxts_mul_x:
368	.quad		1, 0x87
369
370ce_aes_xts_init:
371	vldr		d14, .Lxts_mul_x
372	vldr		d15, .Lxts_mul_x + 8
373
374	ldrd		r4, r5, [sp, #16]	@ load args
375	ldr		r6, [sp, #28]
376	vld1.8		{q0}, [r5]		@ load iv
377	teq		r6, #1			@ start of a block?
378	bxne		lr
379
380	@ Encrypt the IV in q0 with the second AES key. This should only
381	@ be done at the start of a block.
382	ldr		r6, [sp, #24]		@ load AES key 2
383	prepare_key	r6, r3
384	add		ip, r6, #32		@ 3rd round key of key 2
385	b		.Laes_encrypt_tweak	@ tail call
386ENDPROC(ce_aes_xts_init)
387
388ENTRY(ce_aes_xts_encrypt)
389	push		{r4-r6, lr}
390
391	bl		ce_aes_xts_init		@ run shared prologue
392	prepare_key	r2, r3
393	vmov		q3, q0
394
395	teq		r6, #0			@ start of a block?
396	bne		.Lxtsenc3x
397
398.Lxtsencloop3x:
399	next_tweak	q3, q3, q7, q6
400.Lxtsenc3x:
401	subs		r4, r4, #3
402	bmi		.Lxtsenc1x
403	vld1.8		{q0-q1}, [r1]!		@ get 3 pt blocks
404	vld1.8		{q2}, [r1]!
405	next_tweak	q4, q3, q7, q6
406	veor		q0, q0, q3
407	next_tweak	q5, q4, q7, q6
408	veor		q1, q1, q4
409	veor		q2, q2, q5
410	bl		aes_encrypt_3x
411	veor		q0, q0, q3
412	veor		q1, q1, q4
413	veor		q2, q2, q5
414	vst1.8		{q0-q1}, [r0]!		@ write 3 ct blocks
415	vst1.8		{q2}, [r0]!
416	vmov		q3, q5
417	teq		r4, #0
418	beq		.Lxtsencout
419	b		.Lxtsencloop3x
420.Lxtsenc1x:
421	adds		r4, r4, #3
422	beq		.Lxtsencout
423.Lxtsencloop:
424	vld1.8		{q0}, [r1]!
425	veor		q0, q0, q3
426	bl		aes_encrypt
427	veor		q0, q0, q3
428	vst1.8		{q0}, [r0]!
429	subs		r4, r4, #1
430	beq		.Lxtsencout
431	next_tweak	q3, q3, q7, q6
432	b		.Lxtsencloop
433.Lxtsencout:
434	vst1.8		{q3}, [r5]
435	pop		{r4-r6, pc}
436ENDPROC(ce_aes_xts_encrypt)
437
438
439ENTRY(ce_aes_xts_decrypt)
440	push		{r4-r6, lr}
441
442	bl		ce_aes_xts_init		@ run shared prologue
443	prepare_key	r2, r3
444	vmov		q3, q0
445
446	teq		r6, #0			@ start of a block?
447	bne		.Lxtsdec3x
448
449.Lxtsdecloop3x:
450	next_tweak	q3, q3, q7, q6
451.Lxtsdec3x:
452	subs		r4, r4, #3
453	bmi		.Lxtsdec1x
454	vld1.8		{q0-q1}, [r1]!		@ get 3 ct blocks
455	vld1.8		{q2}, [r1]!
456	next_tweak	q4, q3, q7, q6
457	veor		q0, q0, q3
458	next_tweak	q5, q4, q7, q6
459	veor		q1, q1, q4
460	veor		q2, q2, q5
461	bl		aes_decrypt_3x
462	veor		q0, q0, q3
463	veor		q1, q1, q4
464	veor		q2, q2, q5
465	vst1.8		{q0-q1}, [r0]!		@ write 3 pt blocks
466	vst1.8		{q2}, [r0]!
467	vmov		q3, q5
468	teq		r4, #0
469	beq		.Lxtsdecout
470	b		.Lxtsdecloop3x
471.Lxtsdec1x:
472	adds		r4, r4, #3
473	beq		.Lxtsdecout
474.Lxtsdecloop:
475	vld1.8		{q0}, [r1]!
476	veor		q0, q0, q3
477	add		ip, r2, #32		@ 3rd round key
478	bl		aes_decrypt
479	veor		q0, q0, q3
480	vst1.8		{q0}, [r0]!
481	subs		r4, r4, #1
482	beq		.Lxtsdecout
483	next_tweak	q3, q3, q7, q6
484	b		.Lxtsdecloop
485.Lxtsdecout:
486	vst1.8		{q3}, [r5]
487	pop		{r4-r6, pc}
488ENDPROC(ce_aes_xts_decrypt)
489
490	/*
491	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
492	 *                             AES sbox substitution on each byte in
493	 *                             'input'
494	 */
495ENTRY(ce_aes_sub)
496	vdup.32		q1, r0
497	veor		q0, q0, q0
498	aese.8		q0, q1
499	vmov		r0, s0
500	bx		lr
501ENDPROC(ce_aes_sub)
502
503	/*
504	 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
505	 *                                        operation on round key *src
506	 */
507ENTRY(ce_aes_invert)
508	vld1.8		{q0}, [r1]
509	aesimc.8	q0, q0
510	vst1.8		{q0}, [r0]
511	bx		lr
512ENDPROC(ce_aes_invert)
513