xref: /openbmc/linux/arch/arm/crypto/aes-ce-core.S (revision 20e2fc42)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
4 *
5 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	.text
12	.arch		armv8-a
13	.fpu		crypto-neon-fp-armv8
14	.align		3
15
16	.macro		enc_round, state, key
17	aese.8		\state, \key
18	aesmc.8		\state, \state
19	.endm
20
21	.macro		dec_round, state, key
22	aesd.8		\state, \key
23	aesimc.8	\state, \state
24	.endm
25
26	.macro		enc_dround, key1, key2
27	enc_round	q0, \key1
28	enc_round	q0, \key2
29	.endm
30
31	.macro		dec_dround, key1, key2
32	dec_round	q0, \key1
33	dec_round	q0, \key2
34	.endm
35
36	.macro		enc_fround, key1, key2, key3
37	enc_round	q0, \key1
38	aese.8		q0, \key2
39	veor		q0, q0, \key3
40	.endm
41
42	.macro		dec_fround, key1, key2, key3
43	dec_round	q0, \key1
44	aesd.8		q0, \key2
45	veor		q0, q0, \key3
46	.endm
47
48	.macro		enc_dround_4x, key1, key2
49	enc_round	q0, \key1
50	enc_round	q1, \key1
51	enc_round	q2, \key1
52	enc_round	q3, \key1
53	enc_round	q0, \key2
54	enc_round	q1, \key2
55	enc_round	q2, \key2
56	enc_round	q3, \key2
57	.endm
58
59	.macro		dec_dround_4x, key1, key2
60	dec_round	q0, \key1
61	dec_round	q1, \key1
62	dec_round	q2, \key1
63	dec_round	q3, \key1
64	dec_round	q0, \key2
65	dec_round	q1, \key2
66	dec_round	q2, \key2
67	dec_round	q3, \key2
68	.endm
69
70	.macro		enc_fround_4x, key1, key2, key3
71	enc_round	q0, \key1
72	enc_round	q1, \key1
73	enc_round	q2, \key1
74	enc_round	q3, \key1
75	aese.8		q0, \key2
76	aese.8		q1, \key2
77	aese.8		q2, \key2
78	aese.8		q3, \key2
79	veor		q0, q0, \key3
80	veor		q1, q1, \key3
81	veor		q2, q2, \key3
82	veor		q3, q3, \key3
83	.endm
84
85	.macro		dec_fround_4x, key1, key2, key3
86	dec_round	q0, \key1
87	dec_round	q1, \key1
88	dec_round	q2, \key1
89	dec_round	q3, \key1
90	aesd.8		q0, \key2
91	aesd.8		q1, \key2
92	aesd.8		q2, \key2
93	aesd.8		q3, \key2
94	veor		q0, q0, \key3
95	veor		q1, q1, \key3
96	veor		q2, q2, \key3
97	veor		q3, q3, \key3
98	.endm
99
100	.macro		do_block, dround, fround
101	cmp		r3, #12			@ which key size?
102	vld1.32		{q10-q11}, [ip]!
103	\dround		q8, q9
104	vld1.32		{q12-q13}, [ip]!
105	\dround		q10, q11
106	vld1.32		{q10-q11}, [ip]!
107	\dround		q12, q13
108	vld1.32		{q12-q13}, [ip]!
109	\dround		q10, q11
110	blo		0f			@ AES-128: 10 rounds
111	vld1.32		{q10-q11}, [ip]!
112	\dround		q12, q13
113	beq		1f			@ AES-192: 12 rounds
114	vld1.32		{q12-q13}, [ip]
115	\dround		q10, q11
1160:	\fround		q12, q13, q14
117	bx		lr
118
1191:	\fround		q10, q11, q14
120	bx		lr
121	.endm
122
123	/*
124	 * Internal, non-AAPCS compliant functions that implement the core AES
125	 * transforms. These should preserve all registers except q0 - q2 and ip
126	 * Arguments:
127	 *   q0        : first in/output block
128	 *   q1        : second in/output block (_4x version only)
129	 *   q2        : third in/output block (_4x version only)
130	 *   q3        : fourth in/output block (_4x version only)
131	 *   q8        : first round key
132	 *   q9        : secound round key
133	 *   q14       : final round key
134	 *   r2        : address of round key array
135	 *   r3        : number of rounds
136	 */
137	.align		6
138aes_encrypt:
139	add		ip, r2, #32		@ 3rd round key
140.Laes_encrypt_tweak:
141	do_block	enc_dround, enc_fround
142ENDPROC(aes_encrypt)
143
144	.align		6
145aes_decrypt:
146	add		ip, r2, #32		@ 3rd round key
147	do_block	dec_dround, dec_fround
148ENDPROC(aes_decrypt)
149
150	.align		6
151aes_encrypt_4x:
152	add		ip, r2, #32		@ 3rd round key
153	do_block	enc_dround_4x, enc_fround_4x
154ENDPROC(aes_encrypt_4x)
155
156	.align		6
157aes_decrypt_4x:
158	add		ip, r2, #32		@ 3rd round key
159	do_block	dec_dround_4x, dec_fround_4x
160ENDPROC(aes_decrypt_4x)
161
162	.macro		prepare_key, rk, rounds
163	add		ip, \rk, \rounds, lsl #4
164	vld1.32		{q8-q9}, [\rk]		@ load first 2 round keys
165	vld1.32		{q14}, [ip]		@ load last round key
166	.endm
167
168	/*
169	 * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
170	 *		   int blocks)
171	 * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
172	 *		   int blocks)
173	 */
174ENTRY(ce_aes_ecb_encrypt)
175	push		{r4, lr}
176	ldr		r4, [sp, #8]
177	prepare_key	r2, r3
178.Lecbencloop4x:
179	subs		r4, r4, #4
180	bmi		.Lecbenc1x
181	vld1.8		{q0-q1}, [r1]!
182	vld1.8		{q2-q3}, [r1]!
183	bl		aes_encrypt_4x
184	vst1.8		{q0-q1}, [r0]!
185	vst1.8		{q2-q3}, [r0]!
186	b		.Lecbencloop4x
187.Lecbenc1x:
188	adds		r4, r4, #4
189	beq		.Lecbencout
190.Lecbencloop:
191	vld1.8		{q0}, [r1]!
192	bl		aes_encrypt
193	vst1.8		{q0}, [r0]!
194	subs		r4, r4, #1
195	bne		.Lecbencloop
196.Lecbencout:
197	pop		{r4, pc}
198ENDPROC(ce_aes_ecb_encrypt)
199
200ENTRY(ce_aes_ecb_decrypt)
201	push		{r4, lr}
202	ldr		r4, [sp, #8]
203	prepare_key	r2, r3
204.Lecbdecloop4x:
205	subs		r4, r4, #4
206	bmi		.Lecbdec1x
207	vld1.8		{q0-q1}, [r1]!
208	vld1.8		{q2-q3}, [r1]!
209	bl		aes_decrypt_4x
210	vst1.8		{q0-q1}, [r0]!
211	vst1.8		{q2-q3}, [r0]!
212	b		.Lecbdecloop4x
213.Lecbdec1x:
214	adds		r4, r4, #4
215	beq		.Lecbdecout
216.Lecbdecloop:
217	vld1.8		{q0}, [r1]!
218	bl		aes_decrypt
219	vst1.8		{q0}, [r0]!
220	subs		r4, r4, #1
221	bne		.Lecbdecloop
222.Lecbdecout:
223	pop		{r4, pc}
224ENDPROC(ce_aes_ecb_decrypt)
225
226	/*
227	 * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
228	 *		   int blocks, u8 iv[])
229	 * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
230	 *		   int blocks, u8 iv[])
231	 */
232ENTRY(ce_aes_cbc_encrypt)
233	push		{r4-r6, lr}
234	ldrd		r4, r5, [sp, #16]
235	vld1.8		{q0}, [r5]
236	prepare_key	r2, r3
237.Lcbcencloop:
238	vld1.8		{q1}, [r1]!		@ get next pt block
239	veor		q0, q0, q1		@ ..and xor with iv
240	bl		aes_encrypt
241	vst1.8		{q0}, [r0]!
242	subs		r4, r4, #1
243	bne		.Lcbcencloop
244	vst1.8		{q0}, [r5]
245	pop		{r4-r6, pc}
246ENDPROC(ce_aes_cbc_encrypt)
247
248ENTRY(ce_aes_cbc_decrypt)
249	push		{r4-r6, lr}
250	ldrd		r4, r5, [sp, #16]
251	vld1.8		{q15}, [r5]		@ keep iv in q15
252	prepare_key	r2, r3
253.Lcbcdecloop4x:
254	subs		r4, r4, #4
255	bmi		.Lcbcdec1x
256	vld1.8		{q0-q1}, [r1]!
257	vld1.8		{q2-q3}, [r1]!
258	vmov		q4, q0
259	vmov		q5, q1
260	vmov		q6, q2
261	vmov		q7, q3
262	bl		aes_decrypt_4x
263	veor		q0, q0, q15
264	veor		q1, q1, q4
265	veor		q2, q2, q5
266	veor		q3, q3, q6
267	vmov		q15, q7
268	vst1.8		{q0-q1}, [r0]!
269	vst1.8		{q2-q3}, [r0]!
270	b		.Lcbcdecloop4x
271.Lcbcdec1x:
272	adds		r4, r4, #4
273	beq		.Lcbcdecout
274	vmov		q6, q14			@ preserve last round key
275.Lcbcdecloop:
276	vld1.8		{q0}, [r1]!		@ get next ct block
277	veor		q14, q15, q6		@ combine prev ct with last key
278	vmov		q15, q0
279	bl		aes_decrypt
280	vst1.8		{q0}, [r0]!
281	subs		r4, r4, #1
282	bne		.Lcbcdecloop
283.Lcbcdecout:
284	vst1.8		{q15}, [r5]		@ keep iv in q15
285	pop		{r4-r6, pc}
286ENDPROC(ce_aes_cbc_decrypt)
287
288
289	/*
290	 * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
291	 *			  int rounds, int bytes, u8 const iv[])
292	 * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
293	 *			  int rounds, int bytes, u8 const iv[])
294	 */
295
296ENTRY(ce_aes_cbc_cts_encrypt)
297	push		{r4-r6, lr}
298	ldrd		r4, r5, [sp, #16]
299
300	movw		ip, :lower16:.Lcts_permute_table
301	movt		ip, :upper16:.Lcts_permute_table
302	sub		r4, r4, #16
303	add		lr, ip, #32
304	add		ip, ip, r4
305	sub		lr, lr, r4
306	vld1.8		{q5}, [ip]
307	vld1.8		{q6}, [lr]
308
309	add		ip, r1, r4
310	vld1.8		{q0}, [r1]			@ overlapping loads
311	vld1.8		{q3}, [ip]
312
313	vld1.8		{q1}, [r5]			@ get iv
314	prepare_key	r2, r3
315
316	veor		q0, q0, q1			@ xor with iv
317	bl		aes_encrypt
318
319	vtbl.8		d4, {d0-d1}, d10
320	vtbl.8		d5, {d0-d1}, d11
321	vtbl.8		d2, {d6-d7}, d12
322	vtbl.8		d3, {d6-d7}, d13
323
324	veor		q0, q0, q1
325	bl		aes_encrypt
326
327	add		r4, r0, r4
328	vst1.8		{q2}, [r4]			@ overlapping stores
329	vst1.8		{q0}, [r0]
330
331	pop		{r4-r6, pc}
332ENDPROC(ce_aes_cbc_cts_encrypt)
333
334ENTRY(ce_aes_cbc_cts_decrypt)
335	push		{r4-r6, lr}
336	ldrd		r4, r5, [sp, #16]
337
338	movw		ip, :lower16:.Lcts_permute_table
339	movt		ip, :upper16:.Lcts_permute_table
340	sub		r4, r4, #16
341	add		lr, ip, #32
342	add		ip, ip, r4
343	sub		lr, lr, r4
344	vld1.8		{q5}, [ip]
345	vld1.8		{q6}, [lr]
346
347	add		ip, r1, r4
348	vld1.8		{q0}, [r1]			@ overlapping loads
349	vld1.8		{q1}, [ip]
350
351	vld1.8		{q3}, [r5]			@ get iv
352	prepare_key	r2, r3
353
354	bl		aes_decrypt
355
356	vtbl.8		d4, {d0-d1}, d10
357	vtbl.8		d5, {d0-d1}, d11
358	vtbx.8		d0, {d2-d3}, d12
359	vtbx.8		d1, {d2-d3}, d13
360
361	veor		q1, q1, q2
362	bl		aes_decrypt
363	veor		q0, q0, q3			@ xor with iv
364
365	add		r4, r0, r4
366	vst1.8		{q1}, [r4]			@ overlapping stores
367	vst1.8		{q0}, [r0]
368
369	pop		{r4-r6, pc}
370ENDPROC(ce_aes_cbc_cts_decrypt)
371
372
373	/*
374	 * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
375	 *		   int blocks, u8 ctr[])
376	 */
377ENTRY(ce_aes_ctr_encrypt)
378	push		{r4-r6, lr}
379	ldrd		r4, r5, [sp, #16]
380	vld1.8		{q7}, [r5]		@ load ctr
381	prepare_key	r2, r3
382	vmov		r6, s31			@ keep swabbed ctr in r6
383	rev		r6, r6
384	cmn		r6, r4			@ 32 bit overflow?
385	bcs		.Lctrloop
386.Lctrloop4x:
387	subs		r4, r4, #4
388	bmi		.Lctr1x
389	add		r6, r6, #1
390	vmov		q0, q7
391	vmov		q1, q7
392	rev		ip, r6
393	add		r6, r6, #1
394	vmov		q2, q7
395	vmov		s7, ip
396	rev		ip, r6
397	add		r6, r6, #1
398	vmov		q3, q7
399	vmov		s11, ip
400	rev		ip, r6
401	add		r6, r6, #1
402	vmov		s15, ip
403	vld1.8		{q4-q5}, [r1]!
404	vld1.8		{q6}, [r1]!
405	vld1.8		{q15}, [r1]!
406	bl		aes_encrypt_4x
407	veor		q0, q0, q4
408	veor		q1, q1, q5
409	veor		q2, q2, q6
410	veor		q3, q3, q15
411	rev		ip, r6
412	vst1.8		{q0-q1}, [r0]!
413	vst1.8		{q2-q3}, [r0]!
414	vmov		s31, ip
415	b		.Lctrloop4x
416.Lctr1x:
417	adds		r4, r4, #4
418	beq		.Lctrout
419.Lctrloop:
420	vmov		q0, q7
421	bl		aes_encrypt
422
423	adds		r6, r6, #1		@ increment BE ctr
424	rev		ip, r6
425	vmov		s31, ip
426	bcs		.Lctrcarry
427
428.Lctrcarrydone:
429	subs		r4, r4, #1
430	bmi		.Lctrtailblock		@ blocks < 0 means tail block
431	vld1.8		{q3}, [r1]!
432	veor		q3, q0, q3
433	vst1.8		{q3}, [r0]!
434	bne		.Lctrloop
435
436.Lctrout:
437	vst1.8		{q7}, [r5]		@ return next CTR value
438	pop		{r4-r6, pc}
439
440.Lctrtailblock:
441	vst1.8		{q0}, [r0, :64]		@ return the key stream
442	b		.Lctrout
443
444.Lctrcarry:
445	.irp		sreg, s30, s29, s28
446	vmov		ip, \sreg		@ load next word of ctr
447	rev		ip, ip			@ ... to handle the carry
448	adds		ip, ip, #1
449	rev		ip, ip
450	vmov		\sreg, ip
451	bcc		.Lctrcarrydone
452	.endr
453	b		.Lctrcarrydone
454ENDPROC(ce_aes_ctr_encrypt)
455
456	/*
457	 * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
458	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
459	 * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
460	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
461	 */
462
463	.macro		next_tweak, out, in, const, tmp
464	vshr.s64	\tmp, \in, #63
465	vand		\tmp, \tmp, \const
466	vadd.u64	\out, \in, \in
467	vext.8		\tmp, \tmp, \tmp, #8
468	veor		\out, \out, \tmp
469	.endm
470
471ce_aes_xts_init:
472	vmov.i32	d30, #0x87		@ compose tweak mask vector
473	vmovl.u32	q15, d30
474	vshr.u64	d30, d31, #7
475
476	ldrd		r4, r5, [sp, #16]	@ load args
477	ldr		r6, [sp, #28]
478	vld1.8		{q0}, [r5]		@ load iv
479	teq		r6, #1			@ start of a block?
480	bxne		lr
481
482	@ Encrypt the IV in q0 with the second AES key. This should only
483	@ be done at the start of a block.
484	ldr		r6, [sp, #24]		@ load AES key 2
485	prepare_key	r6, r3
486	add		ip, r6, #32		@ 3rd round key of key 2
487	b		.Laes_encrypt_tweak	@ tail call
488ENDPROC(ce_aes_xts_init)
489
490ENTRY(ce_aes_xts_encrypt)
491	push		{r4-r6, lr}
492
493	bl		ce_aes_xts_init		@ run shared prologue
494	prepare_key	r2, r3
495	vmov		q4, q0
496
497	teq		r6, #0			@ start of a block?
498	bne		.Lxtsenc4x
499
500.Lxtsencloop4x:
501	next_tweak	q4, q4, q15, q10
502.Lxtsenc4x:
503	subs		r4, r4, #64
504	bmi		.Lxtsenc1x
505	vld1.8		{q0-q1}, [r1]!		@ get 4 pt blocks
506	vld1.8		{q2-q3}, [r1]!
507	next_tweak	q5, q4, q15, q10
508	veor		q0, q0, q4
509	next_tweak	q6, q5, q15, q10
510	veor		q1, q1, q5
511	next_tweak	q7, q6, q15, q10
512	veor		q2, q2, q6
513	veor		q3, q3, q7
514	bl		aes_encrypt_4x
515	veor		q0, q0, q4
516	veor		q1, q1, q5
517	veor		q2, q2, q6
518	veor		q3, q3, q7
519	vst1.8		{q0-q1}, [r0]!		@ write 4 ct blocks
520	vst1.8		{q2-q3}, [r0]!
521	vmov		q4, q7
522	teq		r4, #0
523	beq		.Lxtsencret
524	b		.Lxtsencloop4x
525.Lxtsenc1x:
526	adds		r4, r4, #64
527	beq		.Lxtsencout
528	subs		r4, r4, #16
529	bmi		.LxtsencctsNx
530.Lxtsencloop:
531	vld1.8		{q0}, [r1]!
532.Lxtsencctsout:
533	veor		q0, q0, q4
534	bl		aes_encrypt
535	veor		q0, q0, q4
536	teq		r4, #0
537	beq		.Lxtsencout
538	subs		r4, r4, #16
539	next_tweak	q4, q4, q15, q6
540	bmi		.Lxtsenccts
541	vst1.8		{q0}, [r0]!
542	b		.Lxtsencloop
543.Lxtsencout:
544	vst1.8		{q0}, [r0]
545.Lxtsencret:
546	vst1.8		{q4}, [r5]
547	pop		{r4-r6, pc}
548
549.LxtsencctsNx:
550	vmov		q0, q3
551	sub		r0, r0, #16
552.Lxtsenccts:
553	movw		ip, :lower16:.Lcts_permute_table
554	movt		ip, :upper16:.Lcts_permute_table
555
556	add		r1, r1, r4		@ rewind input pointer
557	add		r4, r4, #16		@ # bytes in final block
558	add		lr, ip, #32
559	add		ip, ip, r4
560	sub		lr, lr, r4
561	add		r4, r0, r4		@ output address of final block
562
563	vld1.8		{q1}, [r1]		@ load final partial block
564	vld1.8		{q2}, [ip]
565	vld1.8		{q3}, [lr]
566
567	vtbl.8		d4, {d0-d1}, d4
568	vtbl.8		d5, {d0-d1}, d5
569	vtbx.8		d0, {d2-d3}, d6
570	vtbx.8		d1, {d2-d3}, d7
571
572	vst1.8		{q2}, [r4]		@ overlapping stores
573	mov		r4, #0
574	b		.Lxtsencctsout
575ENDPROC(ce_aes_xts_encrypt)
576
577
578ENTRY(ce_aes_xts_decrypt)
579	push		{r4-r6, lr}
580
581	bl		ce_aes_xts_init		@ run shared prologue
582	prepare_key	r2, r3
583	vmov		q4, q0
584
585	/* subtract 16 bytes if we are doing CTS */
586	tst		r4, #0xf
587	subne		r4, r4, #0x10
588
589	teq		r6, #0			@ start of a block?
590	bne		.Lxtsdec4x
591
592.Lxtsdecloop4x:
593	next_tweak	q4, q4, q15, q10
594.Lxtsdec4x:
595	subs		r4, r4, #64
596	bmi		.Lxtsdec1x
597	vld1.8		{q0-q1}, [r1]!		@ get 4 ct blocks
598	vld1.8		{q2-q3}, [r1]!
599	next_tweak	q5, q4, q15, q10
600	veor		q0, q0, q4
601	next_tweak	q6, q5, q15, q10
602	veor		q1, q1, q5
603	next_tweak	q7, q6, q15, q10
604	veor		q2, q2, q6
605	veor		q3, q3, q7
606	bl		aes_decrypt_4x
607	veor		q0, q0, q4
608	veor		q1, q1, q5
609	veor		q2, q2, q6
610	veor		q3, q3, q7
611	vst1.8		{q0-q1}, [r0]!		@ write 4 pt blocks
612	vst1.8		{q2-q3}, [r0]!
613	vmov		q4, q7
614	teq		r4, #0
615	beq		.Lxtsdecout
616	b		.Lxtsdecloop4x
617.Lxtsdec1x:
618	adds		r4, r4, #64
619	beq		.Lxtsdecout
620	subs		r4, r4, #16
621.Lxtsdecloop:
622	vld1.8		{q0}, [r1]!
623	bmi		.Lxtsdeccts
624.Lxtsdecctsout:
625	veor		q0, q0, q4
626	bl		aes_decrypt
627	veor		q0, q0, q4
628	vst1.8		{q0}, [r0]!
629	teq		r4, #0
630	beq		.Lxtsdecout
631	subs		r4, r4, #16
632	next_tweak	q4, q4, q15, q6
633	b		.Lxtsdecloop
634.Lxtsdecout:
635	vst1.8		{q4}, [r5]
636	pop		{r4-r6, pc}
637
638.Lxtsdeccts:
639	movw		ip, :lower16:.Lcts_permute_table
640	movt		ip, :upper16:.Lcts_permute_table
641
642	add		r1, r1, r4		@ rewind input pointer
643	add		r4, r4, #16		@ # bytes in final block
644	add		lr, ip, #32
645	add		ip, ip, r4
646	sub		lr, lr, r4
647	add		r4, r0, r4		@ output address of final block
648
649	next_tweak	q5, q4, q15, q6
650
651	vld1.8		{q1}, [r1]		@ load final partial block
652	vld1.8		{q2}, [ip]
653	vld1.8		{q3}, [lr]
654
655	veor		q0, q0, q5
656	bl		aes_decrypt
657	veor		q0, q0, q5
658
659	vtbl.8		d4, {d0-d1}, d4
660	vtbl.8		d5, {d0-d1}, d5
661	vtbx.8		d0, {d2-d3}, d6
662	vtbx.8		d1, {d2-d3}, d7
663
664	vst1.8		{q2}, [r4]		@ overlapping stores
665	mov		r4, #0
666	b		.Lxtsdecctsout
667ENDPROC(ce_aes_xts_decrypt)
668
669	/*
670	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
671	 *                             AES sbox substitution on each byte in
672	 *                             'input'
673	 */
674ENTRY(ce_aes_sub)
675	vdup.32		q1, r0
676	veor		q0, q0, q0
677	aese.8		q0, q1
678	vmov		r0, s0
679	bx		lr
680ENDPROC(ce_aes_sub)
681
682	/*
683	 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
684	 *                                        operation on round key *src
685	 */
686ENTRY(ce_aes_invert)
687	vld1.32		{q0}, [r1]
688	aesimc.8	q0, q0
689	vst1.32		{q0}, [r0]
690	bx		lr
691ENDPROC(ce_aes_invert)
692
693	.section	".rodata", "a"
694	.align		6
695.Lcts_permute_table:
696	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
697	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
698	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
699	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
700	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
701	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
702