xref: /openbmc/linux/arch/arm/crypto/aes-ce-core.S (revision 9e3bd0f6)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
4 *
5 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	.text
12	.fpu		crypto-neon-fp-armv8
13	.align		3
14
15	.macro		enc_round, state, key
16	aese.8		\state, \key
17	aesmc.8		\state, \state
18	.endm
19
20	.macro		dec_round, state, key
21	aesd.8		\state, \key
22	aesimc.8	\state, \state
23	.endm
24
25	.macro		enc_dround, key1, key2
26	enc_round	q0, \key1
27	enc_round	q0, \key2
28	.endm
29
30	.macro		dec_dround, key1, key2
31	dec_round	q0, \key1
32	dec_round	q0, \key2
33	.endm
34
35	.macro		enc_fround, key1, key2, key3
36	enc_round	q0, \key1
37	aese.8		q0, \key2
38	veor		q0, q0, \key3
39	.endm
40
41	.macro		dec_fround, key1, key2, key3
42	dec_round	q0, \key1
43	aesd.8		q0, \key2
44	veor		q0, q0, \key3
45	.endm
46
47	.macro		enc_dround_4x, key1, key2
48	enc_round	q0, \key1
49	enc_round	q1, \key1
50	enc_round	q2, \key1
51	enc_round	q3, \key1
52	enc_round	q0, \key2
53	enc_round	q1, \key2
54	enc_round	q2, \key2
55	enc_round	q3, \key2
56	.endm
57
58	.macro		dec_dround_4x, key1, key2
59	dec_round	q0, \key1
60	dec_round	q1, \key1
61	dec_round	q2, \key1
62	dec_round	q3, \key1
63	dec_round	q0, \key2
64	dec_round	q1, \key2
65	dec_round	q2, \key2
66	dec_round	q3, \key2
67	.endm
68
69	.macro		enc_fround_4x, key1, key2, key3
70	enc_round	q0, \key1
71	enc_round	q1, \key1
72	enc_round	q2, \key1
73	enc_round	q3, \key1
74	aese.8		q0, \key2
75	aese.8		q1, \key2
76	aese.8		q2, \key2
77	aese.8		q3, \key2
78	veor		q0, q0, \key3
79	veor		q1, q1, \key3
80	veor		q2, q2, \key3
81	veor		q3, q3, \key3
82	.endm
83
84	.macro		dec_fround_4x, key1, key2, key3
85	dec_round	q0, \key1
86	dec_round	q1, \key1
87	dec_round	q2, \key1
88	dec_round	q3, \key1
89	aesd.8		q0, \key2
90	aesd.8		q1, \key2
91	aesd.8		q2, \key2
92	aesd.8		q3, \key2
93	veor		q0, q0, \key3
94	veor		q1, q1, \key3
95	veor		q2, q2, \key3
96	veor		q3, q3, \key3
97	.endm
98
99	.macro		do_block, dround, fround
100	cmp		r3, #12			@ which key size?
101	vld1.32		{q10-q11}, [ip]!
102	\dround		q8, q9
103	vld1.32		{q12-q13}, [ip]!
104	\dround		q10, q11
105	vld1.32		{q10-q11}, [ip]!
106	\dround		q12, q13
107	vld1.32		{q12-q13}, [ip]!
108	\dround		q10, q11
109	blo		0f			@ AES-128: 10 rounds
110	vld1.32		{q10-q11}, [ip]!
111	\dround		q12, q13
112	beq		1f			@ AES-192: 12 rounds
113	vld1.32		{q12-q13}, [ip]
114	\dround		q10, q11
1150:	\fround		q12, q13, q14
116	bx		lr
117
1181:	\fround		q10, q11, q14
119	bx		lr
120	.endm
121
122	/*
123	 * Internal, non-AAPCS compliant functions that implement the core AES
124	 * transforms. These should preserve all registers except q0 - q2 and ip
125	 * Arguments:
126	 *   q0        : first in/output block
127	 *   q1        : second in/output block (_4x version only)
128	 *   q2        : third in/output block (_4x version only)
129	 *   q3        : fourth in/output block (_4x version only)
130	 *   q8        : first round key
131	 *   q9        : secound round key
132	 *   q14       : final round key
133	 *   r2        : address of round key array
134	 *   r3        : number of rounds
135	 */
136	.align		6
137aes_encrypt:
138	add		ip, r2, #32		@ 3rd round key
139.Laes_encrypt_tweak:
140	do_block	enc_dround, enc_fround
141ENDPROC(aes_encrypt)
142
143	.align		6
144aes_decrypt:
145	add		ip, r2, #32		@ 3rd round key
146	do_block	dec_dround, dec_fround
147ENDPROC(aes_decrypt)
148
149	.align		6
150aes_encrypt_4x:
151	add		ip, r2, #32		@ 3rd round key
152	do_block	enc_dround_4x, enc_fround_4x
153ENDPROC(aes_encrypt_4x)
154
155	.align		6
156aes_decrypt_4x:
157	add		ip, r2, #32		@ 3rd round key
158	do_block	dec_dround_4x, dec_fround_4x
159ENDPROC(aes_decrypt_4x)
160
161	.macro		prepare_key, rk, rounds
162	add		ip, \rk, \rounds, lsl #4
163	vld1.32		{q8-q9}, [\rk]		@ load first 2 round keys
164	vld1.32		{q14}, [ip]		@ load last round key
165	.endm
166
167	/*
168	 * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
169	 *		   int blocks)
170	 * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
171	 *		   int blocks)
172	 */
173ENTRY(ce_aes_ecb_encrypt)
174	push		{r4, lr}
175	ldr		r4, [sp, #8]
176	prepare_key	r2, r3
177.Lecbencloop4x:
178	subs		r4, r4, #4
179	bmi		.Lecbenc1x
180	vld1.8		{q0-q1}, [r1]!
181	vld1.8		{q2-q3}, [r1]!
182	bl		aes_encrypt_4x
183	vst1.8		{q0-q1}, [r0]!
184	vst1.8		{q2-q3}, [r0]!
185	b		.Lecbencloop4x
186.Lecbenc1x:
187	adds		r4, r4, #4
188	beq		.Lecbencout
189.Lecbencloop:
190	vld1.8		{q0}, [r1]!
191	bl		aes_encrypt
192	vst1.8		{q0}, [r0]!
193	subs		r4, r4, #1
194	bne		.Lecbencloop
195.Lecbencout:
196	pop		{r4, pc}
197ENDPROC(ce_aes_ecb_encrypt)
198
199ENTRY(ce_aes_ecb_decrypt)
200	push		{r4, lr}
201	ldr		r4, [sp, #8]
202	prepare_key	r2, r3
203.Lecbdecloop4x:
204	subs		r4, r4, #4
205	bmi		.Lecbdec1x
206	vld1.8		{q0-q1}, [r1]!
207	vld1.8		{q2-q3}, [r1]!
208	bl		aes_decrypt_4x
209	vst1.8		{q0-q1}, [r0]!
210	vst1.8		{q2-q3}, [r0]!
211	b		.Lecbdecloop4x
212.Lecbdec1x:
213	adds		r4, r4, #4
214	beq		.Lecbdecout
215.Lecbdecloop:
216	vld1.8		{q0}, [r1]!
217	bl		aes_decrypt
218	vst1.8		{q0}, [r0]!
219	subs		r4, r4, #1
220	bne		.Lecbdecloop
221.Lecbdecout:
222	pop		{r4, pc}
223ENDPROC(ce_aes_ecb_decrypt)
224
225	/*
226	 * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
227	 *		   int blocks, u8 iv[])
228	 * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
229	 *		   int blocks, u8 iv[])
230	 */
231ENTRY(ce_aes_cbc_encrypt)
232	push		{r4-r6, lr}
233	ldrd		r4, r5, [sp, #16]
234	vld1.8		{q0}, [r5]
235	prepare_key	r2, r3
236.Lcbcencloop:
237	vld1.8		{q1}, [r1]!		@ get next pt block
238	veor		q0, q0, q1		@ ..and xor with iv
239	bl		aes_encrypt
240	vst1.8		{q0}, [r0]!
241	subs		r4, r4, #1
242	bne		.Lcbcencloop
243	vst1.8		{q0}, [r5]
244	pop		{r4-r6, pc}
245ENDPROC(ce_aes_cbc_encrypt)
246
247ENTRY(ce_aes_cbc_decrypt)
248	push		{r4-r6, lr}
249	ldrd		r4, r5, [sp, #16]
250	vld1.8		{q15}, [r5]		@ keep iv in q15
251	prepare_key	r2, r3
252.Lcbcdecloop4x:
253	subs		r4, r4, #4
254	bmi		.Lcbcdec1x
255	vld1.8		{q0-q1}, [r1]!
256	vld1.8		{q2-q3}, [r1]!
257	vmov		q4, q0
258	vmov		q5, q1
259	vmov		q6, q2
260	vmov		q7, q3
261	bl		aes_decrypt_4x
262	veor		q0, q0, q15
263	veor		q1, q1, q4
264	veor		q2, q2, q5
265	veor		q3, q3, q6
266	vmov		q15, q7
267	vst1.8		{q0-q1}, [r0]!
268	vst1.8		{q2-q3}, [r0]!
269	b		.Lcbcdecloop4x
270.Lcbcdec1x:
271	adds		r4, r4, #4
272	beq		.Lcbcdecout
273	vmov		q6, q14			@ preserve last round key
274.Lcbcdecloop:
275	vld1.8		{q0}, [r1]!		@ get next ct block
276	veor		q14, q15, q6		@ combine prev ct with last key
277	vmov		q15, q0
278	bl		aes_decrypt
279	vst1.8		{q0}, [r0]!
280	subs		r4, r4, #1
281	bne		.Lcbcdecloop
282.Lcbcdecout:
283	vst1.8		{q15}, [r5]		@ keep iv in q15
284	pop		{r4-r6, pc}
285ENDPROC(ce_aes_cbc_decrypt)
286
287
288	/*
289	 * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
290	 *			  int rounds, int bytes, u8 const iv[])
291	 * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
292	 *			  int rounds, int bytes, u8 const iv[])
293	 */
294
295ENTRY(ce_aes_cbc_cts_encrypt)
296	push		{r4-r6, lr}
297	ldrd		r4, r5, [sp, #16]
298
299	movw		ip, :lower16:.Lcts_permute_table
300	movt		ip, :upper16:.Lcts_permute_table
301	sub		r4, r4, #16
302	add		lr, ip, #32
303	add		ip, ip, r4
304	sub		lr, lr, r4
305	vld1.8		{q5}, [ip]
306	vld1.8		{q6}, [lr]
307
308	add		ip, r1, r4
309	vld1.8		{q0}, [r1]			@ overlapping loads
310	vld1.8		{q3}, [ip]
311
312	vld1.8		{q1}, [r5]			@ get iv
313	prepare_key	r2, r3
314
315	veor		q0, q0, q1			@ xor with iv
316	bl		aes_encrypt
317
318	vtbl.8		d4, {d0-d1}, d10
319	vtbl.8		d5, {d0-d1}, d11
320	vtbl.8		d2, {d6-d7}, d12
321	vtbl.8		d3, {d6-d7}, d13
322
323	veor		q0, q0, q1
324	bl		aes_encrypt
325
326	add		r4, r0, r4
327	vst1.8		{q2}, [r4]			@ overlapping stores
328	vst1.8		{q0}, [r0]
329
330	pop		{r4-r6, pc}
331ENDPROC(ce_aes_cbc_cts_encrypt)
332
333ENTRY(ce_aes_cbc_cts_decrypt)
334	push		{r4-r6, lr}
335	ldrd		r4, r5, [sp, #16]
336
337	movw		ip, :lower16:.Lcts_permute_table
338	movt		ip, :upper16:.Lcts_permute_table
339	sub		r4, r4, #16
340	add		lr, ip, #32
341	add		ip, ip, r4
342	sub		lr, lr, r4
343	vld1.8		{q5}, [ip]
344	vld1.8		{q6}, [lr]
345
346	add		ip, r1, r4
347	vld1.8		{q0}, [r1]			@ overlapping loads
348	vld1.8		{q1}, [ip]
349
350	vld1.8		{q3}, [r5]			@ get iv
351	prepare_key	r2, r3
352
353	bl		aes_decrypt
354
355	vtbl.8		d4, {d0-d1}, d10
356	vtbl.8		d5, {d0-d1}, d11
357	vtbx.8		d0, {d2-d3}, d12
358	vtbx.8		d1, {d2-d3}, d13
359
360	veor		q1, q1, q2
361	bl		aes_decrypt
362	veor		q0, q0, q3			@ xor with iv
363
364	add		r4, r0, r4
365	vst1.8		{q1}, [r4]			@ overlapping stores
366	vst1.8		{q0}, [r0]
367
368	pop		{r4-r6, pc}
369ENDPROC(ce_aes_cbc_cts_decrypt)
370
371
372	/*
373	 * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
374	 *		   int blocks, u8 ctr[])
375	 */
376ENTRY(ce_aes_ctr_encrypt)
377	push		{r4-r6, lr}
378	ldrd		r4, r5, [sp, #16]
379	vld1.8		{q7}, [r5]		@ load ctr
380	prepare_key	r2, r3
381	vmov		r6, s31			@ keep swabbed ctr in r6
382	rev		r6, r6
383	cmn		r6, r4			@ 32 bit overflow?
384	bcs		.Lctrloop
385.Lctrloop4x:
386	subs		r4, r4, #4
387	bmi		.Lctr1x
388	add		r6, r6, #1
389	vmov		q0, q7
390	vmov		q1, q7
391	rev		ip, r6
392	add		r6, r6, #1
393	vmov		q2, q7
394	vmov		s7, ip
395	rev		ip, r6
396	add		r6, r6, #1
397	vmov		q3, q7
398	vmov		s11, ip
399	rev		ip, r6
400	add		r6, r6, #1
401	vmov		s15, ip
402	vld1.8		{q4-q5}, [r1]!
403	vld1.8		{q6}, [r1]!
404	vld1.8		{q15}, [r1]!
405	bl		aes_encrypt_4x
406	veor		q0, q0, q4
407	veor		q1, q1, q5
408	veor		q2, q2, q6
409	veor		q3, q3, q15
410	rev		ip, r6
411	vst1.8		{q0-q1}, [r0]!
412	vst1.8		{q2-q3}, [r0]!
413	vmov		s31, ip
414	b		.Lctrloop4x
415.Lctr1x:
416	adds		r4, r4, #4
417	beq		.Lctrout
418.Lctrloop:
419	vmov		q0, q7
420	bl		aes_encrypt
421
422	adds		r6, r6, #1		@ increment BE ctr
423	rev		ip, r6
424	vmov		s31, ip
425	bcs		.Lctrcarry
426
427.Lctrcarrydone:
428	subs		r4, r4, #1
429	bmi		.Lctrtailblock		@ blocks < 0 means tail block
430	vld1.8		{q3}, [r1]!
431	veor		q3, q0, q3
432	vst1.8		{q3}, [r0]!
433	bne		.Lctrloop
434
435.Lctrout:
436	vst1.8		{q7}, [r5]		@ return next CTR value
437	pop		{r4-r6, pc}
438
439.Lctrtailblock:
440	vst1.8		{q0}, [r0, :64]		@ return the key stream
441	b		.Lctrout
442
443.Lctrcarry:
444	.irp		sreg, s30, s29, s28
445	vmov		ip, \sreg		@ load next word of ctr
446	rev		ip, ip			@ ... to handle the carry
447	adds		ip, ip, #1
448	rev		ip, ip
449	vmov		\sreg, ip
450	bcc		.Lctrcarrydone
451	.endr
452	b		.Lctrcarrydone
453ENDPROC(ce_aes_ctr_encrypt)
454
455	/*
456	 * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
457	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
458	 * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
459	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
460	 */
461
462	.macro		next_tweak, out, in, const, tmp
463	vshr.s64	\tmp, \in, #63
464	vand		\tmp, \tmp, \const
465	vadd.u64	\out, \in, \in
466	vext.8		\tmp, \tmp, \tmp, #8
467	veor		\out, \out, \tmp
468	.endm
469
470ce_aes_xts_init:
471	vmov.i32	d30, #0x87		@ compose tweak mask vector
472	vmovl.u32	q15, d30
473	vshr.u64	d30, d31, #7
474
475	ldrd		r4, r5, [sp, #16]	@ load args
476	ldr		r6, [sp, #28]
477	vld1.8		{q0}, [r5]		@ load iv
478	teq		r6, #1			@ start of a block?
479	bxne		lr
480
481	@ Encrypt the IV in q0 with the second AES key. This should only
482	@ be done at the start of a block.
483	ldr		r6, [sp, #24]		@ load AES key 2
484	prepare_key	r6, r3
485	add		ip, r6, #32		@ 3rd round key of key 2
486	b		.Laes_encrypt_tweak	@ tail call
487ENDPROC(ce_aes_xts_init)
488
489ENTRY(ce_aes_xts_encrypt)
490	push		{r4-r6, lr}
491
492	bl		ce_aes_xts_init		@ run shared prologue
493	prepare_key	r2, r3
494	vmov		q4, q0
495
496	teq		r6, #0			@ start of a block?
497	bne		.Lxtsenc4x
498
499.Lxtsencloop4x:
500	next_tweak	q4, q4, q15, q10
501.Lxtsenc4x:
502	subs		r4, r4, #64
503	bmi		.Lxtsenc1x
504	vld1.8		{q0-q1}, [r1]!		@ get 4 pt blocks
505	vld1.8		{q2-q3}, [r1]!
506	next_tweak	q5, q4, q15, q10
507	veor		q0, q0, q4
508	next_tweak	q6, q5, q15, q10
509	veor		q1, q1, q5
510	next_tweak	q7, q6, q15, q10
511	veor		q2, q2, q6
512	veor		q3, q3, q7
513	bl		aes_encrypt_4x
514	veor		q0, q0, q4
515	veor		q1, q1, q5
516	veor		q2, q2, q6
517	veor		q3, q3, q7
518	vst1.8		{q0-q1}, [r0]!		@ write 4 ct blocks
519	vst1.8		{q2-q3}, [r0]!
520	vmov		q4, q7
521	teq		r4, #0
522	beq		.Lxtsencret
523	b		.Lxtsencloop4x
524.Lxtsenc1x:
525	adds		r4, r4, #64
526	beq		.Lxtsencout
527	subs		r4, r4, #16
528	bmi		.LxtsencctsNx
529.Lxtsencloop:
530	vld1.8		{q0}, [r1]!
531.Lxtsencctsout:
532	veor		q0, q0, q4
533	bl		aes_encrypt
534	veor		q0, q0, q4
535	teq		r4, #0
536	beq		.Lxtsencout
537	subs		r4, r4, #16
538	next_tweak	q4, q4, q15, q6
539	bmi		.Lxtsenccts
540	vst1.8		{q0}, [r0]!
541	b		.Lxtsencloop
542.Lxtsencout:
543	vst1.8		{q0}, [r0]
544.Lxtsencret:
545	vst1.8		{q4}, [r5]
546	pop		{r4-r6, pc}
547
548.LxtsencctsNx:
549	vmov		q0, q3
550	sub		r0, r0, #16
551.Lxtsenccts:
552	movw		ip, :lower16:.Lcts_permute_table
553	movt		ip, :upper16:.Lcts_permute_table
554
555	add		r1, r1, r4		@ rewind input pointer
556	add		r4, r4, #16		@ # bytes in final block
557	add		lr, ip, #32
558	add		ip, ip, r4
559	sub		lr, lr, r4
560	add		r4, r0, r4		@ output address of final block
561
562	vld1.8		{q1}, [r1]		@ load final partial block
563	vld1.8		{q2}, [ip]
564	vld1.8		{q3}, [lr]
565
566	vtbl.8		d4, {d0-d1}, d4
567	vtbl.8		d5, {d0-d1}, d5
568	vtbx.8		d0, {d2-d3}, d6
569	vtbx.8		d1, {d2-d3}, d7
570
571	vst1.8		{q2}, [r4]		@ overlapping stores
572	mov		r4, #0
573	b		.Lxtsencctsout
574ENDPROC(ce_aes_xts_encrypt)
575
576
577ENTRY(ce_aes_xts_decrypt)
578	push		{r4-r6, lr}
579
580	bl		ce_aes_xts_init		@ run shared prologue
581	prepare_key	r2, r3
582	vmov		q4, q0
583
584	/* subtract 16 bytes if we are doing CTS */
585	tst		r4, #0xf
586	subne		r4, r4, #0x10
587
588	teq		r6, #0			@ start of a block?
589	bne		.Lxtsdec4x
590
591.Lxtsdecloop4x:
592	next_tweak	q4, q4, q15, q10
593.Lxtsdec4x:
594	subs		r4, r4, #64
595	bmi		.Lxtsdec1x
596	vld1.8		{q0-q1}, [r1]!		@ get 4 ct blocks
597	vld1.8		{q2-q3}, [r1]!
598	next_tweak	q5, q4, q15, q10
599	veor		q0, q0, q4
600	next_tweak	q6, q5, q15, q10
601	veor		q1, q1, q5
602	next_tweak	q7, q6, q15, q10
603	veor		q2, q2, q6
604	veor		q3, q3, q7
605	bl		aes_decrypt_4x
606	veor		q0, q0, q4
607	veor		q1, q1, q5
608	veor		q2, q2, q6
609	veor		q3, q3, q7
610	vst1.8		{q0-q1}, [r0]!		@ write 4 pt blocks
611	vst1.8		{q2-q3}, [r0]!
612	vmov		q4, q7
613	teq		r4, #0
614	beq		.Lxtsdecout
615	b		.Lxtsdecloop4x
616.Lxtsdec1x:
617	adds		r4, r4, #64
618	beq		.Lxtsdecout
619	subs		r4, r4, #16
620.Lxtsdecloop:
621	vld1.8		{q0}, [r1]!
622	bmi		.Lxtsdeccts
623.Lxtsdecctsout:
624	veor		q0, q0, q4
625	bl		aes_decrypt
626	veor		q0, q0, q4
627	vst1.8		{q0}, [r0]!
628	teq		r4, #0
629	beq		.Lxtsdecout
630	subs		r4, r4, #16
631	next_tweak	q4, q4, q15, q6
632	b		.Lxtsdecloop
633.Lxtsdecout:
634	vst1.8		{q4}, [r5]
635	pop		{r4-r6, pc}
636
637.Lxtsdeccts:
638	movw		ip, :lower16:.Lcts_permute_table
639	movt		ip, :upper16:.Lcts_permute_table
640
641	add		r1, r1, r4		@ rewind input pointer
642	add		r4, r4, #16		@ # bytes in final block
643	add		lr, ip, #32
644	add		ip, ip, r4
645	sub		lr, lr, r4
646	add		r4, r0, r4		@ output address of final block
647
648	next_tweak	q5, q4, q15, q6
649
650	vld1.8		{q1}, [r1]		@ load final partial block
651	vld1.8		{q2}, [ip]
652	vld1.8		{q3}, [lr]
653
654	veor		q0, q0, q5
655	bl		aes_decrypt
656	veor		q0, q0, q5
657
658	vtbl.8		d4, {d0-d1}, d4
659	vtbl.8		d5, {d0-d1}, d5
660	vtbx.8		d0, {d2-d3}, d6
661	vtbx.8		d1, {d2-d3}, d7
662
663	vst1.8		{q2}, [r4]		@ overlapping stores
664	mov		r4, #0
665	b		.Lxtsdecctsout
666ENDPROC(ce_aes_xts_decrypt)
667
668	/*
669	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
670	 *                             AES sbox substitution on each byte in
671	 *                             'input'
672	 */
673ENTRY(ce_aes_sub)
674	vdup.32		q1, r0
675	veor		q0, q0, q0
676	aese.8		q0, q1
677	vmov		r0, s0
678	bx		lr
679ENDPROC(ce_aes_sub)
680
681	/*
682	 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
683	 *                                        operation on round key *src
684	 */
685ENTRY(ce_aes_invert)
686	vld1.32		{q0}, [r1]
687	aesimc.8	q0, q0
688	vst1.32		{q0}, [r0]
689	bx		lr
690ENDPROC(ce_aes_invert)
691
692	.section	".rodata", "a"
693	.align		6
694.Lcts_permute_table:
695	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
696	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
697	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
698	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
699	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
700	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
701