xref: /openbmc/linux/arch/arm64/crypto/aes-modes.S (revision a2cce7a9)
1/*
2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* included by aes-ce.S and aes-neon.S */
12
13	.text
14	.align		4
15
16/*
17 * There are several ways to instantiate this code:
18 * - no interleave, all inline
19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
23 *
24 * Macros imported by this code:
25 * - enc_prepare	- setup NEON registers for encryption
26 * - dec_prepare	- setup NEON registers for decryption
27 * - enc_switch_key	- change to new key after having prepared for encryption
28 * - encrypt_block	- encrypt a single block
29 * - decrypt block	- decrypt a single block
30 * - encrypt_block2x	- encrypt 2 blocks in parallel (if INTERLEAVE == 2)
31 * - decrypt_block2x	- decrypt 2 blocks in parallel (if INTERLEAVE == 2)
32 * - encrypt_block4x	- encrypt 4 blocks in parallel (if INTERLEAVE == 4)
33 * - decrypt_block4x	- decrypt 4 blocks in parallel (if INTERLEAVE == 4)
34 */
35
36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
37#define FRAME_PUSH	stp x29, x30, [sp,#-16]! ; mov x29, sp
38#define FRAME_POP	ldp x29, x30, [sp],#16
39
40#if INTERLEAVE == 2
41
42aes_encrypt_block2x:
43	encrypt_block2x	v0, v1, w3, x2, x6, w7
44	ret
45ENDPROC(aes_encrypt_block2x)
46
47aes_decrypt_block2x:
48	decrypt_block2x	v0, v1, w3, x2, x6, w7
49	ret
50ENDPROC(aes_decrypt_block2x)
51
52#elif INTERLEAVE == 4
53
54aes_encrypt_block4x:
55	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
56	ret
57ENDPROC(aes_encrypt_block4x)
58
59aes_decrypt_block4x:
60	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
61	ret
62ENDPROC(aes_decrypt_block4x)
63
64#else
65#error INTERLEAVE should equal 2 or 4
66#endif
67
68	.macro		do_encrypt_block2x
69	bl		aes_encrypt_block2x
70	.endm
71
72	.macro		do_decrypt_block2x
73	bl		aes_decrypt_block2x
74	.endm
75
76	.macro		do_encrypt_block4x
77	bl		aes_encrypt_block4x
78	.endm
79
80	.macro		do_decrypt_block4x
81	bl		aes_decrypt_block4x
82	.endm
83
84#else
85#define FRAME_PUSH
86#define FRAME_POP
87
88	.macro		do_encrypt_block2x
89	encrypt_block2x	v0, v1, w3, x2, x6, w7
90	.endm
91
92	.macro		do_decrypt_block2x
93	decrypt_block2x	v0, v1, w3, x2, x6, w7
94	.endm
95
96	.macro		do_encrypt_block4x
97	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
98	.endm
99
100	.macro		do_decrypt_block4x
101	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
102	.endm
103
104#endif
105
106	/*
107	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
108	 *		   int blocks, int first)
109	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
110	 *		   int blocks, int first)
111	 */
112
113AES_ENTRY(aes_ecb_encrypt)
114	FRAME_PUSH
115	cbz		w5, .LecbencloopNx
116
117	enc_prepare	w3, x2, x5
118
119.LecbencloopNx:
120#if INTERLEAVE >= 2
121	subs		w4, w4, #INTERLEAVE
122	bmi		.Lecbenc1x
123#if INTERLEAVE == 2
124	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
125	do_encrypt_block2x
126	st1		{v0.16b-v1.16b}, [x0], #32
127#else
128	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
129	do_encrypt_block4x
130	st1		{v0.16b-v3.16b}, [x0], #64
131#endif
132	b		.LecbencloopNx
133.Lecbenc1x:
134	adds		w4, w4, #INTERLEAVE
135	beq		.Lecbencout
136#endif
137.Lecbencloop:
138	ld1		{v0.16b}, [x1], #16		/* get next pt block */
139	encrypt_block	v0, w3, x2, x5, w6
140	st1		{v0.16b}, [x0], #16
141	subs		w4, w4, #1
142	bne		.Lecbencloop
143.Lecbencout:
144	FRAME_POP
145	ret
146AES_ENDPROC(aes_ecb_encrypt)
147
148
149AES_ENTRY(aes_ecb_decrypt)
150	FRAME_PUSH
151	cbz		w5, .LecbdecloopNx
152
153	dec_prepare	w3, x2, x5
154
155.LecbdecloopNx:
156#if INTERLEAVE >= 2
157	subs		w4, w4, #INTERLEAVE
158	bmi		.Lecbdec1x
159#if INTERLEAVE == 2
160	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
161	do_decrypt_block2x
162	st1		{v0.16b-v1.16b}, [x0], #32
163#else
164	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
165	do_decrypt_block4x
166	st1		{v0.16b-v3.16b}, [x0], #64
167#endif
168	b		.LecbdecloopNx
169.Lecbdec1x:
170	adds		w4, w4, #INTERLEAVE
171	beq		.Lecbdecout
172#endif
173.Lecbdecloop:
174	ld1		{v0.16b}, [x1], #16		/* get next ct block */
175	decrypt_block	v0, w3, x2, x5, w6
176	st1		{v0.16b}, [x0], #16
177	subs		w4, w4, #1
178	bne		.Lecbdecloop
179.Lecbdecout:
180	FRAME_POP
181	ret
182AES_ENDPROC(aes_ecb_decrypt)
183
184
185	/*
186	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
187	 *		   int blocks, u8 iv[], int first)
188	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
189	 *		   int blocks, u8 iv[], int first)
190	 */
191
192AES_ENTRY(aes_cbc_encrypt)
193	cbz		w6, .Lcbcencloop
194
195	ld1		{v0.16b}, [x5]			/* get iv */
196	enc_prepare	w3, x2, x5
197
198.Lcbcencloop:
199	ld1		{v1.16b}, [x1], #16		/* get next pt block */
200	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with iv */
201	encrypt_block	v0, w3, x2, x5, w6
202	st1		{v0.16b}, [x0], #16
203	subs		w4, w4, #1
204	bne		.Lcbcencloop
205	ret
206AES_ENDPROC(aes_cbc_encrypt)
207
208
209AES_ENTRY(aes_cbc_decrypt)
210	FRAME_PUSH
211	cbz		w6, .LcbcdecloopNx
212
213	ld1		{v7.16b}, [x5]			/* get iv */
214	dec_prepare	w3, x2, x5
215
216.LcbcdecloopNx:
217#if INTERLEAVE >= 2
218	subs		w4, w4, #INTERLEAVE
219	bmi		.Lcbcdec1x
220#if INTERLEAVE == 2
221	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
222	mov		v2.16b, v0.16b
223	mov		v3.16b, v1.16b
224	do_decrypt_block2x
225	eor		v0.16b, v0.16b, v7.16b
226	eor		v1.16b, v1.16b, v2.16b
227	mov		v7.16b, v3.16b
228	st1		{v0.16b-v1.16b}, [x0], #32
229#else
230	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
231	mov		v4.16b, v0.16b
232	mov		v5.16b, v1.16b
233	mov		v6.16b, v2.16b
234	do_decrypt_block4x
235	sub		x1, x1, #16
236	eor		v0.16b, v0.16b, v7.16b
237	eor		v1.16b, v1.16b, v4.16b
238	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
239	eor		v2.16b, v2.16b, v5.16b
240	eor		v3.16b, v3.16b, v6.16b
241	st1		{v0.16b-v3.16b}, [x0], #64
242#endif
243	b		.LcbcdecloopNx
244.Lcbcdec1x:
245	adds		w4, w4, #INTERLEAVE
246	beq		.Lcbcdecout
247#endif
248.Lcbcdecloop:
249	ld1		{v1.16b}, [x1], #16		/* get next ct block */
250	mov		v0.16b, v1.16b			/* ...and copy to v0 */
251	decrypt_block	v0, w3, x2, x5, w6
252	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
253	mov		v7.16b, v1.16b			/* ct is next iv */
254	st1		{v0.16b}, [x0], #16
255	subs		w4, w4, #1
256	bne		.Lcbcdecloop
257.Lcbcdecout:
258	FRAME_POP
259	ret
260AES_ENDPROC(aes_cbc_decrypt)
261
262
263	/*
264	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
265	 *		   int blocks, u8 ctr[], int first)
266	 */
267
268AES_ENTRY(aes_ctr_encrypt)
269	FRAME_PUSH
270	cbnz		w6, .Lctrfirst		/* 1st time around? */
271	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */
272	rev		x5, x5
273#if INTERLEAVE >= 2
274	cmn		w5, w4			/* 32 bit overflow? */
275	bcs		.Lctrinc
276	add		x5, x5, #1		/* increment BE ctr */
277	b		.LctrincNx
278#else
279	b		.Lctrinc
280#endif
281.Lctrfirst:
282	enc_prepare	w3, x2, x6
283	ld1		{v4.16b}, [x5]
284	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */
285	rev		x5, x5
286#if INTERLEAVE >= 2
287	cmn		w5, w4			/* 32 bit overflow? */
288	bcs		.Lctrloop
289.LctrloopNx:
290	subs		w4, w4, #INTERLEAVE
291	bmi		.Lctr1x
292#if INTERLEAVE == 2
293	mov		v0.8b, v4.8b
294	mov		v1.8b, v4.8b
295	rev		x7, x5
296	add		x5, x5, #1
297	ins		v0.d[1], x7
298	rev		x7, x5
299	add		x5, x5, #1
300	ins		v1.d[1], x7
301	ld1		{v2.16b-v3.16b}, [x1], #32	/* get 2 input blocks */
302	do_encrypt_block2x
303	eor		v0.16b, v0.16b, v2.16b
304	eor		v1.16b, v1.16b, v3.16b
305	st1		{v0.16b-v1.16b}, [x0], #32
306#else
307	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
308	dup		v7.4s, w5
309	mov		v0.16b, v4.16b
310	add		v7.4s, v7.4s, v8.4s
311	mov		v1.16b, v4.16b
312	rev32		v8.16b, v7.16b
313	mov		v2.16b, v4.16b
314	mov		v3.16b, v4.16b
315	mov		v1.s[3], v8.s[0]
316	mov		v2.s[3], v8.s[1]
317	mov		v3.s[3], v8.s[2]
318	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
319	do_encrypt_block4x
320	eor		v0.16b, v5.16b, v0.16b
321	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
322	eor		v1.16b, v6.16b, v1.16b
323	eor		v2.16b, v7.16b, v2.16b
324	eor		v3.16b, v5.16b, v3.16b
325	st1		{v0.16b-v3.16b}, [x0], #64
326	add		x5, x5, #INTERLEAVE
327#endif
328	cbz		w4, .LctroutNx
329.LctrincNx:
330	rev		x7, x5
331	ins		v4.d[1], x7
332	b		.LctrloopNx
333.LctroutNx:
334	sub		x5, x5, #1
335	rev		x7, x5
336	ins		v4.d[1], x7
337	b		.Lctrout
338.Lctr1x:
339	adds		w4, w4, #INTERLEAVE
340	beq		.Lctrout
341#endif
342.Lctrloop:
343	mov		v0.16b, v4.16b
344	encrypt_block	v0, w3, x2, x6, w7
345	subs		w4, w4, #1
346	bmi		.Lctrhalfblock		/* blocks < 0 means 1/2 block */
347	ld1		{v3.16b}, [x1], #16
348	eor		v3.16b, v0.16b, v3.16b
349	st1		{v3.16b}, [x0], #16
350	beq		.Lctrout
351.Lctrinc:
352	adds		x5, x5, #1		/* increment BE ctr */
353	rev		x7, x5
354	ins		v4.d[1], x7
355	bcc		.Lctrloop		/* no overflow? */
356	umov		x7, v4.d[0]		/* load upper word of ctr  */
357	rev		x7, x7			/* ... to handle the carry */
358	add		x7, x7, #1
359	rev		x7, x7
360	ins		v4.d[0], x7
361	b		.Lctrloop
362.Lctrhalfblock:
363	ld1		{v3.8b}, [x1]
364	eor		v3.8b, v0.8b, v3.8b
365	st1		{v3.8b}, [x0]
366.Lctrout:
367	FRAME_POP
368	ret
369AES_ENDPROC(aes_ctr_encrypt)
370	.ltorg
371
372
373	/*
374	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
375	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
376	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
377	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
378	 */
379
380	.macro		next_tweak, out, in, const, tmp
381	sshr		\tmp\().2d,  \in\().2d,   #63
382	and		\tmp\().16b, \tmp\().16b, \const\().16b
383	add		\out\().2d,  \in\().2d,   \in\().2d
384	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
385	eor		\out\().16b, \out\().16b, \tmp\().16b
386	.endm
387
388.Lxts_mul_x:
389	.word		1, 0, 0x87, 0
390
391AES_ENTRY(aes_xts_encrypt)
392	FRAME_PUSH
393	cbz		w7, .LxtsencloopNx
394
395	ld1		{v4.16b}, [x6]
396	enc_prepare	w3, x5, x6
397	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
398	enc_switch_key	w3, x2, x6
399	ldr		q7, .Lxts_mul_x
400	b		.LxtsencNx
401
402.LxtsencloopNx:
403	ldr		q7, .Lxts_mul_x
404	next_tweak	v4, v4, v7, v8
405.LxtsencNx:
406#if INTERLEAVE >= 2
407	subs		w4, w4, #INTERLEAVE
408	bmi		.Lxtsenc1x
409#if INTERLEAVE == 2
410	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
411	next_tweak	v5, v4, v7, v8
412	eor		v0.16b, v0.16b, v4.16b
413	eor		v1.16b, v1.16b, v5.16b
414	do_encrypt_block2x
415	eor		v0.16b, v0.16b, v4.16b
416	eor		v1.16b, v1.16b, v5.16b
417	st1		{v0.16b-v1.16b}, [x0], #32
418	cbz		w4, .LxtsencoutNx
419	next_tweak	v4, v5, v7, v8
420	b		.LxtsencNx
421.LxtsencoutNx:
422	mov		v4.16b, v5.16b
423	b		.Lxtsencout
424#else
425	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
426	next_tweak	v5, v4, v7, v8
427	eor		v0.16b, v0.16b, v4.16b
428	next_tweak	v6, v5, v7, v8
429	eor		v1.16b, v1.16b, v5.16b
430	eor		v2.16b, v2.16b, v6.16b
431	next_tweak	v7, v6, v7, v8
432	eor		v3.16b, v3.16b, v7.16b
433	do_encrypt_block4x
434	eor		v3.16b, v3.16b, v7.16b
435	eor		v0.16b, v0.16b, v4.16b
436	eor		v1.16b, v1.16b, v5.16b
437	eor		v2.16b, v2.16b, v6.16b
438	st1		{v0.16b-v3.16b}, [x0], #64
439	mov		v4.16b, v7.16b
440	cbz		w4, .Lxtsencout
441	b		.LxtsencloopNx
442#endif
443.Lxtsenc1x:
444	adds		w4, w4, #INTERLEAVE
445	beq		.Lxtsencout
446#endif
447.Lxtsencloop:
448	ld1		{v1.16b}, [x1], #16
449	eor		v0.16b, v1.16b, v4.16b
450	encrypt_block	v0, w3, x2, x6, w7
451	eor		v0.16b, v0.16b, v4.16b
452	st1		{v0.16b}, [x0], #16
453	subs		w4, w4, #1
454	beq		.Lxtsencout
455	next_tweak	v4, v4, v7, v8
456	b		.Lxtsencloop
457.Lxtsencout:
458	FRAME_POP
459	ret
460AES_ENDPROC(aes_xts_encrypt)
461
462
463AES_ENTRY(aes_xts_decrypt)
464	FRAME_PUSH
465	cbz		w7, .LxtsdecloopNx
466
467	ld1		{v4.16b}, [x6]
468	enc_prepare	w3, x5, x6
469	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
470	dec_prepare	w3, x2, x6
471	ldr		q7, .Lxts_mul_x
472	b		.LxtsdecNx
473
474.LxtsdecloopNx:
475	ldr		q7, .Lxts_mul_x
476	next_tweak	v4, v4, v7, v8
477.LxtsdecNx:
478#if INTERLEAVE >= 2
479	subs		w4, w4, #INTERLEAVE
480	bmi		.Lxtsdec1x
481#if INTERLEAVE == 2
482	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
483	next_tweak	v5, v4, v7, v8
484	eor		v0.16b, v0.16b, v4.16b
485	eor		v1.16b, v1.16b, v5.16b
486	do_decrypt_block2x
487	eor		v0.16b, v0.16b, v4.16b
488	eor		v1.16b, v1.16b, v5.16b
489	st1		{v0.16b-v1.16b}, [x0], #32
490	cbz		w4, .LxtsdecoutNx
491	next_tweak	v4, v5, v7, v8
492	b		.LxtsdecNx
493.LxtsdecoutNx:
494	mov		v4.16b, v5.16b
495	b		.Lxtsdecout
496#else
497	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
498	next_tweak	v5, v4, v7, v8
499	eor		v0.16b, v0.16b, v4.16b
500	next_tweak	v6, v5, v7, v8
501	eor		v1.16b, v1.16b, v5.16b
502	eor		v2.16b, v2.16b, v6.16b
503	next_tweak	v7, v6, v7, v8
504	eor		v3.16b, v3.16b, v7.16b
505	do_decrypt_block4x
506	eor		v3.16b, v3.16b, v7.16b
507	eor		v0.16b, v0.16b, v4.16b
508	eor		v1.16b, v1.16b, v5.16b
509	eor		v2.16b, v2.16b, v6.16b
510	st1		{v0.16b-v3.16b}, [x0], #64
511	mov		v4.16b, v7.16b
512	cbz		w4, .Lxtsdecout
513	b		.LxtsdecloopNx
514#endif
515.Lxtsdec1x:
516	adds		w4, w4, #INTERLEAVE
517	beq		.Lxtsdecout
518#endif
519.Lxtsdecloop:
520	ld1		{v1.16b}, [x1], #16
521	eor		v0.16b, v1.16b, v4.16b
522	decrypt_block	v0, w3, x2, x6, w7
523	eor		v0.16b, v0.16b, v4.16b
524	st1		{v0.16b}, [x0], #16
525	subs		w4, w4, #1
526	beq		.Lxtsdecout
527	next_tweak	v4, v4, v7, v8
528	b		.Lxtsdecloop
529.Lxtsdecout:
530	FRAME_POP
531	ret
532AES_ENDPROC(aes_xts_decrypt)
533