xref: /openbmc/linux/arch/arm64/crypto/aes-modes.S (revision 4f6cce39)
1/*
2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3 *
4 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* included by aes-ce.S and aes-neon.S */
12
13	.text
14	.align		4
15
16/*
17 * There are several ways to instantiate this code:
18 * - no interleave, all inline
19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
23 *
24 * Macros imported by this code:
25 * - enc_prepare	- setup NEON registers for encryption
26 * - dec_prepare	- setup NEON registers for decryption
27 * - enc_switch_key	- change to new key after having prepared for encryption
28 * - encrypt_block	- encrypt a single block
29 * - decrypt block	- decrypt a single block
30 * - encrypt_block2x	- encrypt 2 blocks in parallel (if INTERLEAVE == 2)
31 * - decrypt_block2x	- decrypt 2 blocks in parallel (if INTERLEAVE == 2)
32 * - encrypt_block4x	- encrypt 4 blocks in parallel (if INTERLEAVE == 4)
33 * - decrypt_block4x	- decrypt 4 blocks in parallel (if INTERLEAVE == 4)
34 */
35
36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
37#define FRAME_PUSH	stp x29, x30, [sp,#-16]! ; mov x29, sp
38#define FRAME_POP	ldp x29, x30, [sp],#16
39
40#if INTERLEAVE == 2
41
42aes_encrypt_block2x:
43	encrypt_block2x	v0, v1, w3, x2, x6, w7
44	ret
45ENDPROC(aes_encrypt_block2x)
46
47aes_decrypt_block2x:
48	decrypt_block2x	v0, v1, w3, x2, x6, w7
49	ret
50ENDPROC(aes_decrypt_block2x)
51
52#elif INTERLEAVE == 4
53
54aes_encrypt_block4x:
55	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
56	ret
57ENDPROC(aes_encrypt_block4x)
58
59aes_decrypt_block4x:
60	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
61	ret
62ENDPROC(aes_decrypt_block4x)
63
64#else
65#error INTERLEAVE should equal 2 or 4
66#endif
67
68	.macro		do_encrypt_block2x
69	bl		aes_encrypt_block2x
70	.endm
71
72	.macro		do_decrypt_block2x
73	bl		aes_decrypt_block2x
74	.endm
75
76	.macro		do_encrypt_block4x
77	bl		aes_encrypt_block4x
78	.endm
79
80	.macro		do_decrypt_block4x
81	bl		aes_decrypt_block4x
82	.endm
83
84#else
85#define FRAME_PUSH
86#define FRAME_POP
87
88	.macro		do_encrypt_block2x
89	encrypt_block2x	v0, v1, w3, x2, x6, w7
90	.endm
91
92	.macro		do_decrypt_block2x
93	decrypt_block2x	v0, v1, w3, x2, x6, w7
94	.endm
95
96	.macro		do_encrypt_block4x
97	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
98	.endm
99
100	.macro		do_decrypt_block4x
101	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
102	.endm
103
104#endif
105
106	/*
107	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
108	 *		   int blocks, int first)
109	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
110	 *		   int blocks, int first)
111	 */
112
113AES_ENTRY(aes_ecb_encrypt)
114	FRAME_PUSH
115	cbz		w5, .LecbencloopNx
116
117	enc_prepare	w3, x2, x5
118
119.LecbencloopNx:
120#if INTERLEAVE >= 2
121	subs		w4, w4, #INTERLEAVE
122	bmi		.Lecbenc1x
123#if INTERLEAVE == 2
124	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
125	do_encrypt_block2x
126	st1		{v0.16b-v1.16b}, [x0], #32
127#else
128	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
129	do_encrypt_block4x
130	st1		{v0.16b-v3.16b}, [x0], #64
131#endif
132	b		.LecbencloopNx
133.Lecbenc1x:
134	adds		w4, w4, #INTERLEAVE
135	beq		.Lecbencout
136#endif
137.Lecbencloop:
138	ld1		{v0.16b}, [x1], #16		/* get next pt block */
139	encrypt_block	v0, w3, x2, x5, w6
140	st1		{v0.16b}, [x0], #16
141	subs		w4, w4, #1
142	bne		.Lecbencloop
143.Lecbencout:
144	FRAME_POP
145	ret
146AES_ENDPROC(aes_ecb_encrypt)
147
148
149AES_ENTRY(aes_ecb_decrypt)
150	FRAME_PUSH
151	cbz		w5, .LecbdecloopNx
152
153	dec_prepare	w3, x2, x5
154
155.LecbdecloopNx:
156#if INTERLEAVE >= 2
157	subs		w4, w4, #INTERLEAVE
158	bmi		.Lecbdec1x
159#if INTERLEAVE == 2
160	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
161	do_decrypt_block2x
162	st1		{v0.16b-v1.16b}, [x0], #32
163#else
164	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
165	do_decrypt_block4x
166	st1		{v0.16b-v3.16b}, [x0], #64
167#endif
168	b		.LecbdecloopNx
169.Lecbdec1x:
170	adds		w4, w4, #INTERLEAVE
171	beq		.Lecbdecout
172#endif
173.Lecbdecloop:
174	ld1		{v0.16b}, [x1], #16		/* get next ct block */
175	decrypt_block	v0, w3, x2, x5, w6
176	st1		{v0.16b}, [x0], #16
177	subs		w4, w4, #1
178	bne		.Lecbdecloop
179.Lecbdecout:
180	FRAME_POP
181	ret
182AES_ENDPROC(aes_ecb_decrypt)
183
184
185	/*
186	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
187	 *		   int blocks, u8 iv[], int first)
188	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
189	 *		   int blocks, u8 iv[], int first)
190	 */
191
192AES_ENTRY(aes_cbc_encrypt)
193	cbz		w6, .Lcbcencloop
194
195	ld1		{v0.16b}, [x5]			/* get iv */
196	enc_prepare	w3, x2, x6
197
198.Lcbcencloop:
199	ld1		{v1.16b}, [x1], #16		/* get next pt block */
200	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with iv */
201	encrypt_block	v0, w3, x2, x6, w7
202	st1		{v0.16b}, [x0], #16
203	subs		w4, w4, #1
204	bne		.Lcbcencloop
205	st1		{v0.16b}, [x5]			/* return iv */
206	ret
207AES_ENDPROC(aes_cbc_encrypt)
208
209
210AES_ENTRY(aes_cbc_decrypt)
211	FRAME_PUSH
212	cbz		w6, .LcbcdecloopNx
213
214	ld1		{v7.16b}, [x5]			/* get iv */
215	dec_prepare	w3, x2, x6
216
217.LcbcdecloopNx:
218#if INTERLEAVE >= 2
219	subs		w4, w4, #INTERLEAVE
220	bmi		.Lcbcdec1x
221#if INTERLEAVE == 2
222	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
223	mov		v2.16b, v0.16b
224	mov		v3.16b, v1.16b
225	do_decrypt_block2x
226	eor		v0.16b, v0.16b, v7.16b
227	eor		v1.16b, v1.16b, v2.16b
228	mov		v7.16b, v3.16b
229	st1		{v0.16b-v1.16b}, [x0], #32
230#else
231	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
232	mov		v4.16b, v0.16b
233	mov		v5.16b, v1.16b
234	mov		v6.16b, v2.16b
235	do_decrypt_block4x
236	sub		x1, x1, #16
237	eor		v0.16b, v0.16b, v7.16b
238	eor		v1.16b, v1.16b, v4.16b
239	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
240	eor		v2.16b, v2.16b, v5.16b
241	eor		v3.16b, v3.16b, v6.16b
242	st1		{v0.16b-v3.16b}, [x0], #64
243#endif
244	b		.LcbcdecloopNx
245.Lcbcdec1x:
246	adds		w4, w4, #INTERLEAVE
247	beq		.Lcbcdecout
248#endif
249.Lcbcdecloop:
250	ld1		{v1.16b}, [x1], #16		/* get next ct block */
251	mov		v0.16b, v1.16b			/* ...and copy to v0 */
252	decrypt_block	v0, w3, x2, x6, w7
253	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
254	mov		v7.16b, v1.16b			/* ct is next iv */
255	st1		{v0.16b}, [x0], #16
256	subs		w4, w4, #1
257	bne		.Lcbcdecloop
258.Lcbcdecout:
259	FRAME_POP
260	st1		{v7.16b}, [x5]			/* return iv */
261	ret
262AES_ENDPROC(aes_cbc_decrypt)
263
264
265	/*
266	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
267	 *		   int blocks, u8 ctr[], int first)
268	 */
269
270AES_ENTRY(aes_ctr_encrypt)
271	FRAME_PUSH
272	cbz		w6, .Lctrnotfirst	/* 1st time around? */
273	enc_prepare	w3, x2, x6
274	ld1		{v4.16b}, [x5]
275
276.Lctrnotfirst:
277	umov		x8, v4.d[1]		/* keep swabbed ctr in reg */
278	rev		x8, x8
279#if INTERLEAVE >= 2
280	cmn		w8, w4			/* 32 bit overflow? */
281	bcs		.Lctrloop
282.LctrloopNx:
283	subs		w4, w4, #INTERLEAVE
284	bmi		.Lctr1x
285#if INTERLEAVE == 2
286	mov		v0.8b, v4.8b
287	mov		v1.8b, v4.8b
288	rev		x7, x8
289	add		x8, x8, #1
290	ins		v0.d[1], x7
291	rev		x7, x8
292	add		x8, x8, #1
293	ins		v1.d[1], x7
294	ld1		{v2.16b-v3.16b}, [x1], #32	/* get 2 input blocks */
295	do_encrypt_block2x
296	eor		v0.16b, v0.16b, v2.16b
297	eor		v1.16b, v1.16b, v3.16b
298	st1		{v0.16b-v1.16b}, [x0], #32
299#else
300	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
301	dup		v7.4s, w8
302	mov		v0.16b, v4.16b
303	add		v7.4s, v7.4s, v8.4s
304	mov		v1.16b, v4.16b
305	rev32		v8.16b, v7.16b
306	mov		v2.16b, v4.16b
307	mov		v3.16b, v4.16b
308	mov		v1.s[3], v8.s[0]
309	mov		v2.s[3], v8.s[1]
310	mov		v3.s[3], v8.s[2]
311	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
312	do_encrypt_block4x
313	eor		v0.16b, v5.16b, v0.16b
314	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
315	eor		v1.16b, v6.16b, v1.16b
316	eor		v2.16b, v7.16b, v2.16b
317	eor		v3.16b, v5.16b, v3.16b
318	st1		{v0.16b-v3.16b}, [x0], #64
319	add		x8, x8, #INTERLEAVE
320#endif
321	rev		x7, x8
322	ins		v4.d[1], x7
323	cbz		w4, .Lctrout
324	b		.LctrloopNx
325.Lctr1x:
326	adds		w4, w4, #INTERLEAVE
327	beq		.Lctrout
328#endif
329.Lctrloop:
330	mov		v0.16b, v4.16b
331	encrypt_block	v0, w3, x2, x6, w7
332
333	adds		x8, x8, #1		/* increment BE ctr */
334	rev		x7, x8
335	ins		v4.d[1], x7
336	bcs		.Lctrcarry		/* overflow? */
337
338.Lctrcarrydone:
339	subs		w4, w4, #1
340	bmi		.Lctrtailblock		/* blocks <0 means tail block */
341	ld1		{v3.16b}, [x1], #16
342	eor		v3.16b, v0.16b, v3.16b
343	st1		{v3.16b}, [x0], #16
344	bne		.Lctrloop
345
346.Lctrout:
347	st1		{v4.16b}, [x5]		/* return next CTR value */
348	FRAME_POP
349	ret
350
351.Lctrtailblock:
352	st1		{v0.16b}, [x0]
353	FRAME_POP
354	ret
355
356.Lctrcarry:
357	umov		x7, v4.d[0]		/* load upper word of ctr  */
358	rev		x7, x7			/* ... to handle the carry */
359	add		x7, x7, #1
360	rev		x7, x7
361	ins		v4.d[0], x7
362	b		.Lctrcarrydone
363AES_ENDPROC(aes_ctr_encrypt)
364	.ltorg
365
366
367	/*
368	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
369	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
370	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
371	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
372	 */
373
374	.macro		next_tweak, out, in, const, tmp
375	sshr		\tmp\().2d,  \in\().2d,   #63
376	and		\tmp\().16b, \tmp\().16b, \const\().16b
377	add		\out\().2d,  \in\().2d,   \in\().2d
378	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
379	eor		\out\().16b, \out\().16b, \tmp\().16b
380	.endm
381
382.Lxts_mul_x:
383CPU_LE(	.quad		1, 0x87		)
384CPU_BE(	.quad		0x87, 1		)
385
386AES_ENTRY(aes_xts_encrypt)
387	FRAME_PUSH
388	cbz		w7, .LxtsencloopNx
389
390	ld1		{v4.16b}, [x6]
391	enc_prepare	w3, x5, x6
392	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
393	enc_switch_key	w3, x2, x6
394	ldr		q7, .Lxts_mul_x
395	b		.LxtsencNx
396
397.LxtsencloopNx:
398	ldr		q7, .Lxts_mul_x
399	next_tweak	v4, v4, v7, v8
400.LxtsencNx:
401#if INTERLEAVE >= 2
402	subs		w4, w4, #INTERLEAVE
403	bmi		.Lxtsenc1x
404#if INTERLEAVE == 2
405	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
406	next_tweak	v5, v4, v7, v8
407	eor		v0.16b, v0.16b, v4.16b
408	eor		v1.16b, v1.16b, v5.16b
409	do_encrypt_block2x
410	eor		v0.16b, v0.16b, v4.16b
411	eor		v1.16b, v1.16b, v5.16b
412	st1		{v0.16b-v1.16b}, [x0], #32
413	cbz		w4, .LxtsencoutNx
414	next_tweak	v4, v5, v7, v8
415	b		.LxtsencNx
416.LxtsencoutNx:
417	mov		v4.16b, v5.16b
418	b		.Lxtsencout
419#else
420	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
421	next_tweak	v5, v4, v7, v8
422	eor		v0.16b, v0.16b, v4.16b
423	next_tweak	v6, v5, v7, v8
424	eor		v1.16b, v1.16b, v5.16b
425	eor		v2.16b, v2.16b, v6.16b
426	next_tweak	v7, v6, v7, v8
427	eor		v3.16b, v3.16b, v7.16b
428	do_encrypt_block4x
429	eor		v3.16b, v3.16b, v7.16b
430	eor		v0.16b, v0.16b, v4.16b
431	eor		v1.16b, v1.16b, v5.16b
432	eor		v2.16b, v2.16b, v6.16b
433	st1		{v0.16b-v3.16b}, [x0], #64
434	mov		v4.16b, v7.16b
435	cbz		w4, .Lxtsencout
436	b		.LxtsencloopNx
437#endif
438.Lxtsenc1x:
439	adds		w4, w4, #INTERLEAVE
440	beq		.Lxtsencout
441#endif
442.Lxtsencloop:
443	ld1		{v1.16b}, [x1], #16
444	eor		v0.16b, v1.16b, v4.16b
445	encrypt_block	v0, w3, x2, x6, w7
446	eor		v0.16b, v0.16b, v4.16b
447	st1		{v0.16b}, [x0], #16
448	subs		w4, w4, #1
449	beq		.Lxtsencout
450	next_tweak	v4, v4, v7, v8
451	b		.Lxtsencloop
452.Lxtsencout:
453	FRAME_POP
454	ret
455AES_ENDPROC(aes_xts_encrypt)
456
457
458AES_ENTRY(aes_xts_decrypt)
459	FRAME_PUSH
460	cbz		w7, .LxtsdecloopNx
461
462	ld1		{v4.16b}, [x6]
463	enc_prepare	w3, x5, x6
464	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
465	dec_prepare	w3, x2, x6
466	ldr		q7, .Lxts_mul_x
467	b		.LxtsdecNx
468
469.LxtsdecloopNx:
470	ldr		q7, .Lxts_mul_x
471	next_tweak	v4, v4, v7, v8
472.LxtsdecNx:
473#if INTERLEAVE >= 2
474	subs		w4, w4, #INTERLEAVE
475	bmi		.Lxtsdec1x
476#if INTERLEAVE == 2
477	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
478	next_tweak	v5, v4, v7, v8
479	eor		v0.16b, v0.16b, v4.16b
480	eor		v1.16b, v1.16b, v5.16b
481	do_decrypt_block2x
482	eor		v0.16b, v0.16b, v4.16b
483	eor		v1.16b, v1.16b, v5.16b
484	st1		{v0.16b-v1.16b}, [x0], #32
485	cbz		w4, .LxtsdecoutNx
486	next_tweak	v4, v5, v7, v8
487	b		.LxtsdecNx
488.LxtsdecoutNx:
489	mov		v4.16b, v5.16b
490	b		.Lxtsdecout
491#else
492	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
493	next_tweak	v5, v4, v7, v8
494	eor		v0.16b, v0.16b, v4.16b
495	next_tweak	v6, v5, v7, v8
496	eor		v1.16b, v1.16b, v5.16b
497	eor		v2.16b, v2.16b, v6.16b
498	next_tweak	v7, v6, v7, v8
499	eor		v3.16b, v3.16b, v7.16b
500	do_decrypt_block4x
501	eor		v3.16b, v3.16b, v7.16b
502	eor		v0.16b, v0.16b, v4.16b
503	eor		v1.16b, v1.16b, v5.16b
504	eor		v2.16b, v2.16b, v6.16b
505	st1		{v0.16b-v3.16b}, [x0], #64
506	mov		v4.16b, v7.16b
507	cbz		w4, .Lxtsdecout
508	b		.LxtsdecloopNx
509#endif
510.Lxtsdec1x:
511	adds		w4, w4, #INTERLEAVE
512	beq		.Lxtsdecout
513#endif
514.Lxtsdecloop:
515	ld1		{v1.16b}, [x1], #16
516	eor		v0.16b, v1.16b, v4.16b
517	decrypt_block	v0, w3, x2, x6, w7
518	eor		v0.16b, v0.16b, v4.16b
519	st1		{v0.16b}, [x0], #16
520	subs		w4, w4, #1
521	beq		.Lxtsdecout
522	next_tweak	v4, v4, v7, v8
523	b		.Lxtsdecloop
524.Lxtsdecout:
525	FRAME_POP
526	ret
527AES_ENDPROC(aes_xts_decrypt)
528
529	/*
530	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
531	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
532	 */
533AES_ENTRY(aes_mac_update)
534	ld1		{v0.16b}, [x4]			/* get dg */
535	enc_prepare	w2, x1, x7
536	cbnz		w5, .Lmacenc
537
538.Lmacloop:
539	cbz		w3, .Lmacout
540	ld1		{v1.16b}, [x0], #16		/* get next pt block */
541	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
542
543	subs		w3, w3, #1
544	csinv		x5, x6, xzr, eq
545	cbz		w5, .Lmacout
546
547.Lmacenc:
548	encrypt_block	v0, w2, x1, x7, w8
549	b		.Lmacloop
550
551.Lmacout:
552	st1		{v0.16b}, [x4]			/* return dg */
553	ret
554AES_ENDPROC(aes_mac_update)
555