xref: /openbmc/linux/arch/arm64/crypto/aes-modes.S (revision b8d312aa)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/* included by aes-ce.S and aes-neon.S */
9
10	.text
11	.align		4
12
13#ifndef MAX_STRIDE
14#define MAX_STRIDE	4
15#endif
16
17#if MAX_STRIDE == 4
18#define ST4(x...) x
19#define ST5(x...)
20#else
21#define ST4(x...)
22#define ST5(x...) x
23#endif
24
25aes_encrypt_block4x:
26	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27	ret
28ENDPROC(aes_encrypt_block4x)
29
30aes_decrypt_block4x:
31	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32	ret
33ENDPROC(aes_decrypt_block4x)
34
35#if MAX_STRIDE == 5
36aes_encrypt_block5x:
37	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38	ret
39ENDPROC(aes_encrypt_block5x)
40
41aes_decrypt_block5x:
42	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43	ret
44ENDPROC(aes_decrypt_block5x)
45#endif
46
47	/*
48	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49	 *		   int blocks)
50	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51	 *		   int blocks)
52	 */
53
54AES_ENTRY(aes_ecb_encrypt)
55	stp		x29, x30, [sp, #-16]!
56	mov		x29, sp
57
58	enc_prepare	w3, x2, x5
59
60.LecbencloopNx:
61	subs		w4, w4, #MAX_STRIDE
62	bmi		.Lecbenc1x
63	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
64ST4(	bl		aes_encrypt_block4x		)
65ST5(	ld1		{v4.16b}, [x1], #16		)
66ST5(	bl		aes_encrypt_block5x		)
67	st1		{v0.16b-v3.16b}, [x0], #64
68ST5(	st1		{v4.16b}, [x0], #16		)
69	b		.LecbencloopNx
70.Lecbenc1x:
71	adds		w4, w4, #MAX_STRIDE
72	beq		.Lecbencout
73.Lecbencloop:
74	ld1		{v0.16b}, [x1], #16		/* get next pt block */
75	encrypt_block	v0, w3, x2, x5, w6
76	st1		{v0.16b}, [x0], #16
77	subs		w4, w4, #1
78	bne		.Lecbencloop
79.Lecbencout:
80	ldp		x29, x30, [sp], #16
81	ret
82AES_ENDPROC(aes_ecb_encrypt)
83
84
85AES_ENTRY(aes_ecb_decrypt)
86	stp		x29, x30, [sp, #-16]!
87	mov		x29, sp
88
89	dec_prepare	w3, x2, x5
90
91.LecbdecloopNx:
92	subs		w4, w4, #MAX_STRIDE
93	bmi		.Lecbdec1x
94	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
95ST4(	bl		aes_decrypt_block4x		)
96ST5(	ld1		{v4.16b}, [x1], #16		)
97ST5(	bl		aes_decrypt_block5x		)
98	st1		{v0.16b-v3.16b}, [x0], #64
99ST5(	st1		{v4.16b}, [x0], #16		)
100	b		.LecbdecloopNx
101.Lecbdec1x:
102	adds		w4, w4, #MAX_STRIDE
103	beq		.Lecbdecout
104.Lecbdecloop:
105	ld1		{v0.16b}, [x1], #16		/* get next ct block */
106	decrypt_block	v0, w3, x2, x5, w6
107	st1		{v0.16b}, [x0], #16
108	subs		w4, w4, #1
109	bne		.Lecbdecloop
110.Lecbdecout:
111	ldp		x29, x30, [sp], #16
112	ret
113AES_ENDPROC(aes_ecb_decrypt)
114
115
116	/*
117	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118	 *		   int blocks, u8 iv[])
119	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120	 *		   int blocks, u8 iv[])
121	 */
122
123AES_ENTRY(aes_cbc_encrypt)
124	ld1		{v4.16b}, [x5]			/* get iv */
125	enc_prepare	w3, x2, x6
126
127.Lcbcencloop4x:
128	subs		w4, w4, #4
129	bmi		.Lcbcenc1x
130	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
131	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
132	encrypt_block	v0, w3, x2, x6, w7
133	eor		v1.16b, v1.16b, v0.16b
134	encrypt_block	v1, w3, x2, x6, w7
135	eor		v2.16b, v2.16b, v1.16b
136	encrypt_block	v2, w3, x2, x6, w7
137	eor		v3.16b, v3.16b, v2.16b
138	encrypt_block	v3, w3, x2, x6, w7
139	st1		{v0.16b-v3.16b}, [x0], #64
140	mov		v4.16b, v3.16b
141	b		.Lcbcencloop4x
142.Lcbcenc1x:
143	adds		w4, w4, #4
144	beq		.Lcbcencout
145.Lcbcencloop:
146	ld1		{v0.16b}, [x1], #16		/* get next pt block */
147	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
148	encrypt_block	v4, w3, x2, x6, w7
149	st1		{v4.16b}, [x0], #16
150	subs		w4, w4, #1
151	bne		.Lcbcencloop
152.Lcbcencout:
153	st1		{v4.16b}, [x5]			/* return iv */
154	ret
155AES_ENDPROC(aes_cbc_encrypt)
156
157
158AES_ENTRY(aes_cbc_decrypt)
159	stp		x29, x30, [sp, #-16]!
160	mov		x29, sp
161
162	ld1		{cbciv.16b}, [x5]		/* get iv */
163	dec_prepare	w3, x2, x6
164
165.LcbcdecloopNx:
166	subs		w4, w4, #MAX_STRIDE
167	bmi		.Lcbcdec1x
168	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
169#if MAX_STRIDE == 5
170	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
171	mov		v5.16b, v0.16b
172	mov		v6.16b, v1.16b
173	mov		v7.16b, v2.16b
174	bl		aes_decrypt_block5x
175	sub		x1, x1, #32
176	eor		v0.16b, v0.16b, cbciv.16b
177	eor		v1.16b, v1.16b, v5.16b
178	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
179	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
180	eor		v2.16b, v2.16b, v6.16b
181	eor		v3.16b, v3.16b, v7.16b
182	eor		v4.16b, v4.16b, v5.16b
183#else
184	mov		v4.16b, v0.16b
185	mov		v5.16b, v1.16b
186	mov		v6.16b, v2.16b
187	bl		aes_decrypt_block4x
188	sub		x1, x1, #16
189	eor		v0.16b, v0.16b, cbciv.16b
190	eor		v1.16b, v1.16b, v4.16b
191	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
192	eor		v2.16b, v2.16b, v5.16b
193	eor		v3.16b, v3.16b, v6.16b
194#endif
195	st1		{v0.16b-v3.16b}, [x0], #64
196ST5(	st1		{v4.16b}, [x0], #16		)
197	b		.LcbcdecloopNx
198.Lcbcdec1x:
199	adds		w4, w4, #MAX_STRIDE
200	beq		.Lcbcdecout
201.Lcbcdecloop:
202	ld1		{v1.16b}, [x1], #16		/* get next ct block */
203	mov		v0.16b, v1.16b			/* ...and copy to v0 */
204	decrypt_block	v0, w3, x2, x6, w7
205	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
206	mov		cbciv.16b, v1.16b		/* ct is next iv */
207	st1		{v0.16b}, [x0], #16
208	subs		w4, w4, #1
209	bne		.Lcbcdecloop
210.Lcbcdecout:
211	st1		{cbciv.16b}, [x5]		/* return iv */
212	ldp		x29, x30, [sp], #16
213	ret
214AES_ENDPROC(aes_cbc_decrypt)
215
216
217	/*
218	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
219	 *		       int rounds, int bytes, u8 const iv[])
220	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
221	 *		       int rounds, int bytes, u8 const iv[])
222	 */
223
224AES_ENTRY(aes_cbc_cts_encrypt)
225	adr_l		x8, .Lcts_permute_table
226	sub		x4, x4, #16
227	add		x9, x8, #32
228	add		x8, x8, x4
229	sub		x9, x9, x4
230	ld1		{v3.16b}, [x8]
231	ld1		{v4.16b}, [x9]
232
233	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
234	ld1		{v1.16b}, [x1]
235
236	ld1		{v5.16b}, [x5]			/* get iv */
237	enc_prepare	w3, x2, x6
238
239	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
240	tbl		v1.16b, {v1.16b}, v4.16b
241	encrypt_block	v0, w3, x2, x6, w7
242
243	eor		v1.16b, v1.16b, v0.16b
244	tbl		v0.16b, {v0.16b}, v3.16b
245	encrypt_block	v1, w3, x2, x6, w7
246
247	add		x4, x0, x4
248	st1		{v0.16b}, [x4]			/* overlapping stores */
249	st1		{v1.16b}, [x0]
250	ret
251AES_ENDPROC(aes_cbc_cts_encrypt)
252
253AES_ENTRY(aes_cbc_cts_decrypt)
254	adr_l		x8, .Lcts_permute_table
255	sub		x4, x4, #16
256	add		x9, x8, #32
257	add		x8, x8, x4
258	sub		x9, x9, x4
259	ld1		{v3.16b}, [x8]
260	ld1		{v4.16b}, [x9]
261
262	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
263	ld1		{v1.16b}, [x1]
264
265	ld1		{v5.16b}, [x5]			/* get iv */
266	dec_prepare	w3, x2, x6
267
268	tbl		v2.16b, {v1.16b}, v4.16b
269	decrypt_block	v0, w3, x2, x6, w7
270	eor		v2.16b, v2.16b, v0.16b
271
272	tbx		v0.16b, {v1.16b}, v4.16b
273	tbl		v2.16b, {v2.16b}, v3.16b
274	decrypt_block	v0, w3, x2, x6, w7
275	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
276
277	add		x4, x0, x4
278	st1		{v2.16b}, [x4]			/* overlapping stores */
279	st1		{v0.16b}, [x0]
280	ret
281AES_ENDPROC(aes_cbc_cts_decrypt)
282
283	.section	".rodata", "a"
284	.align		6
285.Lcts_permute_table:
286	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
287	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
288	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
289	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
290	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
291	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
292	.previous
293
294
295	/*
296	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
297	 *		   int blocks, u8 ctr[])
298	 */
299
300AES_ENTRY(aes_ctr_encrypt)
301	stp		x29, x30, [sp, #-16]!
302	mov		x29, sp
303
304	enc_prepare	w3, x2, x6
305	ld1		{vctr.16b}, [x5]
306
307	umov		x6, vctr.d[1]		/* keep swabbed ctr in reg */
308	rev		x6, x6
309	cmn		w6, w4			/* 32 bit overflow? */
310	bcs		.Lctrloop
311.LctrloopNx:
312	subs		w4, w4, #MAX_STRIDE
313	bmi		.Lctr1x
314	add		w7, w6, #1
315	mov		v0.16b, vctr.16b
316	add		w8, w6, #2
317	mov		v1.16b, vctr.16b
318	add		w9, w6, #3
319	mov		v2.16b, vctr.16b
320	add		w9, w6, #3
321	rev		w7, w7
322	mov		v3.16b, vctr.16b
323	rev		w8, w8
324ST5(	mov		v4.16b, vctr.16b		)
325	mov		v1.s[3], w7
326	rev		w9, w9
327ST5(	add		w10, w6, #4			)
328	mov		v2.s[3], w8
329ST5(	rev		w10, w10			)
330	mov		v3.s[3], w9
331ST5(	mov		v4.s[3], w10			)
332	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
333ST4(	bl		aes_encrypt_block4x		)
334ST5(	bl		aes_encrypt_block5x		)
335	eor		v0.16b, v5.16b, v0.16b
336ST4(	ld1		{v5.16b}, [x1], #16		)
337	eor		v1.16b, v6.16b, v1.16b
338ST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
339	eor		v2.16b, v7.16b, v2.16b
340	eor		v3.16b, v5.16b, v3.16b
341ST5(	eor		v4.16b, v6.16b, v4.16b		)
342	st1		{v0.16b-v3.16b}, [x0], #64
343ST5(	st1		{v4.16b}, [x0], #16		)
344	add		x6, x6, #MAX_STRIDE
345	rev		x7, x6
346	ins		vctr.d[1], x7
347	cbz		w4, .Lctrout
348	b		.LctrloopNx
349.Lctr1x:
350	adds		w4, w4, #MAX_STRIDE
351	beq		.Lctrout
352.Lctrloop:
353	mov		v0.16b, vctr.16b
354	encrypt_block	v0, w3, x2, x8, w7
355
356	adds		x6, x6, #1		/* increment BE ctr */
357	rev		x7, x6
358	ins		vctr.d[1], x7
359	bcs		.Lctrcarry		/* overflow? */
360
361.Lctrcarrydone:
362	subs		w4, w4, #1
363	bmi		.Lctrtailblock		/* blocks <0 means tail block */
364	ld1		{v3.16b}, [x1], #16
365	eor		v3.16b, v0.16b, v3.16b
366	st1		{v3.16b}, [x0], #16
367	bne		.Lctrloop
368
369.Lctrout:
370	st1		{vctr.16b}, [x5]	/* return next CTR value */
371	ldp		x29, x30, [sp], #16
372	ret
373
374.Lctrtailblock:
375	st1		{v0.16b}, [x0]
376	b		.Lctrout
377
378.Lctrcarry:
379	umov		x7, vctr.d[0]		/* load upper word of ctr  */
380	rev		x7, x7			/* ... to handle the carry */
381	add		x7, x7, #1
382	rev		x7, x7
383	ins		vctr.d[0], x7
384	b		.Lctrcarrydone
385AES_ENDPROC(aes_ctr_encrypt)
386
387
388	/*
389	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
390	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
391	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
392	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
393	 */
394
395	.macro		next_tweak, out, in, tmp
396	sshr		\tmp\().2d,  \in\().2d,   #63
397	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
398	add		\out\().2d,  \in\().2d,   \in\().2d
399	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
400	eor		\out\().16b, \out\().16b, \tmp\().16b
401	.endm
402
403	.macro		xts_load_mask, tmp
404	movi		xtsmask.2s, #0x1
405	movi		\tmp\().2s, #0x87
406	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
407	.endm
408
409AES_ENTRY(aes_xts_encrypt)
410	stp		x29, x30, [sp, #-16]!
411	mov		x29, sp
412
413	ld1		{v4.16b}, [x6]
414	xts_load_mask	v8
415	cbz		w7, .Lxtsencnotfirst
416
417	enc_prepare	w3, x5, x8
418	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
419	enc_switch_key	w3, x2, x8
420	b		.LxtsencNx
421
422.Lxtsencnotfirst:
423	enc_prepare	w3, x2, x8
424.LxtsencloopNx:
425	next_tweak	v4, v4, v8
426.LxtsencNx:
427	subs		w4, w4, #4
428	bmi		.Lxtsenc1x
429	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
430	next_tweak	v5, v4, v8
431	eor		v0.16b, v0.16b, v4.16b
432	next_tweak	v6, v5, v8
433	eor		v1.16b, v1.16b, v5.16b
434	eor		v2.16b, v2.16b, v6.16b
435	next_tweak	v7, v6, v8
436	eor		v3.16b, v3.16b, v7.16b
437	bl		aes_encrypt_block4x
438	eor		v3.16b, v3.16b, v7.16b
439	eor		v0.16b, v0.16b, v4.16b
440	eor		v1.16b, v1.16b, v5.16b
441	eor		v2.16b, v2.16b, v6.16b
442	st1		{v0.16b-v3.16b}, [x0], #64
443	mov		v4.16b, v7.16b
444	cbz		w4, .Lxtsencout
445	xts_reload_mask	v8
446	b		.LxtsencloopNx
447.Lxtsenc1x:
448	adds		w4, w4, #4
449	beq		.Lxtsencout
450.Lxtsencloop:
451	ld1		{v1.16b}, [x1], #16
452	eor		v0.16b, v1.16b, v4.16b
453	encrypt_block	v0, w3, x2, x8, w7
454	eor		v0.16b, v0.16b, v4.16b
455	st1		{v0.16b}, [x0], #16
456	subs		w4, w4, #1
457	beq		.Lxtsencout
458	next_tweak	v4, v4, v8
459	b		.Lxtsencloop
460.Lxtsencout:
461	st1		{v4.16b}, [x6]
462	ldp		x29, x30, [sp], #16
463	ret
464AES_ENDPROC(aes_xts_encrypt)
465
466
467AES_ENTRY(aes_xts_decrypt)
468	stp		x29, x30, [sp, #-16]!
469	mov		x29, sp
470
471	ld1		{v4.16b}, [x6]
472	xts_load_mask	v8
473	cbz		w7, .Lxtsdecnotfirst
474
475	enc_prepare	w3, x5, x8
476	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
477	dec_prepare	w3, x2, x8
478	b		.LxtsdecNx
479
480.Lxtsdecnotfirst:
481	dec_prepare	w3, x2, x8
482.LxtsdecloopNx:
483	next_tweak	v4, v4, v8
484.LxtsdecNx:
485	subs		w4, w4, #4
486	bmi		.Lxtsdec1x
487	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
488	next_tweak	v5, v4, v8
489	eor		v0.16b, v0.16b, v4.16b
490	next_tweak	v6, v5, v8
491	eor		v1.16b, v1.16b, v5.16b
492	eor		v2.16b, v2.16b, v6.16b
493	next_tweak	v7, v6, v8
494	eor		v3.16b, v3.16b, v7.16b
495	bl		aes_decrypt_block4x
496	eor		v3.16b, v3.16b, v7.16b
497	eor		v0.16b, v0.16b, v4.16b
498	eor		v1.16b, v1.16b, v5.16b
499	eor		v2.16b, v2.16b, v6.16b
500	st1		{v0.16b-v3.16b}, [x0], #64
501	mov		v4.16b, v7.16b
502	cbz		w4, .Lxtsdecout
503	xts_reload_mask	v8
504	b		.LxtsdecloopNx
505.Lxtsdec1x:
506	adds		w4, w4, #4
507	beq		.Lxtsdecout
508.Lxtsdecloop:
509	ld1		{v1.16b}, [x1], #16
510	eor		v0.16b, v1.16b, v4.16b
511	decrypt_block	v0, w3, x2, x8, w7
512	eor		v0.16b, v0.16b, v4.16b
513	st1		{v0.16b}, [x0], #16
514	subs		w4, w4, #1
515	beq		.Lxtsdecout
516	next_tweak	v4, v4, v8
517	b		.Lxtsdecloop
518.Lxtsdecout:
519	st1		{v4.16b}, [x6]
520	ldp		x29, x30, [sp], #16
521	ret
522AES_ENDPROC(aes_xts_decrypt)
523
524	/*
525	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
526	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
527	 */
528AES_ENTRY(aes_mac_update)
529	frame_push	6
530
531	mov		x19, x0
532	mov		x20, x1
533	mov		x21, x2
534	mov		x22, x3
535	mov		x23, x4
536	mov		x24, x6
537
538	ld1		{v0.16b}, [x23]			/* get dg */
539	enc_prepare	w2, x1, x7
540	cbz		w5, .Lmacloop4x
541
542	encrypt_block	v0, w2, x1, x7, w8
543
544.Lmacloop4x:
545	subs		w22, w22, #4
546	bmi		.Lmac1x
547	ld1		{v1.16b-v4.16b}, [x19], #64	/* get next pt block */
548	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
549	encrypt_block	v0, w21, x20, x7, w8
550	eor		v0.16b, v0.16b, v2.16b
551	encrypt_block	v0, w21, x20, x7, w8
552	eor		v0.16b, v0.16b, v3.16b
553	encrypt_block	v0, w21, x20, x7, w8
554	eor		v0.16b, v0.16b, v4.16b
555	cmp		w22, wzr
556	csinv		x5, x24, xzr, eq
557	cbz		w5, .Lmacout
558	encrypt_block	v0, w21, x20, x7, w8
559	st1		{v0.16b}, [x23]			/* return dg */
560	cond_yield_neon	.Lmacrestart
561	b		.Lmacloop4x
562.Lmac1x:
563	add		w22, w22, #4
564.Lmacloop:
565	cbz		w22, .Lmacout
566	ld1		{v1.16b}, [x19], #16		/* get next pt block */
567	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
568
569	subs		w22, w22, #1
570	csinv		x5, x24, xzr, eq
571	cbz		w5, .Lmacout
572
573.Lmacenc:
574	encrypt_block	v0, w21, x20, x7, w8
575	b		.Lmacloop
576
577.Lmacout:
578	st1		{v0.16b}, [x23]			/* return dg */
579	frame_pop
580	ret
581
582.Lmacrestart:
583	ld1		{v0.16b}, [x23]			/* get dg */
584	enc_prepare	w21, x20, x0
585	b		.Lmacloop4x
586AES_ENDPROC(aes_mac_update)
587