xref: /openbmc/linux/arch/arm64/crypto/aes-modes.S (revision 9fb29c73)
1/*
2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3 *
4 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11/* included by aes-ce.S and aes-neon.S */
12
13	.text
14	.align		4
15
16aes_encrypt_block4x:
17	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
18	ret
19ENDPROC(aes_encrypt_block4x)
20
21aes_decrypt_block4x:
22	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
23	ret
24ENDPROC(aes_decrypt_block4x)
25
26	/*
27	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
28	 *		   int blocks)
29	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
30	 *		   int blocks)
31	 */
32
33AES_ENTRY(aes_ecb_encrypt)
34	stp		x29, x30, [sp, #-16]!
35	mov		x29, sp
36
37	enc_prepare	w3, x2, x5
38
39.LecbencloopNx:
40	subs		w4, w4, #4
41	bmi		.Lecbenc1x
42	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
43	bl		aes_encrypt_block4x
44	st1		{v0.16b-v3.16b}, [x0], #64
45	b		.LecbencloopNx
46.Lecbenc1x:
47	adds		w4, w4, #4
48	beq		.Lecbencout
49.Lecbencloop:
50	ld1		{v0.16b}, [x1], #16		/* get next pt block */
51	encrypt_block	v0, w3, x2, x5, w6
52	st1		{v0.16b}, [x0], #16
53	subs		w4, w4, #1
54	bne		.Lecbencloop
55.Lecbencout:
56	ldp		x29, x30, [sp], #16
57	ret
58AES_ENDPROC(aes_ecb_encrypt)
59
60
61AES_ENTRY(aes_ecb_decrypt)
62	stp		x29, x30, [sp, #-16]!
63	mov		x29, sp
64
65	dec_prepare	w3, x2, x5
66
67.LecbdecloopNx:
68	subs		w4, w4, #4
69	bmi		.Lecbdec1x
70	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
71	bl		aes_decrypt_block4x
72	st1		{v0.16b-v3.16b}, [x0], #64
73	b		.LecbdecloopNx
74.Lecbdec1x:
75	adds		w4, w4, #4
76	beq		.Lecbdecout
77.Lecbdecloop:
78	ld1		{v0.16b}, [x1], #16		/* get next ct block */
79	decrypt_block	v0, w3, x2, x5, w6
80	st1		{v0.16b}, [x0], #16
81	subs		w4, w4, #1
82	bne		.Lecbdecloop
83.Lecbdecout:
84	ldp		x29, x30, [sp], #16
85	ret
86AES_ENDPROC(aes_ecb_decrypt)
87
88
89	/*
90	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
91	 *		   int blocks, u8 iv[])
92	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
93	 *		   int blocks, u8 iv[])
94	 */
95
96AES_ENTRY(aes_cbc_encrypt)
97	ld1		{v4.16b}, [x5]			/* get iv */
98	enc_prepare	w3, x2, x6
99
100.Lcbcencloop4x:
101	subs		w4, w4, #4
102	bmi		.Lcbcenc1x
103	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
104	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
105	encrypt_block	v0, w3, x2, x6, w7
106	eor		v1.16b, v1.16b, v0.16b
107	encrypt_block	v1, w3, x2, x6, w7
108	eor		v2.16b, v2.16b, v1.16b
109	encrypt_block	v2, w3, x2, x6, w7
110	eor		v3.16b, v3.16b, v2.16b
111	encrypt_block	v3, w3, x2, x6, w7
112	st1		{v0.16b-v3.16b}, [x0], #64
113	mov		v4.16b, v3.16b
114	b		.Lcbcencloop4x
115.Lcbcenc1x:
116	adds		w4, w4, #4
117	beq		.Lcbcencout
118.Lcbcencloop:
119	ld1		{v0.16b}, [x1], #16		/* get next pt block */
120	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
121	encrypt_block	v4, w3, x2, x6, w7
122	st1		{v4.16b}, [x0], #16
123	subs		w4, w4, #1
124	bne		.Lcbcencloop
125.Lcbcencout:
126	st1		{v4.16b}, [x5]			/* return iv */
127	ret
128AES_ENDPROC(aes_cbc_encrypt)
129
130
131AES_ENTRY(aes_cbc_decrypt)
132	stp		x29, x30, [sp, #-16]!
133	mov		x29, sp
134
135	ld1		{v7.16b}, [x5]			/* get iv */
136	dec_prepare	w3, x2, x6
137
138.LcbcdecloopNx:
139	subs		w4, w4, #4
140	bmi		.Lcbcdec1x
141	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
142	mov		v4.16b, v0.16b
143	mov		v5.16b, v1.16b
144	mov		v6.16b, v2.16b
145	bl		aes_decrypt_block4x
146	sub		x1, x1, #16
147	eor		v0.16b, v0.16b, v7.16b
148	eor		v1.16b, v1.16b, v4.16b
149	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
150	eor		v2.16b, v2.16b, v5.16b
151	eor		v3.16b, v3.16b, v6.16b
152	st1		{v0.16b-v3.16b}, [x0], #64
153	b		.LcbcdecloopNx
154.Lcbcdec1x:
155	adds		w4, w4, #4
156	beq		.Lcbcdecout
157.Lcbcdecloop:
158	ld1		{v1.16b}, [x1], #16		/* get next ct block */
159	mov		v0.16b, v1.16b			/* ...and copy to v0 */
160	decrypt_block	v0, w3, x2, x6, w7
161	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
162	mov		v7.16b, v1.16b			/* ct is next iv */
163	st1		{v0.16b}, [x0], #16
164	subs		w4, w4, #1
165	bne		.Lcbcdecloop
166.Lcbcdecout:
167	st1		{v7.16b}, [x5]			/* return iv */
168	ldp		x29, x30, [sp], #16
169	ret
170AES_ENDPROC(aes_cbc_decrypt)
171
172
173	/*
174	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
175	 *		       int rounds, int bytes, u8 const iv[])
176	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
177	 *		       int rounds, int bytes, u8 const iv[])
178	 */
179
180AES_ENTRY(aes_cbc_cts_encrypt)
181	adr_l		x8, .Lcts_permute_table
182	sub		x4, x4, #16
183	add		x9, x8, #32
184	add		x8, x8, x4
185	sub		x9, x9, x4
186	ld1		{v3.16b}, [x8]
187	ld1		{v4.16b}, [x9]
188
189	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
190	ld1		{v1.16b}, [x1]
191
192	ld1		{v5.16b}, [x5]			/* get iv */
193	enc_prepare	w3, x2, x6
194
195	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
196	tbl		v1.16b, {v1.16b}, v4.16b
197	encrypt_block	v0, w3, x2, x6, w7
198
199	eor		v1.16b, v1.16b, v0.16b
200	tbl		v0.16b, {v0.16b}, v3.16b
201	encrypt_block	v1, w3, x2, x6, w7
202
203	add		x4, x0, x4
204	st1		{v0.16b}, [x4]			/* overlapping stores */
205	st1		{v1.16b}, [x0]
206	ret
207AES_ENDPROC(aes_cbc_cts_encrypt)
208
209AES_ENTRY(aes_cbc_cts_decrypt)
210	adr_l		x8, .Lcts_permute_table
211	sub		x4, x4, #16
212	add		x9, x8, #32
213	add		x8, x8, x4
214	sub		x9, x9, x4
215	ld1		{v3.16b}, [x8]
216	ld1		{v4.16b}, [x9]
217
218	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
219	ld1		{v1.16b}, [x1]
220
221	ld1		{v5.16b}, [x5]			/* get iv */
222	dec_prepare	w3, x2, x6
223
224	tbl		v2.16b, {v1.16b}, v4.16b
225	decrypt_block	v0, w3, x2, x6, w7
226	eor		v2.16b, v2.16b, v0.16b
227
228	tbx		v0.16b, {v1.16b}, v4.16b
229	tbl		v2.16b, {v2.16b}, v3.16b
230	decrypt_block	v0, w3, x2, x6, w7
231	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
232
233	add		x4, x0, x4
234	st1		{v2.16b}, [x4]			/* overlapping stores */
235	st1		{v0.16b}, [x0]
236	ret
237AES_ENDPROC(aes_cbc_cts_decrypt)
238
239	.section	".rodata", "a"
240	.align		6
241.Lcts_permute_table:
242	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
243	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
244	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
245	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
246	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
247	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
248	.previous
249
250
251	/*
252	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
253	 *		   int blocks, u8 ctr[])
254	 */
255
256AES_ENTRY(aes_ctr_encrypt)
257	stp		x29, x30, [sp, #-16]!
258	mov		x29, sp
259
260	enc_prepare	w3, x2, x6
261	ld1		{v4.16b}, [x5]
262
263	umov		x6, v4.d[1]		/* keep swabbed ctr in reg */
264	rev		x6, x6
265	cmn		w6, w4			/* 32 bit overflow? */
266	bcs		.Lctrloop
267.LctrloopNx:
268	subs		w4, w4, #4
269	bmi		.Lctr1x
270	add		w7, w6, #1
271	mov		v0.16b, v4.16b
272	add		w8, w6, #2
273	mov		v1.16b, v4.16b
274	add		w9, w6, #3
275	mov		v2.16b, v4.16b
276	rev		w7, w7
277	mov		v3.16b, v4.16b
278	rev		w8, w8
279	mov		v1.s[3], w7
280	rev		w9, w9
281	mov		v2.s[3], w8
282	mov		v3.s[3], w9
283	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
284	bl		aes_encrypt_block4x
285	eor		v0.16b, v5.16b, v0.16b
286	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
287	eor		v1.16b, v6.16b, v1.16b
288	eor		v2.16b, v7.16b, v2.16b
289	eor		v3.16b, v5.16b, v3.16b
290	st1		{v0.16b-v3.16b}, [x0], #64
291	add		x6, x6, #4
292	rev		x7, x6
293	ins		v4.d[1], x7
294	cbz		w4, .Lctrout
295	b		.LctrloopNx
296.Lctr1x:
297	adds		w4, w4, #4
298	beq		.Lctrout
299.Lctrloop:
300	mov		v0.16b, v4.16b
301	encrypt_block	v0, w3, x2, x8, w7
302
303	adds		x6, x6, #1		/* increment BE ctr */
304	rev		x7, x6
305	ins		v4.d[1], x7
306	bcs		.Lctrcarry		/* overflow? */
307
308.Lctrcarrydone:
309	subs		w4, w4, #1
310	bmi		.Lctrtailblock		/* blocks <0 means tail block */
311	ld1		{v3.16b}, [x1], #16
312	eor		v3.16b, v0.16b, v3.16b
313	st1		{v3.16b}, [x0], #16
314	bne		.Lctrloop
315
316.Lctrout:
317	st1		{v4.16b}, [x5]		/* return next CTR value */
318	ldp		x29, x30, [sp], #16
319	ret
320
321.Lctrtailblock:
322	st1		{v0.16b}, [x0]
323	ldp		x29, x30, [sp], #16
324	ret
325
326.Lctrcarry:
327	umov		x7, v4.d[0]		/* load upper word of ctr  */
328	rev		x7, x7			/* ... to handle the carry */
329	add		x7, x7, #1
330	rev		x7, x7
331	ins		v4.d[0], x7
332	b		.Lctrcarrydone
333AES_ENDPROC(aes_ctr_encrypt)
334
335
336	/*
337	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
338	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
339	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
340	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
341	 */
342
343	.macro		next_tweak, out, in, tmp
344	sshr		\tmp\().2d,  \in\().2d,   #63
345	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
346	add		\out\().2d,  \in\().2d,   \in\().2d
347	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
348	eor		\out\().16b, \out\().16b, \tmp\().16b
349	.endm
350
351	.macro		xts_load_mask, tmp
352	movi		xtsmask.2s, #0x1
353	movi		\tmp\().2s, #0x87
354	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
355	.endm
356
357AES_ENTRY(aes_xts_encrypt)
358	stp		x29, x30, [sp, #-16]!
359	mov		x29, sp
360
361	ld1		{v4.16b}, [x6]
362	xts_load_mask	v8
363	cbz		w7, .Lxtsencnotfirst
364
365	enc_prepare	w3, x5, x8
366	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
367	enc_switch_key	w3, x2, x8
368	b		.LxtsencNx
369
370.Lxtsencnotfirst:
371	enc_prepare	w3, x2, x8
372.LxtsencloopNx:
373	next_tweak	v4, v4, v8
374.LxtsencNx:
375	subs		w4, w4, #4
376	bmi		.Lxtsenc1x
377	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
378	next_tweak	v5, v4, v8
379	eor		v0.16b, v0.16b, v4.16b
380	next_tweak	v6, v5, v8
381	eor		v1.16b, v1.16b, v5.16b
382	eor		v2.16b, v2.16b, v6.16b
383	next_tweak	v7, v6, v8
384	eor		v3.16b, v3.16b, v7.16b
385	bl		aes_encrypt_block4x
386	eor		v3.16b, v3.16b, v7.16b
387	eor		v0.16b, v0.16b, v4.16b
388	eor		v1.16b, v1.16b, v5.16b
389	eor		v2.16b, v2.16b, v6.16b
390	st1		{v0.16b-v3.16b}, [x0], #64
391	mov		v4.16b, v7.16b
392	cbz		w4, .Lxtsencout
393	xts_reload_mask	v8
394	b		.LxtsencloopNx
395.Lxtsenc1x:
396	adds		w4, w4, #4
397	beq		.Lxtsencout
398.Lxtsencloop:
399	ld1		{v1.16b}, [x1], #16
400	eor		v0.16b, v1.16b, v4.16b
401	encrypt_block	v0, w3, x2, x8, w7
402	eor		v0.16b, v0.16b, v4.16b
403	st1		{v0.16b}, [x0], #16
404	subs		w4, w4, #1
405	beq		.Lxtsencout
406	next_tweak	v4, v4, v8
407	b		.Lxtsencloop
408.Lxtsencout:
409	st1		{v4.16b}, [x6]
410	ldp		x29, x30, [sp], #16
411	ret
412AES_ENDPROC(aes_xts_encrypt)
413
414
415AES_ENTRY(aes_xts_decrypt)
416	stp		x29, x30, [sp, #-16]!
417	mov		x29, sp
418
419	ld1		{v4.16b}, [x6]
420	xts_load_mask	v8
421	cbz		w7, .Lxtsdecnotfirst
422
423	enc_prepare	w3, x5, x8
424	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
425	dec_prepare	w3, x2, x8
426	b		.LxtsdecNx
427
428.Lxtsdecnotfirst:
429	dec_prepare	w3, x2, x8
430.LxtsdecloopNx:
431	next_tweak	v4, v4, v8
432.LxtsdecNx:
433	subs		w4, w4, #4
434	bmi		.Lxtsdec1x
435	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
436	next_tweak	v5, v4, v8
437	eor		v0.16b, v0.16b, v4.16b
438	next_tweak	v6, v5, v8
439	eor		v1.16b, v1.16b, v5.16b
440	eor		v2.16b, v2.16b, v6.16b
441	next_tweak	v7, v6, v8
442	eor		v3.16b, v3.16b, v7.16b
443	bl		aes_decrypt_block4x
444	eor		v3.16b, v3.16b, v7.16b
445	eor		v0.16b, v0.16b, v4.16b
446	eor		v1.16b, v1.16b, v5.16b
447	eor		v2.16b, v2.16b, v6.16b
448	st1		{v0.16b-v3.16b}, [x0], #64
449	mov		v4.16b, v7.16b
450	cbz		w4, .Lxtsdecout
451	xts_reload_mask	v8
452	b		.LxtsdecloopNx
453.Lxtsdec1x:
454	adds		w4, w4, #4
455	beq		.Lxtsdecout
456.Lxtsdecloop:
457	ld1		{v1.16b}, [x1], #16
458	eor		v0.16b, v1.16b, v4.16b
459	decrypt_block	v0, w3, x2, x8, w7
460	eor		v0.16b, v0.16b, v4.16b
461	st1		{v0.16b}, [x0], #16
462	subs		w4, w4, #1
463	beq		.Lxtsdecout
464	next_tweak	v4, v4, v8
465	b		.Lxtsdecloop
466.Lxtsdecout:
467	st1		{v4.16b}, [x6]
468	ldp		x29, x30, [sp], #16
469	ret
470AES_ENDPROC(aes_xts_decrypt)
471
472	/*
473	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
474	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
475	 */
476AES_ENTRY(aes_mac_update)
477	frame_push	6
478
479	mov		x19, x0
480	mov		x20, x1
481	mov		x21, x2
482	mov		x22, x3
483	mov		x23, x4
484	mov		x24, x6
485
486	ld1		{v0.16b}, [x23]			/* get dg */
487	enc_prepare	w2, x1, x7
488	cbz		w5, .Lmacloop4x
489
490	encrypt_block	v0, w2, x1, x7, w8
491
492.Lmacloop4x:
493	subs		w22, w22, #4
494	bmi		.Lmac1x
495	ld1		{v1.16b-v4.16b}, [x19], #64	/* get next pt block */
496	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
497	encrypt_block	v0, w21, x20, x7, w8
498	eor		v0.16b, v0.16b, v2.16b
499	encrypt_block	v0, w21, x20, x7, w8
500	eor		v0.16b, v0.16b, v3.16b
501	encrypt_block	v0, w21, x20, x7, w8
502	eor		v0.16b, v0.16b, v4.16b
503	cmp		w22, wzr
504	csinv		x5, x24, xzr, eq
505	cbz		w5, .Lmacout
506	encrypt_block	v0, w21, x20, x7, w8
507	st1		{v0.16b}, [x23]			/* return dg */
508	cond_yield_neon	.Lmacrestart
509	b		.Lmacloop4x
510.Lmac1x:
511	add		w22, w22, #4
512.Lmacloop:
513	cbz		w22, .Lmacout
514	ld1		{v1.16b}, [x19], #16		/* get next pt block */
515	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
516
517	subs		w22, w22, #1
518	csinv		x5, x24, xzr, eq
519	cbz		w5, .Lmacout
520
521.Lmacenc:
522	encrypt_block	v0, w21, x20, x7, w8
523	b		.Lmacloop
524
525.Lmacout:
526	st1		{v0.16b}, [x23]			/* return dg */
527	frame_pop
528	ret
529
530.Lmacrestart:
531	ld1		{v0.16b}, [x23]			/* get dg */
532	enc_prepare	w21, x20, x0
533	b		.Lmacloop4x
534AES_ENDPROC(aes_mac_update)
535