xref: /openbmc/linux/arch/arm64/crypto/aes-modes.S (revision 1d7a0395)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/* included by aes-ce.S and aes-neon.S */
9
10	.text
11	.align		4
12
13#ifndef MAX_STRIDE
14#define MAX_STRIDE	4
15#endif
16
17#if MAX_STRIDE == 4
18#define ST4(x...) x
19#define ST5(x...)
20#else
21#define ST4(x...)
22#define ST5(x...) x
23#endif
24
25SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27	ret
28SYM_FUNC_END(aes_encrypt_block4x)
29
30SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32	ret
33SYM_FUNC_END(aes_decrypt_block4x)
34
35#if MAX_STRIDE == 5
36SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38	ret
39SYM_FUNC_END(aes_encrypt_block5x)
40
41SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43	ret
44SYM_FUNC_END(aes_decrypt_block5x)
45#endif
46
47	/*
48	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49	 *		   int blocks)
50	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51	 *		   int blocks)
52	 */
53
54AES_FUNC_START(aes_ecb_encrypt)
55	stp		x29, x30, [sp, #-16]!
56	mov		x29, sp
57
58	enc_prepare	w3, x2, x5
59
60.LecbencloopNx:
61	subs		w4, w4, #MAX_STRIDE
62	bmi		.Lecbenc1x
63	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
64ST4(	bl		aes_encrypt_block4x		)
65ST5(	ld1		{v4.16b}, [x1], #16		)
66ST5(	bl		aes_encrypt_block5x		)
67	st1		{v0.16b-v3.16b}, [x0], #64
68ST5(	st1		{v4.16b}, [x0], #16		)
69	b		.LecbencloopNx
70.Lecbenc1x:
71	adds		w4, w4, #MAX_STRIDE
72	beq		.Lecbencout
73.Lecbencloop:
74	ld1		{v0.16b}, [x1], #16		/* get next pt block */
75	encrypt_block	v0, w3, x2, x5, w6
76	st1		{v0.16b}, [x0], #16
77	subs		w4, w4, #1
78	bne		.Lecbencloop
79.Lecbencout:
80	ldp		x29, x30, [sp], #16
81	ret
82AES_FUNC_END(aes_ecb_encrypt)
83
84
85AES_FUNC_START(aes_ecb_decrypt)
86	stp		x29, x30, [sp, #-16]!
87	mov		x29, sp
88
89	dec_prepare	w3, x2, x5
90
91.LecbdecloopNx:
92	subs		w4, w4, #MAX_STRIDE
93	bmi		.Lecbdec1x
94	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
95ST4(	bl		aes_decrypt_block4x		)
96ST5(	ld1		{v4.16b}, [x1], #16		)
97ST5(	bl		aes_decrypt_block5x		)
98	st1		{v0.16b-v3.16b}, [x0], #64
99ST5(	st1		{v4.16b}, [x0], #16		)
100	b		.LecbdecloopNx
101.Lecbdec1x:
102	adds		w4, w4, #MAX_STRIDE
103	beq		.Lecbdecout
104.Lecbdecloop:
105	ld1		{v0.16b}, [x1], #16		/* get next ct block */
106	decrypt_block	v0, w3, x2, x5, w6
107	st1		{v0.16b}, [x0], #16
108	subs		w4, w4, #1
109	bne		.Lecbdecloop
110.Lecbdecout:
111	ldp		x29, x30, [sp], #16
112	ret
113AES_FUNC_END(aes_ecb_decrypt)
114
115
116	/*
117	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118	 *		   int blocks, u8 iv[])
119	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120	 *		   int blocks, u8 iv[])
121	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122	 *			 int rounds, int blocks, u8 iv[],
123	 *			 u32 const rk2[]);
124	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125	 *			 int rounds, int blocks, u8 iv[],
126	 *			 u32 const rk2[]);
127	 */
128
129AES_FUNC_START(aes_essiv_cbc_encrypt)
130	ld1		{v4.16b}, [x5]			/* get iv */
131
132	mov		w8, #14				/* AES-256: 14 rounds */
133	enc_prepare	w8, x6, x7
134	encrypt_block	v4, w8, x6, x7, w9
135	enc_switch_key	w3, x2, x6
136	b		.Lcbcencloop4x
137
138AES_FUNC_START(aes_cbc_encrypt)
139	ld1		{v4.16b}, [x5]			/* get iv */
140	enc_prepare	w3, x2, x6
141
142.Lcbcencloop4x:
143	subs		w4, w4, #4
144	bmi		.Lcbcenc1x
145	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
146	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
147	encrypt_block	v0, w3, x2, x6, w7
148	eor		v1.16b, v1.16b, v0.16b
149	encrypt_block	v1, w3, x2, x6, w7
150	eor		v2.16b, v2.16b, v1.16b
151	encrypt_block	v2, w3, x2, x6, w7
152	eor		v3.16b, v3.16b, v2.16b
153	encrypt_block	v3, w3, x2, x6, w7
154	st1		{v0.16b-v3.16b}, [x0], #64
155	mov		v4.16b, v3.16b
156	b		.Lcbcencloop4x
157.Lcbcenc1x:
158	adds		w4, w4, #4
159	beq		.Lcbcencout
160.Lcbcencloop:
161	ld1		{v0.16b}, [x1], #16		/* get next pt block */
162	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
163	encrypt_block	v4, w3, x2, x6, w7
164	st1		{v4.16b}, [x0], #16
165	subs		w4, w4, #1
166	bne		.Lcbcencloop
167.Lcbcencout:
168	st1		{v4.16b}, [x5]			/* return iv */
169	ret
170AES_FUNC_END(aes_cbc_encrypt)
171AES_FUNC_END(aes_essiv_cbc_encrypt)
172
173AES_FUNC_START(aes_essiv_cbc_decrypt)
174	stp		x29, x30, [sp, #-16]!
175	mov		x29, sp
176
177	ld1		{cbciv.16b}, [x5]		/* get iv */
178
179	mov		w8, #14				/* AES-256: 14 rounds */
180	enc_prepare	w8, x6, x7
181	encrypt_block	cbciv, w8, x6, x7, w9
182	b		.Lessivcbcdecstart
183
184AES_FUNC_START(aes_cbc_decrypt)
185	stp		x29, x30, [sp, #-16]!
186	mov		x29, sp
187
188	ld1		{cbciv.16b}, [x5]		/* get iv */
189.Lessivcbcdecstart:
190	dec_prepare	w3, x2, x6
191
192.LcbcdecloopNx:
193	subs		w4, w4, #MAX_STRIDE
194	bmi		.Lcbcdec1x
195	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
196#if MAX_STRIDE == 5
197	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
198	mov		v5.16b, v0.16b
199	mov		v6.16b, v1.16b
200	mov		v7.16b, v2.16b
201	bl		aes_decrypt_block5x
202	sub		x1, x1, #32
203	eor		v0.16b, v0.16b, cbciv.16b
204	eor		v1.16b, v1.16b, v5.16b
205	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
206	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
207	eor		v2.16b, v2.16b, v6.16b
208	eor		v3.16b, v3.16b, v7.16b
209	eor		v4.16b, v4.16b, v5.16b
210#else
211	mov		v4.16b, v0.16b
212	mov		v5.16b, v1.16b
213	mov		v6.16b, v2.16b
214	bl		aes_decrypt_block4x
215	sub		x1, x1, #16
216	eor		v0.16b, v0.16b, cbciv.16b
217	eor		v1.16b, v1.16b, v4.16b
218	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
219	eor		v2.16b, v2.16b, v5.16b
220	eor		v3.16b, v3.16b, v6.16b
221#endif
222	st1		{v0.16b-v3.16b}, [x0], #64
223ST5(	st1		{v4.16b}, [x0], #16		)
224	b		.LcbcdecloopNx
225.Lcbcdec1x:
226	adds		w4, w4, #MAX_STRIDE
227	beq		.Lcbcdecout
228.Lcbcdecloop:
229	ld1		{v1.16b}, [x1], #16		/* get next ct block */
230	mov		v0.16b, v1.16b			/* ...and copy to v0 */
231	decrypt_block	v0, w3, x2, x6, w7
232	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
233	mov		cbciv.16b, v1.16b		/* ct is next iv */
234	st1		{v0.16b}, [x0], #16
235	subs		w4, w4, #1
236	bne		.Lcbcdecloop
237.Lcbcdecout:
238	st1		{cbciv.16b}, [x5]		/* return iv */
239	ldp		x29, x30, [sp], #16
240	ret
241AES_FUNC_END(aes_cbc_decrypt)
242AES_FUNC_END(aes_essiv_cbc_decrypt)
243
244
245	/*
246	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247	 *		       int rounds, int bytes, u8 const iv[])
248	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249	 *		       int rounds, int bytes, u8 const iv[])
250	 */
251
252AES_FUNC_START(aes_cbc_cts_encrypt)
253	adr_l		x8, .Lcts_permute_table
254	sub		x4, x4, #16
255	add		x9, x8, #32
256	add		x8, x8, x4
257	sub		x9, x9, x4
258	ld1		{v3.16b}, [x8]
259	ld1		{v4.16b}, [x9]
260
261	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
262	ld1		{v1.16b}, [x1]
263
264	ld1		{v5.16b}, [x5]			/* get iv */
265	enc_prepare	w3, x2, x6
266
267	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
268	tbl		v1.16b, {v1.16b}, v4.16b
269	encrypt_block	v0, w3, x2, x6, w7
270
271	eor		v1.16b, v1.16b, v0.16b
272	tbl		v0.16b, {v0.16b}, v3.16b
273	encrypt_block	v1, w3, x2, x6, w7
274
275	add		x4, x0, x4
276	st1		{v0.16b}, [x4]			/* overlapping stores */
277	st1		{v1.16b}, [x0]
278	ret
279AES_FUNC_END(aes_cbc_cts_encrypt)
280
281AES_FUNC_START(aes_cbc_cts_decrypt)
282	adr_l		x8, .Lcts_permute_table
283	sub		x4, x4, #16
284	add		x9, x8, #32
285	add		x8, x8, x4
286	sub		x9, x9, x4
287	ld1		{v3.16b}, [x8]
288	ld1		{v4.16b}, [x9]
289
290	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
291	ld1		{v1.16b}, [x1]
292
293	ld1		{v5.16b}, [x5]			/* get iv */
294	dec_prepare	w3, x2, x6
295
296	decrypt_block	v0, w3, x2, x6, w7
297	tbl		v2.16b, {v0.16b}, v3.16b
298	eor		v2.16b, v2.16b, v1.16b
299
300	tbx		v0.16b, {v1.16b}, v4.16b
301	decrypt_block	v0, w3, x2, x6, w7
302	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
303
304	add		x4, x0, x4
305	st1		{v2.16b}, [x4]			/* overlapping stores */
306	st1		{v0.16b}, [x0]
307	ret
308AES_FUNC_END(aes_cbc_cts_decrypt)
309
310	.section	".rodata", "a"
311	.align		6
312.Lcts_permute_table:
313	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319	.previous
320
321
322	/*
323	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324	 *		   int bytes, u8 ctr[], u8 finalbuf[])
325	 */
326
327AES_FUNC_START(aes_ctr_encrypt)
328	stp		x29, x30, [sp, #-16]!
329	mov		x29, sp
330
331	enc_prepare	w3, x2, x12
332	ld1		{vctr.16b}, [x5]
333
334	umov		x12, vctr.d[1]		/* keep swabbed ctr in reg */
335	rev		x12, x12
336
337.LctrloopNx:
338	add		w7, w4, #15
339	sub		w4, w4, #MAX_STRIDE << 4
340	lsr		w7, w7, #4
341	mov		w8, #MAX_STRIDE
342	cmp		w7, w8
343	csel		w7, w7, w8, lt
344	adds		x12, x12, x7
345
346	mov		v0.16b, vctr.16b
347	mov		v1.16b, vctr.16b
348	mov		v2.16b, vctr.16b
349	mov		v3.16b, vctr.16b
350ST5(	mov		v4.16b, vctr.16b		)
351	bcs		0f
352
353	.subsection	1
354	/* apply carry to outgoing counter */
3550:	umov		x8, vctr.d[0]
356	rev		x8, x8
357	add		x8, x8, #1
358	rev		x8, x8
359	ins		vctr.d[0], x8
360
361	/* apply carry to N counter blocks for N := x12 */
362	cbz		x12, 2f
363	adr		x16, 1f
364	sub		x16, x16, x12, lsl #3
365	br		x16
366	hint		34			// bti c
367	mov		v0.d[0], vctr.d[0]
368	hint		34			// bti c
369	mov		v1.d[0], vctr.d[0]
370	hint		34			// bti c
371	mov		v2.d[0], vctr.d[0]
372	hint		34			// bti c
373	mov		v3.d[0], vctr.d[0]
374ST5(	hint		34				)
375ST5(	mov		v4.d[0], vctr.d[0]		)
3761:	b		2f
377	.previous
378
3792:	rev		x7, x12
380	ins		vctr.d[1], x7
381	sub		x7, x12, #MAX_STRIDE - 1
382	sub		x8, x12, #MAX_STRIDE - 2
383	sub		x9, x12, #MAX_STRIDE - 3
384	rev		x7, x7
385	rev		x8, x8
386	mov		v1.d[1], x7
387	rev		x9, x9
388ST5(	sub		x10, x12, #MAX_STRIDE - 4	)
389	mov		v2.d[1], x8
390ST5(	rev		x10, x10			)
391	mov		v3.d[1], x9
392ST5(	mov		v4.d[1], x10			)
393	tbnz		w4, #31, .Lctrtail
394	ld1		{v5.16b-v7.16b}, [x1], #48
395ST4(	bl		aes_encrypt_block4x		)
396ST5(	bl		aes_encrypt_block5x		)
397	eor		v0.16b, v5.16b, v0.16b
398ST4(	ld1		{v5.16b}, [x1], #16		)
399	eor		v1.16b, v6.16b, v1.16b
400ST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
401	eor		v2.16b, v7.16b, v2.16b
402	eor		v3.16b, v5.16b, v3.16b
403ST5(	eor		v4.16b, v6.16b, v4.16b		)
404	st1		{v0.16b-v3.16b}, [x0], #64
405ST5(	st1		{v4.16b}, [x0], #16		)
406	cbz		w4, .Lctrout
407	b		.LctrloopNx
408
409.Lctrout:
410	st1		{vctr.16b}, [x5]	/* return next CTR value */
411	ldp		x29, x30, [sp], #16
412	ret
413
414.Lctrtail:
415	/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
416	mov		x16, #16
417	ands		x13, x4, #0xf
418	csel		x13, x13, x16, ne
419
420ST5(	cmp		w4, #64 - (MAX_STRIDE << 4)	)
421ST5(	csel		x14, x16, xzr, gt		)
422	cmp		w4, #48 - (MAX_STRIDE << 4)
423	csel		x15, x16, xzr, gt
424	cmp		w4, #32 - (MAX_STRIDE << 4)
425	csel		x16, x16, xzr, gt
426	cmp		w4, #16 - (MAX_STRIDE << 4)
427	ble		.Lctrtail1x
428
429	adr_l		x12, .Lcts_permute_table
430	add		x12, x12, x13
431
432ST5(	ld1		{v5.16b}, [x1], x14		)
433	ld1		{v6.16b}, [x1], x15
434	ld1		{v7.16b}, [x1], x16
435
436ST4(	bl		aes_encrypt_block4x		)
437ST5(	bl		aes_encrypt_block5x		)
438
439	ld1		{v8.16b}, [x1], x13
440	ld1		{v9.16b}, [x1]
441	ld1		{v10.16b}, [x12]
442
443ST4(	eor		v6.16b, v6.16b, v0.16b		)
444ST4(	eor		v7.16b, v7.16b, v1.16b		)
445ST4(	tbl		v3.16b, {v3.16b}, v10.16b	)
446ST4(	eor		v8.16b, v8.16b, v2.16b		)
447ST4(	eor		v9.16b, v9.16b, v3.16b		)
448
449ST5(	eor		v5.16b, v5.16b, v0.16b		)
450ST5(	eor		v6.16b, v6.16b, v1.16b		)
451ST5(	tbl		v4.16b, {v4.16b}, v10.16b	)
452ST5(	eor		v7.16b, v7.16b, v2.16b		)
453ST5(	eor		v8.16b, v8.16b, v3.16b		)
454ST5(	eor		v9.16b, v9.16b, v4.16b		)
455
456ST5(	st1		{v5.16b}, [x0], x14		)
457	st1		{v6.16b}, [x0], x15
458	st1		{v7.16b}, [x0], x16
459	add		x13, x13, x0
460	st1		{v9.16b}, [x13]		// overlapping stores
461	st1		{v8.16b}, [x0]
462	b		.Lctrout
463
464.Lctrtail1x:
465	csel		x0, x0, x6, eq		// use finalbuf if less than a full block
466	ld1		{v5.16b}, [x1]
467ST5(	mov		v3.16b, v4.16b			)
468	encrypt_block	v3, w3, x2, x8, w7
469	eor		v5.16b, v5.16b, v3.16b
470	st1		{v5.16b}, [x0]
471	b		.Lctrout
472AES_FUNC_END(aes_ctr_encrypt)
473
474
475	/*
476	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
477	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
478	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
479	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
480	 */
481
482	.macro		next_tweak, out, in, tmp
483	sshr		\tmp\().2d,  \in\().2d,   #63
484	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
485	add		\out\().2d,  \in\().2d,   \in\().2d
486	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
487	eor		\out\().16b, \out\().16b, \tmp\().16b
488	.endm
489
490	.macro		xts_load_mask, tmp
491	movi		xtsmask.2s, #0x1
492	movi		\tmp\().2s, #0x87
493	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
494	.endm
495
496AES_FUNC_START(aes_xts_encrypt)
497	stp		x29, x30, [sp, #-16]!
498	mov		x29, sp
499
500	ld1		{v4.16b}, [x6]
501	xts_load_mask	v8
502	cbz		w7, .Lxtsencnotfirst
503
504	enc_prepare	w3, x5, x8
505	xts_cts_skip_tw	w7, .LxtsencNx
506	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
507	enc_switch_key	w3, x2, x8
508	b		.LxtsencNx
509
510.Lxtsencnotfirst:
511	enc_prepare	w3, x2, x8
512.LxtsencloopNx:
513	next_tweak	v4, v4, v8
514.LxtsencNx:
515	subs		w4, w4, #64
516	bmi		.Lxtsenc1x
517	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
518	next_tweak	v5, v4, v8
519	eor		v0.16b, v0.16b, v4.16b
520	next_tweak	v6, v5, v8
521	eor		v1.16b, v1.16b, v5.16b
522	eor		v2.16b, v2.16b, v6.16b
523	next_tweak	v7, v6, v8
524	eor		v3.16b, v3.16b, v7.16b
525	bl		aes_encrypt_block4x
526	eor		v3.16b, v3.16b, v7.16b
527	eor		v0.16b, v0.16b, v4.16b
528	eor		v1.16b, v1.16b, v5.16b
529	eor		v2.16b, v2.16b, v6.16b
530	st1		{v0.16b-v3.16b}, [x0], #64
531	mov		v4.16b, v7.16b
532	cbz		w4, .Lxtsencret
533	xts_reload_mask	v8
534	b		.LxtsencloopNx
535.Lxtsenc1x:
536	adds		w4, w4, #64
537	beq		.Lxtsencout
538	subs		w4, w4, #16
539	bmi		.LxtsencctsNx
540.Lxtsencloop:
541	ld1		{v0.16b}, [x1], #16
542.Lxtsencctsout:
543	eor		v0.16b, v0.16b, v4.16b
544	encrypt_block	v0, w3, x2, x8, w7
545	eor		v0.16b, v0.16b, v4.16b
546	cbz		w4, .Lxtsencout
547	subs		w4, w4, #16
548	next_tweak	v4, v4, v8
549	bmi		.Lxtsenccts
550	st1		{v0.16b}, [x0], #16
551	b		.Lxtsencloop
552.Lxtsencout:
553	st1		{v0.16b}, [x0]
554.Lxtsencret:
555	st1		{v4.16b}, [x6]
556	ldp		x29, x30, [sp], #16
557	ret
558
559.LxtsencctsNx:
560	mov		v0.16b, v3.16b
561	sub		x0, x0, #16
562.Lxtsenccts:
563	adr_l		x8, .Lcts_permute_table
564
565	add		x1, x1, w4, sxtw	/* rewind input pointer */
566	add		w4, w4, #16		/* # bytes in final block */
567	add		x9, x8, #32
568	add		x8, x8, x4
569	sub		x9, x9, x4
570	add		x4, x0, x4		/* output address of final block */
571
572	ld1		{v1.16b}, [x1]		/* load final block */
573	ld1		{v2.16b}, [x8]
574	ld1		{v3.16b}, [x9]
575
576	tbl		v2.16b, {v0.16b}, v2.16b
577	tbx		v0.16b, {v1.16b}, v3.16b
578	st1		{v2.16b}, [x4]			/* overlapping stores */
579	mov		w4, wzr
580	b		.Lxtsencctsout
581AES_FUNC_END(aes_xts_encrypt)
582
583AES_FUNC_START(aes_xts_decrypt)
584	stp		x29, x30, [sp, #-16]!
585	mov		x29, sp
586
587	/* subtract 16 bytes if we are doing CTS */
588	sub		w8, w4, #0x10
589	tst		w4, #0xf
590	csel		w4, w4, w8, eq
591
592	ld1		{v4.16b}, [x6]
593	xts_load_mask	v8
594	xts_cts_skip_tw	w7, .Lxtsdecskiptw
595	cbz		w7, .Lxtsdecnotfirst
596
597	enc_prepare	w3, x5, x8
598	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
599.Lxtsdecskiptw:
600	dec_prepare	w3, x2, x8
601	b		.LxtsdecNx
602
603.Lxtsdecnotfirst:
604	dec_prepare	w3, x2, x8
605.LxtsdecloopNx:
606	next_tweak	v4, v4, v8
607.LxtsdecNx:
608	subs		w4, w4, #64
609	bmi		.Lxtsdec1x
610	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
611	next_tweak	v5, v4, v8
612	eor		v0.16b, v0.16b, v4.16b
613	next_tweak	v6, v5, v8
614	eor		v1.16b, v1.16b, v5.16b
615	eor		v2.16b, v2.16b, v6.16b
616	next_tweak	v7, v6, v8
617	eor		v3.16b, v3.16b, v7.16b
618	bl		aes_decrypt_block4x
619	eor		v3.16b, v3.16b, v7.16b
620	eor		v0.16b, v0.16b, v4.16b
621	eor		v1.16b, v1.16b, v5.16b
622	eor		v2.16b, v2.16b, v6.16b
623	st1		{v0.16b-v3.16b}, [x0], #64
624	mov		v4.16b, v7.16b
625	cbz		w4, .Lxtsdecout
626	xts_reload_mask	v8
627	b		.LxtsdecloopNx
628.Lxtsdec1x:
629	adds		w4, w4, #64
630	beq		.Lxtsdecout
631	subs		w4, w4, #16
632.Lxtsdecloop:
633	ld1		{v0.16b}, [x1], #16
634	bmi		.Lxtsdeccts
635.Lxtsdecctsout:
636	eor		v0.16b, v0.16b, v4.16b
637	decrypt_block	v0, w3, x2, x8, w7
638	eor		v0.16b, v0.16b, v4.16b
639	st1		{v0.16b}, [x0], #16
640	cbz		w4, .Lxtsdecout
641	subs		w4, w4, #16
642	next_tweak	v4, v4, v8
643	b		.Lxtsdecloop
644.Lxtsdecout:
645	st1		{v4.16b}, [x6]
646	ldp		x29, x30, [sp], #16
647	ret
648
649.Lxtsdeccts:
650	adr_l		x8, .Lcts_permute_table
651
652	add		x1, x1, w4, sxtw	/* rewind input pointer */
653	add		w4, w4, #16		/* # bytes in final block */
654	add		x9, x8, #32
655	add		x8, x8, x4
656	sub		x9, x9, x4
657	add		x4, x0, x4		/* output address of final block */
658
659	next_tweak	v5, v4, v8
660
661	ld1		{v1.16b}, [x1]		/* load final block */
662	ld1		{v2.16b}, [x8]
663	ld1		{v3.16b}, [x9]
664
665	eor		v0.16b, v0.16b, v5.16b
666	decrypt_block	v0, w3, x2, x8, w7
667	eor		v0.16b, v0.16b, v5.16b
668
669	tbl		v2.16b, {v0.16b}, v2.16b
670	tbx		v0.16b, {v1.16b}, v3.16b
671
672	st1		{v2.16b}, [x4]			/* overlapping stores */
673	mov		w4, wzr
674	b		.Lxtsdecctsout
675AES_FUNC_END(aes_xts_decrypt)
676
677	/*
678	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
679	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
680	 */
681AES_FUNC_START(aes_mac_update)
682	ld1		{v0.16b}, [x4]			/* get dg */
683	enc_prepare	w2, x1, x7
684	cbz		w5, .Lmacloop4x
685
686	encrypt_block	v0, w2, x1, x7, w8
687
688.Lmacloop4x:
689	subs		w3, w3, #4
690	bmi		.Lmac1x
691	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
692	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
693	encrypt_block	v0, w2, x1, x7, w8
694	eor		v0.16b, v0.16b, v2.16b
695	encrypt_block	v0, w2, x1, x7, w8
696	eor		v0.16b, v0.16b, v3.16b
697	encrypt_block	v0, w2, x1, x7, w8
698	eor		v0.16b, v0.16b, v4.16b
699	cmp		w3, wzr
700	csinv		x5, x6, xzr, eq
701	cbz		w5, .Lmacout
702	encrypt_block	v0, w2, x1, x7, w8
703	st1		{v0.16b}, [x4]			/* return dg */
704	cond_yield	.Lmacout, x7, x8
705	b		.Lmacloop4x
706.Lmac1x:
707	add		w3, w3, #4
708.Lmacloop:
709	cbz		w3, .Lmacout
710	ld1		{v1.16b}, [x0], #16		/* get next pt block */
711	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
712
713	subs		w3, w3, #1
714	csinv		x5, x6, xzr, eq
715	cbz		w5, .Lmacout
716
717.Lmacenc:
718	encrypt_block	v0, w2, x1, x7, w8
719	b		.Lmacloop
720
721.Lmacout:
722	st1		{v0.16b}, [x4]			/* return dg */
723	mov		w0, w3
724	ret
725AES_FUNC_END(aes_mac_update)
726