xref: /openbmc/linux/arch/arm64/crypto/aes-modes.S (revision 8ffdff6a)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4 *
5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8/* included by aes-ce.S and aes-neon.S */
9
10	.text
11	.align		4
12
13#ifndef MAX_STRIDE
14#define MAX_STRIDE	4
15#endif
16
17#if MAX_STRIDE == 4
18#define ST4(x...) x
19#define ST5(x...)
20#else
21#define ST4(x...)
22#define ST5(x...) x
23#endif
24
25SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
26	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
27	ret
28SYM_FUNC_END(aes_encrypt_block4x)
29
30SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
31	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
32	ret
33SYM_FUNC_END(aes_decrypt_block4x)
34
35#if MAX_STRIDE == 5
36SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
37	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
38	ret
39SYM_FUNC_END(aes_encrypt_block5x)
40
41SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
42	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
43	ret
44SYM_FUNC_END(aes_decrypt_block5x)
45#endif
46
47	/*
48	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
49	 *		   int blocks)
50	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
51	 *		   int blocks)
52	 */
53
54AES_FUNC_START(aes_ecb_encrypt)
55	stp		x29, x30, [sp, #-16]!
56	mov		x29, sp
57
58	enc_prepare	w3, x2, x5
59
60.LecbencloopNx:
61	subs		w4, w4, #MAX_STRIDE
62	bmi		.Lecbenc1x
63	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
64ST4(	bl		aes_encrypt_block4x		)
65ST5(	ld1		{v4.16b}, [x1], #16		)
66ST5(	bl		aes_encrypt_block5x		)
67	st1		{v0.16b-v3.16b}, [x0], #64
68ST5(	st1		{v4.16b}, [x0], #16		)
69	b		.LecbencloopNx
70.Lecbenc1x:
71	adds		w4, w4, #MAX_STRIDE
72	beq		.Lecbencout
73.Lecbencloop:
74	ld1		{v0.16b}, [x1], #16		/* get next pt block */
75	encrypt_block	v0, w3, x2, x5, w6
76	st1		{v0.16b}, [x0], #16
77	subs		w4, w4, #1
78	bne		.Lecbencloop
79.Lecbencout:
80	ldp		x29, x30, [sp], #16
81	ret
82AES_FUNC_END(aes_ecb_encrypt)
83
84
85AES_FUNC_START(aes_ecb_decrypt)
86	stp		x29, x30, [sp, #-16]!
87	mov		x29, sp
88
89	dec_prepare	w3, x2, x5
90
91.LecbdecloopNx:
92	subs		w4, w4, #MAX_STRIDE
93	bmi		.Lecbdec1x
94	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
95ST4(	bl		aes_decrypt_block4x		)
96ST5(	ld1		{v4.16b}, [x1], #16		)
97ST5(	bl		aes_decrypt_block5x		)
98	st1		{v0.16b-v3.16b}, [x0], #64
99ST5(	st1		{v4.16b}, [x0], #16		)
100	b		.LecbdecloopNx
101.Lecbdec1x:
102	adds		w4, w4, #MAX_STRIDE
103	beq		.Lecbdecout
104.Lecbdecloop:
105	ld1		{v0.16b}, [x1], #16		/* get next ct block */
106	decrypt_block	v0, w3, x2, x5, w6
107	st1		{v0.16b}, [x0], #16
108	subs		w4, w4, #1
109	bne		.Lecbdecloop
110.Lecbdecout:
111	ldp		x29, x30, [sp], #16
112	ret
113AES_FUNC_END(aes_ecb_decrypt)
114
115
116	/*
117	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
118	 *		   int blocks, u8 iv[])
119	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
120	 *		   int blocks, u8 iv[])
121	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
122	 *			 int rounds, int blocks, u8 iv[],
123	 *			 u32 const rk2[]);
124	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
125	 *			 int rounds, int blocks, u8 iv[],
126	 *			 u32 const rk2[]);
127	 */
128
129AES_FUNC_START(aes_essiv_cbc_encrypt)
130	ld1		{v4.16b}, [x5]			/* get iv */
131
132	mov		w8, #14				/* AES-256: 14 rounds */
133	enc_prepare	w8, x6, x7
134	encrypt_block	v4, w8, x6, x7, w9
135	enc_switch_key	w3, x2, x6
136	b		.Lcbcencloop4x
137
138AES_FUNC_START(aes_cbc_encrypt)
139	ld1		{v4.16b}, [x5]			/* get iv */
140	enc_prepare	w3, x2, x6
141
142.Lcbcencloop4x:
143	subs		w4, w4, #4
144	bmi		.Lcbcenc1x
145	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
146	eor		v0.16b, v0.16b, v4.16b		/* ..and xor with iv */
147	encrypt_block	v0, w3, x2, x6, w7
148	eor		v1.16b, v1.16b, v0.16b
149	encrypt_block	v1, w3, x2, x6, w7
150	eor		v2.16b, v2.16b, v1.16b
151	encrypt_block	v2, w3, x2, x6, w7
152	eor		v3.16b, v3.16b, v2.16b
153	encrypt_block	v3, w3, x2, x6, w7
154	st1		{v0.16b-v3.16b}, [x0], #64
155	mov		v4.16b, v3.16b
156	b		.Lcbcencloop4x
157.Lcbcenc1x:
158	adds		w4, w4, #4
159	beq		.Lcbcencout
160.Lcbcencloop:
161	ld1		{v0.16b}, [x1], #16		/* get next pt block */
162	eor		v4.16b, v4.16b, v0.16b		/* ..and xor with iv */
163	encrypt_block	v4, w3, x2, x6, w7
164	st1		{v4.16b}, [x0], #16
165	subs		w4, w4, #1
166	bne		.Lcbcencloop
167.Lcbcencout:
168	st1		{v4.16b}, [x5]			/* return iv */
169	ret
170AES_FUNC_END(aes_cbc_encrypt)
171AES_FUNC_END(aes_essiv_cbc_encrypt)
172
173AES_FUNC_START(aes_essiv_cbc_decrypt)
174	stp		x29, x30, [sp, #-16]!
175	mov		x29, sp
176
177	ld1		{cbciv.16b}, [x5]		/* get iv */
178
179	mov		w8, #14				/* AES-256: 14 rounds */
180	enc_prepare	w8, x6, x7
181	encrypt_block	cbciv, w8, x6, x7, w9
182	b		.Lessivcbcdecstart
183
184AES_FUNC_START(aes_cbc_decrypt)
185	stp		x29, x30, [sp, #-16]!
186	mov		x29, sp
187
188	ld1		{cbciv.16b}, [x5]		/* get iv */
189.Lessivcbcdecstart:
190	dec_prepare	w3, x2, x6
191
192.LcbcdecloopNx:
193	subs		w4, w4, #MAX_STRIDE
194	bmi		.Lcbcdec1x
195	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
196#if MAX_STRIDE == 5
197	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
198	mov		v5.16b, v0.16b
199	mov		v6.16b, v1.16b
200	mov		v7.16b, v2.16b
201	bl		aes_decrypt_block5x
202	sub		x1, x1, #32
203	eor		v0.16b, v0.16b, cbciv.16b
204	eor		v1.16b, v1.16b, v5.16b
205	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
206	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
207	eor		v2.16b, v2.16b, v6.16b
208	eor		v3.16b, v3.16b, v7.16b
209	eor		v4.16b, v4.16b, v5.16b
210#else
211	mov		v4.16b, v0.16b
212	mov		v5.16b, v1.16b
213	mov		v6.16b, v2.16b
214	bl		aes_decrypt_block4x
215	sub		x1, x1, #16
216	eor		v0.16b, v0.16b, cbciv.16b
217	eor		v1.16b, v1.16b, v4.16b
218	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
219	eor		v2.16b, v2.16b, v5.16b
220	eor		v3.16b, v3.16b, v6.16b
221#endif
222	st1		{v0.16b-v3.16b}, [x0], #64
223ST5(	st1		{v4.16b}, [x0], #16		)
224	b		.LcbcdecloopNx
225.Lcbcdec1x:
226	adds		w4, w4, #MAX_STRIDE
227	beq		.Lcbcdecout
228.Lcbcdecloop:
229	ld1		{v1.16b}, [x1], #16		/* get next ct block */
230	mov		v0.16b, v1.16b			/* ...and copy to v0 */
231	decrypt_block	v0, w3, x2, x6, w7
232	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
233	mov		cbciv.16b, v1.16b		/* ct is next iv */
234	st1		{v0.16b}, [x0], #16
235	subs		w4, w4, #1
236	bne		.Lcbcdecloop
237.Lcbcdecout:
238	st1		{cbciv.16b}, [x5]		/* return iv */
239	ldp		x29, x30, [sp], #16
240	ret
241AES_FUNC_END(aes_cbc_decrypt)
242AES_FUNC_END(aes_essiv_cbc_decrypt)
243
244
245	/*
246	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
247	 *		       int rounds, int bytes, u8 const iv[])
248	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
249	 *		       int rounds, int bytes, u8 const iv[])
250	 */
251
252AES_FUNC_START(aes_cbc_cts_encrypt)
253	adr_l		x8, .Lcts_permute_table
254	sub		x4, x4, #16
255	add		x9, x8, #32
256	add		x8, x8, x4
257	sub		x9, x9, x4
258	ld1		{v3.16b}, [x8]
259	ld1		{v4.16b}, [x9]
260
261	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
262	ld1		{v1.16b}, [x1]
263
264	ld1		{v5.16b}, [x5]			/* get iv */
265	enc_prepare	w3, x2, x6
266
267	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
268	tbl		v1.16b, {v1.16b}, v4.16b
269	encrypt_block	v0, w3, x2, x6, w7
270
271	eor		v1.16b, v1.16b, v0.16b
272	tbl		v0.16b, {v0.16b}, v3.16b
273	encrypt_block	v1, w3, x2, x6, w7
274
275	add		x4, x0, x4
276	st1		{v0.16b}, [x4]			/* overlapping stores */
277	st1		{v1.16b}, [x0]
278	ret
279AES_FUNC_END(aes_cbc_cts_encrypt)
280
281AES_FUNC_START(aes_cbc_cts_decrypt)
282	adr_l		x8, .Lcts_permute_table
283	sub		x4, x4, #16
284	add		x9, x8, #32
285	add		x8, x8, x4
286	sub		x9, x9, x4
287	ld1		{v3.16b}, [x8]
288	ld1		{v4.16b}, [x9]
289
290	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
291	ld1		{v1.16b}, [x1]
292
293	ld1		{v5.16b}, [x5]			/* get iv */
294	dec_prepare	w3, x2, x6
295
296	decrypt_block	v0, w3, x2, x6, w7
297	tbl		v2.16b, {v0.16b}, v3.16b
298	eor		v2.16b, v2.16b, v1.16b
299
300	tbx		v0.16b, {v1.16b}, v4.16b
301	decrypt_block	v0, w3, x2, x6, w7
302	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
303
304	add		x4, x0, x4
305	st1		{v2.16b}, [x4]			/* overlapping stores */
306	st1		{v0.16b}, [x0]
307	ret
308AES_FUNC_END(aes_cbc_cts_decrypt)
309
310	.section	".rodata", "a"
311	.align		6
312.Lcts_permute_table:
313	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
314	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
315	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
316	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
317	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
318	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
319	.previous
320
321
322	/*
323	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
324	 *		   int bytes, u8 ctr[], u8 finalbuf[])
325	 */
326
327AES_FUNC_START(aes_ctr_encrypt)
328	stp		x29, x30, [sp, #-16]!
329	mov		x29, sp
330
331	enc_prepare	w3, x2, x12
332	ld1		{vctr.16b}, [x5]
333
334	umov		x12, vctr.d[1]		/* keep swabbed ctr in reg */
335	rev		x12, x12
336
337.LctrloopNx:
338	add		w7, w4, #15
339	sub		w4, w4, #MAX_STRIDE << 4
340	lsr		w7, w7, #4
341	mov		w8, #MAX_STRIDE
342	cmp		w7, w8
343	csel		w7, w7, w8, lt
344	adds		x12, x12, x7
345
346	mov		v0.16b, vctr.16b
347	mov		v1.16b, vctr.16b
348	mov		v2.16b, vctr.16b
349	mov		v3.16b, vctr.16b
350ST5(	mov		v4.16b, vctr.16b		)
351	bcs		0f
352
353	.subsection	1
354	/* apply carry to outgoing counter */
3550:	umov		x8, vctr.d[0]
356	rev		x8, x8
357	add		x8, x8, #1
358	rev		x8, x8
359	ins		vctr.d[0], x8
360
361	/* apply carry to N counter blocks for N := x12 */
362	adr		x16, 1f
363	sub		x16, x16, x12, lsl #3
364	br		x16
365	hint		34			// bti c
366	mov		v0.d[0], vctr.d[0]
367	hint		34			// bti c
368	mov		v1.d[0], vctr.d[0]
369	hint		34			// bti c
370	mov		v2.d[0], vctr.d[0]
371	hint		34			// bti c
372	mov		v3.d[0], vctr.d[0]
373ST5(	hint		34				)
374ST5(	mov		v4.d[0], vctr.d[0]		)
3751:	b		2f
376	.previous
377
3782:	rev		x7, x12
379	ins		vctr.d[1], x7
380	sub		x7, x12, #MAX_STRIDE - 1
381	sub		x8, x12, #MAX_STRIDE - 2
382	sub		x9, x12, #MAX_STRIDE - 3
383	rev		x7, x7
384	rev		x8, x8
385	mov		v1.d[1], x7
386	rev		x9, x9
387ST5(	sub		x10, x12, #MAX_STRIDE - 4	)
388	mov		v2.d[1], x8
389ST5(	rev		x10, x10			)
390	mov		v3.d[1], x9
391ST5(	mov		v4.d[1], x10			)
392	tbnz		w4, #31, .Lctrtail
393	ld1		{v5.16b-v7.16b}, [x1], #48
394ST4(	bl		aes_encrypt_block4x		)
395ST5(	bl		aes_encrypt_block5x		)
396	eor		v0.16b, v5.16b, v0.16b
397ST4(	ld1		{v5.16b}, [x1], #16		)
398	eor		v1.16b, v6.16b, v1.16b
399ST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
400	eor		v2.16b, v7.16b, v2.16b
401	eor		v3.16b, v5.16b, v3.16b
402ST5(	eor		v4.16b, v6.16b, v4.16b		)
403	st1		{v0.16b-v3.16b}, [x0], #64
404ST5(	st1		{v4.16b}, [x0], #16		)
405	cbz		w4, .Lctrout
406	b		.LctrloopNx
407
408.Lctrout:
409	st1		{vctr.16b}, [x5]	/* return next CTR value */
410	ldp		x29, x30, [sp], #16
411	ret
412
413.Lctrtail:
414	/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
415	mov		x16, #16
416	ands		x13, x4, #0xf
417	csel		x13, x13, x16, ne
418
419ST5(	cmp		w4, #64 - (MAX_STRIDE << 4)	)
420ST5(	csel		x14, x16, xzr, gt		)
421	cmp		w4, #48 - (MAX_STRIDE << 4)
422	csel		x15, x16, xzr, gt
423	cmp		w4, #32 - (MAX_STRIDE << 4)
424	csel		x16, x16, xzr, gt
425	cmp		w4, #16 - (MAX_STRIDE << 4)
426	ble		.Lctrtail1x
427
428	adr_l		x12, .Lcts_permute_table
429	add		x12, x12, x13
430
431ST5(	ld1		{v5.16b}, [x1], x14		)
432	ld1		{v6.16b}, [x1], x15
433	ld1		{v7.16b}, [x1], x16
434
435ST4(	bl		aes_encrypt_block4x		)
436ST5(	bl		aes_encrypt_block5x		)
437
438	ld1		{v8.16b}, [x1], x13
439	ld1		{v9.16b}, [x1]
440	ld1		{v10.16b}, [x12]
441
442ST4(	eor		v6.16b, v6.16b, v0.16b		)
443ST4(	eor		v7.16b, v7.16b, v1.16b		)
444ST4(	tbl		v3.16b, {v3.16b}, v10.16b	)
445ST4(	eor		v8.16b, v8.16b, v2.16b		)
446ST4(	eor		v9.16b, v9.16b, v3.16b		)
447
448ST5(	eor		v5.16b, v5.16b, v0.16b		)
449ST5(	eor		v6.16b, v6.16b, v1.16b		)
450ST5(	tbl		v4.16b, {v4.16b}, v10.16b	)
451ST5(	eor		v7.16b, v7.16b, v2.16b		)
452ST5(	eor		v8.16b, v8.16b, v3.16b		)
453ST5(	eor		v9.16b, v9.16b, v4.16b		)
454
455ST5(	st1		{v5.16b}, [x0], x14		)
456	st1		{v6.16b}, [x0], x15
457	st1		{v7.16b}, [x0], x16
458	add		x13, x13, x0
459	st1		{v9.16b}, [x13]		// overlapping stores
460	st1		{v8.16b}, [x0]
461	b		.Lctrout
462
463.Lctrtail1x:
464	csel		x0, x0, x6, eq		// use finalbuf if less than a full block
465	ld1		{v5.16b}, [x1]
466ST5(	mov		v3.16b, v4.16b			)
467	encrypt_block	v3, w3, x2, x8, w7
468	eor		v5.16b, v5.16b, v3.16b
469	st1		{v5.16b}, [x0]
470	b		.Lctrout
471AES_FUNC_END(aes_ctr_encrypt)
472
473
474	/*
475	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
476	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
477	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
478	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
479	 */
480
481	.macro		next_tweak, out, in, tmp
482	sshr		\tmp\().2d,  \in\().2d,   #63
483	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
484	add		\out\().2d,  \in\().2d,   \in\().2d
485	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
486	eor		\out\().16b, \out\().16b, \tmp\().16b
487	.endm
488
489	.macro		xts_load_mask, tmp
490	movi		xtsmask.2s, #0x1
491	movi		\tmp\().2s, #0x87
492	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
493	.endm
494
495AES_FUNC_START(aes_xts_encrypt)
496	stp		x29, x30, [sp, #-16]!
497	mov		x29, sp
498
499	ld1		{v4.16b}, [x6]
500	xts_load_mask	v8
501	cbz		w7, .Lxtsencnotfirst
502
503	enc_prepare	w3, x5, x8
504	xts_cts_skip_tw	w7, .LxtsencNx
505	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
506	enc_switch_key	w3, x2, x8
507	b		.LxtsencNx
508
509.Lxtsencnotfirst:
510	enc_prepare	w3, x2, x8
511.LxtsencloopNx:
512	next_tweak	v4, v4, v8
513.LxtsencNx:
514	subs		w4, w4, #64
515	bmi		.Lxtsenc1x
516	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
517	next_tweak	v5, v4, v8
518	eor		v0.16b, v0.16b, v4.16b
519	next_tweak	v6, v5, v8
520	eor		v1.16b, v1.16b, v5.16b
521	eor		v2.16b, v2.16b, v6.16b
522	next_tweak	v7, v6, v8
523	eor		v3.16b, v3.16b, v7.16b
524	bl		aes_encrypt_block4x
525	eor		v3.16b, v3.16b, v7.16b
526	eor		v0.16b, v0.16b, v4.16b
527	eor		v1.16b, v1.16b, v5.16b
528	eor		v2.16b, v2.16b, v6.16b
529	st1		{v0.16b-v3.16b}, [x0], #64
530	mov		v4.16b, v7.16b
531	cbz		w4, .Lxtsencret
532	xts_reload_mask	v8
533	b		.LxtsencloopNx
534.Lxtsenc1x:
535	adds		w4, w4, #64
536	beq		.Lxtsencout
537	subs		w4, w4, #16
538	bmi		.LxtsencctsNx
539.Lxtsencloop:
540	ld1		{v0.16b}, [x1], #16
541.Lxtsencctsout:
542	eor		v0.16b, v0.16b, v4.16b
543	encrypt_block	v0, w3, x2, x8, w7
544	eor		v0.16b, v0.16b, v4.16b
545	cbz		w4, .Lxtsencout
546	subs		w4, w4, #16
547	next_tweak	v4, v4, v8
548	bmi		.Lxtsenccts
549	st1		{v0.16b}, [x0], #16
550	b		.Lxtsencloop
551.Lxtsencout:
552	st1		{v0.16b}, [x0]
553.Lxtsencret:
554	st1		{v4.16b}, [x6]
555	ldp		x29, x30, [sp], #16
556	ret
557
558.LxtsencctsNx:
559	mov		v0.16b, v3.16b
560	sub		x0, x0, #16
561.Lxtsenccts:
562	adr_l		x8, .Lcts_permute_table
563
564	add		x1, x1, w4, sxtw	/* rewind input pointer */
565	add		w4, w4, #16		/* # bytes in final block */
566	add		x9, x8, #32
567	add		x8, x8, x4
568	sub		x9, x9, x4
569	add		x4, x0, x4		/* output address of final block */
570
571	ld1		{v1.16b}, [x1]		/* load final block */
572	ld1		{v2.16b}, [x8]
573	ld1		{v3.16b}, [x9]
574
575	tbl		v2.16b, {v0.16b}, v2.16b
576	tbx		v0.16b, {v1.16b}, v3.16b
577	st1		{v2.16b}, [x4]			/* overlapping stores */
578	mov		w4, wzr
579	b		.Lxtsencctsout
580AES_FUNC_END(aes_xts_encrypt)
581
582AES_FUNC_START(aes_xts_decrypt)
583	stp		x29, x30, [sp, #-16]!
584	mov		x29, sp
585
586	/* subtract 16 bytes if we are doing CTS */
587	sub		w8, w4, #0x10
588	tst		w4, #0xf
589	csel		w4, w4, w8, eq
590
591	ld1		{v4.16b}, [x6]
592	xts_load_mask	v8
593	xts_cts_skip_tw	w7, .Lxtsdecskiptw
594	cbz		w7, .Lxtsdecnotfirst
595
596	enc_prepare	w3, x5, x8
597	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
598.Lxtsdecskiptw:
599	dec_prepare	w3, x2, x8
600	b		.LxtsdecNx
601
602.Lxtsdecnotfirst:
603	dec_prepare	w3, x2, x8
604.LxtsdecloopNx:
605	next_tweak	v4, v4, v8
606.LxtsdecNx:
607	subs		w4, w4, #64
608	bmi		.Lxtsdec1x
609	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
610	next_tweak	v5, v4, v8
611	eor		v0.16b, v0.16b, v4.16b
612	next_tweak	v6, v5, v8
613	eor		v1.16b, v1.16b, v5.16b
614	eor		v2.16b, v2.16b, v6.16b
615	next_tweak	v7, v6, v8
616	eor		v3.16b, v3.16b, v7.16b
617	bl		aes_decrypt_block4x
618	eor		v3.16b, v3.16b, v7.16b
619	eor		v0.16b, v0.16b, v4.16b
620	eor		v1.16b, v1.16b, v5.16b
621	eor		v2.16b, v2.16b, v6.16b
622	st1		{v0.16b-v3.16b}, [x0], #64
623	mov		v4.16b, v7.16b
624	cbz		w4, .Lxtsdecout
625	xts_reload_mask	v8
626	b		.LxtsdecloopNx
627.Lxtsdec1x:
628	adds		w4, w4, #64
629	beq		.Lxtsdecout
630	subs		w4, w4, #16
631.Lxtsdecloop:
632	ld1		{v0.16b}, [x1], #16
633	bmi		.Lxtsdeccts
634.Lxtsdecctsout:
635	eor		v0.16b, v0.16b, v4.16b
636	decrypt_block	v0, w3, x2, x8, w7
637	eor		v0.16b, v0.16b, v4.16b
638	st1		{v0.16b}, [x0], #16
639	cbz		w4, .Lxtsdecout
640	subs		w4, w4, #16
641	next_tweak	v4, v4, v8
642	b		.Lxtsdecloop
643.Lxtsdecout:
644	st1		{v4.16b}, [x6]
645	ldp		x29, x30, [sp], #16
646	ret
647
648.Lxtsdeccts:
649	adr_l		x8, .Lcts_permute_table
650
651	add		x1, x1, w4, sxtw	/* rewind input pointer */
652	add		w4, w4, #16		/* # bytes in final block */
653	add		x9, x8, #32
654	add		x8, x8, x4
655	sub		x9, x9, x4
656	add		x4, x0, x4		/* output address of final block */
657
658	next_tweak	v5, v4, v8
659
660	ld1		{v1.16b}, [x1]		/* load final block */
661	ld1		{v2.16b}, [x8]
662	ld1		{v3.16b}, [x9]
663
664	eor		v0.16b, v0.16b, v5.16b
665	decrypt_block	v0, w3, x2, x8, w7
666	eor		v0.16b, v0.16b, v5.16b
667
668	tbl		v2.16b, {v0.16b}, v2.16b
669	tbx		v0.16b, {v1.16b}, v3.16b
670
671	st1		{v2.16b}, [x4]			/* overlapping stores */
672	mov		w4, wzr
673	b		.Lxtsdecctsout
674AES_FUNC_END(aes_xts_decrypt)
675
676	/*
677	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
678	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
679	 */
680AES_FUNC_START(aes_mac_update)
681	ld1		{v0.16b}, [x4]			/* get dg */
682	enc_prepare	w2, x1, x7
683	cbz		w5, .Lmacloop4x
684
685	encrypt_block	v0, w2, x1, x7, w8
686
687.Lmacloop4x:
688	subs		w3, w3, #4
689	bmi		.Lmac1x
690	ld1		{v1.16b-v4.16b}, [x0], #64	/* get next pt block */
691	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
692	encrypt_block	v0, w2, x1, x7, w8
693	eor		v0.16b, v0.16b, v2.16b
694	encrypt_block	v0, w2, x1, x7, w8
695	eor		v0.16b, v0.16b, v3.16b
696	encrypt_block	v0, w2, x1, x7, w8
697	eor		v0.16b, v0.16b, v4.16b
698	cmp		w3, wzr
699	csinv		x5, x6, xzr, eq
700	cbz		w5, .Lmacout
701	encrypt_block	v0, w2, x1, x7, w8
702	st1		{v0.16b}, [x4]			/* return dg */
703	cond_yield	.Lmacout, x7
704	b		.Lmacloop4x
705.Lmac1x:
706	add		w3, w3, #4
707.Lmacloop:
708	cbz		w3, .Lmacout
709	ld1		{v1.16b}, [x0], #16		/* get next pt block */
710	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
711
712	subs		w3, w3, #1
713	csinv		x5, x6, xzr, eq
714	cbz		w5, .Lmacout
715
716.Lmacenc:
717	encrypt_block	v0, w2, x1, x7, w8
718	b		.Lmacloop
719
720.Lmacout:
721	st1		{v0.16b}, [x4]			/* return dg */
722	mov		w0, w3
723	ret
724AES_FUNC_END(aes_mac_update)
725