xref: /openbmc/linux/arch/arm/crypto/aes-ce-core.S (revision cdd38c5f1ce4398ec58fec95904b75824daab7b5)
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
4  *
5  * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
6  */
7 
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
10 
11 	.text
12 	.arch		armv8-a
13 	.fpu		crypto-neon-fp-armv8
14 	.align		3
15 
16 	.macro		enc_round, state, key
17 	aese.8		\state, \key
18 	aesmc.8		\state, \state
19 	.endm
20 
21 	.macro		dec_round, state, key
22 	aesd.8		\state, \key
23 	aesimc.8	\state, \state
24 	.endm
25 
26 	.macro		enc_dround, key1, key2
27 	enc_round	q0, \key1
28 	enc_round	q0, \key2
29 	.endm
30 
31 	.macro		dec_dround, key1, key2
32 	dec_round	q0, \key1
33 	dec_round	q0, \key2
34 	.endm
35 
36 	.macro		enc_fround, key1, key2, key3
37 	enc_round	q0, \key1
38 	aese.8		q0, \key2
39 	veor		q0, q0, \key3
40 	.endm
41 
42 	.macro		dec_fround, key1, key2, key3
43 	dec_round	q0, \key1
44 	aesd.8		q0, \key2
45 	veor		q0, q0, \key3
46 	.endm
47 
48 	.macro		enc_dround_4x, key1, key2
49 	enc_round	q0, \key1
50 	enc_round	q1, \key1
51 	enc_round	q2, \key1
52 	enc_round	q3, \key1
53 	enc_round	q0, \key2
54 	enc_round	q1, \key2
55 	enc_round	q2, \key2
56 	enc_round	q3, \key2
57 	.endm
58 
59 	.macro		dec_dround_4x, key1, key2
60 	dec_round	q0, \key1
61 	dec_round	q1, \key1
62 	dec_round	q2, \key1
63 	dec_round	q3, \key1
64 	dec_round	q0, \key2
65 	dec_round	q1, \key2
66 	dec_round	q2, \key2
67 	dec_round	q3, \key2
68 	.endm
69 
70 	.macro		enc_fround_4x, key1, key2, key3
71 	enc_round	q0, \key1
72 	enc_round	q1, \key1
73 	enc_round	q2, \key1
74 	enc_round	q3, \key1
75 	aese.8		q0, \key2
76 	aese.8		q1, \key2
77 	aese.8		q2, \key2
78 	aese.8		q3, \key2
79 	veor		q0, q0, \key3
80 	veor		q1, q1, \key3
81 	veor		q2, q2, \key3
82 	veor		q3, q3, \key3
83 	.endm
84 
85 	.macro		dec_fround_4x, key1, key2, key3
86 	dec_round	q0, \key1
87 	dec_round	q1, \key1
88 	dec_round	q2, \key1
89 	dec_round	q3, \key1
90 	aesd.8		q0, \key2
91 	aesd.8		q1, \key2
92 	aesd.8		q2, \key2
93 	aesd.8		q3, \key2
94 	veor		q0, q0, \key3
95 	veor		q1, q1, \key3
96 	veor		q2, q2, \key3
97 	veor		q3, q3, \key3
98 	.endm
99 
100 	.macro		do_block, dround, fround
101 	cmp		r3, #12			@ which key size?
102 	vld1.32		{q10-q11}, [ip]!
103 	\dround		q8, q9
104 	vld1.32		{q12-q13}, [ip]!
105 	\dround		q10, q11
106 	vld1.32		{q10-q11}, [ip]!
107 	\dround		q12, q13
108 	vld1.32		{q12-q13}, [ip]!
109 	\dround		q10, q11
110 	blo		0f			@ AES-128: 10 rounds
111 	vld1.32		{q10-q11}, [ip]!
112 	\dround		q12, q13
113 	beq		1f			@ AES-192: 12 rounds
114 	vld1.32		{q12-q13}, [ip]
115 	\dround		q10, q11
116 0:	\fround		q12, q13, q14
117 	bx		lr
118 
119 1:	\fround		q10, q11, q14
120 	bx		lr
121 	.endm
122 
123 	/*
124 	 * Internal, non-AAPCS compliant functions that implement the core AES
125 	 * transforms. These should preserve all registers except q0 - q2 and ip
126 	 * Arguments:
127 	 *   q0        : first in/output block
128 	 *   q1        : second in/output block (_4x version only)
129 	 *   q2        : third in/output block (_4x version only)
130 	 *   q3        : fourth in/output block (_4x version only)
131 	 *   q8        : first round key
132 	 *   q9        : secound round key
133 	 *   q14       : final round key
134 	 *   r2        : address of round key array
135 	 *   r3        : number of rounds
136 	 */
137 	.align		6
138 aes_encrypt:
139 	add		ip, r2, #32		@ 3rd round key
140 .Laes_encrypt_tweak:
141 	do_block	enc_dround, enc_fround
142 ENDPROC(aes_encrypt)
143 
144 	.align		6
145 aes_decrypt:
146 	add		ip, r2, #32		@ 3rd round key
147 	do_block	dec_dround, dec_fround
148 ENDPROC(aes_decrypt)
149 
150 	.align		6
151 aes_encrypt_4x:
152 	add		ip, r2, #32		@ 3rd round key
153 	do_block	enc_dround_4x, enc_fround_4x
154 ENDPROC(aes_encrypt_4x)
155 
156 	.align		6
157 aes_decrypt_4x:
158 	add		ip, r2, #32		@ 3rd round key
159 	do_block	dec_dround_4x, dec_fround_4x
160 ENDPROC(aes_decrypt_4x)
161 
162 	.macro		prepare_key, rk, rounds
163 	add		ip, \rk, \rounds, lsl #4
164 	vld1.32		{q8-q9}, [\rk]		@ load first 2 round keys
165 	vld1.32		{q14}, [ip]		@ load last round key
166 	.endm
167 
168 	/*
169 	 * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
170 	 *		   int blocks)
171 	 * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
172 	 *		   int blocks)
173 	 */
174 ENTRY(ce_aes_ecb_encrypt)
175 	push		{r4, lr}
176 	ldr		r4, [sp, #8]
177 	prepare_key	r2, r3
178 .Lecbencloop4x:
179 	subs		r4, r4, #4
180 	bmi		.Lecbenc1x
181 	vld1.8		{q0-q1}, [r1]!
182 	vld1.8		{q2-q3}, [r1]!
183 	bl		aes_encrypt_4x
184 	vst1.8		{q0-q1}, [r0]!
185 	vst1.8		{q2-q3}, [r0]!
186 	b		.Lecbencloop4x
187 .Lecbenc1x:
188 	adds		r4, r4, #4
189 	beq		.Lecbencout
190 .Lecbencloop:
191 	vld1.8		{q0}, [r1]!
192 	bl		aes_encrypt
193 	vst1.8		{q0}, [r0]!
194 	subs		r4, r4, #1
195 	bne		.Lecbencloop
196 .Lecbencout:
197 	pop		{r4, pc}
198 ENDPROC(ce_aes_ecb_encrypt)
199 
200 ENTRY(ce_aes_ecb_decrypt)
201 	push		{r4, lr}
202 	ldr		r4, [sp, #8]
203 	prepare_key	r2, r3
204 .Lecbdecloop4x:
205 	subs		r4, r4, #4
206 	bmi		.Lecbdec1x
207 	vld1.8		{q0-q1}, [r1]!
208 	vld1.8		{q2-q3}, [r1]!
209 	bl		aes_decrypt_4x
210 	vst1.8		{q0-q1}, [r0]!
211 	vst1.8		{q2-q3}, [r0]!
212 	b		.Lecbdecloop4x
213 .Lecbdec1x:
214 	adds		r4, r4, #4
215 	beq		.Lecbdecout
216 .Lecbdecloop:
217 	vld1.8		{q0}, [r1]!
218 	bl		aes_decrypt
219 	vst1.8		{q0}, [r0]!
220 	subs		r4, r4, #1
221 	bne		.Lecbdecloop
222 .Lecbdecout:
223 	pop		{r4, pc}
224 ENDPROC(ce_aes_ecb_decrypt)
225 
226 	/*
227 	 * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
228 	 *		   int blocks, u8 iv[])
229 	 * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
230 	 *		   int blocks, u8 iv[])
231 	 */
232 ENTRY(ce_aes_cbc_encrypt)
233 	push		{r4-r6, lr}
234 	ldrd		r4, r5, [sp, #16]
235 	vld1.8		{q0}, [r5]
236 	prepare_key	r2, r3
237 .Lcbcencloop:
238 	vld1.8		{q1}, [r1]!		@ get next pt block
239 	veor		q0, q0, q1		@ ..and xor with iv
240 	bl		aes_encrypt
241 	vst1.8		{q0}, [r0]!
242 	subs		r4, r4, #1
243 	bne		.Lcbcencloop
244 	vst1.8		{q0}, [r5]
245 	pop		{r4-r6, pc}
246 ENDPROC(ce_aes_cbc_encrypt)
247 
248 ENTRY(ce_aes_cbc_decrypt)
249 	push		{r4-r6, lr}
250 	ldrd		r4, r5, [sp, #16]
251 	vld1.8		{q15}, [r5]		@ keep iv in q15
252 	prepare_key	r2, r3
253 .Lcbcdecloop4x:
254 	subs		r4, r4, #4
255 	bmi		.Lcbcdec1x
256 	vld1.8		{q0-q1}, [r1]!
257 	vld1.8		{q2-q3}, [r1]!
258 	vmov		q4, q0
259 	vmov		q5, q1
260 	vmov		q6, q2
261 	vmov		q7, q3
262 	bl		aes_decrypt_4x
263 	veor		q0, q0, q15
264 	veor		q1, q1, q4
265 	veor		q2, q2, q5
266 	veor		q3, q3, q6
267 	vmov		q15, q7
268 	vst1.8		{q0-q1}, [r0]!
269 	vst1.8		{q2-q3}, [r0]!
270 	b		.Lcbcdecloop4x
271 .Lcbcdec1x:
272 	adds		r4, r4, #4
273 	beq		.Lcbcdecout
274 	vmov		q6, q14			@ preserve last round key
275 .Lcbcdecloop:
276 	vld1.8		{q0}, [r1]!		@ get next ct block
277 	veor		q14, q15, q6		@ combine prev ct with last key
278 	vmov		q15, q0
279 	bl		aes_decrypt
280 	vst1.8		{q0}, [r0]!
281 	subs		r4, r4, #1
282 	bne		.Lcbcdecloop
283 .Lcbcdecout:
284 	vst1.8		{q15}, [r5]		@ keep iv in q15
285 	pop		{r4-r6, pc}
286 ENDPROC(ce_aes_cbc_decrypt)
287 
288 
289 	/*
290 	 * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
291 	 *			  int rounds, int bytes, u8 const iv[])
292 	 * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
293 	 *			  int rounds, int bytes, u8 const iv[])
294 	 */
295 
296 ENTRY(ce_aes_cbc_cts_encrypt)
297 	push		{r4-r6, lr}
298 	ldrd		r4, r5, [sp, #16]
299 
300 	movw		ip, :lower16:.Lcts_permute_table
301 	movt		ip, :upper16:.Lcts_permute_table
302 	sub		r4, r4, #16
303 	add		lr, ip, #32
304 	add		ip, ip, r4
305 	sub		lr, lr, r4
306 	vld1.8		{q5}, [ip]
307 	vld1.8		{q6}, [lr]
308 
309 	add		ip, r1, r4
310 	vld1.8		{q0}, [r1]			@ overlapping loads
311 	vld1.8		{q3}, [ip]
312 
313 	vld1.8		{q1}, [r5]			@ get iv
314 	prepare_key	r2, r3
315 
316 	veor		q0, q0, q1			@ xor with iv
317 	bl		aes_encrypt
318 
319 	vtbl.8		d4, {d0-d1}, d10
320 	vtbl.8		d5, {d0-d1}, d11
321 	vtbl.8		d2, {d6-d7}, d12
322 	vtbl.8		d3, {d6-d7}, d13
323 
324 	veor		q0, q0, q1
325 	bl		aes_encrypt
326 
327 	add		r4, r0, r4
328 	vst1.8		{q2}, [r4]			@ overlapping stores
329 	vst1.8		{q0}, [r0]
330 
331 	pop		{r4-r6, pc}
332 ENDPROC(ce_aes_cbc_cts_encrypt)
333 
334 ENTRY(ce_aes_cbc_cts_decrypt)
335 	push		{r4-r6, lr}
336 	ldrd		r4, r5, [sp, #16]
337 
338 	movw		ip, :lower16:.Lcts_permute_table
339 	movt		ip, :upper16:.Lcts_permute_table
340 	sub		r4, r4, #16
341 	add		lr, ip, #32
342 	add		ip, ip, r4
343 	sub		lr, lr, r4
344 	vld1.8		{q5}, [ip]
345 	vld1.8		{q6}, [lr]
346 
347 	add		ip, r1, r4
348 	vld1.8		{q0}, [r1]			@ overlapping loads
349 	vld1.8		{q1}, [ip]
350 
351 	vld1.8		{q3}, [r5]			@ get iv
352 	prepare_key	r2, r3
353 
354 	bl		aes_decrypt
355 
356 	vtbl.8		d4, {d0-d1}, d10
357 	vtbl.8		d5, {d0-d1}, d11
358 	vtbx.8		d0, {d2-d3}, d12
359 	vtbx.8		d1, {d2-d3}, d13
360 
361 	veor		q1, q1, q2
362 	bl		aes_decrypt
363 	veor		q0, q0, q3			@ xor with iv
364 
365 	add		r4, r0, r4
366 	vst1.8		{q1}, [r4]			@ overlapping stores
367 	vst1.8		{q0}, [r0]
368 
369 	pop		{r4-r6, pc}
370 ENDPROC(ce_aes_cbc_cts_decrypt)
371 
372 
373 	/*
374 	 * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
375 	 *		   int blocks, u8 ctr[])
376 	 */
377 ENTRY(ce_aes_ctr_encrypt)
378 	push		{r4-r6, lr}
379 	ldrd		r4, r5, [sp, #16]
380 	vld1.8		{q7}, [r5]		@ load ctr
381 	prepare_key	r2, r3
382 	vmov		r6, s31			@ keep swabbed ctr in r6
383 	rev		r6, r6
384 	cmn		r6, r4			@ 32 bit overflow?
385 	bcs		.Lctrloop
386 .Lctrloop4x:
387 	subs		r4, r4, #4
388 	bmi		.Lctr1x
389 
390 	/*
391 	 * NOTE: the sequence below has been carefully tweaked to avoid
392 	 * a silicon erratum that exists in Cortex-A57 (#1742098) and
393 	 * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
394 	 * may produce an incorrect result if they take their input from a
395 	 * register of which a single 32-bit lane has been updated the last
396 	 * time it was modified. To work around this, the lanes of registers
397 	 * q0-q3 below are not manipulated individually, and the different
398 	 * counter values are prepared by successive manipulations of q7.
399 	 */
400 	add		ip, r6, #1
401 	vmov		q0, q7
402 	rev		ip, ip
403 	add		lr, r6, #2
404 	vmov		s31, ip			@ set lane 3 of q1 via q7
405 	add		ip, r6, #3
406 	rev		lr, lr
407 	vmov		q1, q7
408 	vmov		s31, lr			@ set lane 3 of q2 via q7
409 	rev		ip, ip
410 	vmov		q2, q7
411 	vmov		s31, ip			@ set lane 3 of q3 via q7
412 	add		r6, r6, #4
413 	vmov		q3, q7
414 
415 	vld1.8		{q4-q5}, [r1]!
416 	vld1.8		{q6}, [r1]!
417 	vld1.8		{q15}, [r1]!
418 	bl		aes_encrypt_4x
419 	veor		q0, q0, q4
420 	veor		q1, q1, q5
421 	veor		q2, q2, q6
422 	veor		q3, q3, q15
423 	rev		ip, r6
424 	vst1.8		{q0-q1}, [r0]!
425 	vst1.8		{q2-q3}, [r0]!
426 	vmov		s31, ip
427 	b		.Lctrloop4x
428 .Lctr1x:
429 	adds		r4, r4, #4
430 	beq		.Lctrout
431 .Lctrloop:
432 	vmov		q0, q7
433 	bl		aes_encrypt
434 
435 	adds		r6, r6, #1		@ increment BE ctr
436 	rev		ip, r6
437 	vmov		s31, ip
438 	bcs		.Lctrcarry
439 
440 .Lctrcarrydone:
441 	subs		r4, r4, #1
442 	bmi		.Lctrtailblock		@ blocks < 0 means tail block
443 	vld1.8		{q3}, [r1]!
444 	veor		q3, q0, q3
445 	vst1.8		{q3}, [r0]!
446 	bne		.Lctrloop
447 
448 .Lctrout:
449 	vst1.8		{q7}, [r5]		@ return next CTR value
450 	pop		{r4-r6, pc}
451 
452 .Lctrtailblock:
453 	vst1.8		{q0}, [r0, :64]		@ return the key stream
454 	b		.Lctrout
455 
456 .Lctrcarry:
457 	.irp		sreg, s30, s29, s28
458 	vmov		ip, \sreg		@ load next word of ctr
459 	rev		ip, ip			@ ... to handle the carry
460 	adds		ip, ip, #1
461 	rev		ip, ip
462 	vmov		\sreg, ip
463 	bcc		.Lctrcarrydone
464 	.endr
465 	b		.Lctrcarrydone
466 ENDPROC(ce_aes_ctr_encrypt)
467 
468 	/*
469 	 * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
470 	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
471 	 * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
472 	 *		   int bytes, u8 iv[], u32 const rk2[], int first)
473 	 */
474 
475 	.macro		next_tweak, out, in, const, tmp
476 	vshr.s64	\tmp, \in, #63
477 	vand		\tmp, \tmp, \const
478 	vadd.u64	\out, \in, \in
479 	vext.8		\tmp, \tmp, \tmp, #8
480 	veor		\out, \out, \tmp
481 	.endm
482 
483 ce_aes_xts_init:
484 	vmov.i32	d30, #0x87		@ compose tweak mask vector
485 	vmovl.u32	q15, d30
486 	vshr.u64	d30, d31, #7
487 
488 	ldrd		r4, r5, [sp, #16]	@ load args
489 	ldr		r6, [sp, #28]
490 	vld1.8		{q0}, [r5]		@ load iv
491 	teq		r6, #1			@ start of a block?
492 	bxne		lr
493 
494 	@ Encrypt the IV in q0 with the second AES key. This should only
495 	@ be done at the start of a block.
496 	ldr		r6, [sp, #24]		@ load AES key 2
497 	prepare_key	r6, r3
498 	add		ip, r6, #32		@ 3rd round key of key 2
499 	b		.Laes_encrypt_tweak	@ tail call
500 ENDPROC(ce_aes_xts_init)
501 
502 ENTRY(ce_aes_xts_encrypt)
503 	push		{r4-r6, lr}
504 
505 	bl		ce_aes_xts_init		@ run shared prologue
506 	prepare_key	r2, r3
507 	vmov		q4, q0
508 
509 	teq		r6, #0			@ start of a block?
510 	bne		.Lxtsenc4x
511 
512 .Lxtsencloop4x:
513 	next_tweak	q4, q4, q15, q10
514 .Lxtsenc4x:
515 	subs		r4, r4, #64
516 	bmi		.Lxtsenc1x
517 	vld1.8		{q0-q1}, [r1]!		@ get 4 pt blocks
518 	vld1.8		{q2-q3}, [r1]!
519 	next_tweak	q5, q4, q15, q10
520 	veor		q0, q0, q4
521 	next_tweak	q6, q5, q15, q10
522 	veor		q1, q1, q5
523 	next_tweak	q7, q6, q15, q10
524 	veor		q2, q2, q6
525 	veor		q3, q3, q7
526 	bl		aes_encrypt_4x
527 	veor		q0, q0, q4
528 	veor		q1, q1, q5
529 	veor		q2, q2, q6
530 	veor		q3, q3, q7
531 	vst1.8		{q0-q1}, [r0]!		@ write 4 ct blocks
532 	vst1.8		{q2-q3}, [r0]!
533 	vmov		q4, q7
534 	teq		r4, #0
535 	beq		.Lxtsencret
536 	b		.Lxtsencloop4x
537 .Lxtsenc1x:
538 	adds		r4, r4, #64
539 	beq		.Lxtsencout
540 	subs		r4, r4, #16
541 	bmi		.LxtsencctsNx
542 .Lxtsencloop:
543 	vld1.8		{q0}, [r1]!
544 .Lxtsencctsout:
545 	veor		q0, q0, q4
546 	bl		aes_encrypt
547 	veor		q0, q0, q4
548 	teq		r4, #0
549 	beq		.Lxtsencout
550 	subs		r4, r4, #16
551 	next_tweak	q4, q4, q15, q6
552 	bmi		.Lxtsenccts
553 	vst1.8		{q0}, [r0]!
554 	b		.Lxtsencloop
555 .Lxtsencout:
556 	vst1.8		{q0}, [r0]
557 .Lxtsencret:
558 	vst1.8		{q4}, [r5]
559 	pop		{r4-r6, pc}
560 
561 .LxtsencctsNx:
562 	vmov		q0, q3
563 	sub		r0, r0, #16
564 .Lxtsenccts:
565 	movw		ip, :lower16:.Lcts_permute_table
566 	movt		ip, :upper16:.Lcts_permute_table
567 
568 	add		r1, r1, r4		@ rewind input pointer
569 	add		r4, r4, #16		@ # bytes in final block
570 	add		lr, ip, #32
571 	add		ip, ip, r4
572 	sub		lr, lr, r4
573 	add		r4, r0, r4		@ output address of final block
574 
575 	vld1.8		{q1}, [r1]		@ load final partial block
576 	vld1.8		{q2}, [ip]
577 	vld1.8		{q3}, [lr]
578 
579 	vtbl.8		d4, {d0-d1}, d4
580 	vtbl.8		d5, {d0-d1}, d5
581 	vtbx.8		d0, {d2-d3}, d6
582 	vtbx.8		d1, {d2-d3}, d7
583 
584 	vst1.8		{q2}, [r4]		@ overlapping stores
585 	mov		r4, #0
586 	b		.Lxtsencctsout
587 ENDPROC(ce_aes_xts_encrypt)
588 
589 
590 ENTRY(ce_aes_xts_decrypt)
591 	push		{r4-r6, lr}
592 
593 	bl		ce_aes_xts_init		@ run shared prologue
594 	prepare_key	r2, r3
595 	vmov		q4, q0
596 
597 	/* subtract 16 bytes if we are doing CTS */
598 	tst		r4, #0xf
599 	subne		r4, r4, #0x10
600 
601 	teq		r6, #0			@ start of a block?
602 	bne		.Lxtsdec4x
603 
604 .Lxtsdecloop4x:
605 	next_tweak	q4, q4, q15, q10
606 .Lxtsdec4x:
607 	subs		r4, r4, #64
608 	bmi		.Lxtsdec1x
609 	vld1.8		{q0-q1}, [r1]!		@ get 4 ct blocks
610 	vld1.8		{q2-q3}, [r1]!
611 	next_tweak	q5, q4, q15, q10
612 	veor		q0, q0, q4
613 	next_tweak	q6, q5, q15, q10
614 	veor		q1, q1, q5
615 	next_tweak	q7, q6, q15, q10
616 	veor		q2, q2, q6
617 	veor		q3, q3, q7
618 	bl		aes_decrypt_4x
619 	veor		q0, q0, q4
620 	veor		q1, q1, q5
621 	veor		q2, q2, q6
622 	veor		q3, q3, q7
623 	vst1.8		{q0-q1}, [r0]!		@ write 4 pt blocks
624 	vst1.8		{q2-q3}, [r0]!
625 	vmov		q4, q7
626 	teq		r4, #0
627 	beq		.Lxtsdecout
628 	b		.Lxtsdecloop4x
629 .Lxtsdec1x:
630 	adds		r4, r4, #64
631 	beq		.Lxtsdecout
632 	subs		r4, r4, #16
633 .Lxtsdecloop:
634 	vld1.8		{q0}, [r1]!
635 	bmi		.Lxtsdeccts
636 .Lxtsdecctsout:
637 	veor		q0, q0, q4
638 	bl		aes_decrypt
639 	veor		q0, q0, q4
640 	vst1.8		{q0}, [r0]!
641 	teq		r4, #0
642 	beq		.Lxtsdecout
643 	subs		r4, r4, #16
644 	next_tweak	q4, q4, q15, q6
645 	b		.Lxtsdecloop
646 .Lxtsdecout:
647 	vst1.8		{q4}, [r5]
648 	pop		{r4-r6, pc}
649 
650 .Lxtsdeccts:
651 	movw		ip, :lower16:.Lcts_permute_table
652 	movt		ip, :upper16:.Lcts_permute_table
653 
654 	add		r1, r1, r4		@ rewind input pointer
655 	add		r4, r4, #16		@ # bytes in final block
656 	add		lr, ip, #32
657 	add		ip, ip, r4
658 	sub		lr, lr, r4
659 	add		r4, r0, r4		@ output address of final block
660 
661 	next_tweak	q5, q4, q15, q6
662 
663 	vld1.8		{q1}, [r1]		@ load final partial block
664 	vld1.8		{q2}, [ip]
665 	vld1.8		{q3}, [lr]
666 
667 	veor		q0, q0, q5
668 	bl		aes_decrypt
669 	veor		q0, q0, q5
670 
671 	vtbl.8		d4, {d0-d1}, d4
672 	vtbl.8		d5, {d0-d1}, d5
673 	vtbx.8		d0, {d2-d3}, d6
674 	vtbx.8		d1, {d2-d3}, d7
675 
676 	vst1.8		{q2}, [r4]		@ overlapping stores
677 	mov		r4, #0
678 	b		.Lxtsdecctsout
679 ENDPROC(ce_aes_xts_decrypt)
680 
681 	/*
682 	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
683 	 *                             AES sbox substitution on each byte in
684 	 *                             'input'
685 	 */
686 ENTRY(ce_aes_sub)
687 	vdup.32		q1, r0
688 	veor		q0, q0, q0
689 	aese.8		q0, q1
690 	vmov		r0, s0
691 	bx		lr
692 ENDPROC(ce_aes_sub)
693 
694 	/*
695 	 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
696 	 *                                        operation on round key *src
697 	 */
698 ENTRY(ce_aes_invert)
699 	vld1.32		{q0}, [r1]
700 	aesimc.8	q0, q0
701 	vst1.32		{q0}, [r0]
702 	bx		lr
703 ENDPROC(ce_aes_invert)
704 
705 	.section	".rodata", "a"
706 	.align		6
707 .Lcts_permute_table:
708 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
709 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
710 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
711 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
712 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
713 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
714