xref: /openbmc/linux/arch/arm/crypto/chacha-scalar-core.S (revision 8be98d2f2a0a262f8bf8a0bc1fdf522b3c7aab17)
129621d09SArd Biesheuvel/* SPDX-License-Identifier: GPL-2.0 */
229621d09SArd Biesheuvel/*
329621d09SArd Biesheuvel * Copyright (C) 2018 Google, Inc.
429621d09SArd Biesheuvel */
529621d09SArd Biesheuvel
629621d09SArd Biesheuvel#include <linux/linkage.h>
729621d09SArd Biesheuvel#include <asm/assembler.h>
829621d09SArd Biesheuvel
929621d09SArd Biesheuvel/*
1029621d09SArd Biesheuvel * Design notes:
1129621d09SArd Biesheuvel *
1229621d09SArd Biesheuvel * 16 registers would be needed to hold the state matrix, but only 14 are
1329621d09SArd Biesheuvel * available because 'sp' and 'pc' cannot be used.  So we spill the elements
1429621d09SArd Biesheuvel * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
1529621d09SArd Biesheuvel * 'ldrd' and one 'strd' instruction per round.
1629621d09SArd Biesheuvel *
1729621d09SArd Biesheuvel * All rotates are performed using the implicit rotate operand accepted by the
1829621d09SArd Biesheuvel * 'add' and 'eor' instructions.  This is faster than using explicit rotate
1929621d09SArd Biesheuvel * instructions.  To make this work, we allow the values in the second and last
2029621d09SArd Biesheuvel * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
2129621d09SArd Biesheuvel * wrong rotation amount.  The rotation amount is then fixed up just in time
2229621d09SArd Biesheuvel * when the values are used.  'brot' is the number of bits the values in row 'b'
2329621d09SArd Biesheuvel * need to be rotated right to arrive at the correct values, and 'drot'
2429621d09SArd Biesheuvel * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
2529621d09SArd Biesheuvel * that they end up as (25, 24) after every round.
2629621d09SArd Biesheuvel */
2729621d09SArd Biesheuvel
2829621d09SArd Biesheuvel	// ChaCha state registers
2929621d09SArd Biesheuvel	X0	.req	r0
3029621d09SArd Biesheuvel	X1	.req	r1
3129621d09SArd Biesheuvel	X2	.req	r2
3229621d09SArd Biesheuvel	X3	.req	r3
3329621d09SArd Biesheuvel	X4	.req	r4
3429621d09SArd Biesheuvel	X5	.req	r5
3529621d09SArd Biesheuvel	X6	.req	r6
3629621d09SArd Biesheuvel	X7	.req	r7
3729621d09SArd Biesheuvel	X8_X10	.req	r8	// shared by x8 and x10
3829621d09SArd Biesheuvel	X9_X11	.req	r9	// shared by x9 and x11
3929621d09SArd Biesheuvel	X12	.req	r10
4029621d09SArd Biesheuvel	X13	.req	r11
4129621d09SArd Biesheuvel	X14	.req	r12
4229621d09SArd Biesheuvel	X15	.req	r14
4329621d09SArd Biesheuvel
44*e0ba808dSArd Biesheuvel.macro _le32_bswap_4x	a, b, c, d,  tmp
4529621d09SArd Biesheuvel#ifdef __ARMEB__
46*e0ba808dSArd Biesheuvel	rev_l		\a,  \tmp
47*e0ba808dSArd Biesheuvel	rev_l		\b,  \tmp
48*e0ba808dSArd Biesheuvel	rev_l		\c,  \tmp
49*e0ba808dSArd Biesheuvel	rev_l		\d,  \tmp
5029621d09SArd Biesheuvel#endif
5129621d09SArd Biesheuvel.endm
5229621d09SArd Biesheuvel
5329621d09SArd Biesheuvel.macro __ldrd		a, b, src, offset
5429621d09SArd Biesheuvel#if __LINUX_ARM_ARCH__ >= 6
5529621d09SArd Biesheuvel	ldrd		\a, \b, [\src, #\offset]
5629621d09SArd Biesheuvel#else
5729621d09SArd Biesheuvel	ldr		\a, [\src, #\offset]
5829621d09SArd Biesheuvel	ldr		\b, [\src, #\offset + 4]
5929621d09SArd Biesheuvel#endif
6029621d09SArd Biesheuvel.endm
6129621d09SArd Biesheuvel
6229621d09SArd Biesheuvel.macro __strd		a, b, dst, offset
6329621d09SArd Biesheuvel#if __LINUX_ARM_ARCH__ >= 6
6429621d09SArd Biesheuvel	strd		\a, \b, [\dst, #\offset]
6529621d09SArd Biesheuvel#else
6629621d09SArd Biesheuvel	str		\a, [\dst, #\offset]
6729621d09SArd Biesheuvel	str		\b, [\dst, #\offset + 4]
6829621d09SArd Biesheuvel#endif
6929621d09SArd Biesheuvel.endm
7029621d09SArd Biesheuvel
7129621d09SArd Biesheuvel.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
7229621d09SArd Biesheuvel
7329621d09SArd Biesheuvel	// a += b; d ^= a; d = rol(d, 16);
7429621d09SArd Biesheuvel	add		\a1, \a1, \b1, ror #brot
7529621d09SArd Biesheuvel	add		\a2, \a2, \b2, ror #brot
7629621d09SArd Biesheuvel	eor		\d1, \a1, \d1, ror #drot
7729621d09SArd Biesheuvel	eor		\d2, \a2, \d2, ror #drot
7829621d09SArd Biesheuvel	// drot == 32 - 16 == 16
7929621d09SArd Biesheuvel
8029621d09SArd Biesheuvel	// c += d; b ^= c; b = rol(b, 12);
8129621d09SArd Biesheuvel	add		\c1, \c1, \d1, ror #16
8229621d09SArd Biesheuvel	add		\c2, \c2, \d2, ror #16
8329621d09SArd Biesheuvel	eor		\b1, \c1, \b1, ror #brot
8429621d09SArd Biesheuvel	eor		\b2, \c2, \b2, ror #brot
8529621d09SArd Biesheuvel	// brot == 32 - 12 == 20
8629621d09SArd Biesheuvel
8729621d09SArd Biesheuvel	// a += b; d ^= a; d = rol(d, 8);
8829621d09SArd Biesheuvel	add		\a1, \a1, \b1, ror #20
8929621d09SArd Biesheuvel	add		\a2, \a2, \b2, ror #20
9029621d09SArd Biesheuvel	eor		\d1, \a1, \d1, ror #16
9129621d09SArd Biesheuvel	eor		\d2, \a2, \d2, ror #16
9229621d09SArd Biesheuvel	// drot == 32 - 8 == 24
9329621d09SArd Biesheuvel
9429621d09SArd Biesheuvel	// c += d; b ^= c; b = rol(b, 7);
9529621d09SArd Biesheuvel	add		\c1, \c1, \d1, ror #24
9629621d09SArd Biesheuvel	add		\c2, \c2, \d2, ror #24
9729621d09SArd Biesheuvel	eor		\b1, \c1, \b1, ror #20
9829621d09SArd Biesheuvel	eor		\b2, \c2, \b2, ror #20
9929621d09SArd Biesheuvel	// brot == 32 - 7 == 25
10029621d09SArd Biesheuvel.endm
10129621d09SArd Biesheuvel
10229621d09SArd Biesheuvel.macro _doubleround
10329621d09SArd Biesheuvel
10429621d09SArd Biesheuvel	// column round
10529621d09SArd Biesheuvel
10629621d09SArd Biesheuvel	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
10729621d09SArd Biesheuvel	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
10829621d09SArd Biesheuvel
10929621d09SArd Biesheuvel	// save (x8, x9); restore (x10, x11)
11029621d09SArd Biesheuvel	__strd		X8_X10, X9_X11, sp, 0
11129621d09SArd Biesheuvel	__ldrd		X8_X10, X9_X11, sp, 8
11229621d09SArd Biesheuvel
11329621d09SArd Biesheuvel	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
11429621d09SArd Biesheuvel	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
11529621d09SArd Biesheuvel
11629621d09SArd Biesheuvel	.set brot, 25
11729621d09SArd Biesheuvel	.set drot, 24
11829621d09SArd Biesheuvel
11929621d09SArd Biesheuvel	// diagonal round
12029621d09SArd Biesheuvel
12129621d09SArd Biesheuvel	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
12229621d09SArd Biesheuvel	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
12329621d09SArd Biesheuvel
12429621d09SArd Biesheuvel	// save (x10, x11); restore (x8, x9)
12529621d09SArd Biesheuvel	__strd		X8_X10, X9_X11, sp, 8
12629621d09SArd Biesheuvel	__ldrd		X8_X10, X9_X11, sp, 0
12729621d09SArd Biesheuvel
12829621d09SArd Biesheuvel	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
12929621d09SArd Biesheuvel	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
13029621d09SArd Biesheuvel.endm
13129621d09SArd Biesheuvel
13229621d09SArd Biesheuvel.macro _chacha_permute	nrounds
13329621d09SArd Biesheuvel	.set brot, 0
13429621d09SArd Biesheuvel	.set drot, 0
13529621d09SArd Biesheuvel	.rept \nrounds / 2
13629621d09SArd Biesheuvel	 _doubleround
13729621d09SArd Biesheuvel	.endr
13829621d09SArd Biesheuvel.endm
13929621d09SArd Biesheuvel
14029621d09SArd Biesheuvel.macro _chacha		nrounds
14129621d09SArd Biesheuvel
14229621d09SArd Biesheuvel.Lnext_block\@:
14329621d09SArd Biesheuvel	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
14429621d09SArd Biesheuvel	// Registers contain x0-x9,x12-x15.
14529621d09SArd Biesheuvel
14629621d09SArd Biesheuvel	// Do the core ChaCha permutation to update x0-x15.
14729621d09SArd Biesheuvel	_chacha_permute	\nrounds
14829621d09SArd Biesheuvel
14929621d09SArd Biesheuvel	add		sp, #8
15029621d09SArd Biesheuvel	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
15129621d09SArd Biesheuvel	// Registers contain x0-x9,x12-x15.
15229621d09SArd Biesheuvel	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
15329621d09SArd Biesheuvel
15429621d09SArd Biesheuvel	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
15529621d09SArd Biesheuvel	push		{X8_X10, X9_X11, X12, X13, X14, X15}
15629621d09SArd Biesheuvel
15729621d09SArd Biesheuvel	// Load (OUT, IN, LEN).
15829621d09SArd Biesheuvel	ldr		r14, [sp, #96]
15929621d09SArd Biesheuvel	ldr		r12, [sp, #100]
16029621d09SArd Biesheuvel	ldr		r11, [sp, #104]
16129621d09SArd Biesheuvel
16229621d09SArd Biesheuvel	orr		r10, r14, r12
16329621d09SArd Biesheuvel
16429621d09SArd Biesheuvel	// Use slow path if fewer than 64 bytes remain.
16529621d09SArd Biesheuvel	cmp		r11, #64
16629621d09SArd Biesheuvel	blt		.Lxor_slowpath\@
16729621d09SArd Biesheuvel
16829621d09SArd Biesheuvel	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
16929621d09SArd Biesheuvel	// ARMv6+, since ldmia and stmia (used below) still require alignment.
17029621d09SArd Biesheuvel	tst		r10, #3
17129621d09SArd Biesheuvel	bne		.Lxor_slowpath\@
17229621d09SArd Biesheuvel
17329621d09SArd Biesheuvel	// Fast path: XOR 64 bytes of aligned data.
17429621d09SArd Biesheuvel
17529621d09SArd Biesheuvel	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
17629621d09SArd Biesheuvel	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
17729621d09SArd Biesheuvel	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
17829621d09SArd Biesheuvel
17929621d09SArd Biesheuvel	// x0-x3
18029621d09SArd Biesheuvel	__ldrd		r8, r9, sp, 32
18129621d09SArd Biesheuvel	__ldrd		r10, r11, sp, 40
18229621d09SArd Biesheuvel	add		X0, X0, r8
18329621d09SArd Biesheuvel	add		X1, X1, r9
18429621d09SArd Biesheuvel	add		X2, X2, r10
18529621d09SArd Biesheuvel	add		X3, X3, r11
186*e0ba808dSArd Biesheuvel	_le32_bswap_4x	X0, X1, X2, X3,  r8
18729621d09SArd Biesheuvel	ldmia		r12!, {r8-r11}
18829621d09SArd Biesheuvel	eor		X0, X0, r8
18929621d09SArd Biesheuvel	eor		X1, X1, r9
19029621d09SArd Biesheuvel	eor		X2, X2, r10
19129621d09SArd Biesheuvel	eor		X3, X3, r11
19229621d09SArd Biesheuvel	stmia		r14!, {X0-X3}
19329621d09SArd Biesheuvel
19429621d09SArd Biesheuvel	// x4-x7
19529621d09SArd Biesheuvel	__ldrd		r8, r9, sp, 48
19629621d09SArd Biesheuvel	__ldrd		r10, r11, sp, 56
19729621d09SArd Biesheuvel	add		X4, r8, X4, ror #brot
19829621d09SArd Biesheuvel	add		X5, r9, X5, ror #brot
19929621d09SArd Biesheuvel	ldmia		r12!, {X0-X3}
20029621d09SArd Biesheuvel	add		X6, r10, X6, ror #brot
20129621d09SArd Biesheuvel	add		X7, r11, X7, ror #brot
202*e0ba808dSArd Biesheuvel	_le32_bswap_4x	X4, X5, X6, X7,  r8
20329621d09SArd Biesheuvel	eor		X4, X4, X0
20429621d09SArd Biesheuvel	eor		X5, X5, X1
20529621d09SArd Biesheuvel	eor		X6, X6, X2
20629621d09SArd Biesheuvel	eor		X7, X7, X3
20729621d09SArd Biesheuvel	stmia		r14!, {X4-X7}
20829621d09SArd Biesheuvel
20929621d09SArd Biesheuvel	// x8-x15
21029621d09SArd Biesheuvel	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
21129621d09SArd Biesheuvel	__ldrd		r8, r9, sp, 32
21229621d09SArd Biesheuvel	__ldrd		r10, r11, sp, 40
21329621d09SArd Biesheuvel	add		r0, r0, r8		// x8
21429621d09SArd Biesheuvel	add		r1, r1, r9		// x9
21529621d09SArd Biesheuvel	add		r6, r6, r10		// x10
21629621d09SArd Biesheuvel	add		r7, r7, r11		// x11
217*e0ba808dSArd Biesheuvel	_le32_bswap_4x	r0, r1, r6, r7,  r8
21829621d09SArd Biesheuvel	ldmia		r12!, {r8-r11}
21929621d09SArd Biesheuvel	eor		r0, r0, r8		// x8
22029621d09SArd Biesheuvel	eor		r1, r1, r9		// x9
22129621d09SArd Biesheuvel	eor		r6, r6, r10		// x10
22229621d09SArd Biesheuvel	eor		r7, r7, r11		// x11
22329621d09SArd Biesheuvel	stmia		r14!, {r0,r1,r6,r7}
22429621d09SArd Biesheuvel	ldmia		r12!, {r0,r1,r6,r7}
22529621d09SArd Biesheuvel	__ldrd		r8, r9, sp, 48
22629621d09SArd Biesheuvel	__ldrd		r10, r11, sp, 56
22729621d09SArd Biesheuvel	add		r2, r8, r2, ror #drot	// x12
22829621d09SArd Biesheuvel	add		r3, r9, r3, ror #drot	// x13
22929621d09SArd Biesheuvel	add		r4, r10, r4, ror #drot	// x14
23029621d09SArd Biesheuvel	add		r5, r11, r5, ror #drot	// x15
231*e0ba808dSArd Biesheuvel	_le32_bswap_4x	r2, r3, r4, r5,  r9
23229621d09SArd Biesheuvel	  ldr		r9, [sp, #72]		// load LEN
23329621d09SArd Biesheuvel	eor		r2, r2, r0		// x12
23429621d09SArd Biesheuvel	eor		r3, r3, r1		// x13
23529621d09SArd Biesheuvel	eor		r4, r4, r6		// x14
23629621d09SArd Biesheuvel	eor		r5, r5, r7		// x15
23729621d09SArd Biesheuvel	  subs		r9, #64			// decrement and check LEN
23829621d09SArd Biesheuvel	stmia		r14!, {r2-r5}
23929621d09SArd Biesheuvel
24029621d09SArd Biesheuvel	beq		.Ldone\@
24129621d09SArd Biesheuvel
24229621d09SArd Biesheuvel.Lprepare_for_next_block\@:
24329621d09SArd Biesheuvel
24429621d09SArd Biesheuvel	// Stack: x0-x15 OUT IN LEN
24529621d09SArd Biesheuvel
24629621d09SArd Biesheuvel	// Increment block counter (x12)
24729621d09SArd Biesheuvel	add		r8, #1
24829621d09SArd Biesheuvel
24929621d09SArd Biesheuvel	// Store updated (OUT, IN, LEN)
25029621d09SArd Biesheuvel	str		r14, [sp, #64]
25129621d09SArd Biesheuvel	str		r12, [sp, #68]
25229621d09SArd Biesheuvel	str		r9, [sp, #72]
25329621d09SArd Biesheuvel
25429621d09SArd Biesheuvel	  mov		r14, sp
25529621d09SArd Biesheuvel
25629621d09SArd Biesheuvel	// Store updated block counter (x12)
25729621d09SArd Biesheuvel	str		r8, [sp, #48]
25829621d09SArd Biesheuvel
25929621d09SArd Biesheuvel	  sub		sp, #16
26029621d09SArd Biesheuvel
26129621d09SArd Biesheuvel	// Reload state and do next block
26229621d09SArd Biesheuvel	ldmia		r14!, {r0-r11}		// load x0-x11
26329621d09SArd Biesheuvel	__strd		r10, r11, sp, 8		// store x10-x11 before state
26429621d09SArd Biesheuvel	ldmia		r14, {r10-r12,r14}	// load x12-x15
26529621d09SArd Biesheuvel	b		.Lnext_block\@
26629621d09SArd Biesheuvel
26729621d09SArd Biesheuvel.Lxor_slowpath\@:
26829621d09SArd Biesheuvel	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
26929621d09SArd Biesheuvel	// We handle it by storing the 64 bytes of keystream to the stack, then
27029621d09SArd Biesheuvel	// XOR-ing the needed portion with the data.
27129621d09SArd Biesheuvel
27229621d09SArd Biesheuvel	// Allocate keystream buffer
27329621d09SArd Biesheuvel	sub		sp, #64
27429621d09SArd Biesheuvel	mov		r14, sp
27529621d09SArd Biesheuvel
27629621d09SArd Biesheuvel	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
27729621d09SArd Biesheuvel	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
27829621d09SArd Biesheuvel	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
27929621d09SArd Biesheuvel
28029621d09SArd Biesheuvel	// Save keystream for x0-x3
28129621d09SArd Biesheuvel	__ldrd		r8, r9, sp, 96
28229621d09SArd Biesheuvel	__ldrd		r10, r11, sp, 104
28329621d09SArd Biesheuvel	add		X0, X0, r8
28429621d09SArd Biesheuvel	add		X1, X1, r9
28529621d09SArd Biesheuvel	add		X2, X2, r10
28629621d09SArd Biesheuvel	add		X3, X3, r11
287*e0ba808dSArd Biesheuvel	_le32_bswap_4x	X0, X1, X2, X3,  r8
28829621d09SArd Biesheuvel	stmia		r14!, {X0-X3}
28929621d09SArd Biesheuvel
29029621d09SArd Biesheuvel	// Save keystream for x4-x7
29129621d09SArd Biesheuvel	__ldrd		r8, r9, sp, 112
29229621d09SArd Biesheuvel	__ldrd		r10, r11, sp, 120
29329621d09SArd Biesheuvel	add		X4, r8, X4, ror #brot
29429621d09SArd Biesheuvel	add		X5, r9, X5, ror #brot
29529621d09SArd Biesheuvel	add		X6, r10, X6, ror #brot
29629621d09SArd Biesheuvel	add		X7, r11, X7, ror #brot
297*e0ba808dSArd Biesheuvel	_le32_bswap_4x	X4, X5, X6, X7,  r8
29829621d09SArd Biesheuvel	  add		r8, sp, #64
29929621d09SArd Biesheuvel	stmia		r14!, {X4-X7}
30029621d09SArd Biesheuvel
30129621d09SArd Biesheuvel	// Save keystream for x8-x15
30229621d09SArd Biesheuvel	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
30329621d09SArd Biesheuvel	__ldrd		r8, r9, sp, 128
30429621d09SArd Biesheuvel	__ldrd		r10, r11, sp, 136
30529621d09SArd Biesheuvel	add		r0, r0, r8		// x8
30629621d09SArd Biesheuvel	add		r1, r1, r9		// x9
30729621d09SArd Biesheuvel	add		r6, r6, r10		// x10
30829621d09SArd Biesheuvel	add		r7, r7, r11		// x11
309*e0ba808dSArd Biesheuvel	_le32_bswap_4x	r0, r1, r6, r7,  r8
31029621d09SArd Biesheuvel	stmia		r14!, {r0,r1,r6,r7}
31129621d09SArd Biesheuvel	__ldrd		r8, r9, sp, 144
31229621d09SArd Biesheuvel	__ldrd		r10, r11, sp, 152
31329621d09SArd Biesheuvel	add		r2, r8, r2, ror #drot	// x12
31429621d09SArd Biesheuvel	add		r3, r9, r3, ror #drot	// x13
31529621d09SArd Biesheuvel	add		r4, r10, r4, ror #drot	// x14
31629621d09SArd Biesheuvel	add		r5, r11, r5, ror #drot	// x15
317*e0ba808dSArd Biesheuvel	_le32_bswap_4x	r2, r3, r4, r5,  r9
31829621d09SArd Biesheuvel	stmia		r14, {r2-r5}
31929621d09SArd Biesheuvel
32029621d09SArd Biesheuvel	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
32129621d09SArd Biesheuvel	// Registers: r8 is block counter, r12 is IN.
32229621d09SArd Biesheuvel
32329621d09SArd Biesheuvel	ldr		r9, [sp, #168]		// LEN
32429621d09SArd Biesheuvel	ldr		r14, [sp, #160]		// OUT
32529621d09SArd Biesheuvel	cmp		r9, #64
32629621d09SArd Biesheuvel	  mov		r0, sp
32729621d09SArd Biesheuvel	movle		r1, r9
32829621d09SArd Biesheuvel	movgt		r1, #64
32929621d09SArd Biesheuvel	// r1 is number of bytes to XOR, in range [1, 64]
33029621d09SArd Biesheuvel
33129621d09SArd Biesheuvel.if __LINUX_ARM_ARCH__ < 6
33229621d09SArd Biesheuvel	orr		r2, r12, r14
33329621d09SArd Biesheuvel	tst		r2, #3			// IN or OUT misaligned?
33429621d09SArd Biesheuvel	bne		.Lxor_next_byte\@
33529621d09SArd Biesheuvel.endif
33629621d09SArd Biesheuvel
33729621d09SArd Biesheuvel	// XOR a word at a time
33829621d09SArd Biesheuvel.rept 16
33929621d09SArd Biesheuvel	subs		r1, #4
34029621d09SArd Biesheuvel	blt		.Lxor_words_done\@
34129621d09SArd Biesheuvel	ldr		r2, [r12], #4
34229621d09SArd Biesheuvel	ldr		r3, [r0], #4
34329621d09SArd Biesheuvel	eor		r2, r2, r3
34429621d09SArd Biesheuvel	str		r2, [r14], #4
34529621d09SArd Biesheuvel.endr
34629621d09SArd Biesheuvel	b		.Lxor_slowpath_done\@
34729621d09SArd Biesheuvel.Lxor_words_done\@:
34829621d09SArd Biesheuvel	ands		r1, r1, #3
34929621d09SArd Biesheuvel	beq		.Lxor_slowpath_done\@
35029621d09SArd Biesheuvel
35129621d09SArd Biesheuvel	// XOR a byte at a time
35229621d09SArd Biesheuvel.Lxor_next_byte\@:
35329621d09SArd Biesheuvel	ldrb		r2, [r12], #1
35429621d09SArd Biesheuvel	ldrb		r3, [r0], #1
35529621d09SArd Biesheuvel	eor		r2, r2, r3
35629621d09SArd Biesheuvel	strb		r2, [r14], #1
35729621d09SArd Biesheuvel	subs		r1, #1
35829621d09SArd Biesheuvel	bne		.Lxor_next_byte\@
35929621d09SArd Biesheuvel
36029621d09SArd Biesheuvel.Lxor_slowpath_done\@:
36129621d09SArd Biesheuvel	subs		r9, #64
36229621d09SArd Biesheuvel	add		sp, #96
36329621d09SArd Biesheuvel	bgt		.Lprepare_for_next_block\@
36429621d09SArd Biesheuvel
36529621d09SArd Biesheuvel.Ldone\@:
36629621d09SArd Biesheuvel.endm	// _chacha
36729621d09SArd Biesheuvel
36829621d09SArd Biesheuvel/*
369b36d8c09SArd Biesheuvel * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
370b36d8c09SArd Biesheuvel *		     const u32 *state, int nrounds);
37129621d09SArd Biesheuvel */
372b36d8c09SArd BiesheuvelENTRY(chacha_doarm)
37329621d09SArd Biesheuvel	cmp		r2, #0			// len == 0?
37429621d09SArd Biesheuvel	reteq		lr
37529621d09SArd Biesheuvel
376b36d8c09SArd Biesheuvel	ldr		ip, [sp]
377b36d8c09SArd Biesheuvel	cmp		ip, #12
378b36d8c09SArd Biesheuvel
37929621d09SArd Biesheuvel	push		{r0-r2,r4-r11,lr}
38029621d09SArd Biesheuvel
38129621d09SArd Biesheuvel	// Push state x0-x15 onto stack.
38229621d09SArd Biesheuvel	// Also store an extra copy of x10-x11 just before the state.
38329621d09SArd Biesheuvel
384b36d8c09SArd Biesheuvel	add		X12, r3, #48
385b36d8c09SArd Biesheuvel	ldm		X12, {X12,X13,X14,X15}
386b36d8c09SArd Biesheuvel	push		{X12,X13,X14,X15}
387b36d8c09SArd Biesheuvel	sub		sp, sp, #64
38829621d09SArd Biesheuvel
389b36d8c09SArd Biesheuvel	__ldrd		X8_X10, X9_X11, r3, 40
39029621d09SArd Biesheuvel	__strd		X8_X10, X9_X11, sp, 8
391b36d8c09SArd Biesheuvel	__strd		X8_X10, X9_X11, sp, 56
392b36d8c09SArd Biesheuvel	ldm		r3, {X0-X9_X11}
39329621d09SArd Biesheuvel	__strd		X0, X1, sp, 16
39429621d09SArd Biesheuvel	__strd		X2, X3, sp, 24
395b36d8c09SArd Biesheuvel	__strd		X4, X5, sp, 32
396b36d8c09SArd Biesheuvel	__strd		X6, X7, sp, 40
397b36d8c09SArd Biesheuvel	__strd		X8_X10, X9_X11, sp, 48
39829621d09SArd Biesheuvel
399b36d8c09SArd Biesheuvel	beq		1f
40029621d09SArd Biesheuvel	_chacha		20
40129621d09SArd Biesheuvel
402b36d8c09SArd Biesheuvel0:	add		sp, #76
40329621d09SArd Biesheuvel	pop		{r4-r11, pc}
404b36d8c09SArd Biesheuvel
405b36d8c09SArd Biesheuvel1:	_chacha		12
406b36d8c09SArd Biesheuvel	b		0b
407b36d8c09SArd BiesheuvelENDPROC(chacha_doarm)
40829621d09SArd Biesheuvel
40929621d09SArd Biesheuvel/*
410b36d8c09SArd Biesheuvel * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
41129621d09SArd Biesheuvel */
412b36d8c09SArd BiesheuvelENTRY(hchacha_block_arm)
41329621d09SArd Biesheuvel	push		{r1,r4-r11,lr}
41429621d09SArd Biesheuvel
415b36d8c09SArd Biesheuvel	cmp		r2, #12			// ChaCha12 ?
416b36d8c09SArd Biesheuvel
41729621d09SArd Biesheuvel	mov		r14, r0
41829621d09SArd Biesheuvel	ldmia		r14!, {r0-r11}		// load x0-x11
41929621d09SArd Biesheuvel	push		{r10-r11}		// store x10-x11 to stack
42029621d09SArd Biesheuvel	ldm		r14, {r10-r12,r14}	// load x12-x15
42129621d09SArd Biesheuvel	sub		sp, #8
42229621d09SArd Biesheuvel
423b36d8c09SArd Biesheuvel	beq		1f
42429621d09SArd Biesheuvel	_chacha_permute	20
42529621d09SArd Biesheuvel
42629621d09SArd Biesheuvel	// Skip over (unused0-unused1, x10-x11)
427b36d8c09SArd Biesheuvel0:	add		sp, #16
42829621d09SArd Biesheuvel
42929621d09SArd Biesheuvel	// Fix up rotations of x12-x15
43029621d09SArd Biesheuvel	ror		X12, X12, #drot
43129621d09SArd Biesheuvel	ror		X13, X13, #drot
43229621d09SArd Biesheuvel	  pop		{r4}			// load 'out'
43329621d09SArd Biesheuvel	ror		X14, X14, #drot
43429621d09SArd Biesheuvel	ror		X15, X15, #drot
43529621d09SArd Biesheuvel
43629621d09SArd Biesheuvel	// Store (x0-x3,x12-x15) to 'out'
43729621d09SArd Biesheuvel	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
43829621d09SArd Biesheuvel
43929621d09SArd Biesheuvel	pop		{r4-r11,pc}
440b36d8c09SArd Biesheuvel
441b36d8c09SArd Biesheuvel1:	_chacha_permute	12
442b36d8c09SArd Biesheuvel	b		0b
443b36d8c09SArd BiesheuvelENDPROC(hchacha_block_arm)
444