xref: /openbmc/linux/arch/arm/crypto/sha1-armv4-large.S (revision f0be44f4fb1faee42635ca5ea06dc9c3e820a35d)
1*f0be44f4SDavid McCullough#define __ARM_ARCH__ __LINUX_ARM_ARCH__
2*f0be44f4SDavid McCullough@ ====================================================================
3*f0be44f4SDavid McCullough@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4*f0be44f4SDavid McCullough@ project. The module is, however, dual licensed under OpenSSL and
5*f0be44f4SDavid McCullough@ CRYPTOGAMS licenses depending on where you obtain it. For further
6*f0be44f4SDavid McCullough@ details see http://www.openssl.org/~appro/cryptogams/.
7*f0be44f4SDavid McCullough@ ====================================================================
8*f0be44f4SDavid McCullough
9*f0be44f4SDavid McCullough@ sha1_block procedure for ARMv4.
10*f0be44f4SDavid McCullough@
11*f0be44f4SDavid McCullough@ January 2007.
12*f0be44f4SDavid McCullough
13*f0be44f4SDavid McCullough@ Size/performance trade-off
14*f0be44f4SDavid McCullough@ ====================================================================
15*f0be44f4SDavid McCullough@ impl		size in bytes	comp cycles[*]	measured performance
16*f0be44f4SDavid McCullough@ ====================================================================
17*f0be44f4SDavid McCullough@ thumb		304		3212		4420
18*f0be44f4SDavid McCullough@ armv4-small	392/+29%	1958/+64%	2250/+96%
19*f0be44f4SDavid McCullough@ armv4-compact	740/+89%	1552/+26%	1840/+22%
20*f0be44f4SDavid McCullough@ armv4-large	1420/+92%	1307/+19%	1370/+34%[***]
21*f0be44f4SDavid McCullough@ full unroll	~5100/+260%	~1260/+4%	~1300/+5%
22*f0be44f4SDavid McCullough@ ====================================================================
23*f0be44f4SDavid McCullough@ thumb		= same as 'small' but in Thumb instructions[**] and
24*f0be44f4SDavid McCullough@		  with recurring code in two private functions;
25*f0be44f4SDavid McCullough@ small		= detached Xload/update, loops are folded;
26*f0be44f4SDavid McCullough@ compact	= detached Xload/update, 5x unroll;
27*f0be44f4SDavid McCullough@ large		= interleaved Xload/update, 5x unroll;
28*f0be44f4SDavid McCullough@ full unroll	= interleaved Xload/update, full unroll, estimated[!];
29*f0be44f4SDavid McCullough@
30*f0be44f4SDavid McCullough@ [*]	Manually counted instructions in "grand" loop body. Measured
31*f0be44f4SDavid McCullough@	performance is affected by prologue and epilogue overhead,
32*f0be44f4SDavid McCullough@	i-cache availability, branch penalties, etc.
33*f0be44f4SDavid McCullough@ [**]	While each Thumb instruction is twice smaller, they are not as
34*f0be44f4SDavid McCullough@	diverse as ARM ones: e.g., there are only two arithmetic
35*f0be44f4SDavid McCullough@	instructions with 3 arguments, no [fixed] rotate, addressing
36*f0be44f4SDavid McCullough@	modes are limited. As result it takes more instructions to do
37*f0be44f4SDavid McCullough@	the same job in Thumb, therefore the code is never twice as
38*f0be44f4SDavid McCullough@	small and always slower.
39*f0be44f4SDavid McCullough@ [***]	which is also ~35% better than compiler generated code. Dual-
40*f0be44f4SDavid McCullough@	issue Cortex A8 core was measured to process input block in
41*f0be44f4SDavid McCullough@	~990 cycles.
42*f0be44f4SDavid McCullough
43*f0be44f4SDavid McCullough@ August 2010.
44*f0be44f4SDavid McCullough@
45*f0be44f4SDavid McCullough@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
46*f0be44f4SDavid McCullough@ Cortex A8 core and in absolute terms ~870 cycles per input block
47*f0be44f4SDavid McCullough@ [or 13.6 cycles per byte].
48*f0be44f4SDavid McCullough
49*f0be44f4SDavid McCullough@ February 2011.
50*f0be44f4SDavid McCullough@
51*f0be44f4SDavid McCullough@ Profiler-assisted and platform-specific optimization resulted in 10%
52*f0be44f4SDavid McCullough@ improvement on Cortex A8 core and 12.2 cycles per byte.
53*f0be44f4SDavid McCullough
54*f0be44f4SDavid McCullough.text
55*f0be44f4SDavid McCullough
56*f0be44f4SDavid McCullough.global	sha1_block_data_order
57*f0be44f4SDavid McCullough.type	sha1_block_data_order,%function
58*f0be44f4SDavid McCullough
59*f0be44f4SDavid McCullough.align	2
60*f0be44f4SDavid McCulloughsha1_block_data_order:
61*f0be44f4SDavid McCullough	stmdb	sp!,{r4-r12,lr}
62*f0be44f4SDavid McCullough	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
63*f0be44f4SDavid McCullough	ldmia	r0,{r3,r4,r5,r6,r7}
64*f0be44f4SDavid McCullough.Lloop:
65*f0be44f4SDavid McCullough	ldr	r8,.LK_00_19
66*f0be44f4SDavid McCullough	mov	r14,sp
67*f0be44f4SDavid McCullough	sub	sp,sp,#15*4
68*f0be44f4SDavid McCullough	mov	r5,r5,ror#30
69*f0be44f4SDavid McCullough	mov	r6,r6,ror#30
70*f0be44f4SDavid McCullough	mov	r7,r7,ror#30		@ [6]
71*f0be44f4SDavid McCullough.L_00_15:
72*f0be44f4SDavid McCullough#if __ARM_ARCH__<7
73*f0be44f4SDavid McCullough	ldrb	r10,[r1,#2]
74*f0be44f4SDavid McCullough	ldrb	r9,[r1,#3]
75*f0be44f4SDavid McCullough	ldrb	r11,[r1,#1]
76*f0be44f4SDavid McCullough	add	r7,r8,r7,ror#2			@ E+=K_00_19
77*f0be44f4SDavid McCullough	ldrb	r12,[r1],#4
78*f0be44f4SDavid McCullough	orr	r9,r9,r10,lsl#8
79*f0be44f4SDavid McCullough	eor	r10,r5,r6			@ F_xx_xx
80*f0be44f4SDavid McCullough	orr	r9,r9,r11,lsl#16
81*f0be44f4SDavid McCullough	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
82*f0be44f4SDavid McCullough	orr	r9,r9,r12,lsl#24
83*f0be44f4SDavid McCullough#else
84*f0be44f4SDavid McCullough	ldr	r9,[r1],#4			@ handles unaligned
85*f0be44f4SDavid McCullough	add	r7,r8,r7,ror#2			@ E+=K_00_19
86*f0be44f4SDavid McCullough	eor	r10,r5,r6			@ F_xx_xx
87*f0be44f4SDavid McCullough	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
88*f0be44f4SDavid McCullough#ifdef __ARMEL__
89*f0be44f4SDavid McCullough	rev	r9,r9				@ byte swap
90*f0be44f4SDavid McCullough#endif
91*f0be44f4SDavid McCullough#endif
92*f0be44f4SDavid McCullough	and	r10,r4,r10,ror#2
93*f0be44f4SDavid McCullough	add	r7,r7,r9			@ E+=X[i]
94*f0be44f4SDavid McCullough	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
95*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
96*f0be44f4SDavid McCullough	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
97*f0be44f4SDavid McCullough#if __ARM_ARCH__<7
98*f0be44f4SDavid McCullough	ldrb	r10,[r1,#2]
99*f0be44f4SDavid McCullough	ldrb	r9,[r1,#3]
100*f0be44f4SDavid McCullough	ldrb	r11,[r1,#1]
101*f0be44f4SDavid McCullough	add	r6,r8,r6,ror#2			@ E+=K_00_19
102*f0be44f4SDavid McCullough	ldrb	r12,[r1],#4
103*f0be44f4SDavid McCullough	orr	r9,r9,r10,lsl#8
104*f0be44f4SDavid McCullough	eor	r10,r4,r5			@ F_xx_xx
105*f0be44f4SDavid McCullough	orr	r9,r9,r11,lsl#16
106*f0be44f4SDavid McCullough	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
107*f0be44f4SDavid McCullough	orr	r9,r9,r12,lsl#24
108*f0be44f4SDavid McCullough#else
109*f0be44f4SDavid McCullough	ldr	r9,[r1],#4			@ handles unaligned
110*f0be44f4SDavid McCullough	add	r6,r8,r6,ror#2			@ E+=K_00_19
111*f0be44f4SDavid McCullough	eor	r10,r4,r5			@ F_xx_xx
112*f0be44f4SDavid McCullough	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
113*f0be44f4SDavid McCullough#ifdef __ARMEL__
114*f0be44f4SDavid McCullough	rev	r9,r9				@ byte swap
115*f0be44f4SDavid McCullough#endif
116*f0be44f4SDavid McCullough#endif
117*f0be44f4SDavid McCullough	and	r10,r3,r10,ror#2
118*f0be44f4SDavid McCullough	add	r6,r6,r9			@ E+=X[i]
119*f0be44f4SDavid McCullough	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
120*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
121*f0be44f4SDavid McCullough	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
122*f0be44f4SDavid McCullough#if __ARM_ARCH__<7
123*f0be44f4SDavid McCullough	ldrb	r10,[r1,#2]
124*f0be44f4SDavid McCullough	ldrb	r9,[r1,#3]
125*f0be44f4SDavid McCullough	ldrb	r11,[r1,#1]
126*f0be44f4SDavid McCullough	add	r5,r8,r5,ror#2			@ E+=K_00_19
127*f0be44f4SDavid McCullough	ldrb	r12,[r1],#4
128*f0be44f4SDavid McCullough	orr	r9,r9,r10,lsl#8
129*f0be44f4SDavid McCullough	eor	r10,r3,r4			@ F_xx_xx
130*f0be44f4SDavid McCullough	orr	r9,r9,r11,lsl#16
131*f0be44f4SDavid McCullough	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
132*f0be44f4SDavid McCullough	orr	r9,r9,r12,lsl#24
133*f0be44f4SDavid McCullough#else
134*f0be44f4SDavid McCullough	ldr	r9,[r1],#4			@ handles unaligned
135*f0be44f4SDavid McCullough	add	r5,r8,r5,ror#2			@ E+=K_00_19
136*f0be44f4SDavid McCullough	eor	r10,r3,r4			@ F_xx_xx
137*f0be44f4SDavid McCullough	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
138*f0be44f4SDavid McCullough#ifdef __ARMEL__
139*f0be44f4SDavid McCullough	rev	r9,r9				@ byte swap
140*f0be44f4SDavid McCullough#endif
141*f0be44f4SDavid McCullough#endif
142*f0be44f4SDavid McCullough	and	r10,r7,r10,ror#2
143*f0be44f4SDavid McCullough	add	r5,r5,r9			@ E+=X[i]
144*f0be44f4SDavid McCullough	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
145*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
146*f0be44f4SDavid McCullough	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
147*f0be44f4SDavid McCullough#if __ARM_ARCH__<7
148*f0be44f4SDavid McCullough	ldrb	r10,[r1,#2]
149*f0be44f4SDavid McCullough	ldrb	r9,[r1,#3]
150*f0be44f4SDavid McCullough	ldrb	r11,[r1,#1]
151*f0be44f4SDavid McCullough	add	r4,r8,r4,ror#2			@ E+=K_00_19
152*f0be44f4SDavid McCullough	ldrb	r12,[r1],#4
153*f0be44f4SDavid McCullough	orr	r9,r9,r10,lsl#8
154*f0be44f4SDavid McCullough	eor	r10,r7,r3			@ F_xx_xx
155*f0be44f4SDavid McCullough	orr	r9,r9,r11,lsl#16
156*f0be44f4SDavid McCullough	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
157*f0be44f4SDavid McCullough	orr	r9,r9,r12,lsl#24
158*f0be44f4SDavid McCullough#else
159*f0be44f4SDavid McCullough	ldr	r9,[r1],#4			@ handles unaligned
160*f0be44f4SDavid McCullough	add	r4,r8,r4,ror#2			@ E+=K_00_19
161*f0be44f4SDavid McCullough	eor	r10,r7,r3			@ F_xx_xx
162*f0be44f4SDavid McCullough	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
163*f0be44f4SDavid McCullough#ifdef __ARMEL__
164*f0be44f4SDavid McCullough	rev	r9,r9				@ byte swap
165*f0be44f4SDavid McCullough#endif
166*f0be44f4SDavid McCullough#endif
167*f0be44f4SDavid McCullough	and	r10,r6,r10,ror#2
168*f0be44f4SDavid McCullough	add	r4,r4,r9			@ E+=X[i]
169*f0be44f4SDavid McCullough	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
170*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
171*f0be44f4SDavid McCullough	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
172*f0be44f4SDavid McCullough#if __ARM_ARCH__<7
173*f0be44f4SDavid McCullough	ldrb	r10,[r1,#2]
174*f0be44f4SDavid McCullough	ldrb	r9,[r1,#3]
175*f0be44f4SDavid McCullough	ldrb	r11,[r1,#1]
176*f0be44f4SDavid McCullough	add	r3,r8,r3,ror#2			@ E+=K_00_19
177*f0be44f4SDavid McCullough	ldrb	r12,[r1],#4
178*f0be44f4SDavid McCullough	orr	r9,r9,r10,lsl#8
179*f0be44f4SDavid McCullough	eor	r10,r6,r7			@ F_xx_xx
180*f0be44f4SDavid McCullough	orr	r9,r9,r11,lsl#16
181*f0be44f4SDavid McCullough	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
182*f0be44f4SDavid McCullough	orr	r9,r9,r12,lsl#24
183*f0be44f4SDavid McCullough#else
184*f0be44f4SDavid McCullough	ldr	r9,[r1],#4			@ handles unaligned
185*f0be44f4SDavid McCullough	add	r3,r8,r3,ror#2			@ E+=K_00_19
186*f0be44f4SDavid McCullough	eor	r10,r6,r7			@ F_xx_xx
187*f0be44f4SDavid McCullough	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
188*f0be44f4SDavid McCullough#ifdef __ARMEL__
189*f0be44f4SDavid McCullough	rev	r9,r9				@ byte swap
190*f0be44f4SDavid McCullough#endif
191*f0be44f4SDavid McCullough#endif
192*f0be44f4SDavid McCullough	and	r10,r5,r10,ror#2
193*f0be44f4SDavid McCullough	add	r3,r3,r9			@ E+=X[i]
194*f0be44f4SDavid McCullough	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
195*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
196*f0be44f4SDavid McCullough	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
197*f0be44f4SDavid McCullough	teq	r14,sp
198*f0be44f4SDavid McCullough	bne	.L_00_15		@ [((11+4)*5+2)*3]
199*f0be44f4SDavid McCullough#if __ARM_ARCH__<7
200*f0be44f4SDavid McCullough	ldrb	r10,[r1,#2]
201*f0be44f4SDavid McCullough	ldrb	r9,[r1,#3]
202*f0be44f4SDavid McCullough	ldrb	r11,[r1,#1]
203*f0be44f4SDavid McCullough	add	r7,r8,r7,ror#2			@ E+=K_00_19
204*f0be44f4SDavid McCullough	ldrb	r12,[r1],#4
205*f0be44f4SDavid McCullough	orr	r9,r9,r10,lsl#8
206*f0be44f4SDavid McCullough	eor	r10,r5,r6			@ F_xx_xx
207*f0be44f4SDavid McCullough	orr	r9,r9,r11,lsl#16
208*f0be44f4SDavid McCullough	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
209*f0be44f4SDavid McCullough	orr	r9,r9,r12,lsl#24
210*f0be44f4SDavid McCullough#else
211*f0be44f4SDavid McCullough	ldr	r9,[r1],#4			@ handles unaligned
212*f0be44f4SDavid McCullough	add	r7,r8,r7,ror#2			@ E+=K_00_19
213*f0be44f4SDavid McCullough	eor	r10,r5,r6			@ F_xx_xx
214*f0be44f4SDavid McCullough	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
215*f0be44f4SDavid McCullough#ifdef __ARMEL__
216*f0be44f4SDavid McCullough	rev	r9,r9				@ byte swap
217*f0be44f4SDavid McCullough#endif
218*f0be44f4SDavid McCullough#endif
219*f0be44f4SDavid McCullough	and	r10,r4,r10,ror#2
220*f0be44f4SDavid McCullough	add	r7,r7,r9			@ E+=X[i]
221*f0be44f4SDavid McCullough	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
222*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
223*f0be44f4SDavid McCullough	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
224*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
225*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
226*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
227*f0be44f4SDavid McCullough	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
228*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
229*f0be44f4SDavid McCullough	eor	r9,r9,r10
230*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
231*f0be44f4SDavid McCullough	eor	r10,r4,r5			@ F_xx_xx
232*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
233*f0be44f4SDavid McCullough	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
234*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
235*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
236*f0be44f4SDavid McCullough	and r10,r3,r10,ror#2					@ F_xx_xx
237*f0be44f4SDavid McCullough						@ F_xx_xx
238*f0be44f4SDavid McCullough	add	r6,r6,r9			@ E+=X[i]
239*f0be44f4SDavid McCullough	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
240*f0be44f4SDavid McCullough	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
241*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
242*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
243*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
244*f0be44f4SDavid McCullough	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
245*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
246*f0be44f4SDavid McCullough	eor	r9,r9,r10
247*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
248*f0be44f4SDavid McCullough	eor	r10,r3,r4			@ F_xx_xx
249*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
250*f0be44f4SDavid McCullough	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
251*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
252*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
253*f0be44f4SDavid McCullough	and r10,r7,r10,ror#2					@ F_xx_xx
254*f0be44f4SDavid McCullough						@ F_xx_xx
255*f0be44f4SDavid McCullough	add	r5,r5,r9			@ E+=X[i]
256*f0be44f4SDavid McCullough	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
257*f0be44f4SDavid McCullough	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
258*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
259*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
260*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
261*f0be44f4SDavid McCullough	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
262*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
263*f0be44f4SDavid McCullough	eor	r9,r9,r10
264*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
265*f0be44f4SDavid McCullough	eor	r10,r7,r3			@ F_xx_xx
266*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
267*f0be44f4SDavid McCullough	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
268*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
269*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
270*f0be44f4SDavid McCullough	and r10,r6,r10,ror#2					@ F_xx_xx
271*f0be44f4SDavid McCullough						@ F_xx_xx
272*f0be44f4SDavid McCullough	add	r4,r4,r9			@ E+=X[i]
273*f0be44f4SDavid McCullough	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
274*f0be44f4SDavid McCullough	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
275*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
276*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
277*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
278*f0be44f4SDavid McCullough	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
279*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
280*f0be44f4SDavid McCullough	eor	r9,r9,r10
281*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
282*f0be44f4SDavid McCullough	eor	r10,r6,r7			@ F_xx_xx
283*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
284*f0be44f4SDavid McCullough	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
285*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
286*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
287*f0be44f4SDavid McCullough	and r10,r5,r10,ror#2					@ F_xx_xx
288*f0be44f4SDavid McCullough						@ F_xx_xx
289*f0be44f4SDavid McCullough	add	r3,r3,r9			@ E+=X[i]
290*f0be44f4SDavid McCullough	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
291*f0be44f4SDavid McCullough	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
292*f0be44f4SDavid McCullough
293*f0be44f4SDavid McCullough	ldr	r8,.LK_20_39		@ [+15+16*4]
294*f0be44f4SDavid McCullough	sub	sp,sp,#25*4
295*f0be44f4SDavid McCullough	cmn	sp,#0			@ [+3], clear carry to denote 20_39
296*f0be44f4SDavid McCullough.L_20_39_or_60_79:
297*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
298*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
299*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
300*f0be44f4SDavid McCullough	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
301*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
302*f0be44f4SDavid McCullough	eor	r9,r9,r10
303*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
304*f0be44f4SDavid McCullough	eor	r10,r5,r6			@ F_xx_xx
305*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
306*f0be44f4SDavid McCullough	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
307*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
308*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
309*f0be44f4SDavid McCullough	eor r10,r4,r10,ror#2					@ F_xx_xx
310*f0be44f4SDavid McCullough						@ F_xx_xx
311*f0be44f4SDavid McCullough	add	r7,r7,r9			@ E+=X[i]
312*f0be44f4SDavid McCullough	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
313*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
314*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
315*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
316*f0be44f4SDavid McCullough	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
317*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
318*f0be44f4SDavid McCullough	eor	r9,r9,r10
319*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
320*f0be44f4SDavid McCullough	eor	r10,r4,r5			@ F_xx_xx
321*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
322*f0be44f4SDavid McCullough	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
323*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
324*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
325*f0be44f4SDavid McCullough	eor r10,r3,r10,ror#2					@ F_xx_xx
326*f0be44f4SDavid McCullough						@ F_xx_xx
327*f0be44f4SDavid McCullough	add	r6,r6,r9			@ E+=X[i]
328*f0be44f4SDavid McCullough	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
329*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
330*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
331*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
332*f0be44f4SDavid McCullough	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
333*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
334*f0be44f4SDavid McCullough	eor	r9,r9,r10
335*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
336*f0be44f4SDavid McCullough	eor	r10,r3,r4			@ F_xx_xx
337*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
338*f0be44f4SDavid McCullough	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
339*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
340*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
341*f0be44f4SDavid McCullough	eor r10,r7,r10,ror#2					@ F_xx_xx
342*f0be44f4SDavid McCullough						@ F_xx_xx
343*f0be44f4SDavid McCullough	add	r5,r5,r9			@ E+=X[i]
344*f0be44f4SDavid McCullough	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
345*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
346*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
347*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
348*f0be44f4SDavid McCullough	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
349*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
350*f0be44f4SDavid McCullough	eor	r9,r9,r10
351*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
352*f0be44f4SDavid McCullough	eor	r10,r7,r3			@ F_xx_xx
353*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
354*f0be44f4SDavid McCullough	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
355*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
356*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
357*f0be44f4SDavid McCullough	eor r10,r6,r10,ror#2					@ F_xx_xx
358*f0be44f4SDavid McCullough						@ F_xx_xx
359*f0be44f4SDavid McCullough	add	r4,r4,r9			@ E+=X[i]
360*f0be44f4SDavid McCullough	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
361*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
362*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
363*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
364*f0be44f4SDavid McCullough	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
365*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
366*f0be44f4SDavid McCullough	eor	r9,r9,r10
367*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
368*f0be44f4SDavid McCullough	eor	r10,r6,r7			@ F_xx_xx
369*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
370*f0be44f4SDavid McCullough	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
371*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
372*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
373*f0be44f4SDavid McCullough	eor r10,r5,r10,ror#2					@ F_xx_xx
374*f0be44f4SDavid McCullough						@ F_xx_xx
375*f0be44f4SDavid McCullough	add	r3,r3,r9			@ E+=X[i]
376*f0be44f4SDavid McCullough	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
377*f0be44f4SDavid McCullough	teq	r14,sp			@ preserve carry
378*f0be44f4SDavid McCullough	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
379*f0be44f4SDavid McCullough	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
380*f0be44f4SDavid McCullough
381*f0be44f4SDavid McCullough	ldr	r8,.LK_40_59
382*f0be44f4SDavid McCullough	sub	sp,sp,#20*4		@ [+2]
383*f0be44f4SDavid McCullough.L_40_59:
384*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
385*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
386*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
387*f0be44f4SDavid McCullough	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
388*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
389*f0be44f4SDavid McCullough	eor	r9,r9,r10
390*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
391*f0be44f4SDavid McCullough	eor	r10,r5,r6			@ F_xx_xx
392*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
393*f0be44f4SDavid McCullough	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
394*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
395*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
396*f0be44f4SDavid McCullough	and r10,r4,r10,ror#2					@ F_xx_xx
397*f0be44f4SDavid McCullough	and r11,r5,r6					@ F_xx_xx
398*f0be44f4SDavid McCullough	add	r7,r7,r9			@ E+=X[i]
399*f0be44f4SDavid McCullough	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
400*f0be44f4SDavid McCullough	add	r7,r7,r11,ror#2
401*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
402*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
403*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
404*f0be44f4SDavid McCullough	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
405*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
406*f0be44f4SDavid McCullough	eor	r9,r9,r10
407*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
408*f0be44f4SDavid McCullough	eor	r10,r4,r5			@ F_xx_xx
409*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
410*f0be44f4SDavid McCullough	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
411*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
412*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
413*f0be44f4SDavid McCullough	and r10,r3,r10,ror#2					@ F_xx_xx
414*f0be44f4SDavid McCullough	and r11,r4,r5					@ F_xx_xx
415*f0be44f4SDavid McCullough	add	r6,r6,r9			@ E+=X[i]
416*f0be44f4SDavid McCullough	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
417*f0be44f4SDavid McCullough	add	r6,r6,r11,ror#2
418*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
419*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
420*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
421*f0be44f4SDavid McCullough	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
422*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
423*f0be44f4SDavid McCullough	eor	r9,r9,r10
424*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
425*f0be44f4SDavid McCullough	eor	r10,r3,r4			@ F_xx_xx
426*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
427*f0be44f4SDavid McCullough	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
428*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
429*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
430*f0be44f4SDavid McCullough	and r10,r7,r10,ror#2					@ F_xx_xx
431*f0be44f4SDavid McCullough	and r11,r3,r4					@ F_xx_xx
432*f0be44f4SDavid McCullough	add	r5,r5,r9			@ E+=X[i]
433*f0be44f4SDavid McCullough	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
434*f0be44f4SDavid McCullough	add	r5,r5,r11,ror#2
435*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
436*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
437*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
438*f0be44f4SDavid McCullough	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
439*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
440*f0be44f4SDavid McCullough	eor	r9,r9,r10
441*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
442*f0be44f4SDavid McCullough	eor	r10,r7,r3			@ F_xx_xx
443*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
444*f0be44f4SDavid McCullough	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
445*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
446*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
447*f0be44f4SDavid McCullough	and r10,r6,r10,ror#2					@ F_xx_xx
448*f0be44f4SDavid McCullough	and r11,r7,r3					@ F_xx_xx
449*f0be44f4SDavid McCullough	add	r4,r4,r9			@ E+=X[i]
450*f0be44f4SDavid McCullough	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
451*f0be44f4SDavid McCullough	add	r4,r4,r11,ror#2
452*f0be44f4SDavid McCullough	ldr	r9,[r14,#15*4]
453*f0be44f4SDavid McCullough	ldr	r10,[r14,#13*4]
454*f0be44f4SDavid McCullough	ldr	r11,[r14,#7*4]
455*f0be44f4SDavid McCullough	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
456*f0be44f4SDavid McCullough	ldr	r12,[r14,#2*4]
457*f0be44f4SDavid McCullough	eor	r9,r9,r10
458*f0be44f4SDavid McCullough	eor	r11,r11,r12			@ 1 cycle stall
459*f0be44f4SDavid McCullough	eor	r10,r6,r7			@ F_xx_xx
460*f0be44f4SDavid McCullough	mov	r9,r9,ror#31
461*f0be44f4SDavid McCullough	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
462*f0be44f4SDavid McCullough	eor	r9,r9,r11,ror#31
463*f0be44f4SDavid McCullough	str	r9,[r14,#-4]!
464*f0be44f4SDavid McCullough	and r10,r5,r10,ror#2					@ F_xx_xx
465*f0be44f4SDavid McCullough	and r11,r6,r7					@ F_xx_xx
466*f0be44f4SDavid McCullough	add	r3,r3,r9			@ E+=X[i]
467*f0be44f4SDavid McCullough	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
468*f0be44f4SDavid McCullough	add	r3,r3,r11,ror#2
469*f0be44f4SDavid McCullough	teq	r14,sp
470*f0be44f4SDavid McCullough	bne	.L_40_59		@ [+((12+5)*5+2)*4]
471*f0be44f4SDavid McCullough
472*f0be44f4SDavid McCullough	ldr	r8,.LK_60_79
473*f0be44f4SDavid McCullough	sub	sp,sp,#20*4
474*f0be44f4SDavid McCullough	cmp	sp,#0			@ set carry to denote 60_79
475*f0be44f4SDavid McCullough	b	.L_20_39_or_60_79	@ [+4], spare 300 bytes
476*f0be44f4SDavid McCullough.L_done:
477*f0be44f4SDavid McCullough	add	sp,sp,#80*4		@ "deallocate" stack frame
478*f0be44f4SDavid McCullough	ldmia	r0,{r8,r9,r10,r11,r12}
479*f0be44f4SDavid McCullough	add	r3,r8,r3
480*f0be44f4SDavid McCullough	add	r4,r9,r4
481*f0be44f4SDavid McCullough	add	r5,r10,r5,ror#2
482*f0be44f4SDavid McCullough	add	r6,r11,r6,ror#2
483*f0be44f4SDavid McCullough	add	r7,r12,r7,ror#2
484*f0be44f4SDavid McCullough	stmia	r0,{r3,r4,r5,r6,r7}
485*f0be44f4SDavid McCullough	teq	r1,r2
486*f0be44f4SDavid McCullough	bne	.Lloop			@ [+18], total 1307
487*f0be44f4SDavid McCullough
488*f0be44f4SDavid McCullough#if __ARM_ARCH__>=5
489*f0be44f4SDavid McCullough	ldmia	sp!,{r4-r12,pc}
490*f0be44f4SDavid McCullough#else
491*f0be44f4SDavid McCullough	ldmia	sp!,{r4-r12,lr}
492*f0be44f4SDavid McCullough	tst	lr,#1
493*f0be44f4SDavid McCullough	moveq	pc,lr			@ be binary compatible with V4, yet
494*f0be44f4SDavid McCullough	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
495*f0be44f4SDavid McCullough#endif
496*f0be44f4SDavid McCullough.align	2
497*f0be44f4SDavid McCullough.LK_00_19:	.word	0x5a827999
498*f0be44f4SDavid McCullough.LK_20_39:	.word	0x6ed9eba1
499*f0be44f4SDavid McCullough.LK_40_59:	.word	0x8f1bbcdc
500*f0be44f4SDavid McCullough.LK_60_79:	.word	0xca62c1d6
501*f0be44f4SDavid McCullough.size	sha1_block_data_order,.-sha1_block_data_order
502*f0be44f4SDavid McCullough.asciz	"SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
503*f0be44f4SDavid McCullough.align	2
504