xref: /openbmc/linux/arch/arm/crypto/sha1-armv7-neon.S (revision 75bf465f0bc33e9b776a46d6a1b9b990f5fb7c37)
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
3  *
4  * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
5  */
6 
7 #include <linux/linkage.h>
8 #include <asm/assembler.h>
9 
10 .syntax unified
11 .fpu neon
12 
13 .text
14 
15 
16 /* Context structure */
17 
18 #define state_h0 0
19 #define state_h1 4
20 #define state_h2 8
21 #define state_h3 12
22 #define state_h4 16
23 
24 
25 /* Constants */
26 
27 #define K1  0x5A827999
28 #define K2  0x6ED9EBA1
29 #define K3  0x8F1BBCDC
30 #define K4  0xCA62C1D6
31 .align 4
32 .LK_VEC:
33 .LK1:	.long K1, K1, K1, K1
34 .LK2:	.long K2, K2, K2, K2
35 .LK3:	.long K3, K3, K3, K3
36 .LK4:	.long K4, K4, K4, K4
37 
38 
39 /* Register macros */
40 
41 #define RSTATE r0
42 #define RDATA r1
43 #define RNBLKS r2
44 #define ROLDSTACK r3
45 #define RWK lr
46 
47 #define _a r4
48 #define _b r5
49 #define _c r6
50 #define _d r7
51 #define _e r8
52 
53 #define RT0 r9
54 #define RT1 r10
55 #define RT2 r11
56 #define RT3 r12
57 
58 #define W0 q0
59 #define W1 q7
60 #define W2 q2
61 #define W3 q3
62 #define W4 q4
63 #define W5 q6
64 #define W6 q5
65 #define W7 q1
66 
67 #define tmp0 q8
68 #define tmp1 q9
69 #define tmp2 q10
70 #define tmp3 q11
71 
72 #define qK1 q12
73 #define qK2 q13
74 #define qK3 q14
75 #define qK4 q15
76 
77 #ifdef CONFIG_CPU_BIG_ENDIAN
78 #define ARM_LE(code...)
79 #else
80 #define ARM_LE(code...)		code
81 #endif
82 
83 /* Round function macros. */
84 
85 #define WK_offs(i) (((i) & 15) * 4)
86 
87 #define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
88 	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
89 	ldr RT3, [sp, WK_offs(i)]; \
90 		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
91 	bic RT0, d, b; \
92 	add e, e, a, ror #(32 - 5); \
93 	and RT1, c, b; \
94 		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
95 	add RT0, RT0, RT3; \
96 	add e, e, RT1; \
97 	ror b, #(32 - 30); \
98 		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
99 	add e, e, RT0;
100 
101 #define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
102 	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
103 	ldr RT3, [sp, WK_offs(i)]; \
104 		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
105 	eor RT0, d, b; \
106 	add e, e, a, ror #(32 - 5); \
107 	eor RT0, RT0, c; \
108 		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
109 	add e, e, RT3; \
110 	ror b, #(32 - 30); \
111 		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
112 	add e, e, RT0; \
113 
114 #define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
115 	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
116 	ldr RT3, [sp, WK_offs(i)]; \
117 		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
118 	eor RT0, b, c; \
119 	and RT1, b, c; \
120 	add e, e, a, ror #(32 - 5); \
121 		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
122 	and RT0, RT0, d; \
123 	add RT1, RT1, RT3; \
124 	add e, e, RT0; \
125 	ror b, #(32 - 30); \
126 		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
127 	add e, e, RT1;
128 
129 #define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
130 	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
131 	_R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
132 	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
133 
134 #define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
135            W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
136 	_R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
137 	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
138 
139 #define R(a,b,c,d,e,f,i) \
140 	_R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
141 	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
142 
143 #define dummy(...)
144 
145 
146 /* Input expansion macros. */
147 
148 /********* Precalc macros for rounds 0-15 *************************************/
149 
150 #define W_PRECALC_00_15() \
151 	add       RWK, sp, #(WK_offs(0));			\
152 	\
153 	vld1.32   {W0, W7}, [RDATA]!;				\
154  ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
155 	vld1.32   {W6, W5}, [RDATA]!;				\
156 	vadd.u32  tmp0, W0, curK;				\
157  ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
158  ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
159 	vadd.u32  tmp1, W7, curK;				\
160  ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
161 	vadd.u32  tmp2, W6, curK;				\
162 	vst1.32   {tmp0, tmp1}, [RWK]!;				\
163 	vadd.u32  tmp3, W5, curK;				\
164 	vst1.32   {tmp2, tmp3}, [RWK];				\
165 
166 #define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
167 	vld1.32   {W0, W7}, [RDATA]!;				\
168 
169 #define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
170 	add       RWK, sp, #(WK_offs(0));			\
171 
172 #define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
173  ARM_LE(vrev32.8  W0, W0;	)	/* big => little */	\
174 
175 #define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
176 	vld1.32   {W6, W5}, [RDATA]!;				\
177 
178 #define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
179 	vadd.u32  tmp0, W0, curK;				\
180 
181 #define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
182  ARM_LE(vrev32.8  W7, W7;	)	/* big => little */	\
183 
184 #define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
185  ARM_LE(vrev32.8  W6, W6;	)	/* big => little */	\
186 
187 #define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
188 	vadd.u32  tmp1, W7, curK;				\
189 
190 #define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
191  ARM_LE(vrev32.8  W5, W5;	)	/* big => little */	\
192 
193 #define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
194 	vadd.u32  tmp2, W6, curK;				\
195 
196 #define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
197 	vst1.32   {tmp0, tmp1}, [RWK]!;				\
198 
199 #define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
200 	vadd.u32  tmp3, W5, curK;				\
201 
202 #define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
203 	vst1.32   {tmp2, tmp3}, [RWK];				\
204 
205 
206 /********* Precalc macros for rounds 16-31 ************************************/
207 
208 #define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
209 	veor      tmp0, tmp0;			\
210 	vext.8    W, W_m16, W_m12, #8;		\
211 
212 #define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
213 	add       RWK, sp, #(WK_offs(i));	\
214 	vext.8    tmp0, W_m04, tmp0, #4;	\
215 
216 #define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
217 	veor      tmp0, tmp0, W_m16;		\
218 	veor.32   W, W, W_m08;			\
219 
220 #define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
221 	veor      tmp1, tmp1;			\
222 	veor      W, W, tmp0;			\
223 
224 #define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
225 	vshl.u32  tmp0, W, #1;			\
226 
227 #define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
228 	vext.8    tmp1, tmp1, W, #(16-12);	\
229 	vshr.u32  W, W, #31;			\
230 
231 #define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
232 	vorr      tmp0, tmp0, W;		\
233 	vshr.u32  W, tmp1, #30;			\
234 
235 #define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
236 	vshl.u32  tmp1, tmp1, #2;		\
237 
238 #define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
239 	veor      tmp0, tmp0, W;		\
240 
241 #define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
242 	veor      W, tmp0, tmp1;		\
243 
244 #define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
245 	vadd.u32  tmp0, W, curK;		\
246 
247 #define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
248 	vst1.32   {tmp0}, [RWK];
249 
250 
251 /********* Precalc macros for rounds 32-79 ************************************/
252 
253 #define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
254 	veor W, W_m28; \
255 
256 #define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
257 	vext.8 tmp0, W_m08, W_m04, #8; \
258 
259 #define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
260 	veor W, W_m16; \
261 
262 #define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
263 	veor W, tmp0; \
264 
265 #define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
266 	add RWK, sp, #(WK_offs(i&~3)); \
267 
268 #define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
269 	vshl.u32 tmp1, W, #2; \
270 
271 #define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
272 	vshr.u32 tmp0, W, #30; \
273 
274 #define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
275 	vorr W, tmp0, tmp1; \
276 
277 #define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
278 	vadd.u32 tmp0, W, curK; \
279 
280 #define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
281 	vst1.32 {tmp0}, [RWK];
282 
283 
284 /*
285  * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
286  *
287  * unsigned int
288  * sha1_transform_neon (void *ctx, const unsigned char *data,
289  *                      unsigned int nblks)
290  */
291 .align 3
292 ENTRY(sha1_transform_neon)
293   /* input:
294    *	r0: ctx, CTX
295    *	r1: data (64*nblks bytes)
296    *	r2: nblks
297    */
298 
299   cmp RNBLKS, #0;
300   beq .Ldo_nothing;
301 
302   push {r4-r12, lr};
303   /*vpush {q4-q7};*/
304 
305   adr RT3, .LK_VEC;
306 
307   mov ROLDSTACK, sp;
308 
309   /* Align stack. */
310   sub RT0, sp, #(16*4);
311   and RT0, #(~(16-1));
312   mov sp, RT0;
313 
314   vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
315 
316   /* Get the values of the chaining variables. */
317   ldm RSTATE, {_a-_e};
318 
319   vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
320 
321 #undef curK
322 #define curK qK1
323   /* Precalc 0-15. */
324   W_PRECALC_00_15();
325 
326 .Loop:
327   /* Transform 0-15 + Precalc 16-31. */
328   _R( _a, _b, _c, _d, _e, F1,  0,
329       WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
330       W4, W5, W6, W7, W0, _, _, _ );
331   _R( _e, _a, _b, _c, _d, F1,  1,
332       WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
333       W4, W5, W6, W7, W0, _, _, _ );
334   _R( _d, _e, _a, _b, _c, F1,  2,
335       WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
336       W4, W5, W6, W7, W0, _, _, _ );
337   _R( _c, _d, _e, _a, _b, F1,  3,
338       WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
339       W4, W5, W6, W7, W0, _, _, _ );
340 
341 #undef curK
342 #define curK qK2
343   _R( _b, _c, _d, _e, _a, F1,  4,
344       WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
345       W3, W4, W5, W6, W7, _, _, _ );
346   _R( _a, _b, _c, _d, _e, F1,  5,
347       WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
348       W3, W4, W5, W6, W7, _, _, _ );
349   _R( _e, _a, _b, _c, _d, F1,  6,
350       WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
351       W3, W4, W5, W6, W7, _, _, _ );
352   _R( _d, _e, _a, _b, _c, F1,  7,
353       WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
354       W3, W4, W5, W6, W7, _, _, _ );
355 
356   _R( _c, _d, _e, _a, _b, F1,  8,
357       WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
358       W2, W3, W4, W5, W6, _, _, _ );
359   _R( _b, _c, _d, _e, _a, F1,  9,
360       WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
361       W2, W3, W4, W5, W6, _, _, _ );
362   _R( _a, _b, _c, _d, _e, F1, 10,
363       WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
364       W2, W3, W4, W5, W6, _, _, _ );
365   _R( _e, _a, _b, _c, _d, F1, 11,
366       WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
367       W2, W3, W4, W5, W6, _, _, _ );
368 
369   _R( _d, _e, _a, _b, _c, F1, 12,
370       WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
371       W1, W2, W3, W4, W5, _, _, _ );
372   _R( _c, _d, _e, _a, _b, F1, 13,
373       WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
374       W1, W2, W3, W4, W5, _, _, _ );
375   _R( _b, _c, _d, _e, _a, F1, 14,
376       WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
377       W1, W2, W3, W4, W5, _, _, _ );
378   _R( _a, _b, _c, _d, _e, F1, 15,
379       WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
380       W1, W2, W3, W4, W5, _, _, _ );
381 
382   /* Transform 16-63 + Precalc 32-79. */
383   _R( _e, _a, _b, _c, _d, F1, 16,
384       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
385       W0, W1, W2, W3, W4, W5, W6, W7);
386   _R( _d, _e, _a, _b, _c, F1, 17,
387       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
388       W0, W1, W2, W3, W4, W5, W6, W7);
389   _R( _c, _d, _e, _a, _b, F1, 18,
390       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
391       W0, W1, W2, W3, W4, W5, W6, W7);
392   _R( _b, _c, _d, _e, _a, F1, 19,
393       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
394       W0, W1, W2, W3, W4, W5, W6, W7);
395 
396   _R( _a, _b, _c, _d, _e, F2, 20,
397       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
398       W7, W0, W1, W2, W3, W4, W5, W6);
399   _R( _e, _a, _b, _c, _d, F2, 21,
400       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
401       W7, W0, W1, W2, W3, W4, W5, W6);
402   _R( _d, _e, _a, _b, _c, F2, 22,
403       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
404       W7, W0, W1, W2, W3, W4, W5, W6);
405   _R( _c, _d, _e, _a, _b, F2, 23,
406       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
407       W7, W0, W1, W2, W3, W4, W5, W6);
408 
409 #undef curK
410 #define curK qK3
411   _R( _b, _c, _d, _e, _a, F2, 24,
412       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
413       W6, W7, W0, W1, W2, W3, W4, W5);
414   _R( _a, _b, _c, _d, _e, F2, 25,
415       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
416       W6, W7, W0, W1, W2, W3, W4, W5);
417   _R( _e, _a, _b, _c, _d, F2, 26,
418       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
419       W6, W7, W0, W1, W2, W3, W4, W5);
420   _R( _d, _e, _a, _b, _c, F2, 27,
421       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
422       W6, W7, W0, W1, W2, W3, W4, W5);
423 
424   _R( _c, _d, _e, _a, _b, F2, 28,
425       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
426       W5, W6, W7, W0, W1, W2, W3, W4);
427   _R( _b, _c, _d, _e, _a, F2, 29,
428       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
429       W5, W6, W7, W0, W1, W2, W3, W4);
430   _R( _a, _b, _c, _d, _e, F2, 30,
431       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
432       W5, W6, W7, W0, W1, W2, W3, W4);
433   _R( _e, _a, _b, _c, _d, F2, 31,
434       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
435       W5, W6, W7, W0, W1, W2, W3, W4);
436 
437   _R( _d, _e, _a, _b, _c, F2, 32,
438       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
439       W4, W5, W6, W7, W0, W1, W2, W3);
440   _R( _c, _d, _e, _a, _b, F2, 33,
441       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
442       W4, W5, W6, W7, W0, W1, W2, W3);
443   _R( _b, _c, _d, _e, _a, F2, 34,
444       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
445       W4, W5, W6, W7, W0, W1, W2, W3);
446   _R( _a, _b, _c, _d, _e, F2, 35,
447       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
448       W4, W5, W6, W7, W0, W1, W2, W3);
449 
450   _R( _e, _a, _b, _c, _d, F2, 36,
451       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
452       W3, W4, W5, W6, W7, W0, W1, W2);
453   _R( _d, _e, _a, _b, _c, F2, 37,
454       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
455       W3, W4, W5, W6, W7, W0, W1, W2);
456   _R( _c, _d, _e, _a, _b, F2, 38,
457       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
458       W3, W4, W5, W6, W7, W0, W1, W2);
459   _R( _b, _c, _d, _e, _a, F2, 39,
460       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
461       W3, W4, W5, W6, W7, W0, W1, W2);
462 
463   _R( _a, _b, _c, _d, _e, F3, 40,
464       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
465       W2, W3, W4, W5, W6, W7, W0, W1);
466   _R( _e, _a, _b, _c, _d, F3, 41,
467       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
468       W2, W3, W4, W5, W6, W7, W0, W1);
469   _R( _d, _e, _a, _b, _c, F3, 42,
470       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
471       W2, W3, W4, W5, W6, W7, W0, W1);
472   _R( _c, _d, _e, _a, _b, F3, 43,
473       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
474       W2, W3, W4, W5, W6, W7, W0, W1);
475 
476 #undef curK
477 #define curK qK4
478   _R( _b, _c, _d, _e, _a, F3, 44,
479       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
480       W1, W2, W3, W4, W5, W6, W7, W0);
481   _R( _a, _b, _c, _d, _e, F3, 45,
482       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
483       W1, W2, W3, W4, W5, W6, W7, W0);
484   _R( _e, _a, _b, _c, _d, F3, 46,
485       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
486       W1, W2, W3, W4, W5, W6, W7, W0);
487   _R( _d, _e, _a, _b, _c, F3, 47,
488       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
489       W1, W2, W3, W4, W5, W6, W7, W0);
490 
491   _R( _c, _d, _e, _a, _b, F3, 48,
492       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
493       W0, W1, W2, W3, W4, W5, W6, W7);
494   _R( _b, _c, _d, _e, _a, F3, 49,
495       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
496       W0, W1, W2, W3, W4, W5, W6, W7);
497   _R( _a, _b, _c, _d, _e, F3, 50,
498       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
499       W0, W1, W2, W3, W4, W5, W6, W7);
500   _R( _e, _a, _b, _c, _d, F3, 51,
501       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
502       W0, W1, W2, W3, W4, W5, W6, W7);
503 
504   _R( _d, _e, _a, _b, _c, F3, 52,
505       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
506       W7, W0, W1, W2, W3, W4, W5, W6);
507   _R( _c, _d, _e, _a, _b, F3, 53,
508       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
509       W7, W0, W1, W2, W3, W4, W5, W6);
510   _R( _b, _c, _d, _e, _a, F3, 54,
511       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
512       W7, W0, W1, W2, W3, W4, W5, W6);
513   _R( _a, _b, _c, _d, _e, F3, 55,
514       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
515       W7, W0, W1, W2, W3, W4, W5, W6);
516 
517   _R( _e, _a, _b, _c, _d, F3, 56,
518       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
519       W6, W7, W0, W1, W2, W3, W4, W5);
520   _R( _d, _e, _a, _b, _c, F3, 57,
521       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
522       W6, W7, W0, W1, W2, W3, W4, W5);
523   _R( _c, _d, _e, _a, _b, F3, 58,
524       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
525       W6, W7, W0, W1, W2, W3, W4, W5);
526   _R( _b, _c, _d, _e, _a, F3, 59,
527       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
528       W6, W7, W0, W1, W2, W3, W4, W5);
529 
530   subs RNBLKS, #1;
531 
532   _R( _a, _b, _c, _d, _e, F4, 60,
533       WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
534       W5, W6, W7, W0, W1, W2, W3, W4);
535   _R( _e, _a, _b, _c, _d, F4, 61,
536       WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
537       W5, W6, W7, W0, W1, W2, W3, W4);
538   _R( _d, _e, _a, _b, _c, F4, 62,
539       WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
540       W5, W6, W7, W0, W1, W2, W3, W4);
541   _R( _c, _d, _e, _a, _b, F4, 63,
542       WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
543       W5, W6, W7, W0, W1, W2, W3, W4);
544 
545   beq .Lend;
546 
547   /* Transform 64-79 + Precalc 0-15 of next block. */
548 #undef curK
549 #define curK qK1
550   _R( _b, _c, _d, _e, _a, F4, 64,
551       WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
552   _R( _a, _b, _c, _d, _e, F4, 65,
553       WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
554   _R( _e, _a, _b, _c, _d, F4, 66,
555       WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
556   _R( _d, _e, _a, _b, _c, F4, 67,
557       WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
558 
559   _R( _c, _d, _e, _a, _b, F4, 68,
560       dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
561   _R( _b, _c, _d, _e, _a, F4, 69,
562       dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
563   _R( _a, _b, _c, _d, _e, F4, 70,
564       WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
565   _R( _e, _a, _b, _c, _d, F4, 71,
566       WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
567 
568   _R( _d, _e, _a, _b, _c, F4, 72,
569       dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
570   _R( _c, _d, _e, _a, _b, F4, 73,
571       dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
572   _R( _b, _c, _d, _e, _a, F4, 74,
573       WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
574   _R( _a, _b, _c, _d, _e, F4, 75,
575       WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
576 
577   _R( _e, _a, _b, _c, _d, F4, 76,
578       WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
579   _R( _d, _e, _a, _b, _c, F4, 77,
580       WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
581   _R( _c, _d, _e, _a, _b, F4, 78,
582       WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
583   _R( _b, _c, _d, _e, _a, F4, 79,
584       WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
585 
586   /* Update the chaining variables. */
587   ldm RSTATE, {RT0-RT3};
588   add _a, RT0;
589   ldr RT0, [RSTATE, #state_h4];
590   add _b, RT1;
591   add _c, RT2;
592   add _d, RT3;
593   add _e, RT0;
594   stm RSTATE, {_a-_e};
595 
596   b .Loop;
597 
598 .Lend:
599   /* Transform 64-79 */
600   R( _b, _c, _d, _e, _a, F4, 64 );
601   R( _a, _b, _c, _d, _e, F4, 65 );
602   R( _e, _a, _b, _c, _d, F4, 66 );
603   R( _d, _e, _a, _b, _c, F4, 67 );
604   R( _c, _d, _e, _a, _b, F4, 68 );
605   R( _b, _c, _d, _e, _a, F4, 69 );
606   R( _a, _b, _c, _d, _e, F4, 70 );
607   R( _e, _a, _b, _c, _d, F4, 71 );
608   R( _d, _e, _a, _b, _c, F4, 72 );
609   R( _c, _d, _e, _a, _b, F4, 73 );
610   R( _b, _c, _d, _e, _a, F4, 74 );
611   R( _a, _b, _c, _d, _e, F4, 75 );
612   R( _e, _a, _b, _c, _d, F4, 76 );
613   R( _d, _e, _a, _b, _c, F4, 77 );
614   R( _c, _d, _e, _a, _b, F4, 78 );
615   R( _b, _c, _d, _e, _a, F4, 79 );
616 
617   mov sp, ROLDSTACK;
618 
619   /* Update the chaining variables. */
620   ldm RSTATE, {RT0-RT3};
621   add _a, RT0;
622   ldr RT0, [RSTATE, #state_h4];
623   add _b, RT1;
624   add _c, RT2;
625   add _d, RT3;
626   /*vpop {q4-q7};*/
627   add _e, RT0;
628   stm RSTATE, {_a-_e};
629 
630   pop {r4-r12, pc};
631 
632 .Ldo_nothing:
633   bx lr
634 ENDPROC(sha1_transform_neon)
635