1/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function 2 * 3 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License as published by the Free 7 * Software Foundation; either version 2 of the License, or (at your option) 8 * any later version. 9 */ 10 11#include <linux/linkage.h> 12 13 14.syntax unified 15.code 32 16.fpu neon 17 18.text 19 20 21/* Context structure */ 22 23#define state_h0 0 24#define state_h1 4 25#define state_h2 8 26#define state_h3 12 27#define state_h4 16 28 29 30/* Constants */ 31 32#define K1 0x5A827999 33#define K2 0x6ED9EBA1 34#define K3 0x8F1BBCDC 35#define K4 0xCA62C1D6 36.align 4 37.LK_VEC: 38.LK1: .long K1, K1, K1, K1 39.LK2: .long K2, K2, K2, K2 40.LK3: .long K3, K3, K3, K3 41.LK4: .long K4, K4, K4, K4 42 43 44/* Register macros */ 45 46#define RSTATE r0 47#define RDATA r1 48#define RNBLKS r2 49#define ROLDSTACK r3 50#define RWK lr 51 52#define _a r4 53#define _b r5 54#define _c r6 55#define _d r7 56#define _e r8 57 58#define RT0 r9 59#define RT1 r10 60#define RT2 r11 61#define RT3 r12 62 63#define W0 q0 64#define W1 q1 65#define W2 q2 66#define W3 q3 67#define W4 q4 68#define W5 q5 69#define W6 q6 70#define W7 q7 71 72#define tmp0 q8 73#define tmp1 q9 74#define tmp2 q10 75#define tmp3 q11 76 77#define qK1 q12 78#define qK2 q13 79#define qK3 q14 80#define qK4 q15 81 82 83/* Round function macros. */ 84 85#define WK_offs(i) (((i) & 15) * 4) 86 87#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 88 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 89 ldr RT3, [sp, WK_offs(i)]; \ 90 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 91 bic RT0, d, b; \ 92 add e, e, a, ror #(32 - 5); \ 93 and RT1, c, b; \ 94 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 95 add RT0, RT0, RT3; \ 96 add e, e, RT1; \ 97 ror b, #(32 - 30); \ 98 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 99 add e, e, RT0; 100 101#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 102 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 103 ldr RT3, [sp, WK_offs(i)]; \ 104 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 105 eor RT0, d, b; \ 106 add e, e, a, ror #(32 - 5); \ 107 eor RT0, RT0, c; \ 108 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 109 add e, e, RT3; \ 110 ror b, #(32 - 30); \ 111 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 112 add e, e, RT0; \ 113 114#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 115 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 116 ldr RT3, [sp, WK_offs(i)]; \ 117 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 118 eor RT0, b, c; \ 119 and RT1, b, c; \ 120 add e, e, a, ror #(32 - 5); \ 121 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 122 and RT0, RT0, d; \ 123 add RT1, RT1, RT3; \ 124 add e, e, RT0; \ 125 ror b, #(32 - 30); \ 126 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 127 add e, e, RT1; 128 129#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 130 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 131 _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 132 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 133 134#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ 135 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 136 _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 137 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 138 139#define R(a,b,c,d,e,f,i) \ 140 _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ 141 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 142 143#define dummy(...) 144 145 146/* Input expansion macros. */ 147 148/********* Precalc macros for rounds 0-15 *************************************/ 149 150#define W_PRECALC_00_15() \ 151 add RWK, sp, #(WK_offs(0)); \ 152 \ 153 vld1.32 {tmp0, tmp1}, [RDATA]!; \ 154 vrev32.8 W0, tmp0; /* big => little */ \ 155 vld1.32 {tmp2, tmp3}, [RDATA]!; \ 156 vadd.u32 tmp0, W0, curK; \ 157 vrev32.8 W7, tmp1; /* big => little */ \ 158 vrev32.8 W6, tmp2; /* big => little */ \ 159 vadd.u32 tmp1, W7, curK; \ 160 vrev32.8 W5, tmp3; /* big => little */ \ 161 vadd.u32 tmp2, W6, curK; \ 162 vst1.32 {tmp0, tmp1}, [RWK]!; \ 163 vadd.u32 tmp3, W5, curK; \ 164 vst1.32 {tmp2, tmp3}, [RWK]; \ 165 166#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 167 vld1.32 {tmp0, tmp1}, [RDATA]!; \ 168 169#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 170 add RWK, sp, #(WK_offs(0)); \ 171 172#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 173 vrev32.8 W0, tmp0; /* big => little */ \ 174 175#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 176 vld1.32 {tmp2, tmp3}, [RDATA]!; \ 177 178#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 179 vadd.u32 tmp0, W0, curK; \ 180 181#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 182 vrev32.8 W7, tmp1; /* big => little */ \ 183 184#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 185 vrev32.8 W6, tmp2; /* big => little */ \ 186 187#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 188 vadd.u32 tmp1, W7, curK; \ 189 190#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 191 vrev32.8 W5, tmp3; /* big => little */ \ 192 193#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 194 vadd.u32 tmp2, W6, curK; \ 195 196#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 197 vst1.32 {tmp0, tmp1}, [RWK]!; \ 198 199#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 200 vadd.u32 tmp3, W5, curK; \ 201 202#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 203 vst1.32 {tmp2, tmp3}, [RWK]; \ 204 205 206/********* Precalc macros for rounds 16-31 ************************************/ 207 208#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 209 veor tmp0, tmp0; \ 210 vext.8 W, W_m16, W_m12, #8; \ 211 212#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 213 add RWK, sp, #(WK_offs(i)); \ 214 vext.8 tmp0, W_m04, tmp0, #4; \ 215 216#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 217 veor tmp0, tmp0, W_m16; \ 218 veor.32 W, W, W_m08; \ 219 220#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 221 veor tmp1, tmp1; \ 222 veor W, W, tmp0; \ 223 224#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 225 vshl.u32 tmp0, W, #1; \ 226 227#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 228 vext.8 tmp1, tmp1, W, #(16-12); \ 229 vshr.u32 W, W, #31; \ 230 231#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 232 vorr tmp0, tmp0, W; \ 233 vshr.u32 W, tmp1, #30; \ 234 235#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 236 vshl.u32 tmp1, tmp1, #2; \ 237 238#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 239 veor tmp0, tmp0, W; \ 240 241#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 242 veor W, tmp0, tmp1; \ 243 244#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 245 vadd.u32 tmp0, W, curK; \ 246 247#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 248 vst1.32 {tmp0}, [RWK]; 249 250 251/********* Precalc macros for rounds 32-79 ************************************/ 252 253#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 254 veor W, W_m28; \ 255 256#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 257 vext.8 tmp0, W_m08, W_m04, #8; \ 258 259#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 260 veor W, W_m16; \ 261 262#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 263 veor W, tmp0; \ 264 265#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 266 add RWK, sp, #(WK_offs(i&~3)); \ 267 268#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 269 vshl.u32 tmp1, W, #2; \ 270 271#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 272 vshr.u32 tmp0, W, #30; \ 273 274#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 275 vorr W, tmp0, tmp1; \ 276 277#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 278 vadd.u32 tmp0, W, curK; \ 279 280#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 281 vst1.32 {tmp0}, [RWK]; 282 283 284/* 285 * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. 286 * 287 * unsigned int 288 * sha1_transform_neon (void *ctx, const unsigned char *data, 289 * unsigned int nblks) 290 */ 291.align 3 292ENTRY(sha1_transform_neon) 293 /* input: 294 * r0: ctx, CTX 295 * r1: data (64*nblks bytes) 296 * r2: nblks 297 */ 298 299 cmp RNBLKS, #0; 300 beq .Ldo_nothing; 301 302 push {r4-r12, lr}; 303 /*vpush {q4-q7};*/ 304 305 adr RT3, .LK_VEC; 306 307 mov ROLDSTACK, sp; 308 309 /* Align stack. */ 310 sub RT0, sp, #(16*4); 311 and RT0, #(~(16-1)); 312 mov sp, RT0; 313 314 vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ 315 316 /* Get the values of the chaining variables. */ 317 ldm RSTATE, {_a-_e}; 318 319 vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ 320 321#undef curK 322#define curK qK1 323 /* Precalc 0-15. */ 324 W_PRECALC_00_15(); 325 326.Loop: 327 /* Transform 0-15 + Precalc 16-31. */ 328 _R( _a, _b, _c, _d, _e, F1, 0, 329 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, 330 W4, W5, W6, W7, W0, _, _, _ ); 331 _R( _e, _a, _b, _c, _d, F1, 1, 332 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, 333 W4, W5, W6, W7, W0, _, _, _ ); 334 _R( _d, _e, _a, _b, _c, F1, 2, 335 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, 336 W4, W5, W6, W7, W0, _, _, _ ); 337 _R( _c, _d, _e, _a, _b, F1, 3, 338 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, 339 W4, W5, W6, W7, W0, _, _, _ ); 340 341#undef curK 342#define curK qK2 343 _R( _b, _c, _d, _e, _a, F1, 4, 344 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, 345 W3, W4, W5, W6, W7, _, _, _ ); 346 _R( _a, _b, _c, _d, _e, F1, 5, 347 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, 348 W3, W4, W5, W6, W7, _, _, _ ); 349 _R( _e, _a, _b, _c, _d, F1, 6, 350 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, 351 W3, W4, W5, W6, W7, _, _, _ ); 352 _R( _d, _e, _a, _b, _c, F1, 7, 353 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, 354 W3, W4, W5, W6, W7, _, _, _ ); 355 356 _R( _c, _d, _e, _a, _b, F1, 8, 357 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, 358 W2, W3, W4, W5, W6, _, _, _ ); 359 _R( _b, _c, _d, _e, _a, F1, 9, 360 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, 361 W2, W3, W4, W5, W6, _, _, _ ); 362 _R( _a, _b, _c, _d, _e, F1, 10, 363 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, 364 W2, W3, W4, W5, W6, _, _, _ ); 365 _R( _e, _a, _b, _c, _d, F1, 11, 366 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, 367 W2, W3, W4, W5, W6, _, _, _ ); 368 369 _R( _d, _e, _a, _b, _c, F1, 12, 370 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, 371 W1, W2, W3, W4, W5, _, _, _ ); 372 _R( _c, _d, _e, _a, _b, F1, 13, 373 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, 374 W1, W2, W3, W4, W5, _, _, _ ); 375 _R( _b, _c, _d, _e, _a, F1, 14, 376 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, 377 W1, W2, W3, W4, W5, _, _, _ ); 378 _R( _a, _b, _c, _d, _e, F1, 15, 379 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, 380 W1, W2, W3, W4, W5, _, _, _ ); 381 382 /* Transform 16-63 + Precalc 32-79. */ 383 _R( _e, _a, _b, _c, _d, F1, 16, 384 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, 385 W0, W1, W2, W3, W4, W5, W6, W7); 386 _R( _d, _e, _a, _b, _c, F1, 17, 387 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, 388 W0, W1, W2, W3, W4, W5, W6, W7); 389 _R( _c, _d, _e, _a, _b, F1, 18, 390 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, 391 W0, W1, W2, W3, W4, W5, W6, W7); 392 _R( _b, _c, _d, _e, _a, F1, 19, 393 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, 394 W0, W1, W2, W3, W4, W5, W6, W7); 395 396 _R( _a, _b, _c, _d, _e, F2, 20, 397 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, 398 W7, W0, W1, W2, W3, W4, W5, W6); 399 _R( _e, _a, _b, _c, _d, F2, 21, 400 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, 401 W7, W0, W1, W2, W3, W4, W5, W6); 402 _R( _d, _e, _a, _b, _c, F2, 22, 403 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, 404 W7, W0, W1, W2, W3, W4, W5, W6); 405 _R( _c, _d, _e, _a, _b, F2, 23, 406 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, 407 W7, W0, W1, W2, W3, W4, W5, W6); 408 409#undef curK 410#define curK qK3 411 _R( _b, _c, _d, _e, _a, F2, 24, 412 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, 413 W6, W7, W0, W1, W2, W3, W4, W5); 414 _R( _a, _b, _c, _d, _e, F2, 25, 415 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, 416 W6, W7, W0, W1, W2, W3, W4, W5); 417 _R( _e, _a, _b, _c, _d, F2, 26, 418 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, 419 W6, W7, W0, W1, W2, W3, W4, W5); 420 _R( _d, _e, _a, _b, _c, F2, 27, 421 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, 422 W6, W7, W0, W1, W2, W3, W4, W5); 423 424 _R( _c, _d, _e, _a, _b, F2, 28, 425 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, 426 W5, W6, W7, W0, W1, W2, W3, W4); 427 _R( _b, _c, _d, _e, _a, F2, 29, 428 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, 429 W5, W6, W7, W0, W1, W2, W3, W4); 430 _R( _a, _b, _c, _d, _e, F2, 30, 431 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, 432 W5, W6, W7, W0, W1, W2, W3, W4); 433 _R( _e, _a, _b, _c, _d, F2, 31, 434 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, 435 W5, W6, W7, W0, W1, W2, W3, W4); 436 437 _R( _d, _e, _a, _b, _c, F2, 32, 438 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, 439 W4, W5, W6, W7, W0, W1, W2, W3); 440 _R( _c, _d, _e, _a, _b, F2, 33, 441 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, 442 W4, W5, W6, W7, W0, W1, W2, W3); 443 _R( _b, _c, _d, _e, _a, F2, 34, 444 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, 445 W4, W5, W6, W7, W0, W1, W2, W3); 446 _R( _a, _b, _c, _d, _e, F2, 35, 447 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, 448 W4, W5, W6, W7, W0, W1, W2, W3); 449 450 _R( _e, _a, _b, _c, _d, F2, 36, 451 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, 452 W3, W4, W5, W6, W7, W0, W1, W2); 453 _R( _d, _e, _a, _b, _c, F2, 37, 454 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, 455 W3, W4, W5, W6, W7, W0, W1, W2); 456 _R( _c, _d, _e, _a, _b, F2, 38, 457 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, 458 W3, W4, W5, W6, W7, W0, W1, W2); 459 _R( _b, _c, _d, _e, _a, F2, 39, 460 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, 461 W3, W4, W5, W6, W7, W0, W1, W2); 462 463 _R( _a, _b, _c, _d, _e, F3, 40, 464 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, 465 W2, W3, W4, W5, W6, W7, W0, W1); 466 _R( _e, _a, _b, _c, _d, F3, 41, 467 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, 468 W2, W3, W4, W5, W6, W7, W0, W1); 469 _R( _d, _e, _a, _b, _c, F3, 42, 470 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, 471 W2, W3, W4, W5, W6, W7, W0, W1); 472 _R( _c, _d, _e, _a, _b, F3, 43, 473 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, 474 W2, W3, W4, W5, W6, W7, W0, W1); 475 476#undef curK 477#define curK qK4 478 _R( _b, _c, _d, _e, _a, F3, 44, 479 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, 480 W1, W2, W3, W4, W5, W6, W7, W0); 481 _R( _a, _b, _c, _d, _e, F3, 45, 482 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, 483 W1, W2, W3, W4, W5, W6, W7, W0); 484 _R( _e, _a, _b, _c, _d, F3, 46, 485 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, 486 W1, W2, W3, W4, W5, W6, W7, W0); 487 _R( _d, _e, _a, _b, _c, F3, 47, 488 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, 489 W1, W2, W3, W4, W5, W6, W7, W0); 490 491 _R( _c, _d, _e, _a, _b, F3, 48, 492 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, 493 W0, W1, W2, W3, W4, W5, W6, W7); 494 _R( _b, _c, _d, _e, _a, F3, 49, 495 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, 496 W0, W1, W2, W3, W4, W5, W6, W7); 497 _R( _a, _b, _c, _d, _e, F3, 50, 498 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, 499 W0, W1, W2, W3, W4, W5, W6, W7); 500 _R( _e, _a, _b, _c, _d, F3, 51, 501 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, 502 W0, W1, W2, W3, W4, W5, W6, W7); 503 504 _R( _d, _e, _a, _b, _c, F3, 52, 505 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, 506 W7, W0, W1, W2, W3, W4, W5, W6); 507 _R( _c, _d, _e, _a, _b, F3, 53, 508 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, 509 W7, W0, W1, W2, W3, W4, W5, W6); 510 _R( _b, _c, _d, _e, _a, F3, 54, 511 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, 512 W7, W0, W1, W2, W3, W4, W5, W6); 513 _R( _a, _b, _c, _d, _e, F3, 55, 514 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, 515 W7, W0, W1, W2, W3, W4, W5, W6); 516 517 _R( _e, _a, _b, _c, _d, F3, 56, 518 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, 519 W6, W7, W0, W1, W2, W3, W4, W5); 520 _R( _d, _e, _a, _b, _c, F3, 57, 521 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, 522 W6, W7, W0, W1, W2, W3, W4, W5); 523 _R( _c, _d, _e, _a, _b, F3, 58, 524 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, 525 W6, W7, W0, W1, W2, W3, W4, W5); 526 _R( _b, _c, _d, _e, _a, F3, 59, 527 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, 528 W6, W7, W0, W1, W2, W3, W4, W5); 529 530 subs RNBLKS, #1; 531 532 _R( _a, _b, _c, _d, _e, F4, 60, 533 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, 534 W5, W6, W7, W0, W1, W2, W3, W4); 535 _R( _e, _a, _b, _c, _d, F4, 61, 536 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, 537 W5, W6, W7, W0, W1, W2, W3, W4); 538 _R( _d, _e, _a, _b, _c, F4, 62, 539 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, 540 W5, W6, W7, W0, W1, W2, W3, W4); 541 _R( _c, _d, _e, _a, _b, F4, 63, 542 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, 543 W5, W6, W7, W0, W1, W2, W3, W4); 544 545 beq .Lend; 546 547 /* Transform 64-79 + Precalc 0-15 of next block. */ 548#undef curK 549#define curK qK1 550 _R( _b, _c, _d, _e, _a, F4, 64, 551 WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 552 _R( _a, _b, _c, _d, _e, F4, 65, 553 WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 554 _R( _e, _a, _b, _c, _d, F4, 66, 555 WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 556 _R( _d, _e, _a, _b, _c, F4, 67, 557 WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 558 559 _R( _c, _d, _e, _a, _b, F4, 68, 560 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 561 _R( _b, _c, _d, _e, _a, F4, 69, 562 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 563 _R( _a, _b, _c, _d, _e, F4, 70, 564 WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 565 _R( _e, _a, _b, _c, _d, F4, 71, 566 WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 567 568 _R( _d, _e, _a, _b, _c, F4, 72, 569 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 570 _R( _c, _d, _e, _a, _b, F4, 73, 571 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 572 _R( _b, _c, _d, _e, _a, F4, 74, 573 WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 574 _R( _a, _b, _c, _d, _e, F4, 75, 575 WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 576 577 _R( _e, _a, _b, _c, _d, F4, 76, 578 WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 579 _R( _d, _e, _a, _b, _c, F4, 77, 580 WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 581 _R( _c, _d, _e, _a, _b, F4, 78, 582 WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 583 _R( _b, _c, _d, _e, _a, F4, 79, 584 WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); 585 586 /* Update the chaining variables. */ 587 ldm RSTATE, {RT0-RT3}; 588 add _a, RT0; 589 ldr RT0, [RSTATE, #state_h4]; 590 add _b, RT1; 591 add _c, RT2; 592 add _d, RT3; 593 add _e, RT0; 594 stm RSTATE, {_a-_e}; 595 596 b .Loop; 597 598.Lend: 599 /* Transform 64-79 */ 600 R( _b, _c, _d, _e, _a, F4, 64 ); 601 R( _a, _b, _c, _d, _e, F4, 65 ); 602 R( _e, _a, _b, _c, _d, F4, 66 ); 603 R( _d, _e, _a, _b, _c, F4, 67 ); 604 R( _c, _d, _e, _a, _b, F4, 68 ); 605 R( _b, _c, _d, _e, _a, F4, 69 ); 606 R( _a, _b, _c, _d, _e, F4, 70 ); 607 R( _e, _a, _b, _c, _d, F4, 71 ); 608 R( _d, _e, _a, _b, _c, F4, 72 ); 609 R( _c, _d, _e, _a, _b, F4, 73 ); 610 R( _b, _c, _d, _e, _a, F4, 74 ); 611 R( _a, _b, _c, _d, _e, F4, 75 ); 612 R( _e, _a, _b, _c, _d, F4, 76 ); 613 R( _d, _e, _a, _b, _c, F4, 77 ); 614 R( _c, _d, _e, _a, _b, F4, 78 ); 615 R( _b, _c, _d, _e, _a, F4, 79 ); 616 617 mov sp, ROLDSTACK; 618 619 /* Update the chaining variables. */ 620 ldm RSTATE, {RT0-RT3}; 621 add _a, RT0; 622 ldr RT0, [RSTATE, #state_h4]; 623 add _b, RT1; 624 add _c, RT2; 625 add _d, RT3; 626 /*vpop {q4-q7};*/ 627 add _e, RT0; 628 stm RSTATE, {_a-_e}; 629 630 pop {r4-r12, pc}; 631 632.Ldo_nothing: 633 bx lr 634ENDPROC(sha1_transform_neon) 635