1/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function 2 * 3 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License as published by the Free 7 * Software Foundation; either version 2 of the License, or (at your option) 8 * any later version. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14.syntax unified 15.fpu neon 16 17.text 18 19 20/* Context structure */ 21 22#define state_h0 0 23#define state_h1 4 24#define state_h2 8 25#define state_h3 12 26#define state_h4 16 27 28 29/* Constants */ 30 31#define K1 0x5A827999 32#define K2 0x6ED9EBA1 33#define K3 0x8F1BBCDC 34#define K4 0xCA62C1D6 35.align 4 36.LK_VEC: 37.LK1: .long K1, K1, K1, K1 38.LK2: .long K2, K2, K2, K2 39.LK3: .long K3, K3, K3, K3 40.LK4: .long K4, K4, K4, K4 41 42 43/* Register macros */ 44 45#define RSTATE r0 46#define RDATA r1 47#define RNBLKS r2 48#define ROLDSTACK r3 49#define RWK lr 50 51#define _a r4 52#define _b r5 53#define _c r6 54#define _d r7 55#define _e r8 56 57#define RT0 r9 58#define RT1 r10 59#define RT2 r11 60#define RT3 r12 61 62#define W0 q0 63#define W1 q7 64#define W2 q2 65#define W3 q3 66#define W4 q4 67#define W5 q6 68#define W6 q5 69#define W7 q1 70 71#define tmp0 q8 72#define tmp1 q9 73#define tmp2 q10 74#define tmp3 q11 75 76#define qK1 q12 77#define qK2 q13 78#define qK3 q14 79#define qK4 q15 80 81#ifdef CONFIG_CPU_BIG_ENDIAN 82#define ARM_LE(code...) 83#else 84#define ARM_LE(code...) code 85#endif 86 87/* Round function macros. */ 88 89#define WK_offs(i) (((i) & 15) * 4) 90 91#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 92 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 93 ldr RT3, [sp, WK_offs(i)]; \ 94 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 95 bic RT0, d, b; \ 96 add e, e, a, ror #(32 - 5); \ 97 and RT1, c, b; \ 98 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 99 add RT0, RT0, RT3; \ 100 add e, e, RT1; \ 101 ror b, #(32 - 30); \ 102 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 103 add e, e, RT0; 104 105#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 106 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 107 ldr RT3, [sp, WK_offs(i)]; \ 108 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 109 eor RT0, d, b; \ 110 add e, e, a, ror #(32 - 5); \ 111 eor RT0, RT0, c; \ 112 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 113 add e, e, RT3; \ 114 ror b, #(32 - 30); \ 115 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 116 add e, e, RT0; \ 117 118#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 119 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 120 ldr RT3, [sp, WK_offs(i)]; \ 121 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 122 eor RT0, b, c; \ 123 and RT1, b, c; \ 124 add e, e, a, ror #(32 - 5); \ 125 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 126 and RT0, RT0, d; \ 127 add RT1, RT1, RT3; \ 128 add e, e, RT0; \ 129 ror b, #(32 - 30); \ 130 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 131 add e, e, RT1; 132 133#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 134 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 135 _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 136 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 137 138#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ 139 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 140 _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 141 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 142 143#define R(a,b,c,d,e,f,i) \ 144 _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ 145 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 146 147#define dummy(...) 148 149 150/* Input expansion macros. */ 151 152/********* Precalc macros for rounds 0-15 *************************************/ 153 154#define W_PRECALC_00_15() \ 155 add RWK, sp, #(WK_offs(0)); \ 156 \ 157 vld1.32 {W0, W7}, [RDATA]!; \ 158 ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ 159 vld1.32 {W6, W5}, [RDATA]!; \ 160 vadd.u32 tmp0, W0, curK; \ 161 ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ 162 ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ 163 vadd.u32 tmp1, W7, curK; \ 164 ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ 165 vadd.u32 tmp2, W6, curK; \ 166 vst1.32 {tmp0, tmp1}, [RWK]!; \ 167 vadd.u32 tmp3, W5, curK; \ 168 vst1.32 {tmp2, tmp3}, [RWK]; \ 169 170#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 171 vld1.32 {W0, W7}, [RDATA]!; \ 172 173#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 174 add RWK, sp, #(WK_offs(0)); \ 175 176#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 177 ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ 178 179#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 180 vld1.32 {W6, W5}, [RDATA]!; \ 181 182#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 183 vadd.u32 tmp0, W0, curK; \ 184 185#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 186 ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ 187 188#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 189 ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ 190 191#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 192 vadd.u32 tmp1, W7, curK; \ 193 194#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 195 ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ 196 197#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 198 vadd.u32 tmp2, W6, curK; \ 199 200#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 201 vst1.32 {tmp0, tmp1}, [RWK]!; \ 202 203#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 204 vadd.u32 tmp3, W5, curK; \ 205 206#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 207 vst1.32 {tmp2, tmp3}, [RWK]; \ 208 209 210/********* Precalc macros for rounds 16-31 ************************************/ 211 212#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 213 veor tmp0, tmp0; \ 214 vext.8 W, W_m16, W_m12, #8; \ 215 216#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 217 add RWK, sp, #(WK_offs(i)); \ 218 vext.8 tmp0, W_m04, tmp0, #4; \ 219 220#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 221 veor tmp0, tmp0, W_m16; \ 222 veor.32 W, W, W_m08; \ 223 224#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 225 veor tmp1, tmp1; \ 226 veor W, W, tmp0; \ 227 228#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 229 vshl.u32 tmp0, W, #1; \ 230 231#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 232 vext.8 tmp1, tmp1, W, #(16-12); \ 233 vshr.u32 W, W, #31; \ 234 235#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 236 vorr tmp0, tmp0, W; \ 237 vshr.u32 W, tmp1, #30; \ 238 239#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 240 vshl.u32 tmp1, tmp1, #2; \ 241 242#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 243 veor tmp0, tmp0, W; \ 244 245#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 246 veor W, tmp0, tmp1; \ 247 248#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 249 vadd.u32 tmp0, W, curK; \ 250 251#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 252 vst1.32 {tmp0}, [RWK]; 253 254 255/********* Precalc macros for rounds 32-79 ************************************/ 256 257#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 258 veor W, W_m28; \ 259 260#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 261 vext.8 tmp0, W_m08, W_m04, #8; \ 262 263#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 264 veor W, W_m16; \ 265 266#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 267 veor W, tmp0; \ 268 269#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 270 add RWK, sp, #(WK_offs(i&~3)); \ 271 272#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 273 vshl.u32 tmp1, W, #2; \ 274 275#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 276 vshr.u32 tmp0, W, #30; \ 277 278#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 279 vorr W, tmp0, tmp1; \ 280 281#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 282 vadd.u32 tmp0, W, curK; \ 283 284#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 285 vst1.32 {tmp0}, [RWK]; 286 287 288/* 289 * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. 290 * 291 * unsigned int 292 * sha1_transform_neon (void *ctx, const unsigned char *data, 293 * unsigned int nblks) 294 */ 295.align 3 296ENTRY(sha1_transform_neon) 297 /* input: 298 * r0: ctx, CTX 299 * r1: data (64*nblks bytes) 300 * r2: nblks 301 */ 302 303 cmp RNBLKS, #0; 304 beq .Ldo_nothing; 305 306 push {r4-r12, lr}; 307 /*vpush {q4-q7};*/ 308 309 adr RT3, .LK_VEC; 310 311 mov ROLDSTACK, sp; 312 313 /* Align stack. */ 314 sub RT0, sp, #(16*4); 315 and RT0, #(~(16-1)); 316 mov sp, RT0; 317 318 vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ 319 320 /* Get the values of the chaining variables. */ 321 ldm RSTATE, {_a-_e}; 322 323 vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ 324 325#undef curK 326#define curK qK1 327 /* Precalc 0-15. */ 328 W_PRECALC_00_15(); 329 330.Loop: 331 /* Transform 0-15 + Precalc 16-31. */ 332 _R( _a, _b, _c, _d, _e, F1, 0, 333 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, 334 W4, W5, W6, W7, W0, _, _, _ ); 335 _R( _e, _a, _b, _c, _d, F1, 1, 336 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, 337 W4, W5, W6, W7, W0, _, _, _ ); 338 _R( _d, _e, _a, _b, _c, F1, 2, 339 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, 340 W4, W5, W6, W7, W0, _, _, _ ); 341 _R( _c, _d, _e, _a, _b, F1, 3, 342 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, 343 W4, W5, W6, W7, W0, _, _, _ ); 344 345#undef curK 346#define curK qK2 347 _R( _b, _c, _d, _e, _a, F1, 4, 348 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, 349 W3, W4, W5, W6, W7, _, _, _ ); 350 _R( _a, _b, _c, _d, _e, F1, 5, 351 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, 352 W3, W4, W5, W6, W7, _, _, _ ); 353 _R( _e, _a, _b, _c, _d, F1, 6, 354 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, 355 W3, W4, W5, W6, W7, _, _, _ ); 356 _R( _d, _e, _a, _b, _c, F1, 7, 357 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, 358 W3, W4, W5, W6, W7, _, _, _ ); 359 360 _R( _c, _d, _e, _a, _b, F1, 8, 361 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, 362 W2, W3, W4, W5, W6, _, _, _ ); 363 _R( _b, _c, _d, _e, _a, F1, 9, 364 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, 365 W2, W3, W4, W5, W6, _, _, _ ); 366 _R( _a, _b, _c, _d, _e, F1, 10, 367 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, 368 W2, W3, W4, W5, W6, _, _, _ ); 369 _R( _e, _a, _b, _c, _d, F1, 11, 370 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, 371 W2, W3, W4, W5, W6, _, _, _ ); 372 373 _R( _d, _e, _a, _b, _c, F1, 12, 374 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, 375 W1, W2, W3, W4, W5, _, _, _ ); 376 _R( _c, _d, _e, _a, _b, F1, 13, 377 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, 378 W1, W2, W3, W4, W5, _, _, _ ); 379 _R( _b, _c, _d, _e, _a, F1, 14, 380 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, 381 W1, W2, W3, W4, W5, _, _, _ ); 382 _R( _a, _b, _c, _d, _e, F1, 15, 383 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, 384 W1, W2, W3, W4, W5, _, _, _ ); 385 386 /* Transform 16-63 + Precalc 32-79. */ 387 _R( _e, _a, _b, _c, _d, F1, 16, 388 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, 389 W0, W1, W2, W3, W4, W5, W6, W7); 390 _R( _d, _e, _a, _b, _c, F1, 17, 391 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, 392 W0, W1, W2, W3, W4, W5, W6, W7); 393 _R( _c, _d, _e, _a, _b, F1, 18, 394 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, 395 W0, W1, W2, W3, W4, W5, W6, W7); 396 _R( _b, _c, _d, _e, _a, F1, 19, 397 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, 398 W0, W1, W2, W3, W4, W5, W6, W7); 399 400 _R( _a, _b, _c, _d, _e, F2, 20, 401 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, 402 W7, W0, W1, W2, W3, W4, W5, W6); 403 _R( _e, _a, _b, _c, _d, F2, 21, 404 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, 405 W7, W0, W1, W2, W3, W4, W5, W6); 406 _R( _d, _e, _a, _b, _c, F2, 22, 407 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, 408 W7, W0, W1, W2, W3, W4, W5, W6); 409 _R( _c, _d, _e, _a, _b, F2, 23, 410 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, 411 W7, W0, W1, W2, W3, W4, W5, W6); 412 413#undef curK 414#define curK qK3 415 _R( _b, _c, _d, _e, _a, F2, 24, 416 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, 417 W6, W7, W0, W1, W2, W3, W4, W5); 418 _R( _a, _b, _c, _d, _e, F2, 25, 419 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, 420 W6, W7, W0, W1, W2, W3, W4, W5); 421 _R( _e, _a, _b, _c, _d, F2, 26, 422 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, 423 W6, W7, W0, W1, W2, W3, W4, W5); 424 _R( _d, _e, _a, _b, _c, F2, 27, 425 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, 426 W6, W7, W0, W1, W2, W3, W4, W5); 427 428 _R( _c, _d, _e, _a, _b, F2, 28, 429 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, 430 W5, W6, W7, W0, W1, W2, W3, W4); 431 _R( _b, _c, _d, _e, _a, F2, 29, 432 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, 433 W5, W6, W7, W0, W1, W2, W3, W4); 434 _R( _a, _b, _c, _d, _e, F2, 30, 435 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, 436 W5, W6, W7, W0, W1, W2, W3, W4); 437 _R( _e, _a, _b, _c, _d, F2, 31, 438 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, 439 W5, W6, W7, W0, W1, W2, W3, W4); 440 441 _R( _d, _e, _a, _b, _c, F2, 32, 442 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, 443 W4, W5, W6, W7, W0, W1, W2, W3); 444 _R( _c, _d, _e, _a, _b, F2, 33, 445 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, 446 W4, W5, W6, W7, W0, W1, W2, W3); 447 _R( _b, _c, _d, _e, _a, F2, 34, 448 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, 449 W4, W5, W6, W7, W0, W1, W2, W3); 450 _R( _a, _b, _c, _d, _e, F2, 35, 451 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, 452 W4, W5, W6, W7, W0, W1, W2, W3); 453 454 _R( _e, _a, _b, _c, _d, F2, 36, 455 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, 456 W3, W4, W5, W6, W7, W0, W1, W2); 457 _R( _d, _e, _a, _b, _c, F2, 37, 458 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, 459 W3, W4, W5, W6, W7, W0, W1, W2); 460 _R( _c, _d, _e, _a, _b, F2, 38, 461 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, 462 W3, W4, W5, W6, W7, W0, W1, W2); 463 _R( _b, _c, _d, _e, _a, F2, 39, 464 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, 465 W3, W4, W5, W6, W7, W0, W1, W2); 466 467 _R( _a, _b, _c, _d, _e, F3, 40, 468 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, 469 W2, W3, W4, W5, W6, W7, W0, W1); 470 _R( _e, _a, _b, _c, _d, F3, 41, 471 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, 472 W2, W3, W4, W5, W6, W7, W0, W1); 473 _R( _d, _e, _a, _b, _c, F3, 42, 474 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, 475 W2, W3, W4, W5, W6, W7, W0, W1); 476 _R( _c, _d, _e, _a, _b, F3, 43, 477 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, 478 W2, W3, W4, W5, W6, W7, W0, W1); 479 480#undef curK 481#define curK qK4 482 _R( _b, _c, _d, _e, _a, F3, 44, 483 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, 484 W1, W2, W3, W4, W5, W6, W7, W0); 485 _R( _a, _b, _c, _d, _e, F3, 45, 486 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, 487 W1, W2, W3, W4, W5, W6, W7, W0); 488 _R( _e, _a, _b, _c, _d, F3, 46, 489 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, 490 W1, W2, W3, W4, W5, W6, W7, W0); 491 _R( _d, _e, _a, _b, _c, F3, 47, 492 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, 493 W1, W2, W3, W4, W5, W6, W7, W0); 494 495 _R( _c, _d, _e, _a, _b, F3, 48, 496 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, 497 W0, W1, W2, W3, W4, W5, W6, W7); 498 _R( _b, _c, _d, _e, _a, F3, 49, 499 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, 500 W0, W1, W2, W3, W4, W5, W6, W7); 501 _R( _a, _b, _c, _d, _e, F3, 50, 502 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, 503 W0, W1, W2, W3, W4, W5, W6, W7); 504 _R( _e, _a, _b, _c, _d, F3, 51, 505 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, 506 W0, W1, W2, W3, W4, W5, W6, W7); 507 508 _R( _d, _e, _a, _b, _c, F3, 52, 509 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, 510 W7, W0, W1, W2, W3, W4, W5, W6); 511 _R( _c, _d, _e, _a, _b, F3, 53, 512 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, 513 W7, W0, W1, W2, W3, W4, W5, W6); 514 _R( _b, _c, _d, _e, _a, F3, 54, 515 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, 516 W7, W0, W1, W2, W3, W4, W5, W6); 517 _R( _a, _b, _c, _d, _e, F3, 55, 518 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, 519 W7, W0, W1, W2, W3, W4, W5, W6); 520 521 _R( _e, _a, _b, _c, _d, F3, 56, 522 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, 523 W6, W7, W0, W1, W2, W3, W4, W5); 524 _R( _d, _e, _a, _b, _c, F3, 57, 525 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, 526 W6, W7, W0, W1, W2, W3, W4, W5); 527 _R( _c, _d, _e, _a, _b, F3, 58, 528 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, 529 W6, W7, W0, W1, W2, W3, W4, W5); 530 _R( _b, _c, _d, _e, _a, F3, 59, 531 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, 532 W6, W7, W0, W1, W2, W3, W4, W5); 533 534 subs RNBLKS, #1; 535 536 _R( _a, _b, _c, _d, _e, F4, 60, 537 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, 538 W5, W6, W7, W0, W1, W2, W3, W4); 539 _R( _e, _a, _b, _c, _d, F4, 61, 540 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, 541 W5, W6, W7, W0, W1, W2, W3, W4); 542 _R( _d, _e, _a, _b, _c, F4, 62, 543 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, 544 W5, W6, W7, W0, W1, W2, W3, W4); 545 _R( _c, _d, _e, _a, _b, F4, 63, 546 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, 547 W5, W6, W7, W0, W1, W2, W3, W4); 548 549 beq .Lend; 550 551 /* Transform 64-79 + Precalc 0-15 of next block. */ 552#undef curK 553#define curK qK1 554 _R( _b, _c, _d, _e, _a, F4, 64, 555 WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 556 _R( _a, _b, _c, _d, _e, F4, 65, 557 WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 558 _R( _e, _a, _b, _c, _d, F4, 66, 559 WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 560 _R( _d, _e, _a, _b, _c, F4, 67, 561 WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 562 563 _R( _c, _d, _e, _a, _b, F4, 68, 564 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 565 _R( _b, _c, _d, _e, _a, F4, 69, 566 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 567 _R( _a, _b, _c, _d, _e, F4, 70, 568 WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 569 _R( _e, _a, _b, _c, _d, F4, 71, 570 WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 571 572 _R( _d, _e, _a, _b, _c, F4, 72, 573 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 574 _R( _c, _d, _e, _a, _b, F4, 73, 575 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 576 _R( _b, _c, _d, _e, _a, F4, 74, 577 WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 578 _R( _a, _b, _c, _d, _e, F4, 75, 579 WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 580 581 _R( _e, _a, _b, _c, _d, F4, 76, 582 WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 583 _R( _d, _e, _a, _b, _c, F4, 77, 584 WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 585 _R( _c, _d, _e, _a, _b, F4, 78, 586 WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 587 _R( _b, _c, _d, _e, _a, F4, 79, 588 WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); 589 590 /* Update the chaining variables. */ 591 ldm RSTATE, {RT0-RT3}; 592 add _a, RT0; 593 ldr RT0, [RSTATE, #state_h4]; 594 add _b, RT1; 595 add _c, RT2; 596 add _d, RT3; 597 add _e, RT0; 598 stm RSTATE, {_a-_e}; 599 600 b .Loop; 601 602.Lend: 603 /* Transform 64-79 */ 604 R( _b, _c, _d, _e, _a, F4, 64 ); 605 R( _a, _b, _c, _d, _e, F4, 65 ); 606 R( _e, _a, _b, _c, _d, F4, 66 ); 607 R( _d, _e, _a, _b, _c, F4, 67 ); 608 R( _c, _d, _e, _a, _b, F4, 68 ); 609 R( _b, _c, _d, _e, _a, F4, 69 ); 610 R( _a, _b, _c, _d, _e, F4, 70 ); 611 R( _e, _a, _b, _c, _d, F4, 71 ); 612 R( _d, _e, _a, _b, _c, F4, 72 ); 613 R( _c, _d, _e, _a, _b, F4, 73 ); 614 R( _b, _c, _d, _e, _a, F4, 74 ); 615 R( _a, _b, _c, _d, _e, F4, 75 ); 616 R( _e, _a, _b, _c, _d, F4, 76 ); 617 R( _d, _e, _a, _b, _c, F4, 77 ); 618 R( _c, _d, _e, _a, _b, F4, 78 ); 619 R( _b, _c, _d, _e, _a, F4, 79 ); 620 621 mov sp, ROLDSTACK; 622 623 /* Update the chaining variables. */ 624 ldm RSTATE, {RT0-RT3}; 625 add _a, RT0; 626 ldr RT0, [RSTATE, #state_h4]; 627 add _b, RT1; 628 add _c, RT2; 629 add _d, RT3; 630 /*vpop {q4-q7};*/ 631 add _e, RT0; 632 stm RSTATE, {_a-_e}; 633 634 pop {r4-r12, pc}; 635 636.Ldo_nothing: 637 bx lr 638ENDPROC(sha1_transform_neon) 639