1#define __ARM_ARCH__ __LINUX_ARM_ARCH__ 2@ ==================================================================== 3@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 4@ project. The module is, however, dual licensed under OpenSSL and 5@ CRYPTOGAMS licenses depending on where you obtain it. For further 6@ details see http://www.openssl.org/~appro/cryptogams/. 7@ ==================================================================== 8 9@ sha1_block procedure for ARMv4. 10@ 11@ January 2007. 12 13@ Size/performance trade-off 14@ ==================================================================== 15@ impl size in bytes comp cycles[*] measured performance 16@ ==================================================================== 17@ thumb 304 3212 4420 18@ armv4-small 392/+29% 1958/+64% 2250/+96% 19@ armv4-compact 740/+89% 1552/+26% 1840/+22% 20@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] 21@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% 22@ ==================================================================== 23@ thumb = same as 'small' but in Thumb instructions[**] and 24@ with recurring code in two private functions; 25@ small = detached Xload/update, loops are folded; 26@ compact = detached Xload/update, 5x unroll; 27@ large = interleaved Xload/update, 5x unroll; 28@ full unroll = interleaved Xload/update, full unroll, estimated[!]; 29@ 30@ [*] Manually counted instructions in "grand" loop body. Measured 31@ performance is affected by prologue and epilogue overhead, 32@ i-cache availability, branch penalties, etc. 33@ [**] While each Thumb instruction is twice smaller, they are not as 34@ diverse as ARM ones: e.g., there are only two arithmetic 35@ instructions with 3 arguments, no [fixed] rotate, addressing 36@ modes are limited. As result it takes more instructions to do 37@ the same job in Thumb, therefore the code is never twice as 38@ small and always slower. 39@ [***] which is also ~35% better than compiler generated code. Dual- 40@ issue Cortex A8 core was measured to process input block in 41@ ~990 cycles. 42 43@ August 2010. 44@ 45@ Rescheduling for dual-issue pipeline resulted in 13% improvement on 46@ Cortex A8 core and in absolute terms ~870 cycles per input block 47@ [or 13.6 cycles per byte]. 48 49@ February 2011. 50@ 51@ Profiler-assisted and platform-specific optimization resulted in 10% 52@ improvement on Cortex A8 core and 12.2 cycles per byte. 53 54.text 55 56.global sha1_block_data_order 57.type sha1_block_data_order,%function 58 59.align 2 60sha1_block_data_order: 61 stmdb sp!,{r4-r12,lr} 62 add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 63 ldmia r0,{r3,r4,r5,r6,r7} 64.Lloop: 65 ldr r8,.LK_00_19 66 mov r14,sp 67 sub sp,sp,#15*4 68 mov r5,r5,ror#30 69 mov r6,r6,ror#30 70 mov r7,r7,ror#30 @ [6] 71.L_00_15: 72#if __ARM_ARCH__<7 73 ldrb r10,[r1,#2] 74 ldrb r9,[r1,#3] 75 ldrb r11,[r1,#1] 76 add r7,r8,r7,ror#2 @ E+=K_00_19 77 ldrb r12,[r1],#4 78 orr r9,r9,r10,lsl#8 79 eor r10,r5,r6 @ F_xx_xx 80 orr r9,r9,r11,lsl#16 81 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 82 orr r9,r9,r12,lsl#24 83#else 84 ldr r9,[r1],#4 @ handles unaligned 85 add r7,r8,r7,ror#2 @ E+=K_00_19 86 eor r10,r5,r6 @ F_xx_xx 87 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 88#ifdef __ARMEL__ 89 rev r9,r9 @ byte swap 90#endif 91#endif 92 and r10,r4,r10,ror#2 93 add r7,r7,r9 @ E+=X[i] 94 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 95 str r9,[r14,#-4]! 96 add r7,r7,r10 @ E+=F_00_19(B,C,D) 97#if __ARM_ARCH__<7 98 ldrb r10,[r1,#2] 99 ldrb r9,[r1,#3] 100 ldrb r11,[r1,#1] 101 add r6,r8,r6,ror#2 @ E+=K_00_19 102 ldrb r12,[r1],#4 103 orr r9,r9,r10,lsl#8 104 eor r10,r4,r5 @ F_xx_xx 105 orr r9,r9,r11,lsl#16 106 add r6,r6,r7,ror#27 @ E+=ROR(A,27) 107 orr r9,r9,r12,lsl#24 108#else 109 ldr r9,[r1],#4 @ handles unaligned 110 add r6,r8,r6,ror#2 @ E+=K_00_19 111 eor r10,r4,r5 @ F_xx_xx 112 add r6,r6,r7,ror#27 @ E+=ROR(A,27) 113#ifdef __ARMEL__ 114 rev r9,r9 @ byte swap 115#endif 116#endif 117 and r10,r3,r10,ror#2 118 add r6,r6,r9 @ E+=X[i] 119 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 120 str r9,[r14,#-4]! 121 add r6,r6,r10 @ E+=F_00_19(B,C,D) 122#if __ARM_ARCH__<7 123 ldrb r10,[r1,#2] 124 ldrb r9,[r1,#3] 125 ldrb r11,[r1,#1] 126 add r5,r8,r5,ror#2 @ E+=K_00_19 127 ldrb r12,[r1],#4 128 orr r9,r9,r10,lsl#8 129 eor r10,r3,r4 @ F_xx_xx 130 orr r9,r9,r11,lsl#16 131 add r5,r5,r6,ror#27 @ E+=ROR(A,27) 132 orr r9,r9,r12,lsl#24 133#else 134 ldr r9,[r1],#4 @ handles unaligned 135 add r5,r8,r5,ror#2 @ E+=K_00_19 136 eor r10,r3,r4 @ F_xx_xx 137 add r5,r5,r6,ror#27 @ E+=ROR(A,27) 138#ifdef __ARMEL__ 139 rev r9,r9 @ byte swap 140#endif 141#endif 142 and r10,r7,r10,ror#2 143 add r5,r5,r9 @ E+=X[i] 144 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 145 str r9,[r14,#-4]! 146 add r5,r5,r10 @ E+=F_00_19(B,C,D) 147#if __ARM_ARCH__<7 148 ldrb r10,[r1,#2] 149 ldrb r9,[r1,#3] 150 ldrb r11,[r1,#1] 151 add r4,r8,r4,ror#2 @ E+=K_00_19 152 ldrb r12,[r1],#4 153 orr r9,r9,r10,lsl#8 154 eor r10,r7,r3 @ F_xx_xx 155 orr r9,r9,r11,lsl#16 156 add r4,r4,r5,ror#27 @ E+=ROR(A,27) 157 orr r9,r9,r12,lsl#24 158#else 159 ldr r9,[r1],#4 @ handles unaligned 160 add r4,r8,r4,ror#2 @ E+=K_00_19 161 eor r10,r7,r3 @ F_xx_xx 162 add r4,r4,r5,ror#27 @ E+=ROR(A,27) 163#ifdef __ARMEL__ 164 rev r9,r9 @ byte swap 165#endif 166#endif 167 and r10,r6,r10,ror#2 168 add r4,r4,r9 @ E+=X[i] 169 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 170 str r9,[r14,#-4]! 171 add r4,r4,r10 @ E+=F_00_19(B,C,D) 172#if __ARM_ARCH__<7 173 ldrb r10,[r1,#2] 174 ldrb r9,[r1,#3] 175 ldrb r11,[r1,#1] 176 add r3,r8,r3,ror#2 @ E+=K_00_19 177 ldrb r12,[r1],#4 178 orr r9,r9,r10,lsl#8 179 eor r10,r6,r7 @ F_xx_xx 180 orr r9,r9,r11,lsl#16 181 add r3,r3,r4,ror#27 @ E+=ROR(A,27) 182 orr r9,r9,r12,lsl#24 183#else 184 ldr r9,[r1],#4 @ handles unaligned 185 add r3,r8,r3,ror#2 @ E+=K_00_19 186 eor r10,r6,r7 @ F_xx_xx 187 add r3,r3,r4,ror#27 @ E+=ROR(A,27) 188#ifdef __ARMEL__ 189 rev r9,r9 @ byte swap 190#endif 191#endif 192 and r10,r5,r10,ror#2 193 add r3,r3,r9 @ E+=X[i] 194 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 195 str r9,[r14,#-4]! 196 add r3,r3,r10 @ E+=F_00_19(B,C,D) 197 teq r14,sp 198 bne .L_00_15 @ [((11+4)*5+2)*3] 199#if __ARM_ARCH__<7 200 ldrb r10,[r1,#2] 201 ldrb r9,[r1,#3] 202 ldrb r11,[r1,#1] 203 add r7,r8,r7,ror#2 @ E+=K_00_19 204 ldrb r12,[r1],#4 205 orr r9,r9,r10,lsl#8 206 eor r10,r5,r6 @ F_xx_xx 207 orr r9,r9,r11,lsl#16 208 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 209 orr r9,r9,r12,lsl#24 210#else 211 ldr r9,[r1],#4 @ handles unaligned 212 add r7,r8,r7,ror#2 @ E+=K_00_19 213 eor r10,r5,r6 @ F_xx_xx 214 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 215#ifdef __ARMEL__ 216 rev r9,r9 @ byte swap 217#endif 218#endif 219 and r10,r4,r10,ror#2 220 add r7,r7,r9 @ E+=X[i] 221 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 222 str r9,[r14,#-4]! 223 add r7,r7,r10 @ E+=F_00_19(B,C,D) 224 ldr r9,[r14,#15*4] 225 ldr r10,[r14,#13*4] 226 ldr r11,[r14,#7*4] 227 add r6,r8,r6,ror#2 @ E+=K_xx_xx 228 ldr r12,[r14,#2*4] 229 eor r9,r9,r10 230 eor r11,r11,r12 @ 1 cycle stall 231 eor r10,r4,r5 @ F_xx_xx 232 mov r9,r9,ror#31 233 add r6,r6,r7,ror#27 @ E+=ROR(A,27) 234 eor r9,r9,r11,ror#31 235 str r9,[r14,#-4]! 236 and r10,r3,r10,ror#2 @ F_xx_xx 237 @ F_xx_xx 238 add r6,r6,r9 @ E+=X[i] 239 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 240 add r6,r6,r10 @ E+=F_00_19(B,C,D) 241 ldr r9,[r14,#15*4] 242 ldr r10,[r14,#13*4] 243 ldr r11,[r14,#7*4] 244 add r5,r8,r5,ror#2 @ E+=K_xx_xx 245 ldr r12,[r14,#2*4] 246 eor r9,r9,r10 247 eor r11,r11,r12 @ 1 cycle stall 248 eor r10,r3,r4 @ F_xx_xx 249 mov r9,r9,ror#31 250 add r5,r5,r6,ror#27 @ E+=ROR(A,27) 251 eor r9,r9,r11,ror#31 252 str r9,[r14,#-4]! 253 and r10,r7,r10,ror#2 @ F_xx_xx 254 @ F_xx_xx 255 add r5,r5,r9 @ E+=X[i] 256 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 257 add r5,r5,r10 @ E+=F_00_19(B,C,D) 258 ldr r9,[r14,#15*4] 259 ldr r10,[r14,#13*4] 260 ldr r11,[r14,#7*4] 261 add r4,r8,r4,ror#2 @ E+=K_xx_xx 262 ldr r12,[r14,#2*4] 263 eor r9,r9,r10 264 eor r11,r11,r12 @ 1 cycle stall 265 eor r10,r7,r3 @ F_xx_xx 266 mov r9,r9,ror#31 267 add r4,r4,r5,ror#27 @ E+=ROR(A,27) 268 eor r9,r9,r11,ror#31 269 str r9,[r14,#-4]! 270 and r10,r6,r10,ror#2 @ F_xx_xx 271 @ F_xx_xx 272 add r4,r4,r9 @ E+=X[i] 273 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 274 add r4,r4,r10 @ E+=F_00_19(B,C,D) 275 ldr r9,[r14,#15*4] 276 ldr r10,[r14,#13*4] 277 ldr r11,[r14,#7*4] 278 add r3,r8,r3,ror#2 @ E+=K_xx_xx 279 ldr r12,[r14,#2*4] 280 eor r9,r9,r10 281 eor r11,r11,r12 @ 1 cycle stall 282 eor r10,r6,r7 @ F_xx_xx 283 mov r9,r9,ror#31 284 add r3,r3,r4,ror#27 @ E+=ROR(A,27) 285 eor r9,r9,r11,ror#31 286 str r9,[r14,#-4]! 287 and r10,r5,r10,ror#2 @ F_xx_xx 288 @ F_xx_xx 289 add r3,r3,r9 @ E+=X[i] 290 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 291 add r3,r3,r10 @ E+=F_00_19(B,C,D) 292 293 ldr r8,.LK_20_39 @ [+15+16*4] 294 sub sp,sp,#25*4 295 cmn sp,#0 @ [+3], clear carry to denote 20_39 296.L_20_39_or_60_79: 297 ldr r9,[r14,#15*4] 298 ldr r10,[r14,#13*4] 299 ldr r11,[r14,#7*4] 300 add r7,r8,r7,ror#2 @ E+=K_xx_xx 301 ldr r12,[r14,#2*4] 302 eor r9,r9,r10 303 eor r11,r11,r12 @ 1 cycle stall 304 eor r10,r5,r6 @ F_xx_xx 305 mov r9,r9,ror#31 306 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 307 eor r9,r9,r11,ror#31 308 str r9,[r14,#-4]! 309 eor r10,r4,r10,ror#2 @ F_xx_xx 310 @ F_xx_xx 311 add r7,r7,r9 @ E+=X[i] 312 add r7,r7,r10 @ E+=F_20_39(B,C,D) 313 ldr r9,[r14,#15*4] 314 ldr r10,[r14,#13*4] 315 ldr r11,[r14,#7*4] 316 add r6,r8,r6,ror#2 @ E+=K_xx_xx 317 ldr r12,[r14,#2*4] 318 eor r9,r9,r10 319 eor r11,r11,r12 @ 1 cycle stall 320 eor r10,r4,r5 @ F_xx_xx 321 mov r9,r9,ror#31 322 add r6,r6,r7,ror#27 @ E+=ROR(A,27) 323 eor r9,r9,r11,ror#31 324 str r9,[r14,#-4]! 325 eor r10,r3,r10,ror#2 @ F_xx_xx 326 @ F_xx_xx 327 add r6,r6,r9 @ E+=X[i] 328 add r6,r6,r10 @ E+=F_20_39(B,C,D) 329 ldr r9,[r14,#15*4] 330 ldr r10,[r14,#13*4] 331 ldr r11,[r14,#7*4] 332 add r5,r8,r5,ror#2 @ E+=K_xx_xx 333 ldr r12,[r14,#2*4] 334 eor r9,r9,r10 335 eor r11,r11,r12 @ 1 cycle stall 336 eor r10,r3,r4 @ F_xx_xx 337 mov r9,r9,ror#31 338 add r5,r5,r6,ror#27 @ E+=ROR(A,27) 339 eor r9,r9,r11,ror#31 340 str r9,[r14,#-4]! 341 eor r10,r7,r10,ror#2 @ F_xx_xx 342 @ F_xx_xx 343 add r5,r5,r9 @ E+=X[i] 344 add r5,r5,r10 @ E+=F_20_39(B,C,D) 345 ldr r9,[r14,#15*4] 346 ldr r10,[r14,#13*4] 347 ldr r11,[r14,#7*4] 348 add r4,r8,r4,ror#2 @ E+=K_xx_xx 349 ldr r12,[r14,#2*4] 350 eor r9,r9,r10 351 eor r11,r11,r12 @ 1 cycle stall 352 eor r10,r7,r3 @ F_xx_xx 353 mov r9,r9,ror#31 354 add r4,r4,r5,ror#27 @ E+=ROR(A,27) 355 eor r9,r9,r11,ror#31 356 str r9,[r14,#-4]! 357 eor r10,r6,r10,ror#2 @ F_xx_xx 358 @ F_xx_xx 359 add r4,r4,r9 @ E+=X[i] 360 add r4,r4,r10 @ E+=F_20_39(B,C,D) 361 ldr r9,[r14,#15*4] 362 ldr r10,[r14,#13*4] 363 ldr r11,[r14,#7*4] 364 add r3,r8,r3,ror#2 @ E+=K_xx_xx 365 ldr r12,[r14,#2*4] 366 eor r9,r9,r10 367 eor r11,r11,r12 @ 1 cycle stall 368 eor r10,r6,r7 @ F_xx_xx 369 mov r9,r9,ror#31 370 add r3,r3,r4,ror#27 @ E+=ROR(A,27) 371 eor r9,r9,r11,ror#31 372 str r9,[r14,#-4]! 373 eor r10,r5,r10,ror#2 @ F_xx_xx 374 @ F_xx_xx 375 add r3,r3,r9 @ E+=X[i] 376 add r3,r3,r10 @ E+=F_20_39(B,C,D) 377 teq r14,sp @ preserve carry 378 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] 379 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes 380 381 ldr r8,.LK_40_59 382 sub sp,sp,#20*4 @ [+2] 383.L_40_59: 384 ldr r9,[r14,#15*4] 385 ldr r10,[r14,#13*4] 386 ldr r11,[r14,#7*4] 387 add r7,r8,r7,ror#2 @ E+=K_xx_xx 388 ldr r12,[r14,#2*4] 389 eor r9,r9,r10 390 eor r11,r11,r12 @ 1 cycle stall 391 eor r10,r5,r6 @ F_xx_xx 392 mov r9,r9,ror#31 393 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 394 eor r9,r9,r11,ror#31 395 str r9,[r14,#-4]! 396 and r10,r4,r10,ror#2 @ F_xx_xx 397 and r11,r5,r6 @ F_xx_xx 398 add r7,r7,r9 @ E+=X[i] 399 add r7,r7,r10 @ E+=F_40_59(B,C,D) 400 add r7,r7,r11,ror#2 401 ldr r9,[r14,#15*4] 402 ldr r10,[r14,#13*4] 403 ldr r11,[r14,#7*4] 404 add r6,r8,r6,ror#2 @ E+=K_xx_xx 405 ldr r12,[r14,#2*4] 406 eor r9,r9,r10 407 eor r11,r11,r12 @ 1 cycle stall 408 eor r10,r4,r5 @ F_xx_xx 409 mov r9,r9,ror#31 410 add r6,r6,r7,ror#27 @ E+=ROR(A,27) 411 eor r9,r9,r11,ror#31 412 str r9,[r14,#-4]! 413 and r10,r3,r10,ror#2 @ F_xx_xx 414 and r11,r4,r5 @ F_xx_xx 415 add r6,r6,r9 @ E+=X[i] 416 add r6,r6,r10 @ E+=F_40_59(B,C,D) 417 add r6,r6,r11,ror#2 418 ldr r9,[r14,#15*4] 419 ldr r10,[r14,#13*4] 420 ldr r11,[r14,#7*4] 421 add r5,r8,r5,ror#2 @ E+=K_xx_xx 422 ldr r12,[r14,#2*4] 423 eor r9,r9,r10 424 eor r11,r11,r12 @ 1 cycle stall 425 eor r10,r3,r4 @ F_xx_xx 426 mov r9,r9,ror#31 427 add r5,r5,r6,ror#27 @ E+=ROR(A,27) 428 eor r9,r9,r11,ror#31 429 str r9,[r14,#-4]! 430 and r10,r7,r10,ror#2 @ F_xx_xx 431 and r11,r3,r4 @ F_xx_xx 432 add r5,r5,r9 @ E+=X[i] 433 add r5,r5,r10 @ E+=F_40_59(B,C,D) 434 add r5,r5,r11,ror#2 435 ldr r9,[r14,#15*4] 436 ldr r10,[r14,#13*4] 437 ldr r11,[r14,#7*4] 438 add r4,r8,r4,ror#2 @ E+=K_xx_xx 439 ldr r12,[r14,#2*4] 440 eor r9,r9,r10 441 eor r11,r11,r12 @ 1 cycle stall 442 eor r10,r7,r3 @ F_xx_xx 443 mov r9,r9,ror#31 444 add r4,r4,r5,ror#27 @ E+=ROR(A,27) 445 eor r9,r9,r11,ror#31 446 str r9,[r14,#-4]! 447 and r10,r6,r10,ror#2 @ F_xx_xx 448 and r11,r7,r3 @ F_xx_xx 449 add r4,r4,r9 @ E+=X[i] 450 add r4,r4,r10 @ E+=F_40_59(B,C,D) 451 add r4,r4,r11,ror#2 452 ldr r9,[r14,#15*4] 453 ldr r10,[r14,#13*4] 454 ldr r11,[r14,#7*4] 455 add r3,r8,r3,ror#2 @ E+=K_xx_xx 456 ldr r12,[r14,#2*4] 457 eor r9,r9,r10 458 eor r11,r11,r12 @ 1 cycle stall 459 eor r10,r6,r7 @ F_xx_xx 460 mov r9,r9,ror#31 461 add r3,r3,r4,ror#27 @ E+=ROR(A,27) 462 eor r9,r9,r11,ror#31 463 str r9,[r14,#-4]! 464 and r10,r5,r10,ror#2 @ F_xx_xx 465 and r11,r6,r7 @ F_xx_xx 466 add r3,r3,r9 @ E+=X[i] 467 add r3,r3,r10 @ E+=F_40_59(B,C,D) 468 add r3,r3,r11,ror#2 469 teq r14,sp 470 bne .L_40_59 @ [+((12+5)*5+2)*4] 471 472 ldr r8,.LK_60_79 473 sub sp,sp,#20*4 474 cmp sp,#0 @ set carry to denote 60_79 475 b .L_20_39_or_60_79 @ [+4], spare 300 bytes 476.L_done: 477 add sp,sp,#80*4 @ "deallocate" stack frame 478 ldmia r0,{r8,r9,r10,r11,r12} 479 add r3,r8,r3 480 add r4,r9,r4 481 add r5,r10,r5,ror#2 482 add r6,r11,r6,ror#2 483 add r7,r12,r7,ror#2 484 stmia r0,{r3,r4,r5,r6,r7} 485 teq r1,r2 486 bne .Lloop @ [+18], total 1307 487 488#if __ARM_ARCH__>=5 489 ldmia sp!,{r4-r12,pc} 490#else 491 ldmia sp!,{r4-r12,lr} 492 tst lr,#1 493 moveq pc,lr @ be binary compatible with V4, yet 494 .word 0xe12fff1e @ interoperable with Thumb ISA:-) 495#endif 496.align 2 497.LK_00_19: .word 0x5a827999 498.LK_20_39: .word 0x6ed9eba1 499.LK_40_59: .word 0x8f1bbcdc 500.LK_60_79: .word 0xca62c1d6 501.size sha1_block_data_order,.-sha1_block_data_order 502.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" 503.align 2 504