1#define __ARM_ARCH__ __LINUX_ARM_ARCH__ 2@ ==================================================================== 3@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 4@ project. The module is, however, dual licensed under OpenSSL and 5@ CRYPTOGAMS licenses depending on where you obtain it. For further 6@ details see http://www.openssl.org/~appro/cryptogams/. 7@ ==================================================================== 8 9@ sha1_block procedure for ARMv4. 10@ 11@ January 2007. 12 13@ Size/performance trade-off 14@ ==================================================================== 15@ impl size in bytes comp cycles[*] measured performance 16@ ==================================================================== 17@ thumb 304 3212 4420 18@ armv4-small 392/+29% 1958/+64% 2250/+96% 19@ armv4-compact 740/+89% 1552/+26% 1840/+22% 20@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] 21@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% 22@ ==================================================================== 23@ thumb = same as 'small' but in Thumb instructions[**] and 24@ with recurring code in two private functions; 25@ small = detached Xload/update, loops are folded; 26@ compact = detached Xload/update, 5x unroll; 27@ large = interleaved Xload/update, 5x unroll; 28@ full unroll = interleaved Xload/update, full unroll, estimated[!]; 29@ 30@ [*] Manually counted instructions in "grand" loop body. Measured 31@ performance is affected by prologue and epilogue overhead, 32@ i-cache availability, branch penalties, etc. 33@ [**] While each Thumb instruction is twice smaller, they are not as 34@ diverse as ARM ones: e.g., there are only two arithmetic 35@ instructions with 3 arguments, no [fixed] rotate, addressing 36@ modes are limited. As result it takes more instructions to do 37@ the same job in Thumb, therefore the code is never twice as 38@ small and always slower. 39@ [***] which is also ~35% better than compiler generated code. Dual- 40@ issue Cortex A8 core was measured to process input block in 41@ ~990 cycles. 42 43@ August 2010. 44@ 45@ Rescheduling for dual-issue pipeline resulted in 13% improvement on 46@ Cortex A8 core and in absolute terms ~870 cycles per input block 47@ [or 13.6 cycles per byte]. 48 49@ February 2011. 50@ 51@ Profiler-assisted and platform-specific optimization resulted in 10% 52@ improvement on Cortex A8 core and 12.2 cycles per byte. 53 54#include <linux/linkage.h> 55 56.text 57 58.align 2 59ENTRY(sha1_block_data_order) 60 stmdb sp!,{r4-r12,lr} 61 add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 62 ldmia r0,{r3,r4,r5,r6,r7} 63.Lloop: 64 ldr r8,.LK_00_19 65 mov r14,sp 66 sub sp,sp,#15*4 67 mov r5,r5,ror#30 68 mov r6,r6,ror#30 69 mov r7,r7,ror#30 @ [6] 70.L_00_15: 71#if __ARM_ARCH__<7 72 ldrb r10,[r1,#2] 73 ldrb r9,[r1,#3] 74 ldrb r11,[r1,#1] 75 add r7,r8,r7,ror#2 @ E+=K_00_19 76 ldrb r12,[r1],#4 77 orr r9,r9,r10,lsl#8 78 eor r10,r5,r6 @ F_xx_xx 79 orr r9,r9,r11,lsl#16 80 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 81 orr r9,r9,r12,lsl#24 82#else 83 ldr r9,[r1],#4 @ handles unaligned 84 add r7,r8,r7,ror#2 @ E+=K_00_19 85 eor r10,r5,r6 @ F_xx_xx 86 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 87#ifdef __ARMEL__ 88 rev r9,r9 @ byte swap 89#endif 90#endif 91 and r10,r4,r10,ror#2 92 add r7,r7,r9 @ E+=X[i] 93 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 94 str r9,[r14,#-4]! 95 add r7,r7,r10 @ E+=F_00_19(B,C,D) 96#if __ARM_ARCH__<7 97 ldrb r10,[r1,#2] 98 ldrb r9,[r1,#3] 99 ldrb r11,[r1,#1] 100 add r6,r8,r6,ror#2 @ E+=K_00_19 101 ldrb r12,[r1],#4 102 orr r9,r9,r10,lsl#8 103 eor r10,r4,r5 @ F_xx_xx 104 orr r9,r9,r11,lsl#16 105 add r6,r6,r7,ror#27 @ E+=ROR(A,27) 106 orr r9,r9,r12,lsl#24 107#else 108 ldr r9,[r1],#4 @ handles unaligned 109 add r6,r8,r6,ror#2 @ E+=K_00_19 110 eor r10,r4,r5 @ F_xx_xx 111 add r6,r6,r7,ror#27 @ E+=ROR(A,27) 112#ifdef __ARMEL__ 113 rev r9,r9 @ byte swap 114#endif 115#endif 116 and r10,r3,r10,ror#2 117 add r6,r6,r9 @ E+=X[i] 118 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 119 str r9,[r14,#-4]! 120 add r6,r6,r10 @ E+=F_00_19(B,C,D) 121#if __ARM_ARCH__<7 122 ldrb r10,[r1,#2] 123 ldrb r9,[r1,#3] 124 ldrb r11,[r1,#1] 125 add r5,r8,r5,ror#2 @ E+=K_00_19 126 ldrb r12,[r1],#4 127 orr r9,r9,r10,lsl#8 128 eor r10,r3,r4 @ F_xx_xx 129 orr r9,r9,r11,lsl#16 130 add r5,r5,r6,ror#27 @ E+=ROR(A,27) 131 orr r9,r9,r12,lsl#24 132#else 133 ldr r9,[r1],#4 @ handles unaligned 134 add r5,r8,r5,ror#2 @ E+=K_00_19 135 eor r10,r3,r4 @ F_xx_xx 136 add r5,r5,r6,ror#27 @ E+=ROR(A,27) 137#ifdef __ARMEL__ 138 rev r9,r9 @ byte swap 139#endif 140#endif 141 and r10,r7,r10,ror#2 142 add r5,r5,r9 @ E+=X[i] 143 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 144 str r9,[r14,#-4]! 145 add r5,r5,r10 @ E+=F_00_19(B,C,D) 146#if __ARM_ARCH__<7 147 ldrb r10,[r1,#2] 148 ldrb r9,[r1,#3] 149 ldrb r11,[r1,#1] 150 add r4,r8,r4,ror#2 @ E+=K_00_19 151 ldrb r12,[r1],#4 152 orr r9,r9,r10,lsl#8 153 eor r10,r7,r3 @ F_xx_xx 154 orr r9,r9,r11,lsl#16 155 add r4,r4,r5,ror#27 @ E+=ROR(A,27) 156 orr r9,r9,r12,lsl#24 157#else 158 ldr r9,[r1],#4 @ handles unaligned 159 add r4,r8,r4,ror#2 @ E+=K_00_19 160 eor r10,r7,r3 @ F_xx_xx 161 add r4,r4,r5,ror#27 @ E+=ROR(A,27) 162#ifdef __ARMEL__ 163 rev r9,r9 @ byte swap 164#endif 165#endif 166 and r10,r6,r10,ror#2 167 add r4,r4,r9 @ E+=X[i] 168 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 169 str r9,[r14,#-4]! 170 add r4,r4,r10 @ E+=F_00_19(B,C,D) 171#if __ARM_ARCH__<7 172 ldrb r10,[r1,#2] 173 ldrb r9,[r1,#3] 174 ldrb r11,[r1,#1] 175 add r3,r8,r3,ror#2 @ E+=K_00_19 176 ldrb r12,[r1],#4 177 orr r9,r9,r10,lsl#8 178 eor r10,r6,r7 @ F_xx_xx 179 orr r9,r9,r11,lsl#16 180 add r3,r3,r4,ror#27 @ E+=ROR(A,27) 181 orr r9,r9,r12,lsl#24 182#else 183 ldr r9,[r1],#4 @ handles unaligned 184 add r3,r8,r3,ror#2 @ E+=K_00_19 185 eor r10,r6,r7 @ F_xx_xx 186 add r3,r3,r4,ror#27 @ E+=ROR(A,27) 187#ifdef __ARMEL__ 188 rev r9,r9 @ byte swap 189#endif 190#endif 191 and r10,r5,r10,ror#2 192 add r3,r3,r9 @ E+=X[i] 193 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 194 str r9,[r14,#-4]! 195 add r3,r3,r10 @ E+=F_00_19(B,C,D) 196 cmp r14,sp 197 bne .L_00_15 @ [((11+4)*5+2)*3] 198 sub sp,sp,#25*4 199#if __ARM_ARCH__<7 200 ldrb r10,[r1,#2] 201 ldrb r9,[r1,#3] 202 ldrb r11,[r1,#1] 203 add r7,r8,r7,ror#2 @ E+=K_00_19 204 ldrb r12,[r1],#4 205 orr r9,r9,r10,lsl#8 206 eor r10,r5,r6 @ F_xx_xx 207 orr r9,r9,r11,lsl#16 208 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 209 orr r9,r9,r12,lsl#24 210#else 211 ldr r9,[r1],#4 @ handles unaligned 212 add r7,r8,r7,ror#2 @ E+=K_00_19 213 eor r10,r5,r6 @ F_xx_xx 214 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 215#ifdef __ARMEL__ 216 rev r9,r9 @ byte swap 217#endif 218#endif 219 and r10,r4,r10,ror#2 220 add r7,r7,r9 @ E+=X[i] 221 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 222 str r9,[r14,#-4]! 223 add r7,r7,r10 @ E+=F_00_19(B,C,D) 224 ldr r9,[r14,#15*4] 225 ldr r10,[r14,#13*4] 226 ldr r11,[r14,#7*4] 227 add r6,r8,r6,ror#2 @ E+=K_xx_xx 228 ldr r12,[r14,#2*4] 229 eor r9,r9,r10 230 eor r11,r11,r12 @ 1 cycle stall 231 eor r10,r4,r5 @ F_xx_xx 232 mov r9,r9,ror#31 233 add r6,r6,r7,ror#27 @ E+=ROR(A,27) 234 eor r9,r9,r11,ror#31 235 str r9,[r14,#-4]! 236 and r10,r3,r10,ror#2 @ F_xx_xx 237 @ F_xx_xx 238 add r6,r6,r9 @ E+=X[i] 239 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 240 add r6,r6,r10 @ E+=F_00_19(B,C,D) 241 ldr r9,[r14,#15*4] 242 ldr r10,[r14,#13*4] 243 ldr r11,[r14,#7*4] 244 add r5,r8,r5,ror#2 @ E+=K_xx_xx 245 ldr r12,[r14,#2*4] 246 eor r9,r9,r10 247 eor r11,r11,r12 @ 1 cycle stall 248 eor r10,r3,r4 @ F_xx_xx 249 mov r9,r9,ror#31 250 add r5,r5,r6,ror#27 @ E+=ROR(A,27) 251 eor r9,r9,r11,ror#31 252 str r9,[r14,#-4]! 253 and r10,r7,r10,ror#2 @ F_xx_xx 254 @ F_xx_xx 255 add r5,r5,r9 @ E+=X[i] 256 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 257 add r5,r5,r10 @ E+=F_00_19(B,C,D) 258 ldr r9,[r14,#15*4] 259 ldr r10,[r14,#13*4] 260 ldr r11,[r14,#7*4] 261 add r4,r8,r4,ror#2 @ E+=K_xx_xx 262 ldr r12,[r14,#2*4] 263 eor r9,r9,r10 264 eor r11,r11,r12 @ 1 cycle stall 265 eor r10,r7,r3 @ F_xx_xx 266 mov r9,r9,ror#31 267 add r4,r4,r5,ror#27 @ E+=ROR(A,27) 268 eor r9,r9,r11,ror#31 269 str r9,[r14,#-4]! 270 and r10,r6,r10,ror#2 @ F_xx_xx 271 @ F_xx_xx 272 add r4,r4,r9 @ E+=X[i] 273 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 274 add r4,r4,r10 @ E+=F_00_19(B,C,D) 275 ldr r9,[r14,#15*4] 276 ldr r10,[r14,#13*4] 277 ldr r11,[r14,#7*4] 278 add r3,r8,r3,ror#2 @ E+=K_xx_xx 279 ldr r12,[r14,#2*4] 280 eor r9,r9,r10 281 eor r11,r11,r12 @ 1 cycle stall 282 eor r10,r6,r7 @ F_xx_xx 283 mov r9,r9,ror#31 284 add r3,r3,r4,ror#27 @ E+=ROR(A,27) 285 eor r9,r9,r11,ror#31 286 str r9,[r14,#-4]! 287 and r10,r5,r10,ror#2 @ F_xx_xx 288 @ F_xx_xx 289 add r3,r3,r9 @ E+=X[i] 290 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 291 add r3,r3,r10 @ E+=F_00_19(B,C,D) 292 293 ldr r8,.LK_20_39 @ [+15+16*4] 294 cmn sp,#0 @ [+3], clear carry to denote 20_39 295.L_20_39_or_60_79: 296 ldr r9,[r14,#15*4] 297 ldr r10,[r14,#13*4] 298 ldr r11,[r14,#7*4] 299 add r7,r8,r7,ror#2 @ E+=K_xx_xx 300 ldr r12,[r14,#2*4] 301 eor r9,r9,r10 302 eor r11,r11,r12 @ 1 cycle stall 303 eor r10,r5,r6 @ F_xx_xx 304 mov r9,r9,ror#31 305 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 306 eor r9,r9,r11,ror#31 307 str r9,[r14,#-4]! 308 eor r10,r4,r10,ror#2 @ F_xx_xx 309 @ F_xx_xx 310 add r7,r7,r9 @ E+=X[i] 311 add r7,r7,r10 @ E+=F_20_39(B,C,D) 312 ldr r9,[r14,#15*4] 313 ldr r10,[r14,#13*4] 314 ldr r11,[r14,#7*4] 315 add r6,r8,r6,ror#2 @ E+=K_xx_xx 316 ldr r12,[r14,#2*4] 317 eor r9,r9,r10 318 eor r11,r11,r12 @ 1 cycle stall 319 eor r10,r4,r5 @ F_xx_xx 320 mov r9,r9,ror#31 321 add r6,r6,r7,ror#27 @ E+=ROR(A,27) 322 eor r9,r9,r11,ror#31 323 str r9,[r14,#-4]! 324 eor r10,r3,r10,ror#2 @ F_xx_xx 325 @ F_xx_xx 326 add r6,r6,r9 @ E+=X[i] 327 add r6,r6,r10 @ E+=F_20_39(B,C,D) 328 ldr r9,[r14,#15*4] 329 ldr r10,[r14,#13*4] 330 ldr r11,[r14,#7*4] 331 add r5,r8,r5,ror#2 @ E+=K_xx_xx 332 ldr r12,[r14,#2*4] 333 eor r9,r9,r10 334 eor r11,r11,r12 @ 1 cycle stall 335 eor r10,r3,r4 @ F_xx_xx 336 mov r9,r9,ror#31 337 add r5,r5,r6,ror#27 @ E+=ROR(A,27) 338 eor r9,r9,r11,ror#31 339 str r9,[r14,#-4]! 340 eor r10,r7,r10,ror#2 @ F_xx_xx 341 @ F_xx_xx 342 add r5,r5,r9 @ E+=X[i] 343 add r5,r5,r10 @ E+=F_20_39(B,C,D) 344 ldr r9,[r14,#15*4] 345 ldr r10,[r14,#13*4] 346 ldr r11,[r14,#7*4] 347 add r4,r8,r4,ror#2 @ E+=K_xx_xx 348 ldr r12,[r14,#2*4] 349 eor r9,r9,r10 350 eor r11,r11,r12 @ 1 cycle stall 351 eor r10,r7,r3 @ F_xx_xx 352 mov r9,r9,ror#31 353 add r4,r4,r5,ror#27 @ E+=ROR(A,27) 354 eor r9,r9,r11,ror#31 355 str r9,[r14,#-4]! 356 eor r10,r6,r10,ror#2 @ F_xx_xx 357 @ F_xx_xx 358 add r4,r4,r9 @ E+=X[i] 359 add r4,r4,r10 @ E+=F_20_39(B,C,D) 360 ldr r9,[r14,#15*4] 361 ldr r10,[r14,#13*4] 362 ldr r11,[r14,#7*4] 363 add r3,r8,r3,ror#2 @ E+=K_xx_xx 364 ldr r12,[r14,#2*4] 365 eor r9,r9,r10 366 eor r11,r11,r12 @ 1 cycle stall 367 eor r10,r6,r7 @ F_xx_xx 368 mov r9,r9,ror#31 369 add r3,r3,r4,ror#27 @ E+=ROR(A,27) 370 eor r9,r9,r11,ror#31 371 str r9,[r14,#-4]! 372 eor r10,r5,r10,ror#2 @ F_xx_xx 373 @ F_xx_xx 374 add r3,r3,r9 @ E+=X[i] 375 add r3,r3,r10 @ E+=F_20_39(B,C,D) 376 ARM( teq r14,sp ) @ preserve carry 377 THUMB( mov r11,sp ) 378 THUMB( teq r14,r11 ) @ preserve carry 379 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] 380 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes 381 382 ldr r8,.LK_40_59 383 sub sp,sp,#20*4 @ [+2] 384.L_40_59: 385 ldr r9,[r14,#15*4] 386 ldr r10,[r14,#13*4] 387 ldr r11,[r14,#7*4] 388 add r7,r8,r7,ror#2 @ E+=K_xx_xx 389 ldr r12,[r14,#2*4] 390 eor r9,r9,r10 391 eor r11,r11,r12 @ 1 cycle stall 392 eor r10,r5,r6 @ F_xx_xx 393 mov r9,r9,ror#31 394 add r7,r7,r3,ror#27 @ E+=ROR(A,27) 395 eor r9,r9,r11,ror#31 396 str r9,[r14,#-4]! 397 and r10,r4,r10,ror#2 @ F_xx_xx 398 and r11,r5,r6 @ F_xx_xx 399 add r7,r7,r9 @ E+=X[i] 400 add r7,r7,r10 @ E+=F_40_59(B,C,D) 401 add r7,r7,r11,ror#2 402 ldr r9,[r14,#15*4] 403 ldr r10,[r14,#13*4] 404 ldr r11,[r14,#7*4] 405 add r6,r8,r6,ror#2 @ E+=K_xx_xx 406 ldr r12,[r14,#2*4] 407 eor r9,r9,r10 408 eor r11,r11,r12 @ 1 cycle stall 409 eor r10,r4,r5 @ F_xx_xx 410 mov r9,r9,ror#31 411 add r6,r6,r7,ror#27 @ E+=ROR(A,27) 412 eor r9,r9,r11,ror#31 413 str r9,[r14,#-4]! 414 and r10,r3,r10,ror#2 @ F_xx_xx 415 and r11,r4,r5 @ F_xx_xx 416 add r6,r6,r9 @ E+=X[i] 417 add r6,r6,r10 @ E+=F_40_59(B,C,D) 418 add r6,r6,r11,ror#2 419 ldr r9,[r14,#15*4] 420 ldr r10,[r14,#13*4] 421 ldr r11,[r14,#7*4] 422 add r5,r8,r5,ror#2 @ E+=K_xx_xx 423 ldr r12,[r14,#2*4] 424 eor r9,r9,r10 425 eor r11,r11,r12 @ 1 cycle stall 426 eor r10,r3,r4 @ F_xx_xx 427 mov r9,r9,ror#31 428 add r5,r5,r6,ror#27 @ E+=ROR(A,27) 429 eor r9,r9,r11,ror#31 430 str r9,[r14,#-4]! 431 and r10,r7,r10,ror#2 @ F_xx_xx 432 and r11,r3,r4 @ F_xx_xx 433 add r5,r5,r9 @ E+=X[i] 434 add r5,r5,r10 @ E+=F_40_59(B,C,D) 435 add r5,r5,r11,ror#2 436 ldr r9,[r14,#15*4] 437 ldr r10,[r14,#13*4] 438 ldr r11,[r14,#7*4] 439 add r4,r8,r4,ror#2 @ E+=K_xx_xx 440 ldr r12,[r14,#2*4] 441 eor r9,r9,r10 442 eor r11,r11,r12 @ 1 cycle stall 443 eor r10,r7,r3 @ F_xx_xx 444 mov r9,r9,ror#31 445 add r4,r4,r5,ror#27 @ E+=ROR(A,27) 446 eor r9,r9,r11,ror#31 447 str r9,[r14,#-4]! 448 and r10,r6,r10,ror#2 @ F_xx_xx 449 and r11,r7,r3 @ F_xx_xx 450 add r4,r4,r9 @ E+=X[i] 451 add r4,r4,r10 @ E+=F_40_59(B,C,D) 452 add r4,r4,r11,ror#2 453 ldr r9,[r14,#15*4] 454 ldr r10,[r14,#13*4] 455 ldr r11,[r14,#7*4] 456 add r3,r8,r3,ror#2 @ E+=K_xx_xx 457 ldr r12,[r14,#2*4] 458 eor r9,r9,r10 459 eor r11,r11,r12 @ 1 cycle stall 460 eor r10,r6,r7 @ F_xx_xx 461 mov r9,r9,ror#31 462 add r3,r3,r4,ror#27 @ E+=ROR(A,27) 463 eor r9,r9,r11,ror#31 464 str r9,[r14,#-4]! 465 and r10,r5,r10,ror#2 @ F_xx_xx 466 and r11,r6,r7 @ F_xx_xx 467 add r3,r3,r9 @ E+=X[i] 468 add r3,r3,r10 @ E+=F_40_59(B,C,D) 469 add r3,r3,r11,ror#2 470 cmp r14,sp 471 bne .L_40_59 @ [+((12+5)*5+2)*4] 472 473 ldr r8,.LK_60_79 474 sub sp,sp,#20*4 475 cmp sp,#0 @ set carry to denote 60_79 476 b .L_20_39_or_60_79 @ [+4], spare 300 bytes 477.L_done: 478 add sp,sp,#80*4 @ "deallocate" stack frame 479 ldmia r0,{r8,r9,r10,r11,r12} 480 add r3,r8,r3 481 add r4,r9,r4 482 add r5,r10,r5,ror#2 483 add r6,r11,r6,ror#2 484 add r7,r12,r7,ror#2 485 stmia r0,{r3,r4,r5,r6,r7} 486 teq r1,r2 487 bne .Lloop @ [+18], total 1307 488 489 ldmia sp!,{r4-r12,pc} 490.align 2 491.LK_00_19: .word 0x5a827999 492.LK_20_39: .word 0x6ed9eba1 493.LK_40_59: .word 0x8f1bbcdc 494.LK_60_79: .word 0xca62c1d6 495ENDPROC(sha1_block_data_order) 496.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" 497.align 2 498