1*f0be44f4SDavid McCullough#define __ARM_ARCH__ __LINUX_ARM_ARCH__ 2*f0be44f4SDavid McCullough@ ==================================================================== 3*f0be44f4SDavid McCullough@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 4*f0be44f4SDavid McCullough@ project. The module is, however, dual licensed under OpenSSL and 5*f0be44f4SDavid McCullough@ CRYPTOGAMS licenses depending on where you obtain it. For further 6*f0be44f4SDavid McCullough@ details see http://www.openssl.org/~appro/cryptogams/. 7*f0be44f4SDavid McCullough@ ==================================================================== 8*f0be44f4SDavid McCullough 9*f0be44f4SDavid McCullough@ sha1_block procedure for ARMv4. 10*f0be44f4SDavid McCullough@ 11*f0be44f4SDavid McCullough@ January 2007. 12*f0be44f4SDavid McCullough 13*f0be44f4SDavid McCullough@ Size/performance trade-off 14*f0be44f4SDavid McCullough@ ==================================================================== 15*f0be44f4SDavid McCullough@ impl size in bytes comp cycles[*] measured performance 16*f0be44f4SDavid McCullough@ ==================================================================== 17*f0be44f4SDavid McCullough@ thumb 304 3212 4420 18*f0be44f4SDavid McCullough@ armv4-small 392/+29% 1958/+64% 2250/+96% 19*f0be44f4SDavid McCullough@ armv4-compact 740/+89% 1552/+26% 1840/+22% 20*f0be44f4SDavid McCullough@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] 21*f0be44f4SDavid McCullough@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% 22*f0be44f4SDavid McCullough@ ==================================================================== 23*f0be44f4SDavid McCullough@ thumb = same as 'small' but in Thumb instructions[**] and 24*f0be44f4SDavid McCullough@ with recurring code in two private functions; 25*f0be44f4SDavid McCullough@ small = detached Xload/update, loops are folded; 26*f0be44f4SDavid McCullough@ compact = detached Xload/update, 5x unroll; 27*f0be44f4SDavid McCullough@ large = interleaved Xload/update, 5x unroll; 28*f0be44f4SDavid McCullough@ full unroll = interleaved Xload/update, full unroll, estimated[!]; 29*f0be44f4SDavid McCullough@ 30*f0be44f4SDavid McCullough@ [*] Manually counted instructions in "grand" loop body. Measured 31*f0be44f4SDavid McCullough@ performance is affected by prologue and epilogue overhead, 32*f0be44f4SDavid McCullough@ i-cache availability, branch penalties, etc. 33*f0be44f4SDavid McCullough@ [**] While each Thumb instruction is twice smaller, they are not as 34*f0be44f4SDavid McCullough@ diverse as ARM ones: e.g., there are only two arithmetic 35*f0be44f4SDavid McCullough@ instructions with 3 arguments, no [fixed] rotate, addressing 36*f0be44f4SDavid McCullough@ modes are limited. As result it takes more instructions to do 37*f0be44f4SDavid McCullough@ the same job in Thumb, therefore the code is never twice as 38*f0be44f4SDavid McCullough@ small and always slower. 39*f0be44f4SDavid McCullough@ [***] which is also ~35% better than compiler generated code. Dual- 40*f0be44f4SDavid McCullough@ issue Cortex A8 core was measured to process input block in 41*f0be44f4SDavid McCullough@ ~990 cycles. 42*f0be44f4SDavid McCullough 43*f0be44f4SDavid McCullough@ August 2010. 44*f0be44f4SDavid McCullough@ 45*f0be44f4SDavid McCullough@ Rescheduling for dual-issue pipeline resulted in 13% improvement on 46*f0be44f4SDavid McCullough@ Cortex A8 core and in absolute terms ~870 cycles per input block 47*f0be44f4SDavid McCullough@ [or 13.6 cycles per byte]. 48*f0be44f4SDavid McCullough 49*f0be44f4SDavid McCullough@ February 2011. 50*f0be44f4SDavid McCullough@ 51*f0be44f4SDavid McCullough@ Profiler-assisted and platform-specific optimization resulted in 10% 52*f0be44f4SDavid McCullough@ improvement on Cortex A8 core and 12.2 cycles per byte. 53*f0be44f4SDavid McCullough 54*f0be44f4SDavid McCullough.text 55*f0be44f4SDavid McCullough 56*f0be44f4SDavid McCullough.global sha1_block_data_order 57*f0be44f4SDavid McCullough.type sha1_block_data_order,%function 58*f0be44f4SDavid McCullough 59*f0be44f4SDavid McCullough.align 2 60*f0be44f4SDavid McCulloughsha1_block_data_order: 61*f0be44f4SDavid McCullough stmdb sp!,{r4-r12,lr} 62*f0be44f4SDavid McCullough add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 63*f0be44f4SDavid McCullough ldmia r0,{r3,r4,r5,r6,r7} 64*f0be44f4SDavid McCullough.Lloop: 65*f0be44f4SDavid McCullough ldr r8,.LK_00_19 66*f0be44f4SDavid McCullough mov r14,sp 67*f0be44f4SDavid McCullough sub sp,sp,#15*4 68*f0be44f4SDavid McCullough mov r5,r5,ror#30 69*f0be44f4SDavid McCullough mov r6,r6,ror#30 70*f0be44f4SDavid McCullough mov r7,r7,ror#30 @ [6] 71*f0be44f4SDavid McCullough.L_00_15: 72*f0be44f4SDavid McCullough#if __ARM_ARCH__<7 73*f0be44f4SDavid McCullough ldrb r10,[r1,#2] 74*f0be44f4SDavid McCullough ldrb r9,[r1,#3] 75*f0be44f4SDavid McCullough ldrb r11,[r1,#1] 76*f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 77*f0be44f4SDavid McCullough ldrb r12,[r1],#4 78*f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 79*f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 80*f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 81*f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 82*f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 83*f0be44f4SDavid McCullough#else 84*f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 85*f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 86*f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 87*f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 88*f0be44f4SDavid McCullough#ifdef __ARMEL__ 89*f0be44f4SDavid McCullough rev r9,r9 @ byte swap 90*f0be44f4SDavid McCullough#endif 91*f0be44f4SDavid McCullough#endif 92*f0be44f4SDavid McCullough and r10,r4,r10,ror#2 93*f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 94*f0be44f4SDavid McCullough eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 95*f0be44f4SDavid McCullough str r9,[r14,#-4]! 96*f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_00_19(B,C,D) 97*f0be44f4SDavid McCullough#if __ARM_ARCH__<7 98*f0be44f4SDavid McCullough ldrb r10,[r1,#2] 99*f0be44f4SDavid McCullough ldrb r9,[r1,#3] 100*f0be44f4SDavid McCullough ldrb r11,[r1,#1] 101*f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_00_19 102*f0be44f4SDavid McCullough ldrb r12,[r1],#4 103*f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 104*f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 105*f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 106*f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 107*f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 108*f0be44f4SDavid McCullough#else 109*f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 110*f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_00_19 111*f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 112*f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 113*f0be44f4SDavid McCullough#ifdef __ARMEL__ 114*f0be44f4SDavid McCullough rev r9,r9 @ byte swap 115*f0be44f4SDavid McCullough#endif 116*f0be44f4SDavid McCullough#endif 117*f0be44f4SDavid McCullough and r10,r3,r10,ror#2 118*f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 119*f0be44f4SDavid McCullough eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 120*f0be44f4SDavid McCullough str r9,[r14,#-4]! 121*f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_00_19(B,C,D) 122*f0be44f4SDavid McCullough#if __ARM_ARCH__<7 123*f0be44f4SDavid McCullough ldrb r10,[r1,#2] 124*f0be44f4SDavid McCullough ldrb r9,[r1,#3] 125*f0be44f4SDavid McCullough ldrb r11,[r1,#1] 126*f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_00_19 127*f0be44f4SDavid McCullough ldrb r12,[r1],#4 128*f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 129*f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 130*f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 131*f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 132*f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 133*f0be44f4SDavid McCullough#else 134*f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 135*f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_00_19 136*f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 137*f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 138*f0be44f4SDavid McCullough#ifdef __ARMEL__ 139*f0be44f4SDavid McCullough rev r9,r9 @ byte swap 140*f0be44f4SDavid McCullough#endif 141*f0be44f4SDavid McCullough#endif 142*f0be44f4SDavid McCullough and r10,r7,r10,ror#2 143*f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 144*f0be44f4SDavid McCullough eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 145*f0be44f4SDavid McCullough str r9,[r14,#-4]! 146*f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_00_19(B,C,D) 147*f0be44f4SDavid McCullough#if __ARM_ARCH__<7 148*f0be44f4SDavid McCullough ldrb r10,[r1,#2] 149*f0be44f4SDavid McCullough ldrb r9,[r1,#3] 150*f0be44f4SDavid McCullough ldrb r11,[r1,#1] 151*f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_00_19 152*f0be44f4SDavid McCullough ldrb r12,[r1],#4 153*f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 154*f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 155*f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 156*f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 157*f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 158*f0be44f4SDavid McCullough#else 159*f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 160*f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_00_19 161*f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 162*f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 163*f0be44f4SDavid McCullough#ifdef __ARMEL__ 164*f0be44f4SDavid McCullough rev r9,r9 @ byte swap 165*f0be44f4SDavid McCullough#endif 166*f0be44f4SDavid McCullough#endif 167*f0be44f4SDavid McCullough and r10,r6,r10,ror#2 168*f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 169*f0be44f4SDavid McCullough eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 170*f0be44f4SDavid McCullough str r9,[r14,#-4]! 171*f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_00_19(B,C,D) 172*f0be44f4SDavid McCullough#if __ARM_ARCH__<7 173*f0be44f4SDavid McCullough ldrb r10,[r1,#2] 174*f0be44f4SDavid McCullough ldrb r9,[r1,#3] 175*f0be44f4SDavid McCullough ldrb r11,[r1,#1] 176*f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_00_19 177*f0be44f4SDavid McCullough ldrb r12,[r1],#4 178*f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 179*f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 180*f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 181*f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 182*f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 183*f0be44f4SDavid McCullough#else 184*f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 185*f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_00_19 186*f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 187*f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 188*f0be44f4SDavid McCullough#ifdef __ARMEL__ 189*f0be44f4SDavid McCullough rev r9,r9 @ byte swap 190*f0be44f4SDavid McCullough#endif 191*f0be44f4SDavid McCullough#endif 192*f0be44f4SDavid McCullough and r10,r5,r10,ror#2 193*f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 194*f0be44f4SDavid McCullough eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 195*f0be44f4SDavid McCullough str r9,[r14,#-4]! 196*f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_00_19(B,C,D) 197*f0be44f4SDavid McCullough teq r14,sp 198*f0be44f4SDavid McCullough bne .L_00_15 @ [((11+4)*5+2)*3] 199*f0be44f4SDavid McCullough#if __ARM_ARCH__<7 200*f0be44f4SDavid McCullough ldrb r10,[r1,#2] 201*f0be44f4SDavid McCullough ldrb r9,[r1,#3] 202*f0be44f4SDavid McCullough ldrb r11,[r1,#1] 203*f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 204*f0be44f4SDavid McCullough ldrb r12,[r1],#4 205*f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 206*f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 207*f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 208*f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 209*f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 210*f0be44f4SDavid McCullough#else 211*f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 212*f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 213*f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 214*f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 215*f0be44f4SDavid McCullough#ifdef __ARMEL__ 216*f0be44f4SDavid McCullough rev r9,r9 @ byte swap 217*f0be44f4SDavid McCullough#endif 218*f0be44f4SDavid McCullough#endif 219*f0be44f4SDavid McCullough and r10,r4,r10,ror#2 220*f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 221*f0be44f4SDavid McCullough eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 222*f0be44f4SDavid McCullough str r9,[r14,#-4]! 223*f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_00_19(B,C,D) 224*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 225*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 226*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 227*f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_xx_xx 228*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 229*f0be44f4SDavid McCullough eor r9,r9,r10 230*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 231*f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 232*f0be44f4SDavid McCullough mov r9,r9,ror#31 233*f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 234*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 235*f0be44f4SDavid McCullough str r9,[r14,#-4]! 236*f0be44f4SDavid McCullough and r10,r3,r10,ror#2 @ F_xx_xx 237*f0be44f4SDavid McCullough @ F_xx_xx 238*f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 239*f0be44f4SDavid McCullough eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 240*f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_00_19(B,C,D) 241*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 242*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 243*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 244*f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_xx_xx 245*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 246*f0be44f4SDavid McCullough eor r9,r9,r10 247*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 248*f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 249*f0be44f4SDavid McCullough mov r9,r9,ror#31 250*f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 251*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 252*f0be44f4SDavid McCullough str r9,[r14,#-4]! 253*f0be44f4SDavid McCullough and r10,r7,r10,ror#2 @ F_xx_xx 254*f0be44f4SDavid McCullough @ F_xx_xx 255*f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 256*f0be44f4SDavid McCullough eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 257*f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_00_19(B,C,D) 258*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 259*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 260*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 261*f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_xx_xx 262*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 263*f0be44f4SDavid McCullough eor r9,r9,r10 264*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 265*f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 266*f0be44f4SDavid McCullough mov r9,r9,ror#31 267*f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 268*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 269*f0be44f4SDavid McCullough str r9,[r14,#-4]! 270*f0be44f4SDavid McCullough and r10,r6,r10,ror#2 @ F_xx_xx 271*f0be44f4SDavid McCullough @ F_xx_xx 272*f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 273*f0be44f4SDavid McCullough eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 274*f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_00_19(B,C,D) 275*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 276*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 277*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 278*f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_xx_xx 279*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 280*f0be44f4SDavid McCullough eor r9,r9,r10 281*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 282*f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 283*f0be44f4SDavid McCullough mov r9,r9,ror#31 284*f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 285*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 286*f0be44f4SDavid McCullough str r9,[r14,#-4]! 287*f0be44f4SDavid McCullough and r10,r5,r10,ror#2 @ F_xx_xx 288*f0be44f4SDavid McCullough @ F_xx_xx 289*f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 290*f0be44f4SDavid McCullough eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 291*f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_00_19(B,C,D) 292*f0be44f4SDavid McCullough 293*f0be44f4SDavid McCullough ldr r8,.LK_20_39 @ [+15+16*4] 294*f0be44f4SDavid McCullough sub sp,sp,#25*4 295*f0be44f4SDavid McCullough cmn sp,#0 @ [+3], clear carry to denote 20_39 296*f0be44f4SDavid McCullough.L_20_39_or_60_79: 297*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 298*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 299*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 300*f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_xx_xx 301*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 302*f0be44f4SDavid McCullough eor r9,r9,r10 303*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 304*f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 305*f0be44f4SDavid McCullough mov r9,r9,ror#31 306*f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 307*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 308*f0be44f4SDavid McCullough str r9,[r14,#-4]! 309*f0be44f4SDavid McCullough eor r10,r4,r10,ror#2 @ F_xx_xx 310*f0be44f4SDavid McCullough @ F_xx_xx 311*f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 312*f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_20_39(B,C,D) 313*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 314*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 315*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 316*f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_xx_xx 317*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 318*f0be44f4SDavid McCullough eor r9,r9,r10 319*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 320*f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 321*f0be44f4SDavid McCullough mov r9,r9,ror#31 322*f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 323*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 324*f0be44f4SDavid McCullough str r9,[r14,#-4]! 325*f0be44f4SDavid McCullough eor r10,r3,r10,ror#2 @ F_xx_xx 326*f0be44f4SDavid McCullough @ F_xx_xx 327*f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 328*f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_20_39(B,C,D) 329*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 330*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 331*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 332*f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_xx_xx 333*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 334*f0be44f4SDavid McCullough eor r9,r9,r10 335*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 336*f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 337*f0be44f4SDavid McCullough mov r9,r9,ror#31 338*f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 339*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 340*f0be44f4SDavid McCullough str r9,[r14,#-4]! 341*f0be44f4SDavid McCullough eor r10,r7,r10,ror#2 @ F_xx_xx 342*f0be44f4SDavid McCullough @ F_xx_xx 343*f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 344*f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_20_39(B,C,D) 345*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 346*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 347*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 348*f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_xx_xx 349*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 350*f0be44f4SDavid McCullough eor r9,r9,r10 351*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 352*f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 353*f0be44f4SDavid McCullough mov r9,r9,ror#31 354*f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 355*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 356*f0be44f4SDavid McCullough str r9,[r14,#-4]! 357*f0be44f4SDavid McCullough eor r10,r6,r10,ror#2 @ F_xx_xx 358*f0be44f4SDavid McCullough @ F_xx_xx 359*f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 360*f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_20_39(B,C,D) 361*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 362*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 363*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 364*f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_xx_xx 365*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 366*f0be44f4SDavid McCullough eor r9,r9,r10 367*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 368*f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 369*f0be44f4SDavid McCullough mov r9,r9,ror#31 370*f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 371*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 372*f0be44f4SDavid McCullough str r9,[r14,#-4]! 373*f0be44f4SDavid McCullough eor r10,r5,r10,ror#2 @ F_xx_xx 374*f0be44f4SDavid McCullough @ F_xx_xx 375*f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 376*f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_20_39(B,C,D) 377*f0be44f4SDavid McCullough teq r14,sp @ preserve carry 378*f0be44f4SDavid McCullough bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] 379*f0be44f4SDavid McCullough bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes 380*f0be44f4SDavid McCullough 381*f0be44f4SDavid McCullough ldr r8,.LK_40_59 382*f0be44f4SDavid McCullough sub sp,sp,#20*4 @ [+2] 383*f0be44f4SDavid McCullough.L_40_59: 384*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 385*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 386*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 387*f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_xx_xx 388*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 389*f0be44f4SDavid McCullough eor r9,r9,r10 390*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 391*f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 392*f0be44f4SDavid McCullough mov r9,r9,ror#31 393*f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 394*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 395*f0be44f4SDavid McCullough str r9,[r14,#-4]! 396*f0be44f4SDavid McCullough and r10,r4,r10,ror#2 @ F_xx_xx 397*f0be44f4SDavid McCullough and r11,r5,r6 @ F_xx_xx 398*f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 399*f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_40_59(B,C,D) 400*f0be44f4SDavid McCullough add r7,r7,r11,ror#2 401*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 402*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 403*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 404*f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_xx_xx 405*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 406*f0be44f4SDavid McCullough eor r9,r9,r10 407*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 408*f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 409*f0be44f4SDavid McCullough mov r9,r9,ror#31 410*f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 411*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 412*f0be44f4SDavid McCullough str r9,[r14,#-4]! 413*f0be44f4SDavid McCullough and r10,r3,r10,ror#2 @ F_xx_xx 414*f0be44f4SDavid McCullough and r11,r4,r5 @ F_xx_xx 415*f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 416*f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_40_59(B,C,D) 417*f0be44f4SDavid McCullough add r6,r6,r11,ror#2 418*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 419*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 420*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 421*f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_xx_xx 422*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 423*f0be44f4SDavid McCullough eor r9,r9,r10 424*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 425*f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 426*f0be44f4SDavid McCullough mov r9,r9,ror#31 427*f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 428*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 429*f0be44f4SDavid McCullough str r9,[r14,#-4]! 430*f0be44f4SDavid McCullough and r10,r7,r10,ror#2 @ F_xx_xx 431*f0be44f4SDavid McCullough and r11,r3,r4 @ F_xx_xx 432*f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 433*f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_40_59(B,C,D) 434*f0be44f4SDavid McCullough add r5,r5,r11,ror#2 435*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 436*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 437*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 438*f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_xx_xx 439*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 440*f0be44f4SDavid McCullough eor r9,r9,r10 441*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 442*f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 443*f0be44f4SDavid McCullough mov r9,r9,ror#31 444*f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 445*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 446*f0be44f4SDavid McCullough str r9,[r14,#-4]! 447*f0be44f4SDavid McCullough and r10,r6,r10,ror#2 @ F_xx_xx 448*f0be44f4SDavid McCullough and r11,r7,r3 @ F_xx_xx 449*f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 450*f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_40_59(B,C,D) 451*f0be44f4SDavid McCullough add r4,r4,r11,ror#2 452*f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 453*f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 454*f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 455*f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_xx_xx 456*f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 457*f0be44f4SDavid McCullough eor r9,r9,r10 458*f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 459*f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 460*f0be44f4SDavid McCullough mov r9,r9,ror#31 461*f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 462*f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 463*f0be44f4SDavid McCullough str r9,[r14,#-4]! 464*f0be44f4SDavid McCullough and r10,r5,r10,ror#2 @ F_xx_xx 465*f0be44f4SDavid McCullough and r11,r6,r7 @ F_xx_xx 466*f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 467*f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_40_59(B,C,D) 468*f0be44f4SDavid McCullough add r3,r3,r11,ror#2 469*f0be44f4SDavid McCullough teq r14,sp 470*f0be44f4SDavid McCullough bne .L_40_59 @ [+((12+5)*5+2)*4] 471*f0be44f4SDavid McCullough 472*f0be44f4SDavid McCullough ldr r8,.LK_60_79 473*f0be44f4SDavid McCullough sub sp,sp,#20*4 474*f0be44f4SDavid McCullough cmp sp,#0 @ set carry to denote 60_79 475*f0be44f4SDavid McCullough b .L_20_39_or_60_79 @ [+4], spare 300 bytes 476*f0be44f4SDavid McCullough.L_done: 477*f0be44f4SDavid McCullough add sp,sp,#80*4 @ "deallocate" stack frame 478*f0be44f4SDavid McCullough ldmia r0,{r8,r9,r10,r11,r12} 479*f0be44f4SDavid McCullough add r3,r8,r3 480*f0be44f4SDavid McCullough add r4,r9,r4 481*f0be44f4SDavid McCullough add r5,r10,r5,ror#2 482*f0be44f4SDavid McCullough add r6,r11,r6,ror#2 483*f0be44f4SDavid McCullough add r7,r12,r7,ror#2 484*f0be44f4SDavid McCullough stmia r0,{r3,r4,r5,r6,r7} 485*f0be44f4SDavid McCullough teq r1,r2 486*f0be44f4SDavid McCullough bne .Lloop @ [+18], total 1307 487*f0be44f4SDavid McCullough 488*f0be44f4SDavid McCullough#if __ARM_ARCH__>=5 489*f0be44f4SDavid McCullough ldmia sp!,{r4-r12,pc} 490*f0be44f4SDavid McCullough#else 491*f0be44f4SDavid McCullough ldmia sp!,{r4-r12,lr} 492*f0be44f4SDavid McCullough tst lr,#1 493*f0be44f4SDavid McCullough moveq pc,lr @ be binary compatible with V4, yet 494*f0be44f4SDavid McCullough .word 0xe12fff1e @ interoperable with Thumb ISA:-) 495*f0be44f4SDavid McCullough#endif 496*f0be44f4SDavid McCullough.align 2 497*f0be44f4SDavid McCullough.LK_00_19: .word 0x5a827999 498*f0be44f4SDavid McCullough.LK_20_39: .word 0x6ed9eba1 499*f0be44f4SDavid McCullough.LK_40_59: .word 0x8f1bbcdc 500*f0be44f4SDavid McCullough.LK_60_79: .word 0xca62c1d6 501*f0be44f4SDavid McCullough.size sha1_block_data_order,.-sha1_block_data_order 502*f0be44f4SDavid McCullough.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" 503*f0be44f4SDavid McCullough.align 2 504