1f0be44f4SDavid McCullough#define __ARM_ARCH__ __LINUX_ARM_ARCH__ 2f0be44f4SDavid McCullough@ ==================================================================== 3f0be44f4SDavid McCullough@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 4f0be44f4SDavid McCullough@ project. The module is, however, dual licensed under OpenSSL and 5f0be44f4SDavid McCullough@ CRYPTOGAMS licenses depending on where you obtain it. For further 6f0be44f4SDavid McCullough@ details see http://www.openssl.org/~appro/cryptogams/. 7f0be44f4SDavid McCullough@ ==================================================================== 8f0be44f4SDavid McCullough 9f0be44f4SDavid McCullough@ sha1_block procedure for ARMv4. 10f0be44f4SDavid McCullough@ 11f0be44f4SDavid McCullough@ January 2007. 12f0be44f4SDavid McCullough 13f0be44f4SDavid McCullough@ Size/performance trade-off 14f0be44f4SDavid McCullough@ ==================================================================== 15f0be44f4SDavid McCullough@ impl size in bytes comp cycles[*] measured performance 16f0be44f4SDavid McCullough@ ==================================================================== 17f0be44f4SDavid McCullough@ thumb 304 3212 4420 18f0be44f4SDavid McCullough@ armv4-small 392/+29% 1958/+64% 2250/+96% 19f0be44f4SDavid McCullough@ armv4-compact 740/+89% 1552/+26% 1840/+22% 20f0be44f4SDavid McCullough@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] 21f0be44f4SDavid McCullough@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% 22f0be44f4SDavid McCullough@ ==================================================================== 23f0be44f4SDavid McCullough@ thumb = same as 'small' but in Thumb instructions[**] and 24f0be44f4SDavid McCullough@ with recurring code in two private functions; 25f0be44f4SDavid McCullough@ small = detached Xload/update, loops are folded; 26f0be44f4SDavid McCullough@ compact = detached Xload/update, 5x unroll; 27f0be44f4SDavid McCullough@ large = interleaved Xload/update, 5x unroll; 28f0be44f4SDavid McCullough@ full unroll = interleaved Xload/update, full unroll, estimated[!]; 29f0be44f4SDavid McCullough@ 30f0be44f4SDavid McCullough@ [*] Manually counted instructions in "grand" loop body. Measured 31f0be44f4SDavid McCullough@ performance is affected by prologue and epilogue overhead, 32f0be44f4SDavid McCullough@ i-cache availability, branch penalties, etc. 33f0be44f4SDavid McCullough@ [**] While each Thumb instruction is twice smaller, they are not as 34f0be44f4SDavid McCullough@ diverse as ARM ones: e.g., there are only two arithmetic 35f0be44f4SDavid McCullough@ instructions with 3 arguments, no [fixed] rotate, addressing 36f0be44f4SDavid McCullough@ modes are limited. As result it takes more instructions to do 37f0be44f4SDavid McCullough@ the same job in Thumb, therefore the code is never twice as 38f0be44f4SDavid McCullough@ small and always slower. 39f0be44f4SDavid McCullough@ [***] which is also ~35% better than compiler generated code. Dual- 40f0be44f4SDavid McCullough@ issue Cortex A8 core was measured to process input block in 41f0be44f4SDavid McCullough@ ~990 cycles. 42f0be44f4SDavid McCullough 43f0be44f4SDavid McCullough@ August 2010. 44f0be44f4SDavid McCullough@ 45f0be44f4SDavid McCullough@ Rescheduling for dual-issue pipeline resulted in 13% improvement on 46f0be44f4SDavid McCullough@ Cortex A8 core and in absolute terms ~870 cycles per input block 47f0be44f4SDavid McCullough@ [or 13.6 cycles per byte]. 48f0be44f4SDavid McCullough 49f0be44f4SDavid McCullough@ February 2011. 50f0be44f4SDavid McCullough@ 51f0be44f4SDavid McCullough@ Profiler-assisted and platform-specific optimization resulted in 10% 52f0be44f4SDavid McCullough@ improvement on Cortex A8 core and 12.2 cycles per byte. 53f0be44f4SDavid McCullough 54*638591cdSDave Martin#include <linux/linkage.h> 55*638591cdSDave Martin 56f0be44f4SDavid McCullough.text 57f0be44f4SDavid McCullough 58f0be44f4SDavid McCullough.align 2 59*638591cdSDave MartinENTRY(sha1_block_data_order) 60f0be44f4SDavid McCullough stmdb sp!,{r4-r12,lr} 61f0be44f4SDavid McCullough add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 62f0be44f4SDavid McCullough ldmia r0,{r3,r4,r5,r6,r7} 63f0be44f4SDavid McCullough.Lloop: 64f0be44f4SDavid McCullough ldr r8,.LK_00_19 65f0be44f4SDavid McCullough mov r14,sp 66f0be44f4SDavid McCullough sub sp,sp,#15*4 67f0be44f4SDavid McCullough mov r5,r5,ror#30 68f0be44f4SDavid McCullough mov r6,r6,ror#30 69f0be44f4SDavid McCullough mov r7,r7,ror#30 @ [6] 70f0be44f4SDavid McCullough.L_00_15: 71f0be44f4SDavid McCullough#if __ARM_ARCH__<7 72f0be44f4SDavid McCullough ldrb r10,[r1,#2] 73f0be44f4SDavid McCullough ldrb r9,[r1,#3] 74f0be44f4SDavid McCullough ldrb r11,[r1,#1] 75f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 76f0be44f4SDavid McCullough ldrb r12,[r1],#4 77f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 78f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 79f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 80f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 81f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 82f0be44f4SDavid McCullough#else 83f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 84f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 85f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 86f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 87f0be44f4SDavid McCullough#ifdef __ARMEL__ 88f0be44f4SDavid McCullough rev r9,r9 @ byte swap 89f0be44f4SDavid McCullough#endif 90f0be44f4SDavid McCullough#endif 91f0be44f4SDavid McCullough and r10,r4,r10,ror#2 92f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 93f0be44f4SDavid McCullough eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 94f0be44f4SDavid McCullough str r9,[r14,#-4]! 95f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_00_19(B,C,D) 96f0be44f4SDavid McCullough#if __ARM_ARCH__<7 97f0be44f4SDavid McCullough ldrb r10,[r1,#2] 98f0be44f4SDavid McCullough ldrb r9,[r1,#3] 99f0be44f4SDavid McCullough ldrb r11,[r1,#1] 100f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_00_19 101f0be44f4SDavid McCullough ldrb r12,[r1],#4 102f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 103f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 104f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 105f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 106f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 107f0be44f4SDavid McCullough#else 108f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 109f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_00_19 110f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 111f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 112f0be44f4SDavid McCullough#ifdef __ARMEL__ 113f0be44f4SDavid McCullough rev r9,r9 @ byte swap 114f0be44f4SDavid McCullough#endif 115f0be44f4SDavid McCullough#endif 116f0be44f4SDavid McCullough and r10,r3,r10,ror#2 117f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 118f0be44f4SDavid McCullough eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 119f0be44f4SDavid McCullough str r9,[r14,#-4]! 120f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_00_19(B,C,D) 121f0be44f4SDavid McCullough#if __ARM_ARCH__<7 122f0be44f4SDavid McCullough ldrb r10,[r1,#2] 123f0be44f4SDavid McCullough ldrb r9,[r1,#3] 124f0be44f4SDavid McCullough ldrb r11,[r1,#1] 125f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_00_19 126f0be44f4SDavid McCullough ldrb r12,[r1],#4 127f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 128f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 129f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 130f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 131f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 132f0be44f4SDavid McCullough#else 133f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 134f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_00_19 135f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 136f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 137f0be44f4SDavid McCullough#ifdef __ARMEL__ 138f0be44f4SDavid McCullough rev r9,r9 @ byte swap 139f0be44f4SDavid McCullough#endif 140f0be44f4SDavid McCullough#endif 141f0be44f4SDavid McCullough and r10,r7,r10,ror#2 142f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 143f0be44f4SDavid McCullough eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 144f0be44f4SDavid McCullough str r9,[r14,#-4]! 145f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_00_19(B,C,D) 146f0be44f4SDavid McCullough#if __ARM_ARCH__<7 147f0be44f4SDavid McCullough ldrb r10,[r1,#2] 148f0be44f4SDavid McCullough ldrb r9,[r1,#3] 149f0be44f4SDavid McCullough ldrb r11,[r1,#1] 150f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_00_19 151f0be44f4SDavid McCullough ldrb r12,[r1],#4 152f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 153f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 154f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 155f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 156f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 157f0be44f4SDavid McCullough#else 158f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 159f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_00_19 160f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 161f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 162f0be44f4SDavid McCullough#ifdef __ARMEL__ 163f0be44f4SDavid McCullough rev r9,r9 @ byte swap 164f0be44f4SDavid McCullough#endif 165f0be44f4SDavid McCullough#endif 166f0be44f4SDavid McCullough and r10,r6,r10,ror#2 167f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 168f0be44f4SDavid McCullough eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 169f0be44f4SDavid McCullough str r9,[r14,#-4]! 170f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_00_19(B,C,D) 171f0be44f4SDavid McCullough#if __ARM_ARCH__<7 172f0be44f4SDavid McCullough ldrb r10,[r1,#2] 173f0be44f4SDavid McCullough ldrb r9,[r1,#3] 174f0be44f4SDavid McCullough ldrb r11,[r1,#1] 175f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_00_19 176f0be44f4SDavid McCullough ldrb r12,[r1],#4 177f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 178f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 179f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 180f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 181f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 182f0be44f4SDavid McCullough#else 183f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 184f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_00_19 185f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 186f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 187f0be44f4SDavid McCullough#ifdef __ARMEL__ 188f0be44f4SDavid McCullough rev r9,r9 @ byte swap 189f0be44f4SDavid McCullough#endif 190f0be44f4SDavid McCullough#endif 191f0be44f4SDavid McCullough and r10,r5,r10,ror#2 192f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 193f0be44f4SDavid McCullough eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 194f0be44f4SDavid McCullough str r9,[r14,#-4]! 195f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_00_19(B,C,D) 196*638591cdSDave Martin cmp r14,sp 197f0be44f4SDavid McCullough bne .L_00_15 @ [((11+4)*5+2)*3] 198f0be44f4SDavid McCullough#if __ARM_ARCH__<7 199f0be44f4SDavid McCullough ldrb r10,[r1,#2] 200f0be44f4SDavid McCullough ldrb r9,[r1,#3] 201f0be44f4SDavid McCullough ldrb r11,[r1,#1] 202f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 203f0be44f4SDavid McCullough ldrb r12,[r1],#4 204f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 205f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 206f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 207f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 208f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 209f0be44f4SDavid McCullough#else 210f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 211f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 212f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 213f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 214f0be44f4SDavid McCullough#ifdef __ARMEL__ 215f0be44f4SDavid McCullough rev r9,r9 @ byte swap 216f0be44f4SDavid McCullough#endif 217f0be44f4SDavid McCullough#endif 218f0be44f4SDavid McCullough and r10,r4,r10,ror#2 219f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 220f0be44f4SDavid McCullough eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 221f0be44f4SDavid McCullough str r9,[r14,#-4]! 222f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_00_19(B,C,D) 223f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 224f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 225f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 226f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_xx_xx 227f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 228f0be44f4SDavid McCullough eor r9,r9,r10 229f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 230f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 231f0be44f4SDavid McCullough mov r9,r9,ror#31 232f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 233f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 234f0be44f4SDavid McCullough str r9,[r14,#-4]! 235f0be44f4SDavid McCullough and r10,r3,r10,ror#2 @ F_xx_xx 236f0be44f4SDavid McCullough @ F_xx_xx 237f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 238f0be44f4SDavid McCullough eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 239f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_00_19(B,C,D) 240f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 241f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 242f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 243f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_xx_xx 244f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 245f0be44f4SDavid McCullough eor r9,r9,r10 246f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 247f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 248f0be44f4SDavid McCullough mov r9,r9,ror#31 249f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 250f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 251f0be44f4SDavid McCullough str r9,[r14,#-4]! 252f0be44f4SDavid McCullough and r10,r7,r10,ror#2 @ F_xx_xx 253f0be44f4SDavid McCullough @ F_xx_xx 254f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 255f0be44f4SDavid McCullough eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 256f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_00_19(B,C,D) 257f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 258f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 259f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 260f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_xx_xx 261f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 262f0be44f4SDavid McCullough eor r9,r9,r10 263f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 264f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 265f0be44f4SDavid McCullough mov r9,r9,ror#31 266f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 267f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 268f0be44f4SDavid McCullough str r9,[r14,#-4]! 269f0be44f4SDavid McCullough and r10,r6,r10,ror#2 @ F_xx_xx 270f0be44f4SDavid McCullough @ F_xx_xx 271f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 272f0be44f4SDavid McCullough eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 273f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_00_19(B,C,D) 274f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 275f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 276f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 277f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_xx_xx 278f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 279f0be44f4SDavid McCullough eor r9,r9,r10 280f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 281f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 282f0be44f4SDavid McCullough mov r9,r9,ror#31 283f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 284f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 285f0be44f4SDavid McCullough str r9,[r14,#-4]! 286f0be44f4SDavid McCullough and r10,r5,r10,ror#2 @ F_xx_xx 287f0be44f4SDavid McCullough @ F_xx_xx 288f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 289f0be44f4SDavid McCullough eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 290f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_00_19(B,C,D) 291f0be44f4SDavid McCullough 292f0be44f4SDavid McCullough ldr r8,.LK_20_39 @ [+15+16*4] 293f0be44f4SDavid McCullough sub sp,sp,#25*4 294f0be44f4SDavid McCullough cmn sp,#0 @ [+3], clear carry to denote 20_39 295f0be44f4SDavid McCullough.L_20_39_or_60_79: 296f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 297f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 298f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 299f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_xx_xx 300f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 301f0be44f4SDavid McCullough eor r9,r9,r10 302f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 303f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 304f0be44f4SDavid McCullough mov r9,r9,ror#31 305f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 306f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 307f0be44f4SDavid McCullough str r9,[r14,#-4]! 308f0be44f4SDavid McCullough eor r10,r4,r10,ror#2 @ F_xx_xx 309f0be44f4SDavid McCullough @ F_xx_xx 310f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 311f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_20_39(B,C,D) 312f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 313f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 314f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 315f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_xx_xx 316f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 317f0be44f4SDavid McCullough eor r9,r9,r10 318f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 319f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 320f0be44f4SDavid McCullough mov r9,r9,ror#31 321f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 322f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 323f0be44f4SDavid McCullough str r9,[r14,#-4]! 324f0be44f4SDavid McCullough eor r10,r3,r10,ror#2 @ F_xx_xx 325f0be44f4SDavid McCullough @ F_xx_xx 326f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 327f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_20_39(B,C,D) 328f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 329f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 330f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 331f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_xx_xx 332f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 333f0be44f4SDavid McCullough eor r9,r9,r10 334f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 335f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 336f0be44f4SDavid McCullough mov r9,r9,ror#31 337f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 338f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 339f0be44f4SDavid McCullough str r9,[r14,#-4]! 340f0be44f4SDavid McCullough eor r10,r7,r10,ror#2 @ F_xx_xx 341f0be44f4SDavid McCullough @ F_xx_xx 342f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 343f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_20_39(B,C,D) 344f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 345f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 346f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 347f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_xx_xx 348f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 349f0be44f4SDavid McCullough eor r9,r9,r10 350f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 351f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 352f0be44f4SDavid McCullough mov r9,r9,ror#31 353f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 354f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 355f0be44f4SDavid McCullough str r9,[r14,#-4]! 356f0be44f4SDavid McCullough eor r10,r6,r10,ror#2 @ F_xx_xx 357f0be44f4SDavid McCullough @ F_xx_xx 358f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 359f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_20_39(B,C,D) 360f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 361f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 362f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 363f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_xx_xx 364f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 365f0be44f4SDavid McCullough eor r9,r9,r10 366f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 367f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 368f0be44f4SDavid McCullough mov r9,r9,ror#31 369f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 370f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 371f0be44f4SDavid McCullough str r9,[r14,#-4]! 372f0be44f4SDavid McCullough eor r10,r5,r10,ror#2 @ F_xx_xx 373f0be44f4SDavid McCullough @ F_xx_xx 374f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 375f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_20_39(B,C,D) 376*638591cdSDave Martin ARM( teq r14,sp ) @ preserve carry 377*638591cdSDave Martin THUMB( mov r11,sp ) 378*638591cdSDave Martin THUMB( teq r14,r11 ) @ preserve carry 379f0be44f4SDavid McCullough bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] 380f0be44f4SDavid McCullough bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes 381f0be44f4SDavid McCullough 382f0be44f4SDavid McCullough ldr r8,.LK_40_59 383f0be44f4SDavid McCullough sub sp,sp,#20*4 @ [+2] 384f0be44f4SDavid McCullough.L_40_59: 385f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 386f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 387f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 388f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_xx_xx 389f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 390f0be44f4SDavid McCullough eor r9,r9,r10 391f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 392f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 393f0be44f4SDavid McCullough mov r9,r9,ror#31 394f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 395f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 396f0be44f4SDavid McCullough str r9,[r14,#-4]! 397f0be44f4SDavid McCullough and r10,r4,r10,ror#2 @ F_xx_xx 398f0be44f4SDavid McCullough and r11,r5,r6 @ F_xx_xx 399f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 400f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_40_59(B,C,D) 401f0be44f4SDavid McCullough add r7,r7,r11,ror#2 402f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 403f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 404f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 405f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_xx_xx 406f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 407f0be44f4SDavid McCullough eor r9,r9,r10 408f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 409f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 410f0be44f4SDavid McCullough mov r9,r9,ror#31 411f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 412f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 413f0be44f4SDavid McCullough str r9,[r14,#-4]! 414f0be44f4SDavid McCullough and r10,r3,r10,ror#2 @ F_xx_xx 415f0be44f4SDavid McCullough and r11,r4,r5 @ F_xx_xx 416f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 417f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_40_59(B,C,D) 418f0be44f4SDavid McCullough add r6,r6,r11,ror#2 419f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 420f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 421f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 422f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_xx_xx 423f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 424f0be44f4SDavid McCullough eor r9,r9,r10 425f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 426f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 427f0be44f4SDavid McCullough mov r9,r9,ror#31 428f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 429f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 430f0be44f4SDavid McCullough str r9,[r14,#-4]! 431f0be44f4SDavid McCullough and r10,r7,r10,ror#2 @ F_xx_xx 432f0be44f4SDavid McCullough and r11,r3,r4 @ F_xx_xx 433f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 434f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_40_59(B,C,D) 435f0be44f4SDavid McCullough add r5,r5,r11,ror#2 436f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 437f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 438f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 439f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_xx_xx 440f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 441f0be44f4SDavid McCullough eor r9,r9,r10 442f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 443f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 444f0be44f4SDavid McCullough mov r9,r9,ror#31 445f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 446f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 447f0be44f4SDavid McCullough str r9,[r14,#-4]! 448f0be44f4SDavid McCullough and r10,r6,r10,ror#2 @ F_xx_xx 449f0be44f4SDavid McCullough and r11,r7,r3 @ F_xx_xx 450f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 451f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_40_59(B,C,D) 452f0be44f4SDavid McCullough add r4,r4,r11,ror#2 453f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 454f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 455f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 456f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_xx_xx 457f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 458f0be44f4SDavid McCullough eor r9,r9,r10 459f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 460f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 461f0be44f4SDavid McCullough mov r9,r9,ror#31 462f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 463f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 464f0be44f4SDavid McCullough str r9,[r14,#-4]! 465f0be44f4SDavid McCullough and r10,r5,r10,ror#2 @ F_xx_xx 466f0be44f4SDavid McCullough and r11,r6,r7 @ F_xx_xx 467f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 468f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_40_59(B,C,D) 469f0be44f4SDavid McCullough add r3,r3,r11,ror#2 470*638591cdSDave Martin cmp r14,sp 471f0be44f4SDavid McCullough bne .L_40_59 @ [+((12+5)*5+2)*4] 472f0be44f4SDavid McCullough 473f0be44f4SDavid McCullough ldr r8,.LK_60_79 474f0be44f4SDavid McCullough sub sp,sp,#20*4 475f0be44f4SDavid McCullough cmp sp,#0 @ set carry to denote 60_79 476f0be44f4SDavid McCullough b .L_20_39_or_60_79 @ [+4], spare 300 bytes 477f0be44f4SDavid McCullough.L_done: 478f0be44f4SDavid McCullough add sp,sp,#80*4 @ "deallocate" stack frame 479f0be44f4SDavid McCullough ldmia r0,{r8,r9,r10,r11,r12} 480f0be44f4SDavid McCullough add r3,r8,r3 481f0be44f4SDavid McCullough add r4,r9,r4 482f0be44f4SDavid McCullough add r5,r10,r5,ror#2 483f0be44f4SDavid McCullough add r6,r11,r6,ror#2 484f0be44f4SDavid McCullough add r7,r12,r7,ror#2 485f0be44f4SDavid McCullough stmia r0,{r3,r4,r5,r6,r7} 486f0be44f4SDavid McCullough teq r1,r2 487f0be44f4SDavid McCullough bne .Lloop @ [+18], total 1307 488f0be44f4SDavid McCullough 489f0be44f4SDavid McCullough ldmia sp!,{r4-r12,pc} 490f0be44f4SDavid McCullough.align 2 491f0be44f4SDavid McCullough.LK_00_19: .word 0x5a827999 492f0be44f4SDavid McCullough.LK_20_39: .word 0x6ed9eba1 493f0be44f4SDavid McCullough.LK_40_59: .word 0x8f1bbcdc 494f0be44f4SDavid McCullough.LK_60_79: .word 0xca62c1d6 495*638591cdSDave MartinENDPROC(sha1_block_data_order) 496f0be44f4SDavid McCullough.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" 497f0be44f4SDavid McCullough.align 2 498