1f0be44f4SDavid McCullough#define __ARM_ARCH__ __LINUX_ARM_ARCH__ 2c2e415feSAdam Langley@ SPDX-License-Identifier: GPL-2.0 3c2e415feSAdam Langley 4c2e415feSAdam Langley@ This code is taken from the OpenSSL project but the author (Andy Polyakov) 5c2e415feSAdam Langley@ has relicensed it under the GPLv2. Therefore this program is free software; 6c2e415feSAdam Langley@ you can redistribute it and/or modify it under the terms of the GNU General 7c2e415feSAdam Langley@ Public License version 2 as published by the Free Software Foundation. 8c2e415feSAdam Langley@ 9c2e415feSAdam Langley@ The original headers, including the original license headers, are 10c2e415feSAdam Langley@ included below for completeness. 11c2e415feSAdam Langley 12f0be44f4SDavid McCullough@ ==================================================================== 13f0be44f4SDavid McCullough@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 14f0be44f4SDavid McCullough@ project. The module is, however, dual licensed under OpenSSL and 15f0be44f4SDavid McCullough@ CRYPTOGAMS licenses depending on where you obtain it. For further 16*9332a9e7SAlexander A. Klimov@ details see https://www.openssl.org/~appro/cryptogams/. 17f0be44f4SDavid McCullough@ ==================================================================== 18f0be44f4SDavid McCullough 19f0be44f4SDavid McCullough@ sha1_block procedure for ARMv4. 20f0be44f4SDavid McCullough@ 21f0be44f4SDavid McCullough@ January 2007. 22f0be44f4SDavid McCullough 23f0be44f4SDavid McCullough@ Size/performance trade-off 24f0be44f4SDavid McCullough@ ==================================================================== 25f0be44f4SDavid McCullough@ impl size in bytes comp cycles[*] measured performance 26f0be44f4SDavid McCullough@ ==================================================================== 27f0be44f4SDavid McCullough@ thumb 304 3212 4420 28f0be44f4SDavid McCullough@ armv4-small 392/+29% 1958/+64% 2250/+96% 29f0be44f4SDavid McCullough@ armv4-compact 740/+89% 1552/+26% 1840/+22% 30f0be44f4SDavid McCullough@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] 31f0be44f4SDavid McCullough@ full unroll ~5100/+260% ~1260/+4% ~1300/+5% 32f0be44f4SDavid McCullough@ ==================================================================== 33f0be44f4SDavid McCullough@ thumb = same as 'small' but in Thumb instructions[**] and 34f0be44f4SDavid McCullough@ with recurring code in two private functions; 35f0be44f4SDavid McCullough@ small = detached Xload/update, loops are folded; 36f0be44f4SDavid McCullough@ compact = detached Xload/update, 5x unroll; 37f0be44f4SDavid McCullough@ large = interleaved Xload/update, 5x unroll; 38f0be44f4SDavid McCullough@ full unroll = interleaved Xload/update, full unroll, estimated[!]; 39f0be44f4SDavid McCullough@ 40f0be44f4SDavid McCullough@ [*] Manually counted instructions in "grand" loop body. Measured 41f0be44f4SDavid McCullough@ performance is affected by prologue and epilogue overhead, 42f0be44f4SDavid McCullough@ i-cache availability, branch penalties, etc. 43f0be44f4SDavid McCullough@ [**] While each Thumb instruction is twice smaller, they are not as 44f0be44f4SDavid McCullough@ diverse as ARM ones: e.g., there are only two arithmetic 45f0be44f4SDavid McCullough@ instructions with 3 arguments, no [fixed] rotate, addressing 46f0be44f4SDavid McCullough@ modes are limited. As result it takes more instructions to do 47f0be44f4SDavid McCullough@ the same job in Thumb, therefore the code is never twice as 48f0be44f4SDavid McCullough@ small and always slower. 49f0be44f4SDavid McCullough@ [***] which is also ~35% better than compiler generated code. Dual- 50f0be44f4SDavid McCullough@ issue Cortex A8 core was measured to process input block in 51f0be44f4SDavid McCullough@ ~990 cycles. 52f0be44f4SDavid McCullough 53f0be44f4SDavid McCullough@ August 2010. 54f0be44f4SDavid McCullough@ 55f0be44f4SDavid McCullough@ Rescheduling for dual-issue pipeline resulted in 13% improvement on 56f0be44f4SDavid McCullough@ Cortex A8 core and in absolute terms ~870 cycles per input block 57f0be44f4SDavid McCullough@ [or 13.6 cycles per byte]. 58f0be44f4SDavid McCullough 59f0be44f4SDavid McCullough@ February 2011. 60f0be44f4SDavid McCullough@ 61f0be44f4SDavid McCullough@ Profiler-assisted and platform-specific optimization resulted in 10% 62f0be44f4SDavid McCullough@ improvement on Cortex A8 core and 12.2 cycles per byte. 63f0be44f4SDavid McCullough 64638591cdSDave Martin#include <linux/linkage.h> 65638591cdSDave Martin 66f0be44f4SDavid McCullough.text 67f0be44f4SDavid McCullough 68f0be44f4SDavid McCullough.align 2 69638591cdSDave MartinENTRY(sha1_block_data_order) 70f0be44f4SDavid McCullough stmdb sp!,{r4-r12,lr} 71f0be44f4SDavid McCullough add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 72f0be44f4SDavid McCullough ldmia r0,{r3,r4,r5,r6,r7} 73f0be44f4SDavid McCullough.Lloop: 74f0be44f4SDavid McCullough ldr r8,.LK_00_19 75f0be44f4SDavid McCullough mov r14,sp 76f0be44f4SDavid McCullough sub sp,sp,#15*4 77f0be44f4SDavid McCullough mov r5,r5,ror#30 78f0be44f4SDavid McCullough mov r6,r6,ror#30 79f0be44f4SDavid McCullough mov r7,r7,ror#30 @ [6] 80f0be44f4SDavid McCullough.L_00_15: 81f0be44f4SDavid McCullough#if __ARM_ARCH__<7 82f0be44f4SDavid McCullough ldrb r10,[r1,#2] 83f0be44f4SDavid McCullough ldrb r9,[r1,#3] 84f0be44f4SDavid McCullough ldrb r11,[r1,#1] 85f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 86f0be44f4SDavid McCullough ldrb r12,[r1],#4 87f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 88f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 89f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 90f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 91f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 92f0be44f4SDavid McCullough#else 93f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 94f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 95f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 96f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 97f0be44f4SDavid McCullough#ifdef __ARMEL__ 98f0be44f4SDavid McCullough rev r9,r9 @ byte swap 99f0be44f4SDavid McCullough#endif 100f0be44f4SDavid McCullough#endif 101f0be44f4SDavid McCullough and r10,r4,r10,ror#2 102f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 103f0be44f4SDavid McCullough eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 104f0be44f4SDavid McCullough str r9,[r14,#-4]! 105f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_00_19(B,C,D) 106f0be44f4SDavid McCullough#if __ARM_ARCH__<7 107f0be44f4SDavid McCullough ldrb r10,[r1,#2] 108f0be44f4SDavid McCullough ldrb r9,[r1,#3] 109f0be44f4SDavid McCullough ldrb r11,[r1,#1] 110f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_00_19 111f0be44f4SDavid McCullough ldrb r12,[r1],#4 112f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 113f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 114f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 115f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 116f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 117f0be44f4SDavid McCullough#else 118f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 119f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_00_19 120f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 121f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 122f0be44f4SDavid McCullough#ifdef __ARMEL__ 123f0be44f4SDavid McCullough rev r9,r9 @ byte swap 124f0be44f4SDavid McCullough#endif 125f0be44f4SDavid McCullough#endif 126f0be44f4SDavid McCullough and r10,r3,r10,ror#2 127f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 128f0be44f4SDavid McCullough eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 129f0be44f4SDavid McCullough str r9,[r14,#-4]! 130f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_00_19(B,C,D) 131f0be44f4SDavid McCullough#if __ARM_ARCH__<7 132f0be44f4SDavid McCullough ldrb r10,[r1,#2] 133f0be44f4SDavid McCullough ldrb r9,[r1,#3] 134f0be44f4SDavid McCullough ldrb r11,[r1,#1] 135f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_00_19 136f0be44f4SDavid McCullough ldrb r12,[r1],#4 137f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 138f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 139f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 140f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 141f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 142f0be44f4SDavid McCullough#else 143f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 144f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_00_19 145f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 146f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 147f0be44f4SDavid McCullough#ifdef __ARMEL__ 148f0be44f4SDavid McCullough rev r9,r9 @ byte swap 149f0be44f4SDavid McCullough#endif 150f0be44f4SDavid McCullough#endif 151f0be44f4SDavid McCullough and r10,r7,r10,ror#2 152f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 153f0be44f4SDavid McCullough eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 154f0be44f4SDavid McCullough str r9,[r14,#-4]! 155f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_00_19(B,C,D) 156f0be44f4SDavid McCullough#if __ARM_ARCH__<7 157f0be44f4SDavid McCullough ldrb r10,[r1,#2] 158f0be44f4SDavid McCullough ldrb r9,[r1,#3] 159f0be44f4SDavid McCullough ldrb r11,[r1,#1] 160f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_00_19 161f0be44f4SDavid McCullough ldrb r12,[r1],#4 162f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 163f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 164f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 165f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 166f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 167f0be44f4SDavid McCullough#else 168f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 169f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_00_19 170f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 171f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 172f0be44f4SDavid McCullough#ifdef __ARMEL__ 173f0be44f4SDavid McCullough rev r9,r9 @ byte swap 174f0be44f4SDavid McCullough#endif 175f0be44f4SDavid McCullough#endif 176f0be44f4SDavid McCullough and r10,r6,r10,ror#2 177f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 178f0be44f4SDavid McCullough eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 179f0be44f4SDavid McCullough str r9,[r14,#-4]! 180f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_00_19(B,C,D) 181f0be44f4SDavid McCullough#if __ARM_ARCH__<7 182f0be44f4SDavid McCullough ldrb r10,[r1,#2] 183f0be44f4SDavid McCullough ldrb r9,[r1,#3] 184f0be44f4SDavid McCullough ldrb r11,[r1,#1] 185f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_00_19 186f0be44f4SDavid McCullough ldrb r12,[r1],#4 187f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 188f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 189f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 190f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 191f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 192f0be44f4SDavid McCullough#else 193f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 194f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_00_19 195f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 196f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 197f0be44f4SDavid McCullough#ifdef __ARMEL__ 198f0be44f4SDavid McCullough rev r9,r9 @ byte swap 199f0be44f4SDavid McCullough#endif 200f0be44f4SDavid McCullough#endif 201f0be44f4SDavid McCullough and r10,r5,r10,ror#2 202f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 203f0be44f4SDavid McCullough eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 204f0be44f4SDavid McCullough str r9,[r14,#-4]! 205f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_00_19(B,C,D) 206638591cdSDave Martin cmp r14,sp 207f0be44f4SDavid McCullough bne .L_00_15 @ [((11+4)*5+2)*3] 208934fc24dSArd Biesheuvel sub sp,sp,#25*4 209f0be44f4SDavid McCullough#if __ARM_ARCH__<7 210f0be44f4SDavid McCullough ldrb r10,[r1,#2] 211f0be44f4SDavid McCullough ldrb r9,[r1,#3] 212f0be44f4SDavid McCullough ldrb r11,[r1,#1] 213f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 214f0be44f4SDavid McCullough ldrb r12,[r1],#4 215f0be44f4SDavid McCullough orr r9,r9,r10,lsl#8 216f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 217f0be44f4SDavid McCullough orr r9,r9,r11,lsl#16 218f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 219f0be44f4SDavid McCullough orr r9,r9,r12,lsl#24 220f0be44f4SDavid McCullough#else 221f0be44f4SDavid McCullough ldr r9,[r1],#4 @ handles unaligned 222f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_00_19 223f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 224f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 225f0be44f4SDavid McCullough#ifdef __ARMEL__ 226f0be44f4SDavid McCullough rev r9,r9 @ byte swap 227f0be44f4SDavid McCullough#endif 228f0be44f4SDavid McCullough#endif 229f0be44f4SDavid McCullough and r10,r4,r10,ror#2 230f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 231f0be44f4SDavid McCullough eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) 232f0be44f4SDavid McCullough str r9,[r14,#-4]! 233f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_00_19(B,C,D) 234f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 235f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 236f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 237f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_xx_xx 238f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 239f0be44f4SDavid McCullough eor r9,r9,r10 240f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 241f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 242f0be44f4SDavid McCullough mov r9,r9,ror#31 243f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 244f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 245f0be44f4SDavid McCullough str r9,[r14,#-4]! 246f0be44f4SDavid McCullough and r10,r3,r10,ror#2 @ F_xx_xx 247f0be44f4SDavid McCullough @ F_xx_xx 248f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 249f0be44f4SDavid McCullough eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) 250f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_00_19(B,C,D) 251f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 252f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 253f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 254f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_xx_xx 255f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 256f0be44f4SDavid McCullough eor r9,r9,r10 257f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 258f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 259f0be44f4SDavid McCullough mov r9,r9,ror#31 260f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 261f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 262f0be44f4SDavid McCullough str r9,[r14,#-4]! 263f0be44f4SDavid McCullough and r10,r7,r10,ror#2 @ F_xx_xx 264f0be44f4SDavid McCullough @ F_xx_xx 265f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 266f0be44f4SDavid McCullough eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) 267f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_00_19(B,C,D) 268f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 269f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 270f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 271f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_xx_xx 272f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 273f0be44f4SDavid McCullough eor r9,r9,r10 274f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 275f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 276f0be44f4SDavid McCullough mov r9,r9,ror#31 277f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 278f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 279f0be44f4SDavid McCullough str r9,[r14,#-4]! 280f0be44f4SDavid McCullough and r10,r6,r10,ror#2 @ F_xx_xx 281f0be44f4SDavid McCullough @ F_xx_xx 282f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 283f0be44f4SDavid McCullough eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) 284f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_00_19(B,C,D) 285f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 286f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 287f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 288f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_xx_xx 289f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 290f0be44f4SDavid McCullough eor r9,r9,r10 291f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 292f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 293f0be44f4SDavid McCullough mov r9,r9,ror#31 294f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 295f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 296f0be44f4SDavid McCullough str r9,[r14,#-4]! 297f0be44f4SDavid McCullough and r10,r5,r10,ror#2 @ F_xx_xx 298f0be44f4SDavid McCullough @ F_xx_xx 299f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 300f0be44f4SDavid McCullough eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) 301f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_00_19(B,C,D) 302f0be44f4SDavid McCullough 303f0be44f4SDavid McCullough ldr r8,.LK_20_39 @ [+15+16*4] 304f0be44f4SDavid McCullough cmn sp,#0 @ [+3], clear carry to denote 20_39 305f0be44f4SDavid McCullough.L_20_39_or_60_79: 306f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 307f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 308f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 309f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_xx_xx 310f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 311f0be44f4SDavid McCullough eor r9,r9,r10 312f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 313f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 314f0be44f4SDavid McCullough mov r9,r9,ror#31 315f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 316f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 317f0be44f4SDavid McCullough str r9,[r14,#-4]! 318f0be44f4SDavid McCullough eor r10,r4,r10,ror#2 @ F_xx_xx 319f0be44f4SDavid McCullough @ F_xx_xx 320f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 321f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_20_39(B,C,D) 322f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 323f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 324f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 325f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_xx_xx 326f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 327f0be44f4SDavid McCullough eor r9,r9,r10 328f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 329f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 330f0be44f4SDavid McCullough mov r9,r9,ror#31 331f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 332f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 333f0be44f4SDavid McCullough str r9,[r14,#-4]! 334f0be44f4SDavid McCullough eor r10,r3,r10,ror#2 @ F_xx_xx 335f0be44f4SDavid McCullough @ F_xx_xx 336f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 337f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_20_39(B,C,D) 338f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 339f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 340f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 341f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_xx_xx 342f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 343f0be44f4SDavid McCullough eor r9,r9,r10 344f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 345f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 346f0be44f4SDavid McCullough mov r9,r9,ror#31 347f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 348f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 349f0be44f4SDavid McCullough str r9,[r14,#-4]! 350f0be44f4SDavid McCullough eor r10,r7,r10,ror#2 @ F_xx_xx 351f0be44f4SDavid McCullough @ F_xx_xx 352f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 353f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_20_39(B,C,D) 354f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 355f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 356f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 357f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_xx_xx 358f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 359f0be44f4SDavid McCullough eor r9,r9,r10 360f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 361f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 362f0be44f4SDavid McCullough mov r9,r9,ror#31 363f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 364f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 365f0be44f4SDavid McCullough str r9,[r14,#-4]! 366f0be44f4SDavid McCullough eor r10,r6,r10,ror#2 @ F_xx_xx 367f0be44f4SDavid McCullough @ F_xx_xx 368f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 369f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_20_39(B,C,D) 370f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 371f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 372f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 373f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_xx_xx 374f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 375f0be44f4SDavid McCullough eor r9,r9,r10 376f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 377f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 378f0be44f4SDavid McCullough mov r9,r9,ror#31 379f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 380f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 381f0be44f4SDavid McCullough str r9,[r14,#-4]! 382f0be44f4SDavid McCullough eor r10,r5,r10,ror#2 @ F_xx_xx 383f0be44f4SDavid McCullough @ F_xx_xx 384f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 385f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_20_39(B,C,D) 386638591cdSDave Martin ARM( teq r14,sp ) @ preserve carry 387638591cdSDave Martin THUMB( mov r11,sp ) 388638591cdSDave Martin THUMB( teq r14,r11 ) @ preserve carry 389f0be44f4SDavid McCullough bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] 390f0be44f4SDavid McCullough bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes 391f0be44f4SDavid McCullough 392f0be44f4SDavid McCullough ldr r8,.LK_40_59 393f0be44f4SDavid McCullough sub sp,sp,#20*4 @ [+2] 394f0be44f4SDavid McCullough.L_40_59: 395f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 396f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 397f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 398f0be44f4SDavid McCullough add r7,r8,r7,ror#2 @ E+=K_xx_xx 399f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 400f0be44f4SDavid McCullough eor r9,r9,r10 401f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 402f0be44f4SDavid McCullough eor r10,r5,r6 @ F_xx_xx 403f0be44f4SDavid McCullough mov r9,r9,ror#31 404f0be44f4SDavid McCullough add r7,r7,r3,ror#27 @ E+=ROR(A,27) 405f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 406f0be44f4SDavid McCullough str r9,[r14,#-4]! 407f0be44f4SDavid McCullough and r10,r4,r10,ror#2 @ F_xx_xx 408f0be44f4SDavid McCullough and r11,r5,r6 @ F_xx_xx 409f0be44f4SDavid McCullough add r7,r7,r9 @ E+=X[i] 410f0be44f4SDavid McCullough add r7,r7,r10 @ E+=F_40_59(B,C,D) 411f0be44f4SDavid McCullough add r7,r7,r11,ror#2 412f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 413f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 414f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 415f0be44f4SDavid McCullough add r6,r8,r6,ror#2 @ E+=K_xx_xx 416f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 417f0be44f4SDavid McCullough eor r9,r9,r10 418f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 419f0be44f4SDavid McCullough eor r10,r4,r5 @ F_xx_xx 420f0be44f4SDavid McCullough mov r9,r9,ror#31 421f0be44f4SDavid McCullough add r6,r6,r7,ror#27 @ E+=ROR(A,27) 422f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 423f0be44f4SDavid McCullough str r9,[r14,#-4]! 424f0be44f4SDavid McCullough and r10,r3,r10,ror#2 @ F_xx_xx 425f0be44f4SDavid McCullough and r11,r4,r5 @ F_xx_xx 426f0be44f4SDavid McCullough add r6,r6,r9 @ E+=X[i] 427f0be44f4SDavid McCullough add r6,r6,r10 @ E+=F_40_59(B,C,D) 428f0be44f4SDavid McCullough add r6,r6,r11,ror#2 429f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 430f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 431f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 432f0be44f4SDavid McCullough add r5,r8,r5,ror#2 @ E+=K_xx_xx 433f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 434f0be44f4SDavid McCullough eor r9,r9,r10 435f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 436f0be44f4SDavid McCullough eor r10,r3,r4 @ F_xx_xx 437f0be44f4SDavid McCullough mov r9,r9,ror#31 438f0be44f4SDavid McCullough add r5,r5,r6,ror#27 @ E+=ROR(A,27) 439f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 440f0be44f4SDavid McCullough str r9,[r14,#-4]! 441f0be44f4SDavid McCullough and r10,r7,r10,ror#2 @ F_xx_xx 442f0be44f4SDavid McCullough and r11,r3,r4 @ F_xx_xx 443f0be44f4SDavid McCullough add r5,r5,r9 @ E+=X[i] 444f0be44f4SDavid McCullough add r5,r5,r10 @ E+=F_40_59(B,C,D) 445f0be44f4SDavid McCullough add r5,r5,r11,ror#2 446f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 447f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 448f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 449f0be44f4SDavid McCullough add r4,r8,r4,ror#2 @ E+=K_xx_xx 450f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 451f0be44f4SDavid McCullough eor r9,r9,r10 452f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 453f0be44f4SDavid McCullough eor r10,r7,r3 @ F_xx_xx 454f0be44f4SDavid McCullough mov r9,r9,ror#31 455f0be44f4SDavid McCullough add r4,r4,r5,ror#27 @ E+=ROR(A,27) 456f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 457f0be44f4SDavid McCullough str r9,[r14,#-4]! 458f0be44f4SDavid McCullough and r10,r6,r10,ror#2 @ F_xx_xx 459f0be44f4SDavid McCullough and r11,r7,r3 @ F_xx_xx 460f0be44f4SDavid McCullough add r4,r4,r9 @ E+=X[i] 461f0be44f4SDavid McCullough add r4,r4,r10 @ E+=F_40_59(B,C,D) 462f0be44f4SDavid McCullough add r4,r4,r11,ror#2 463f0be44f4SDavid McCullough ldr r9,[r14,#15*4] 464f0be44f4SDavid McCullough ldr r10,[r14,#13*4] 465f0be44f4SDavid McCullough ldr r11,[r14,#7*4] 466f0be44f4SDavid McCullough add r3,r8,r3,ror#2 @ E+=K_xx_xx 467f0be44f4SDavid McCullough ldr r12,[r14,#2*4] 468f0be44f4SDavid McCullough eor r9,r9,r10 469f0be44f4SDavid McCullough eor r11,r11,r12 @ 1 cycle stall 470f0be44f4SDavid McCullough eor r10,r6,r7 @ F_xx_xx 471f0be44f4SDavid McCullough mov r9,r9,ror#31 472f0be44f4SDavid McCullough add r3,r3,r4,ror#27 @ E+=ROR(A,27) 473f0be44f4SDavid McCullough eor r9,r9,r11,ror#31 474f0be44f4SDavid McCullough str r9,[r14,#-4]! 475f0be44f4SDavid McCullough and r10,r5,r10,ror#2 @ F_xx_xx 476f0be44f4SDavid McCullough and r11,r6,r7 @ F_xx_xx 477f0be44f4SDavid McCullough add r3,r3,r9 @ E+=X[i] 478f0be44f4SDavid McCullough add r3,r3,r10 @ E+=F_40_59(B,C,D) 479f0be44f4SDavid McCullough add r3,r3,r11,ror#2 480638591cdSDave Martin cmp r14,sp 481f0be44f4SDavid McCullough bne .L_40_59 @ [+((12+5)*5+2)*4] 482f0be44f4SDavid McCullough 483f0be44f4SDavid McCullough ldr r8,.LK_60_79 484f0be44f4SDavid McCullough sub sp,sp,#20*4 485f0be44f4SDavid McCullough cmp sp,#0 @ set carry to denote 60_79 486f0be44f4SDavid McCullough b .L_20_39_or_60_79 @ [+4], spare 300 bytes 487f0be44f4SDavid McCullough.L_done: 488f0be44f4SDavid McCullough add sp,sp,#80*4 @ "deallocate" stack frame 489f0be44f4SDavid McCullough ldmia r0,{r8,r9,r10,r11,r12} 490f0be44f4SDavid McCullough add r3,r8,r3 491f0be44f4SDavid McCullough add r4,r9,r4 492f0be44f4SDavid McCullough add r5,r10,r5,ror#2 493f0be44f4SDavid McCullough add r6,r11,r6,ror#2 494f0be44f4SDavid McCullough add r7,r12,r7,ror#2 495f0be44f4SDavid McCullough stmia r0,{r3,r4,r5,r6,r7} 496f0be44f4SDavid McCullough teq r1,r2 497f0be44f4SDavid McCullough bne .Lloop @ [+18], total 1307 498f0be44f4SDavid McCullough 499f0be44f4SDavid McCullough ldmia sp!,{r4-r12,pc} 500f0be44f4SDavid McCullough.align 2 501f0be44f4SDavid McCullough.LK_00_19: .word 0x5a827999 502f0be44f4SDavid McCullough.LK_20_39: .word 0x6ed9eba1 503f0be44f4SDavid McCullough.LK_40_59: .word 0x8f1bbcdc 504f0be44f4SDavid McCullough.LK_60_79: .word 0xca62c1d6 505638591cdSDave MartinENDPROC(sha1_block_data_order) 506f0be44f4SDavid McCullough.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" 507f0be44f4SDavid McCullough.align 2 508