1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# 9# Permission to use under GPL terms is granted. 10# ==================================================================== 11 12# SHA256 block procedure for ARMv4. May 2007. 13 14# Performance is ~2x better than gcc 3.4 generated code and in "abso- 15# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 16# byte [on single-issue Xscale PXA250 core]. 17 18# July 2010. 19# 20# Rescheduling for dual-issue pipeline resulted in 22% improvement on 21# Cortex A8 core and ~20 cycles per processed byte. 22 23# February 2011. 24# 25# Profiler-assisted and platform-specific optimization resulted in 16% 26# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 27 28# September 2013. 29# 30# Add NEON implementation. On Cortex A8 it was measured to process one 31# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 32# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 33# code (meaning that latter performs sub-optimally, nothing was done 34# about it). 35 36# May 2014. 37# 38# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 39 40while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 41open STDOUT,">$output"; 42 43$ctx="r0"; $t0="r0"; 44$inp="r1"; $t4="r1"; 45$len="r2"; $t1="r2"; 46$T1="r3"; $t3="r3"; 47$A="r4"; 48$B="r5"; 49$C="r6"; 50$D="r7"; 51$E="r8"; 52$F="r9"; 53$G="r10"; 54$H="r11"; 55@V=($A,$B,$C,$D,$E,$F,$G,$H); 56$t2="r12"; 57$Ktbl="r14"; 58 59@Sigma0=( 2,13,22); 60@Sigma1=( 6,11,25); 61@sigma0=( 7,18, 3); 62@sigma1=(17,19,10); 63 64sub BODY_00_15 { 65my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 66 67$code.=<<___ if ($i<16); 68#if __ARM_ARCH__>=7 69 @ ldr $t1,[$inp],#4 @ $i 70# if $i==15 71 str $inp,[sp,#17*4] @ make room for $t4 72# endif 73 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 74 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 75 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 76# ifndef __ARMEB__ 77 rev $t1,$t1 78# endif 79#else 80 @ ldrb $t1,[$inp,#3] @ $i 81 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 82 ldrb $t2,[$inp,#2] 83 ldrb $t0,[$inp,#1] 84 orr $t1,$t1,$t2,lsl#8 85 ldrb $t2,[$inp],#4 86 orr $t1,$t1,$t0,lsl#16 87# if $i==15 88 str $inp,[sp,#17*4] @ make room for $t4 89# endif 90 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 91 orr $t1,$t1,$t2,lsl#24 92 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 93#endif 94___ 95$code.=<<___; 96 ldr $t2,[$Ktbl],#4 @ *K256++ 97 add $h,$h,$t1 @ h+=X[i] 98 str $t1,[sp,#`$i%16`*4] 99 eor $t1,$f,$g 100 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 101 and $t1,$t1,$e 102 add $h,$h,$t2 @ h+=K256[i] 103 eor $t1,$t1,$g @ Ch(e,f,g) 104 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 105 add $h,$h,$t1 @ h+=Ch(e,f,g) 106#if $i==31 107 and $t2,$t2,#0xff 108 cmp $t2,#0xf2 @ done? 109#endif 110#if $i<15 111# if __ARM_ARCH__>=7 112 ldr $t1,[$inp],#4 @ prefetch 113# else 114 ldrb $t1,[$inp,#3] 115# endif 116 eor $t2,$a,$b @ a^b, b^c in next round 117#else 118 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 119 eor $t2,$a,$b @ a^b, b^c in next round 120 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 121#endif 122 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 123 and $t3,$t3,$t2 @ (b^c)&=(a^b) 124 add $d,$d,$h @ d+=h 125 eor $t3,$t3,$b @ Maj(a,b,c) 126 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 127 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 128___ 129 ($t2,$t3)=($t3,$t2); 130} 131 132sub BODY_16_XX { 133my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 134 135$code.=<<___; 136 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 137 @ ldr $t4,[sp,#`($i+14)%16`*4] 138 mov $t0,$t1,ror#$sigma0[0] 139 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 140 mov $t2,$t4,ror#$sigma1[0] 141 eor $t0,$t0,$t1,ror#$sigma0[1] 142 eor $t2,$t2,$t4,ror#$sigma1[1] 143 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 144 ldr $t1,[sp,#`($i+0)%16`*4] 145 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 146 ldr $t4,[sp,#`($i+9)%16`*4] 147 148 add $t2,$t2,$t0 149 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 150 add $t1,$t1,$t2 151 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 152 add $t1,$t1,$t4 @ X[i] 153___ 154 &BODY_00_15(@_); 155} 156 157$code=<<___; 158#ifndef __KERNEL__ 159# include "arm_arch.h" 160#else 161# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 162# define __ARM_MAX_ARCH__ 7 163#endif 164 165.text 166#if __ARM_ARCH__<7 167.code 32 168#else 169.syntax unified 170# ifdef __thumb2__ 171# define adrl adr 172.thumb 173# else 174.code 32 175# endif 176#endif 177 178.type K256,%object 179.align 5 180K256: 181.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 182.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 183.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 184.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 185.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 186.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 187.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 188.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 189.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 190.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 191.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 192.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 193.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 194.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 195.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 196.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 197.size K256,.-K256 198.word 0 @ terminator 199#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 200.LOPENSSL_armcap: 201.word OPENSSL_armcap_P-sha256_block_data_order 202#endif 203.align 5 204 205.global sha256_block_data_order 206.type sha256_block_data_order,%function 207sha256_block_data_order: 208#if __ARM_ARCH__<7 209 sub r3,pc,#8 @ sha256_block_data_order 210#else 211 adr r3,sha256_block_data_order 212#endif 213#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 214 ldr r12,.LOPENSSL_armcap 215 ldr r12,[r3,r12] @ OPENSSL_armcap_P 216 tst r12,#ARMV8_SHA256 217 bne .LARMv8 218 tst r12,#ARMV7_NEON 219 bne .LNEON 220#endif 221 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 222 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 223 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 224 sub $Ktbl,r3,#256+32 @ K256 225 sub sp,sp,#16*4 @ alloca(X[16]) 226.Loop: 227# if __ARM_ARCH__>=7 228 ldr $t1,[$inp],#4 229# else 230 ldrb $t1,[$inp,#3] 231# endif 232 eor $t3,$B,$C @ magic 233 eor $t2,$t2,$t2 234___ 235for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 236$code.=".Lrounds_16_xx:\n"; 237for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 238$code.=<<___; 239#if __ARM_ARCH__>=7 240 ite eq @ Thumb2 thing, sanity check in ARM 241#endif 242 ldreq $t3,[sp,#16*4] @ pull ctx 243 bne .Lrounds_16_xx 244 245 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 246 ldr $t0,[$t3,#0] 247 ldr $t1,[$t3,#4] 248 ldr $t2,[$t3,#8] 249 add $A,$A,$t0 250 ldr $t0,[$t3,#12] 251 add $B,$B,$t1 252 ldr $t1,[$t3,#16] 253 add $C,$C,$t2 254 ldr $t2,[$t3,#20] 255 add $D,$D,$t0 256 ldr $t0,[$t3,#24] 257 add $E,$E,$t1 258 ldr $t1,[$t3,#28] 259 add $F,$F,$t2 260 ldr $inp,[sp,#17*4] @ pull inp 261 ldr $t2,[sp,#18*4] @ pull inp+len 262 add $G,$G,$t0 263 add $H,$H,$t1 264 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 265 cmp $inp,$t2 266 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 267 bne .Loop 268 269 add sp,sp,#`16+3`*4 @ destroy frame 270#if __ARM_ARCH__>=5 271 ldmia sp!,{r4-r11,pc} 272#else 273 ldmia sp!,{r4-r11,lr} 274 tst lr,#1 275 moveq pc,lr @ be binary compatible with V4, yet 276 bx lr @ interoperable with Thumb ISA:-) 277#endif 278.size sha256_block_data_order,.-sha256_block_data_order 279___ 280###################################################################### 281# NEON stuff 282# 283{{{ 284my @X=map("q$_",(0..3)); 285my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 286my $Xfer=$t4; 287my $j=0; 288 289sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 290sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 291 292sub AUTOLOAD() # thunk [simplified] x86-style perlasm 293{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 294 my $arg = pop; 295 $arg = "#$arg" if ($arg*1 eq $arg); 296 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 297} 298 299sub Xupdate() 300{ use integer; 301 my $body = shift; 302 my @insns = (&$body,&$body,&$body,&$body); 303 my ($a,$b,$c,$d,$e,$f,$g,$h); 304 305 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 306 eval(shift(@insns)); 307 eval(shift(@insns)); 308 eval(shift(@insns)); 309 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 310 eval(shift(@insns)); 311 eval(shift(@insns)); 312 eval(shift(@insns)); 313 &vshr_u32 ($T2,$T0,$sigma0[0]); 314 eval(shift(@insns)); 315 eval(shift(@insns)); 316 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 317 eval(shift(@insns)); 318 eval(shift(@insns)); 319 &vshr_u32 ($T1,$T0,$sigma0[2]); 320 eval(shift(@insns)); 321 eval(shift(@insns)); 322 &vsli_32 ($T2,$T0,32-$sigma0[0]); 323 eval(shift(@insns)); 324 eval(shift(@insns)); 325 &vshr_u32 ($T3,$T0,$sigma0[1]); 326 eval(shift(@insns)); 327 eval(shift(@insns)); 328 &veor ($T1,$T1,$T2); 329 eval(shift(@insns)); 330 eval(shift(@insns)); 331 &vsli_32 ($T3,$T0,32-$sigma0[1]); 332 eval(shift(@insns)); 333 eval(shift(@insns)); 334 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 335 eval(shift(@insns)); 336 eval(shift(@insns)); 337 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 338 eval(shift(@insns)); 339 eval(shift(@insns)); 340 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 341 eval(shift(@insns)); 342 eval(shift(@insns)); 343 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 344 eval(shift(@insns)); 345 eval(shift(@insns)); 346 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 347 eval(shift(@insns)); 348 eval(shift(@insns)); 349 &veor ($T5,$T5,$T4); 350 eval(shift(@insns)); 351 eval(shift(@insns)); 352 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 353 eval(shift(@insns)); 354 eval(shift(@insns)); 355 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 356 eval(shift(@insns)); 357 eval(shift(@insns)); 358 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 359 eval(shift(@insns)); 360 eval(shift(@insns)); 361 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 362 eval(shift(@insns)); 363 eval(shift(@insns)); 364 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 365 eval(shift(@insns)); 366 eval(shift(@insns)); 367 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 368 eval(shift(@insns)); 369 eval(shift(@insns)); 370 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 371 eval(shift(@insns)); 372 eval(shift(@insns)); 373 &veor ($T5,$T5,$T4); 374 eval(shift(@insns)); 375 eval(shift(@insns)); 376 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 377 eval(shift(@insns)); 378 eval(shift(@insns)); 379 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 380 eval(shift(@insns)); 381 eval(shift(@insns)); 382 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 383 eval(shift(@insns)); 384 eval(shift(@insns)); 385 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 386 eval(shift(@insns)); 387 eval(shift(@insns)); 388 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 389 eval(shift(@insns)); 390 eval(shift(@insns)); 391 &vadd_i32 ($T0,$T0,@X[0]); 392 while($#insns>=2) { eval(shift(@insns)); } 393 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 394 eval(shift(@insns)); 395 eval(shift(@insns)); 396 397 push(@X,shift(@X)); # "rotate" X[] 398} 399 400sub Xpreload() 401{ use integer; 402 my $body = shift; 403 my @insns = (&$body,&$body,&$body,&$body); 404 my ($a,$b,$c,$d,$e,$f,$g,$h); 405 406 eval(shift(@insns)); 407 eval(shift(@insns)); 408 eval(shift(@insns)); 409 eval(shift(@insns)); 410 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 411 eval(shift(@insns)); 412 eval(shift(@insns)); 413 eval(shift(@insns)); 414 eval(shift(@insns)); 415 &vrev32_8 (@X[0],@X[0]); 416 eval(shift(@insns)); 417 eval(shift(@insns)); 418 eval(shift(@insns)); 419 eval(shift(@insns)); 420 &vadd_i32 ($T0,$T0,@X[0]); 421 foreach (@insns) { eval; } # remaining instructions 422 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 423 424 push(@X,shift(@X)); # "rotate" X[] 425} 426 427sub body_00_15 () { 428 ( 429 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 430 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 431 '&eor ($t1,$f,$g)', 432 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 433 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 434 '&and ($t1,$t1,$e)', 435 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 436 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 437 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 438 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 439 '&eor ($t2,$a,$b)', # a^b, b^c in next round 440 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 441 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 442 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 443 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 444 '&ldr ($t1,"[sp,#64]") if ($j==31)', 445 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 446 '&add ($d,$d,$h)', # d+=h 447 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 448 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 449 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 450 ) 451} 452 453$code.=<<___; 454#if __ARM_MAX_ARCH__>=7 455.arch armv7-a 456.fpu neon 457 458.global sha256_block_data_order_neon 459.type sha256_block_data_order_neon,%function 460.align 4 461sha256_block_data_order_neon: 462.LNEON: 463 stmdb sp!,{r4-r12,lr} 464 465 sub $H,sp,#16*4+16 466 adrl $Ktbl,K256 467 bic $H,$H,#15 @ align for 128-bit stores 468 mov $t2,sp 469 mov sp,$H @ alloca 470 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 471 472 vld1.8 {@X[0]},[$inp]! 473 vld1.8 {@X[1]},[$inp]! 474 vld1.8 {@X[2]},[$inp]! 475 vld1.8 {@X[3]},[$inp]! 476 vld1.32 {$T0},[$Ktbl,:128]! 477 vld1.32 {$T1},[$Ktbl,:128]! 478 vld1.32 {$T2},[$Ktbl,:128]! 479 vld1.32 {$T3},[$Ktbl,:128]! 480 vrev32.8 @X[0],@X[0] @ yes, even on 481 str $ctx,[sp,#64] 482 vrev32.8 @X[1],@X[1] @ big-endian 483 str $inp,[sp,#68] 484 mov $Xfer,sp 485 vrev32.8 @X[2],@X[2] 486 str $len,[sp,#72] 487 vrev32.8 @X[3],@X[3] 488 str $t2,[sp,#76] @ save original sp 489 vadd.i32 $T0,$T0,@X[0] 490 vadd.i32 $T1,$T1,@X[1] 491 vst1.32 {$T0},[$Xfer,:128]! 492 vadd.i32 $T2,$T2,@X[2] 493 vst1.32 {$T1},[$Xfer,:128]! 494 vadd.i32 $T3,$T3,@X[3] 495 vst1.32 {$T2},[$Xfer,:128]! 496 vst1.32 {$T3},[$Xfer,:128]! 497 498 ldmia $ctx,{$A-$H} 499 sub $Xfer,$Xfer,#64 500 ldr $t1,[sp,#0] 501 eor $t2,$t2,$t2 502 eor $t3,$B,$C 503 b .L_00_48 504 505.align 4 506.L_00_48: 507___ 508 &Xupdate(\&body_00_15); 509 &Xupdate(\&body_00_15); 510 &Xupdate(\&body_00_15); 511 &Xupdate(\&body_00_15); 512$code.=<<___; 513 teq $t1,#0 @ check for K256 terminator 514 ldr $t1,[sp,#0] 515 sub $Xfer,$Xfer,#64 516 bne .L_00_48 517 518 ldr $inp,[sp,#68] 519 ldr $t0,[sp,#72] 520 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 521 teq $inp,$t0 522 it eq 523 subeq $inp,$inp,#64 @ avoid SEGV 524 vld1.8 {@X[0]},[$inp]! @ load next input block 525 vld1.8 {@X[1]},[$inp]! 526 vld1.8 {@X[2]},[$inp]! 527 vld1.8 {@X[3]},[$inp]! 528 it ne 529 strne $inp,[sp,#68] 530 mov $Xfer,sp 531___ 532 &Xpreload(\&body_00_15); 533 &Xpreload(\&body_00_15); 534 &Xpreload(\&body_00_15); 535 &Xpreload(\&body_00_15); 536$code.=<<___; 537 ldr $t0,[$t1,#0] 538 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 539 ldr $t2,[$t1,#4] 540 ldr $t3,[$t1,#8] 541 ldr $t4,[$t1,#12] 542 add $A,$A,$t0 @ accumulate 543 ldr $t0,[$t1,#16] 544 add $B,$B,$t2 545 ldr $t2,[$t1,#20] 546 add $C,$C,$t3 547 ldr $t3,[$t1,#24] 548 add $D,$D,$t4 549 ldr $t4,[$t1,#28] 550 add $E,$E,$t0 551 str $A,[$t1],#4 552 add $F,$F,$t2 553 str $B,[$t1],#4 554 add $G,$G,$t3 555 str $C,[$t1],#4 556 add $H,$H,$t4 557 str $D,[$t1],#4 558 stmia $t1,{$E-$H} 559 560 ittte ne 561 movne $Xfer,sp 562 ldrne $t1,[sp,#0] 563 eorne $t2,$t2,$t2 564 ldreq sp,[sp,#76] @ restore original sp 565 itt ne 566 eorne $t3,$B,$C 567 bne .L_00_48 568 569 ldmia sp!,{r4-r12,pc} 570.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 571#endif 572___ 573}}} 574###################################################################### 575# ARMv8 stuff 576# 577{{{ 578my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 579my @MSG=map("q$_",(8..11)); 580my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 581my $Ktbl="r3"; 582 583$code.=<<___; 584#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 585 586# ifdef __thumb2__ 587# define INST(a,b,c,d) .byte c,d|0xc,a,b 588# else 589# define INST(a,b,c,d) .byte a,b,c,d 590# endif 591 592.type sha256_block_data_order_armv8,%function 593.align 5 594sha256_block_data_order_armv8: 595.LARMv8: 596 vld1.32 {$ABCD,$EFGH},[$ctx] 597# ifdef __thumb2__ 598 adr $Ktbl,.LARMv8 599 sub $Ktbl,$Ktbl,#.LARMv8-K256 600# else 601 adrl $Ktbl,K256 602# endif 603 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 604 605.Loop_v8: 606 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 607 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 608 vld1.32 {$W0},[$Ktbl]! 609 vrev32.8 @MSG[0],@MSG[0] 610 vrev32.8 @MSG[1],@MSG[1] 611 vrev32.8 @MSG[2],@MSG[2] 612 vrev32.8 @MSG[3],@MSG[3] 613 vmov $ABCD_SAVE,$ABCD @ offload 614 vmov $EFGH_SAVE,$EFGH 615 teq $inp,$len 616___ 617for($i=0;$i<12;$i++) { 618$code.=<<___; 619 vld1.32 {$W1},[$Ktbl]! 620 vadd.i32 $W0,$W0,@MSG[0] 621 sha256su0 @MSG[0],@MSG[1] 622 vmov $abcd,$ABCD 623 sha256h $ABCD,$EFGH,$W0 624 sha256h2 $EFGH,$abcd,$W0 625 sha256su1 @MSG[0],@MSG[2],@MSG[3] 626___ 627 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 628} 629$code.=<<___; 630 vld1.32 {$W1},[$Ktbl]! 631 vadd.i32 $W0,$W0,@MSG[0] 632 vmov $abcd,$ABCD 633 sha256h $ABCD,$EFGH,$W0 634 sha256h2 $EFGH,$abcd,$W0 635 636 vld1.32 {$W0},[$Ktbl]! 637 vadd.i32 $W1,$W1,@MSG[1] 638 vmov $abcd,$ABCD 639 sha256h $ABCD,$EFGH,$W1 640 sha256h2 $EFGH,$abcd,$W1 641 642 vld1.32 {$W1},[$Ktbl] 643 vadd.i32 $W0,$W0,@MSG[2] 644 sub $Ktbl,$Ktbl,#256-16 @ rewind 645 vmov $abcd,$ABCD 646 sha256h $ABCD,$EFGH,$W0 647 sha256h2 $EFGH,$abcd,$W0 648 649 vadd.i32 $W1,$W1,@MSG[3] 650 vmov $abcd,$ABCD 651 sha256h $ABCD,$EFGH,$W1 652 sha256h2 $EFGH,$abcd,$W1 653 654 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 655 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 656 it ne 657 bne .Loop_v8 658 659 vst1.32 {$ABCD,$EFGH},[$ctx] 660 661 ret @ bx lr 662.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 663#endif 664___ 665}}} 666$code.=<<___; 667.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 668.align 2 669#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 670.comm OPENSSL_armcap_P,4,4 671#endif 672___ 673 674open SELF,$0; 675while(<SELF>) { 676 next if (/^#!/); 677 last if (!s/^#/@/ and !/^$/); 678 print; 679} 680close SELF; 681 682{ my %opcode = ( 683 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 684 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 685 686 sub unsha256 { 687 my ($mnemonic,$arg)=@_; 688 689 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 690 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 691 |(($2&7)<<17)|(($2&8)<<4) 692 |(($3&7)<<1) |(($3&8)<<2); 693 # since ARMv7 instructions are always encoded little-endian. 694 # correct solution is to use .inst directive, but older 695 # assemblers don't implement it:-( 696 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 697 $word&0xff,($word>>8)&0xff, 698 ($word>>16)&0xff,($word>>24)&0xff, 699 $mnemonic,$arg; 700 } 701 } 702} 703 704foreach (split($/,$code)) { 705 706 s/\`([^\`]*)\`/eval $1/geo; 707 708 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 709 710 s/\bret\b/bx lr/go or 711 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 712 713 print $_,"\n"; 714} 715 716close STDOUT; # enforce flush 717