1#!/usr/bin/env perl 2# SPDX-License-Identifier: GPL-2.0 3 4# This code is taken from the OpenSSL project but the author (Andy Polyakov) 5# has relicensed it under the GPLv2. Therefore this program is free software; 6# you can redistribute it and/or modify it under the terms of the GNU General 7# Public License version 2 as published by the Free Software Foundation. 8# 9# The original headers, including the original license headers, are 10# included below for completeness. 11 12# ==================================================================== 13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 14# project. The module is, however, dual licensed under OpenSSL and 15# CRYPTOGAMS licenses depending on where you obtain it. For further 16# details see http://www.openssl.org/~appro/cryptogams/. 17# ==================================================================== 18 19# SHA256 block procedure for ARMv4. May 2007. 20 21# Performance is ~2x better than gcc 3.4 generated code and in "abso- 22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 23# byte [on single-issue Xscale PXA250 core]. 24 25# July 2010. 26# 27# Rescheduling for dual-issue pipeline resulted in 22% improvement on 28# Cortex A8 core and ~20 cycles per processed byte. 29 30# February 2011. 31# 32# Profiler-assisted and platform-specific optimization resulted in 16% 33# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 34 35# September 2013. 36# 37# Add NEON implementation. On Cortex A8 it was measured to process one 38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 40# code (meaning that latter performs sub-optimally, nothing was done 41# about it). 42 43# May 2014. 44# 45# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 46 47while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 48open STDOUT,">$output"; 49 50$ctx="r0"; $t0="r0"; 51$inp="r1"; $t4="r1"; 52$len="r2"; $t1="r2"; 53$T1="r3"; $t3="r3"; 54$A="r4"; 55$B="r5"; 56$C="r6"; 57$D="r7"; 58$E="r8"; 59$F="r9"; 60$G="r10"; 61$H="r11"; 62@V=($A,$B,$C,$D,$E,$F,$G,$H); 63$t2="r12"; 64$Ktbl="r14"; 65 66@Sigma0=( 2,13,22); 67@Sigma1=( 6,11,25); 68@sigma0=( 7,18, 3); 69@sigma1=(17,19,10); 70 71sub BODY_00_15 { 72my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 73 74$code.=<<___ if ($i<16); 75#if __ARM_ARCH__>=7 76 @ ldr $t1,[$inp],#4 @ $i 77# if $i==15 78 str $inp,[sp,#17*4] @ make room for $t4 79# endif 80 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 81 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 82 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 83# ifndef __ARMEB__ 84 rev $t1,$t1 85# endif 86#else 87 @ ldrb $t1,[$inp,#3] @ $i 88 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 89 ldrb $t2,[$inp,#2] 90 ldrb $t0,[$inp,#1] 91 orr $t1,$t1,$t2,lsl#8 92 ldrb $t2,[$inp],#4 93 orr $t1,$t1,$t0,lsl#16 94# if $i==15 95 str $inp,[sp,#17*4] @ make room for $t4 96# endif 97 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 98 orr $t1,$t1,$t2,lsl#24 99 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 100#endif 101___ 102$code.=<<___; 103 ldr $t2,[$Ktbl],#4 @ *K256++ 104 add $h,$h,$t1 @ h+=X[i] 105 str $t1,[sp,#`$i%16`*4] 106 eor $t1,$f,$g 107 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 108 and $t1,$t1,$e 109 add $h,$h,$t2 @ h+=K256[i] 110 eor $t1,$t1,$g @ Ch(e,f,g) 111 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 112 add $h,$h,$t1 @ h+=Ch(e,f,g) 113#if $i==31 114 and $t2,$t2,#0xff 115 cmp $t2,#0xf2 @ done? 116#endif 117#if $i<15 118# if __ARM_ARCH__>=7 119 ldr $t1,[$inp],#4 @ prefetch 120# else 121 ldrb $t1,[$inp,#3] 122# endif 123 eor $t2,$a,$b @ a^b, b^c in next round 124#else 125 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 126 eor $t2,$a,$b @ a^b, b^c in next round 127 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 128#endif 129 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 130 and $t3,$t3,$t2 @ (b^c)&=(a^b) 131 add $d,$d,$h @ d+=h 132 eor $t3,$t3,$b @ Maj(a,b,c) 133 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 134 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 135___ 136 ($t2,$t3)=($t3,$t2); 137} 138 139sub BODY_16_XX { 140my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 141 142$code.=<<___; 143 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 144 @ ldr $t4,[sp,#`($i+14)%16`*4] 145 mov $t0,$t1,ror#$sigma0[0] 146 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 147 mov $t2,$t4,ror#$sigma1[0] 148 eor $t0,$t0,$t1,ror#$sigma0[1] 149 eor $t2,$t2,$t4,ror#$sigma1[1] 150 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 151 ldr $t1,[sp,#`($i+0)%16`*4] 152 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 153 ldr $t4,[sp,#`($i+9)%16`*4] 154 155 add $t2,$t2,$t0 156 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 157 add $t1,$t1,$t2 158 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 159 add $t1,$t1,$t4 @ X[i] 160___ 161 &BODY_00_15(@_); 162} 163 164$code=<<___; 165#ifndef __KERNEL__ 166# include "arm_arch.h" 167#else 168# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 169# define __ARM_MAX_ARCH__ 7 170#endif 171 172.text 173#if __ARM_ARCH__<7 174.code 32 175#else 176.syntax unified 177# ifdef __thumb2__ 178# define adrl adr 179.thumb 180# else 181.code 32 182# endif 183#endif 184 185.type K256,%object 186.align 5 187K256: 188.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 189.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 190.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 191.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 192.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 193.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 194.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 195.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 196.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 197.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 198.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 199.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 200.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 201.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 202.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 203.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 204.size K256,.-K256 205.word 0 @ terminator 206#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 207.LOPENSSL_armcap: 208.word OPENSSL_armcap_P-sha256_block_data_order 209#endif 210.align 5 211 212.global sha256_block_data_order 213.type sha256_block_data_order,%function 214sha256_block_data_order: 215#if __ARM_ARCH__<7 216 sub r3,pc,#8 @ sha256_block_data_order 217#else 218 adr r3,sha256_block_data_order 219#endif 220#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 221 ldr r12,.LOPENSSL_armcap 222 ldr r12,[r3,r12] @ OPENSSL_armcap_P 223 tst r12,#ARMV8_SHA256 224 bne .LARMv8 225 tst r12,#ARMV7_NEON 226 bne .LNEON 227#endif 228 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 229 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 230 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 231 sub $Ktbl,r3,#256+32 @ K256 232 sub sp,sp,#16*4 @ alloca(X[16]) 233.Loop: 234# if __ARM_ARCH__>=7 235 ldr $t1,[$inp],#4 236# else 237 ldrb $t1,[$inp,#3] 238# endif 239 eor $t3,$B,$C @ magic 240 eor $t2,$t2,$t2 241___ 242for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 243$code.=".Lrounds_16_xx:\n"; 244for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 245$code.=<<___; 246#if __ARM_ARCH__>=7 247 ite eq @ Thumb2 thing, sanity check in ARM 248#endif 249 ldreq $t3,[sp,#16*4] @ pull ctx 250 bne .Lrounds_16_xx 251 252 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 253 ldr $t0,[$t3,#0] 254 ldr $t1,[$t3,#4] 255 ldr $t2,[$t3,#8] 256 add $A,$A,$t0 257 ldr $t0,[$t3,#12] 258 add $B,$B,$t1 259 ldr $t1,[$t3,#16] 260 add $C,$C,$t2 261 ldr $t2,[$t3,#20] 262 add $D,$D,$t0 263 ldr $t0,[$t3,#24] 264 add $E,$E,$t1 265 ldr $t1,[$t3,#28] 266 add $F,$F,$t2 267 ldr $inp,[sp,#17*4] @ pull inp 268 ldr $t2,[sp,#18*4] @ pull inp+len 269 add $G,$G,$t0 270 add $H,$H,$t1 271 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 272 cmp $inp,$t2 273 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 274 bne .Loop 275 276 add sp,sp,#`16+3`*4 @ destroy frame 277#if __ARM_ARCH__>=5 278 ldmia sp!,{r4-r11,pc} 279#else 280 ldmia sp!,{r4-r11,lr} 281 tst lr,#1 282 moveq pc,lr @ be binary compatible with V4, yet 283 bx lr @ interoperable with Thumb ISA:-) 284#endif 285.size sha256_block_data_order,.-sha256_block_data_order 286___ 287###################################################################### 288# NEON stuff 289# 290{{{ 291my @X=map("q$_",(0..3)); 292my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 293my $Xfer=$t4; 294my $j=0; 295 296sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 297sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 298 299sub AUTOLOAD() # thunk [simplified] x86-style perlasm 300{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 301 my $arg = pop; 302 $arg = "#$arg" if ($arg*1 eq $arg); 303 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 304} 305 306sub Xupdate() 307{ use integer; 308 my $body = shift; 309 my @insns = (&$body,&$body,&$body,&$body); 310 my ($a,$b,$c,$d,$e,$f,$g,$h); 311 312 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 313 eval(shift(@insns)); 314 eval(shift(@insns)); 315 eval(shift(@insns)); 316 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 317 eval(shift(@insns)); 318 eval(shift(@insns)); 319 eval(shift(@insns)); 320 &vshr_u32 ($T2,$T0,$sigma0[0]); 321 eval(shift(@insns)); 322 eval(shift(@insns)); 323 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 324 eval(shift(@insns)); 325 eval(shift(@insns)); 326 &vshr_u32 ($T1,$T0,$sigma0[2]); 327 eval(shift(@insns)); 328 eval(shift(@insns)); 329 &vsli_32 ($T2,$T0,32-$sigma0[0]); 330 eval(shift(@insns)); 331 eval(shift(@insns)); 332 &vshr_u32 ($T3,$T0,$sigma0[1]); 333 eval(shift(@insns)); 334 eval(shift(@insns)); 335 &veor ($T1,$T1,$T2); 336 eval(shift(@insns)); 337 eval(shift(@insns)); 338 &vsli_32 ($T3,$T0,32-$sigma0[1]); 339 eval(shift(@insns)); 340 eval(shift(@insns)); 341 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 342 eval(shift(@insns)); 343 eval(shift(@insns)); 344 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 345 eval(shift(@insns)); 346 eval(shift(@insns)); 347 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 348 eval(shift(@insns)); 349 eval(shift(@insns)); 350 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 351 eval(shift(@insns)); 352 eval(shift(@insns)); 353 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 354 eval(shift(@insns)); 355 eval(shift(@insns)); 356 &veor ($T5,$T5,$T4); 357 eval(shift(@insns)); 358 eval(shift(@insns)); 359 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 360 eval(shift(@insns)); 361 eval(shift(@insns)); 362 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 363 eval(shift(@insns)); 364 eval(shift(@insns)); 365 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 366 eval(shift(@insns)); 367 eval(shift(@insns)); 368 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 369 eval(shift(@insns)); 370 eval(shift(@insns)); 371 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 372 eval(shift(@insns)); 373 eval(shift(@insns)); 374 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 375 eval(shift(@insns)); 376 eval(shift(@insns)); 377 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 378 eval(shift(@insns)); 379 eval(shift(@insns)); 380 &veor ($T5,$T5,$T4); 381 eval(shift(@insns)); 382 eval(shift(@insns)); 383 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 384 eval(shift(@insns)); 385 eval(shift(@insns)); 386 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 387 eval(shift(@insns)); 388 eval(shift(@insns)); 389 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 390 eval(shift(@insns)); 391 eval(shift(@insns)); 392 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 393 eval(shift(@insns)); 394 eval(shift(@insns)); 395 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 396 eval(shift(@insns)); 397 eval(shift(@insns)); 398 &vadd_i32 ($T0,$T0,@X[0]); 399 while($#insns>=2) { eval(shift(@insns)); } 400 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 401 eval(shift(@insns)); 402 eval(shift(@insns)); 403 404 push(@X,shift(@X)); # "rotate" X[] 405} 406 407sub Xpreload() 408{ use integer; 409 my $body = shift; 410 my @insns = (&$body,&$body,&$body,&$body); 411 my ($a,$b,$c,$d,$e,$f,$g,$h); 412 413 eval(shift(@insns)); 414 eval(shift(@insns)); 415 eval(shift(@insns)); 416 eval(shift(@insns)); 417 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 418 eval(shift(@insns)); 419 eval(shift(@insns)); 420 eval(shift(@insns)); 421 eval(shift(@insns)); 422 &vrev32_8 (@X[0],@X[0]); 423 eval(shift(@insns)); 424 eval(shift(@insns)); 425 eval(shift(@insns)); 426 eval(shift(@insns)); 427 &vadd_i32 ($T0,$T0,@X[0]); 428 foreach (@insns) { eval; } # remaining instructions 429 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 430 431 push(@X,shift(@X)); # "rotate" X[] 432} 433 434sub body_00_15 () { 435 ( 436 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 437 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 438 '&eor ($t1,$f,$g)', 439 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 440 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 441 '&and ($t1,$t1,$e)', 442 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 443 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 444 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 445 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 446 '&eor ($t2,$a,$b)', # a^b, b^c in next round 447 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 448 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 449 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 450 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 451 '&ldr ($t1,"[sp,#64]") if ($j==31)', 452 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 453 '&add ($d,$d,$h)', # d+=h 454 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 455 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 456 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 457 ) 458} 459 460$code.=<<___; 461#if __ARM_MAX_ARCH__>=7 462.arch armv7-a 463.fpu neon 464 465.global sha256_block_data_order_neon 466.type sha256_block_data_order_neon,%function 467.align 4 468sha256_block_data_order_neon: 469.LNEON: 470 stmdb sp!,{r4-r12,lr} 471 472 sub $H,sp,#16*4+16 473 adrl $Ktbl,K256 474 bic $H,$H,#15 @ align for 128-bit stores 475 mov $t2,sp 476 mov sp,$H @ alloca 477 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 478 479 vld1.8 {@X[0]},[$inp]! 480 vld1.8 {@X[1]},[$inp]! 481 vld1.8 {@X[2]},[$inp]! 482 vld1.8 {@X[3]},[$inp]! 483 vld1.32 {$T0},[$Ktbl,:128]! 484 vld1.32 {$T1},[$Ktbl,:128]! 485 vld1.32 {$T2},[$Ktbl,:128]! 486 vld1.32 {$T3},[$Ktbl,:128]! 487 vrev32.8 @X[0],@X[0] @ yes, even on 488 str $ctx,[sp,#64] 489 vrev32.8 @X[1],@X[1] @ big-endian 490 str $inp,[sp,#68] 491 mov $Xfer,sp 492 vrev32.8 @X[2],@X[2] 493 str $len,[sp,#72] 494 vrev32.8 @X[3],@X[3] 495 str $t2,[sp,#76] @ save original sp 496 vadd.i32 $T0,$T0,@X[0] 497 vadd.i32 $T1,$T1,@X[1] 498 vst1.32 {$T0},[$Xfer,:128]! 499 vadd.i32 $T2,$T2,@X[2] 500 vst1.32 {$T1},[$Xfer,:128]! 501 vadd.i32 $T3,$T3,@X[3] 502 vst1.32 {$T2},[$Xfer,:128]! 503 vst1.32 {$T3},[$Xfer,:128]! 504 505 ldmia $ctx,{$A-$H} 506 sub $Xfer,$Xfer,#64 507 ldr $t1,[sp,#0] 508 eor $t2,$t2,$t2 509 eor $t3,$B,$C 510 b .L_00_48 511 512.align 4 513.L_00_48: 514___ 515 &Xupdate(\&body_00_15); 516 &Xupdate(\&body_00_15); 517 &Xupdate(\&body_00_15); 518 &Xupdate(\&body_00_15); 519$code.=<<___; 520 teq $t1,#0 @ check for K256 terminator 521 ldr $t1,[sp,#0] 522 sub $Xfer,$Xfer,#64 523 bne .L_00_48 524 525 ldr $inp,[sp,#68] 526 ldr $t0,[sp,#72] 527 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 528 teq $inp,$t0 529 it eq 530 subeq $inp,$inp,#64 @ avoid SEGV 531 vld1.8 {@X[0]},[$inp]! @ load next input block 532 vld1.8 {@X[1]},[$inp]! 533 vld1.8 {@X[2]},[$inp]! 534 vld1.8 {@X[3]},[$inp]! 535 it ne 536 strne $inp,[sp,#68] 537 mov $Xfer,sp 538___ 539 &Xpreload(\&body_00_15); 540 &Xpreload(\&body_00_15); 541 &Xpreload(\&body_00_15); 542 &Xpreload(\&body_00_15); 543$code.=<<___; 544 ldr $t0,[$t1,#0] 545 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 546 ldr $t2,[$t1,#4] 547 ldr $t3,[$t1,#8] 548 ldr $t4,[$t1,#12] 549 add $A,$A,$t0 @ accumulate 550 ldr $t0,[$t1,#16] 551 add $B,$B,$t2 552 ldr $t2,[$t1,#20] 553 add $C,$C,$t3 554 ldr $t3,[$t1,#24] 555 add $D,$D,$t4 556 ldr $t4,[$t1,#28] 557 add $E,$E,$t0 558 str $A,[$t1],#4 559 add $F,$F,$t2 560 str $B,[$t1],#4 561 add $G,$G,$t3 562 str $C,[$t1],#4 563 add $H,$H,$t4 564 str $D,[$t1],#4 565 stmia $t1,{$E-$H} 566 567 ittte ne 568 movne $Xfer,sp 569 ldrne $t1,[sp,#0] 570 eorne $t2,$t2,$t2 571 ldreq sp,[sp,#76] @ restore original sp 572 itt ne 573 eorne $t3,$B,$C 574 bne .L_00_48 575 576 ldmia sp!,{r4-r12,pc} 577.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 578#endif 579___ 580}}} 581###################################################################### 582# ARMv8 stuff 583# 584{{{ 585my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 586my @MSG=map("q$_",(8..11)); 587my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 588my $Ktbl="r3"; 589 590$code.=<<___; 591#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 592 593# ifdef __thumb2__ 594# define INST(a,b,c,d) .byte c,d|0xc,a,b 595# else 596# define INST(a,b,c,d) .byte a,b,c,d 597# endif 598 599.type sha256_block_data_order_armv8,%function 600.align 5 601sha256_block_data_order_armv8: 602.LARMv8: 603 vld1.32 {$ABCD,$EFGH},[$ctx] 604# ifdef __thumb2__ 605 adr $Ktbl,.LARMv8 606 sub $Ktbl,$Ktbl,#.LARMv8-K256 607# else 608 adrl $Ktbl,K256 609# endif 610 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 611 612.Loop_v8: 613 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 614 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 615 vld1.32 {$W0},[$Ktbl]! 616 vrev32.8 @MSG[0],@MSG[0] 617 vrev32.8 @MSG[1],@MSG[1] 618 vrev32.8 @MSG[2],@MSG[2] 619 vrev32.8 @MSG[3],@MSG[3] 620 vmov $ABCD_SAVE,$ABCD @ offload 621 vmov $EFGH_SAVE,$EFGH 622 teq $inp,$len 623___ 624for($i=0;$i<12;$i++) { 625$code.=<<___; 626 vld1.32 {$W1},[$Ktbl]! 627 vadd.i32 $W0,$W0,@MSG[0] 628 sha256su0 @MSG[0],@MSG[1] 629 vmov $abcd,$ABCD 630 sha256h $ABCD,$EFGH,$W0 631 sha256h2 $EFGH,$abcd,$W0 632 sha256su1 @MSG[0],@MSG[2],@MSG[3] 633___ 634 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 635} 636$code.=<<___; 637 vld1.32 {$W1},[$Ktbl]! 638 vadd.i32 $W0,$W0,@MSG[0] 639 vmov $abcd,$ABCD 640 sha256h $ABCD,$EFGH,$W0 641 sha256h2 $EFGH,$abcd,$W0 642 643 vld1.32 {$W0},[$Ktbl]! 644 vadd.i32 $W1,$W1,@MSG[1] 645 vmov $abcd,$ABCD 646 sha256h $ABCD,$EFGH,$W1 647 sha256h2 $EFGH,$abcd,$W1 648 649 vld1.32 {$W1},[$Ktbl] 650 vadd.i32 $W0,$W0,@MSG[2] 651 sub $Ktbl,$Ktbl,#256-16 @ rewind 652 vmov $abcd,$ABCD 653 sha256h $ABCD,$EFGH,$W0 654 sha256h2 $EFGH,$abcd,$W0 655 656 vadd.i32 $W1,$W1,@MSG[3] 657 vmov $abcd,$ABCD 658 sha256h $ABCD,$EFGH,$W1 659 sha256h2 $EFGH,$abcd,$W1 660 661 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 662 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 663 it ne 664 bne .Loop_v8 665 666 vst1.32 {$ABCD,$EFGH},[$ctx] 667 668 ret @ bx lr 669.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 670#endif 671___ 672}}} 673$code.=<<___; 674.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 675.align 2 676#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 677.comm OPENSSL_armcap_P,4,4 678#endif 679___ 680 681open SELF,$0; 682while(<SELF>) { 683 next if (/^#!/); 684 last if (!s/^#/@/ and !/^$/); 685 print; 686} 687close SELF; 688 689{ my %opcode = ( 690 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 691 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 692 693 sub unsha256 { 694 my ($mnemonic,$arg)=@_; 695 696 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 697 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 698 |(($2&7)<<17)|(($2&8)<<4) 699 |(($3&7)<<1) |(($3&8)<<2); 700 # since ARMv7 instructions are always encoded little-endian. 701 # correct solution is to use .inst directive, but older 702 # assemblers don't implement it:-( 703 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 704 $word&0xff,($word>>8)&0xff, 705 ($word>>16)&0xff,($word>>24)&0xff, 706 $mnemonic,$arg; 707 } 708 } 709} 710 711foreach (split($/,$code)) { 712 713 s/\`([^\`]*)\`/eval $1/geo; 714 715 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 716 717 s/\bret\b/bx lr/go or 718 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 719 720 print $_,"\n"; 721} 722 723close STDOUT; # enforce flush 724