1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# GHASH for for PowerISA v2.07. 11# 12# July 2014 13# 14# Accurate performance measurements are problematic, because it's 15# always virtualized setup with possibly throttled processor. 16# Relative comparison is therefore more informative. This initial 17# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x 18# faster than "4-bit" integer-only compiler-generated 64-bit code. 19# "Initial version" means that there is room for futher improvement. 20 21$flavour=shift; 22$output =shift; 23 24if ($flavour =~ /64/) { 25 $SIZE_T=8; 26 $LRSAVE=2*$SIZE_T; 27 $STU="stdu"; 28 $POP="ld"; 29 $PUSH="std"; 30} elsif ($flavour =~ /32/) { 31 $SIZE_T=4; 32 $LRSAVE=$SIZE_T; 33 $STU="stwu"; 34 $POP="lwz"; 35 $PUSH="stw"; 36} else { die "nonsense $flavour"; } 37 38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 39( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 40( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 41die "can't locate ppc-xlate.pl"; 42 43open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; 44 45my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block 46 47my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); 48my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); 49my $vrsave="r12"; 50 51$code=<<___; 52.machine "any" 53 54.text 55 56.globl .gcm_init_p8 57 lis r0,0xfff0 58 li r8,0x10 59 mfspr $vrsave,256 60 li r9,0x20 61 mtspr 256,r0 62 li r10,0x30 63 lvx_u $H,0,r4 # load H 64 le?xor r7,r7,r7 65 le?addi r7,r7,0x8 # need a vperm start with 08 66 le?lvsr 5,0,r7 67 le?vspltisb 6,0x0f 68 le?vxor 5,5,6 # set a b-endian mask 69 le?vperm $H,$H,$H,5 70 71 vspltisb $xC2,-16 # 0xf0 72 vspltisb $t0,1 # one 73 vaddubm $xC2,$xC2,$xC2 # 0xe0 74 vxor $zero,$zero,$zero 75 vor $xC2,$xC2,$t0 # 0xe1 76 vsldoi $xC2,$xC2,$zero,15 # 0xe1... 77 vsldoi $t1,$zero,$t0,1 # ...1 78 vaddubm $xC2,$xC2,$xC2 # 0xc2... 79 vspltisb $t2,7 80 vor $xC2,$xC2,$t1 # 0xc2....01 81 vspltb $t1,$H,0 # most significant byte 82 vsl $H,$H,$t0 # H<<=1 83 vsrab $t1,$t1,$t2 # broadcast carry bit 84 vand $t1,$t1,$xC2 85 vxor $H,$H,$t1 # twisted H 86 87 vsldoi $H,$H,$H,8 # twist even more ... 88 vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 89 vsldoi $Hl,$zero,$H,8 # ... and split 90 vsldoi $Hh,$H,$zero,8 91 92 stvx_u $xC2,0,r3 # save pre-computed table 93 stvx_u $Hl,r8,r3 94 stvx_u $H, r9,r3 95 stvx_u $Hh,r10,r3 96 97 mtspr 256,$vrsave 98 blr 99 .long 0 100 .byte 0,12,0x14,0,0,0,2,0 101 .long 0 102.size .gcm_init_p8,.-.gcm_init_p8 103 104.globl .gcm_gmult_p8 105 lis r0,0xfff8 106 li r8,0x10 107 mfspr $vrsave,256 108 li r9,0x20 109 mtspr 256,r0 110 li r10,0x30 111 lvx_u $IN,0,$Xip # load Xi 112 113 lvx_u $Hl,r8,$Htbl # load pre-computed table 114 le?lvsl $lemask,r0,r0 115 lvx_u $H, r9,$Htbl 116 le?vspltisb $t0,0x07 117 lvx_u $Hh,r10,$Htbl 118 le?vxor $lemask,$lemask,$t0 119 lvx_u $xC2,0,$Htbl 120 le?vperm $IN,$IN,$IN,$lemask 121 vxor $zero,$zero,$zero 122 123 vpmsumd $Xl,$IN,$Hl # H.lo�Xi.lo 124 vpmsumd $Xm,$IN,$H # H.hi�Xi.lo+H.lo�Xi.hi 125 vpmsumd $Xh,$IN,$Hh # H.hi�Xi.hi 126 127 vpmsumd $t2,$Xl,$xC2 # 1st phase 128 129 vsldoi $t0,$Xm,$zero,8 130 vsldoi $t1,$zero,$Xm,8 131 vxor $Xl,$Xl,$t0 132 vxor $Xh,$Xh,$t1 133 134 vsldoi $Xl,$Xl,$Xl,8 135 vxor $Xl,$Xl,$t2 136 137 vsldoi $t1,$Xl,$Xl,8 # 2nd phase 138 vpmsumd $Xl,$Xl,$xC2 139 vxor $t1,$t1,$Xh 140 vxor $Xl,$Xl,$t1 141 142 le?vperm $Xl,$Xl,$Xl,$lemask 143 stvx_u $Xl,0,$Xip # write out Xi 144 145 mtspr 256,$vrsave 146 blr 147 .long 0 148 .byte 0,12,0x14,0,0,0,2,0 149 .long 0 150.size .gcm_gmult_p8,.-.gcm_gmult_p8 151 152.globl .gcm_ghash_p8 153 lis r0,0xfff8 154 li r8,0x10 155 mfspr $vrsave,256 156 li r9,0x20 157 mtspr 256,r0 158 li r10,0x30 159 lvx_u $Xl,0,$Xip # load Xi 160 161 lvx_u $Hl,r8,$Htbl # load pre-computed table 162 le?lvsl $lemask,r0,r0 163 lvx_u $H, r9,$Htbl 164 le?vspltisb $t0,0x07 165 lvx_u $Hh,r10,$Htbl 166 le?vxor $lemask,$lemask,$t0 167 lvx_u $xC2,0,$Htbl 168 le?vperm $Xl,$Xl,$Xl,$lemask 169 vxor $zero,$zero,$zero 170 171 lvx_u $IN,0,$inp 172 addi $inp,$inp,16 173 subi $len,$len,16 174 le?vperm $IN,$IN,$IN,$lemask 175 vxor $IN,$IN,$Xl 176 b Loop 177 178.align 5 179Loop: 180 subic $len,$len,16 181 vpmsumd $Xl,$IN,$Hl # H.lo�Xi.lo 182 subfe. r0,r0,r0 # borrow?-1:0 183 vpmsumd $Xm,$IN,$H # H.hi�Xi.lo+H.lo�Xi.hi 184 and r0,r0,$len 185 vpmsumd $Xh,$IN,$Hh # H.hi�Xi.hi 186 add $inp,$inp,r0 187 188 vpmsumd $t2,$Xl,$xC2 # 1st phase 189 190 vsldoi $t0,$Xm,$zero,8 191 vsldoi $t1,$zero,$Xm,8 192 vxor $Xl,$Xl,$t0 193 vxor $Xh,$Xh,$t1 194 195 vsldoi $Xl,$Xl,$Xl,8 196 vxor $Xl,$Xl,$t2 197 lvx_u $IN,0,$inp 198 addi $inp,$inp,16 199 200 vsldoi $t1,$Xl,$Xl,8 # 2nd phase 201 vpmsumd $Xl,$Xl,$xC2 202 le?vperm $IN,$IN,$IN,$lemask 203 vxor $t1,$t1,$Xh 204 vxor $IN,$IN,$t1 205 vxor $IN,$IN,$Xl 206 beq Loop # did $len-=16 borrow? 207 208 vxor $Xl,$Xl,$t1 209 le?vperm $Xl,$Xl,$Xl,$lemask 210 stvx_u $Xl,0,$Xip # write out Xi 211 212 mtspr 256,$vrsave 213 blr 214 .long 0 215 .byte 0,12,0x14,0,0,0,4,0 216 .long 0 217.size .gcm_ghash_p8,.-.gcm_ghash_p8 218 219.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" 220.align 2 221___ 222 223foreach (split("\n",$code)) { 224 if ($flavour =~ /le$/o) { # little-endian 225 s/le\?//o or 226 s/be\?/#be#/o; 227 } else { 228 s/le\?/#le#/o or 229 s/be\?//o; 230 } 231 print $_,"\n"; 232} 233 234close STDOUT; # enforce flush 235