1/* 2 * Fast SHA-1 implementation for SPE instruction set (PPC) 3 * 4 * This code makes use of the SPE SIMD instruction set as defined in 5 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf 6 * Implementation is based on optimization guide notes from 7 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf 8 * 9 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> 10 * 11 * This program is free software; you can redistribute it and/or modify it 12 * under the terms of the GNU General Public License as published by the Free 13 * Software Foundation; either version 2 of the License, or (at your option) 14 * any later version. 15 * 16 */ 17 18#include <asm/ppc_asm.h> 19#include <asm/asm-offsets.h> 20 21#define rHP r3 /* pointer to hash value */ 22#define rWP r4 /* pointer to input */ 23#define rKP r5 /* pointer to constants */ 24 25#define rW0 r14 /* 64 bit round words */ 26#define rW1 r15 27#define rW2 r16 28#define rW3 r17 29#define rW4 r18 30#define rW5 r19 31#define rW6 r20 32#define rW7 r21 33 34#define rH0 r6 /* 32 bit hash values */ 35#define rH1 r7 36#define rH2 r8 37#define rH3 r9 38#define rH4 r10 39 40#define rT0 r22 /* 64 bit temporary */ 41#define rT1 r0 /* 32 bit temporaries */ 42#define rT2 r11 43#define rT3 r12 44 45#define rK r23 /* 64 bit constant in volatile register */ 46 47#define LOAD_K01 48 49#define LOAD_K11 \ 50 evlwwsplat rK,0(rKP); 51 52#define LOAD_K21 \ 53 evlwwsplat rK,4(rKP); 54 55#define LOAD_K31 \ 56 evlwwsplat rK,8(rKP); 57 58#define LOAD_K41 \ 59 evlwwsplat rK,12(rKP); 60 61#define INITIALIZE \ 62 stwu r1,-128(r1); /* create stack frame */ \ 63 evstdw r14,8(r1); /* We must save non volatile */ \ 64 evstdw r15,16(r1); /* registers. Take the chance */ \ 65 evstdw r16,24(r1); /* and save the SPE part too */ \ 66 evstdw r17,32(r1); \ 67 evstdw r18,40(r1); \ 68 evstdw r19,48(r1); \ 69 evstdw r20,56(r1); \ 70 evstdw r21,64(r1); \ 71 evstdw r22,72(r1); \ 72 evstdw r23,80(r1); 73 74 75#define FINALIZE \ 76 evldw r14,8(r1); /* restore SPE registers */ \ 77 evldw r15,16(r1); \ 78 evldw r16,24(r1); \ 79 evldw r17,32(r1); \ 80 evldw r18,40(r1); \ 81 evldw r19,48(r1); \ 82 evldw r20,56(r1); \ 83 evldw r21,64(r1); \ 84 evldw r22,72(r1); \ 85 evldw r23,80(r1); \ 86 xor r0,r0,r0; \ 87 stw r0,8(r1); /* Delete sensitive data */ \ 88 stw r0,16(r1); /* that we might have pushed */ \ 89 stw r0,24(r1); /* from other context that runs */ \ 90 stw r0,32(r1); /* the same code. Assume that */ \ 91 stw r0,40(r1); /* the lower part of the GPRs */ \ 92 stw r0,48(r1); /* were already overwritten on */ \ 93 stw r0,56(r1); /* the way down to here */ \ 94 stw r0,64(r1); \ 95 stw r0,72(r1); \ 96 stw r0,80(r1); \ 97 addi r1,r1,128; /* cleanup stack frame */ 98 99#ifdef __BIG_ENDIAN__ 100#define LOAD_DATA(reg, off) \ 101 lwz reg,off(rWP); /* load data */ 102#define NEXT_BLOCK \ 103 addi rWP,rWP,64; /* increment per block */ 104#else 105#define LOAD_DATA(reg, off) \ 106 lwbrx reg,0,rWP; /* load data */ \ 107 addi rWP,rWP,4; /* increment per word */ 108#define NEXT_BLOCK /* nothing to do */ 109#endif 110 111#define R_00_15(a, b, c, d, e, w0, w1, k, off) \ 112 LOAD_DATA(w0, off) /* 1: W */ \ 113 and rT2,b,c; /* 1: F' = B and C */ \ 114 LOAD_K##k##1 \ 115 andc rT1,d,b; /* 1: F" = ~B and D */ \ 116 rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \ 117 or rT2,rT2,rT1; /* 1: F = F' or F" */ \ 118 add e,e,rT0; /* 1: E = E + A' */ \ 119 rotrwi b,b,2; /* 1: B = B rotl 30 */ \ 120 add e,e,w0; /* 1: E = E + W */ \ 121 LOAD_DATA(w1, off+4) /* 2: W */ \ 122 add e,e,rT2; /* 1: E = E + F */ \ 123 and rT1,a,b; /* 2: F' = B and C */ \ 124 add e,e,rK; /* 1: E = E + K */ \ 125 andc rT2,c,a; /* 2: F" = ~B and D */ \ 126 add d,d,rK; /* 2: E = E + K */ \ 127 or rT2,rT2,rT1; /* 2: F = F' or F" */ \ 128 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ 129 add d,d,w1; /* 2: E = E + W */ \ 130 rotrwi a,a,2; /* 2: B = B rotl 30 */ \ 131 add d,d,rT0; /* 2: E = E + A' */ \ 132 evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \ 133 add d,d,rT2 /* 2: E = E + F */ 134 135#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ 136 and rT2,b,c; /* 1: F' = B and C */ \ 137 evmergelohi rT0,w7,w6; /* W[-3] */ \ 138 andc rT1,d,b; /* 1: F" = ~B and D */ \ 139 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ 140 or rT1,rT1,rT2; /* 1: F = F' or F" */ \ 141 evxor w0,w0,w4; /* W = W xor W[-8] */ \ 142 add e,e,rT1; /* 1: E = E + F */ \ 143 evxor w0,w0,w1; /* W = W xor W[-14] */ \ 144 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ 145 evrlwi w0,w0,1; /* W = W rotl 1 */ \ 146 add e,e,rT2; /* 1: E = E + A' */ \ 147 evaddw rT0,w0,rK; /* WK = W + K */ \ 148 rotrwi b,b,2; /* 1: B = B rotl 30 */ \ 149 LOAD_K##k##1 \ 150 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ 151 add e,e,rT0; /* 1: E = E + WK */ \ 152 add d,d,rT1; /* 2: E = E + WK */ \ 153 and rT2,a,b; /* 2: F' = B and C */ \ 154 andc rT1,c,a; /* 2: F" = ~B and D */ \ 155 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ 156 or rT1,rT1,rT2; /* 2: F = F' or F" */ \ 157 add d,d,rT0; /* 2: E = E + A' */ \ 158 rotrwi a,a,2; /* 2: B = B rotl 30 */ \ 159 add d,d,rT1 /* 2: E = E + F */ 160 161#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ 162 evmergelohi rT0,w7,w6; /* W[-3] */ \ 163 xor rT2,b,c; /* 1: F' = B xor C */ \ 164 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ 165 xor rT2,rT2,d; /* 1: F = F' xor D */ \ 166 evxor w0,w0,w4; /* W = W xor W[-8] */ \ 167 add e,e,rT2; /* 1: E = E + F */ \ 168 evxor w0,w0,w1; /* W = W xor W[-14] */ \ 169 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ 170 evrlwi w0,w0,1; /* W = W rotl 1 */ \ 171 add e,e,rT2; /* 1: E = E + A' */ \ 172 evaddw rT0,w0,rK; /* WK = W + K */ \ 173 rotrwi b,b,2; /* 1: B = B rotl 30 */ \ 174 LOAD_K##k##1 \ 175 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ 176 add e,e,rT0; /* 1: E = E + WK */ \ 177 xor rT2,a,b; /* 2: F' = B xor C */ \ 178 add d,d,rT1; /* 2: E = E + WK */ \ 179 xor rT2,rT2,c; /* 2: F = F' xor D */ \ 180 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ 181 add d,d,rT2; /* 2: E = E + F */ \ 182 rotrwi a,a,2; /* 2: B = B rotl 30 */ \ 183 add d,d,rT0 /* 2: E = E + A' */ 184 185#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ 186 and rT2,b,c; /* 1: F' = B and C */ \ 187 evmergelohi rT0,w7,w6; /* W[-3] */ \ 188 or rT1,b,c; /* 1: F" = B or C */ \ 189 evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ 190 and rT1,d,rT1; /* 1: F" = F" and D */ \ 191 evxor w0,w0,w4; /* W = W xor W[-8] */ \ 192 or rT2,rT2,rT1; /* 1: F = F' or F" */ \ 193 evxor w0,w0,w1; /* W = W xor W[-14] */ \ 194 add e,e,rT2; /* 1: E = E + F */ \ 195 evrlwi w0,w0,1; /* W = W rotl 1 */ \ 196 rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ 197 evaddw rT0,w0,rK; /* WK = W + K */ \ 198 add e,e,rT2; /* 1: E = E + A' */ \ 199 LOAD_K##k##1 \ 200 evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ 201 rotrwi b,b,2; /* 1: B = B rotl 30 */ \ 202 add e,e,rT0; /* 1: E = E + WK */ \ 203 and rT2,a,b; /* 2: F' = B and C */ \ 204 or rT0,a,b; /* 2: F" = B or C */ \ 205 add d,d,rT1; /* 2: E = E + WK */ \ 206 and rT0,c,rT0; /* 2: F" = F" and D */ \ 207 rotrwi a,a,2; /* 2: B = B rotl 30 */ \ 208 or rT2,rT2,rT0; /* 2: F = F' or F" */ \ 209 rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ 210 add d,d,rT2; /* 2: E = E + F */ \ 211 add d,d,rT0 /* 2: E = E + A' */ 212 213#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ 214 R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) 215 216_GLOBAL(ppc_spe_sha1_transform) 217 INITIALIZE 218 219 lwz rH0,0(rHP) 220 lwz rH1,4(rHP) 221 mtctr r5 222 lwz rH2,8(rHP) 223 lis rKP,PPC_SPE_SHA1_K@h 224 lwz rH3,12(rHP) 225 ori rKP,rKP,PPC_SPE_SHA1_K@l 226 lwz rH4,16(rHP) 227 228ppc_spe_sha1_main: 229 R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0) 230 R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8) 231 R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16) 232 R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24) 233 R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32) 234 R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40) 235 R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48) 236 R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56) 237 238 R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0) 239 R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2) 240 241 R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0) 242 R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0) 243 R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0) 244 R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0) 245 R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0) 246 R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0) 247 R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0) 248 R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0) 249 R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0) 250 R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3) 251 252 R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0) 253 R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0) 254 R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0) 255 R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0) 256 R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0) 257 R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0) 258 R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0) 259 R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0) 260 R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0) 261 R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4) 262 263 R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0) 264 R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0) 265 R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0) 266 R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0) 267 R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0) 268 R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0) 269 R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0) 270 lwz rT3,0(rHP) 271 R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0) 272 lwz rW1,4(rHP) 273 R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0) 274 lwz rW2,8(rHP) 275 R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0) 276 lwz rW3,12(rHP) 277 NEXT_BLOCK 278 lwz rW4,16(rHP) 279 280 add rH0,rH0,rT3 281 stw rH0,0(rHP) 282 add rH1,rH1,rW1 283 stw rH1,4(rHP) 284 add rH2,rH2,rW2 285 stw rH2,8(rHP) 286 add rH3,rH3,rW3 287 stw rH3,12(rHP) 288 add rH4,rH4,rW4 289 stw rH4,16(rHP) 290 291 bdnz ppc_spe_sha1_main 292 293 FINALIZE 294 blr 295 296.data 297.align 4 298PPC_SPE_SHA1_K: 299 .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6 300