1/* 2 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental 3 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture 4 * processors. CPUs supporting Intel(R) AVX extensions will get an additional 5 * boost. 6 * 7 * This work was inspired by the vectorized implementation of Dean Gaudet. 8 * Additional information on it can be found at: 9 * http://www.arctic.org/~dean/crypto/sha1.html 10 * 11 * It was improved upon with more efficient vectorization of the message 12 * scheduling. This implementation has also been optimized for all current and 13 * several future generations of Intel CPUs. 14 * 15 * See this article for more information about the implementation details: 16 * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ 17 * 18 * Copyright (C) 2010, Intel Corp. 19 * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> 20 * Ronen Zohar <ronen.zohar@intel.com> 21 * 22 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: 23 * Author: Mathias Krause <minipli@googlemail.com> 24 * 25 * This program is free software; you can redistribute it and/or modify 26 * it under the terms of the GNU General Public License as published by 27 * the Free Software Foundation; either version 2 of the License, or 28 * (at your option) any later version. 29 */ 30 31#include <linux/linkage.h> 32 33#define CTX %rdi // arg1 34#define BUF %rsi // arg2 35#define CNT %rdx // arg3 36 37#define REG_A %ecx 38#define REG_B %esi 39#define REG_C %edi 40#define REG_D %r12d 41#define REG_E %edx 42 43#define REG_T1 %eax 44#define REG_T2 %ebx 45 46#define K_BASE %r8 47#define HASH_PTR %r9 48#define BUFFER_PTR %r10 49#define BUFFER_END %r11 50 51#define W_TMP1 %xmm0 52#define W_TMP2 %xmm9 53 54#define W0 %xmm1 55#define W4 %xmm2 56#define W8 %xmm3 57#define W12 %xmm4 58#define W16 %xmm5 59#define W20 %xmm6 60#define W24 %xmm7 61#define W28 %xmm8 62 63#define XMM_SHUFB_BSWAP %xmm10 64 65/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ 66#define WK(t) (((t) & 15) * 4)(%rsp) 67#define W_PRECALC_AHEAD 16 68 69/* 70 * This macro implements the SHA-1 function's body for single 64-byte block 71 * param: function's name 72 */ 73.macro SHA1_VECTOR_ASM name 74 ENTRY(\name) 75 76 push %rbx 77 push %r12 78 push %rbp 79 mov %rsp, %rbp 80 81 sub $64, %rsp # allocate workspace 82 and $~15, %rsp # align stack 83 84 mov CTX, HASH_PTR 85 mov BUF, BUFFER_PTR 86 87 shl $6, CNT # multiply by 64 88 add BUF, CNT 89 mov CNT, BUFFER_END 90 91 lea K_XMM_AR(%rip), K_BASE 92 xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP 93 94 SHA1_PIPELINED_MAIN_BODY 95 96 # cleanup workspace 97 mov $8, %ecx 98 mov %rsp, %rdi 99 xor %rax, %rax 100 rep stosq 101 102 mov %rbp, %rsp # deallocate workspace 103 pop %rbp 104 pop %r12 105 pop %rbx 106 ret 107 108 ENDPROC(\name) 109.endm 110 111/* 112 * This macro implements 80 rounds of SHA-1 for one 64-byte block 113 */ 114.macro SHA1_PIPELINED_MAIN_BODY 115 INIT_REGALLOC 116 117 mov (HASH_PTR), A 118 mov 4(HASH_PTR), B 119 mov 8(HASH_PTR), C 120 mov 12(HASH_PTR), D 121 mov 16(HASH_PTR), E 122 123 .set i, 0 124 .rept W_PRECALC_AHEAD 125 W_PRECALC i 126 .set i, (i+1) 127 .endr 128 129.align 4 1301: 131 RR F1,A,B,C,D,E,0 132 RR F1,D,E,A,B,C,2 133 RR F1,B,C,D,E,A,4 134 RR F1,E,A,B,C,D,6 135 RR F1,C,D,E,A,B,8 136 137 RR F1,A,B,C,D,E,10 138 RR F1,D,E,A,B,C,12 139 RR F1,B,C,D,E,A,14 140 RR F1,E,A,B,C,D,16 141 RR F1,C,D,E,A,B,18 142 143 RR F2,A,B,C,D,E,20 144 RR F2,D,E,A,B,C,22 145 RR F2,B,C,D,E,A,24 146 RR F2,E,A,B,C,D,26 147 RR F2,C,D,E,A,B,28 148 149 RR F2,A,B,C,D,E,30 150 RR F2,D,E,A,B,C,32 151 RR F2,B,C,D,E,A,34 152 RR F2,E,A,B,C,D,36 153 RR F2,C,D,E,A,B,38 154 155 RR F3,A,B,C,D,E,40 156 RR F3,D,E,A,B,C,42 157 RR F3,B,C,D,E,A,44 158 RR F3,E,A,B,C,D,46 159 RR F3,C,D,E,A,B,48 160 161 RR F3,A,B,C,D,E,50 162 RR F3,D,E,A,B,C,52 163 RR F3,B,C,D,E,A,54 164 RR F3,E,A,B,C,D,56 165 RR F3,C,D,E,A,B,58 166 167 add $64, BUFFER_PTR # move to the next 64-byte block 168 cmp BUFFER_END, BUFFER_PTR # if the current is the last one use 169 cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun 170 171 RR F4,A,B,C,D,E,60 172 RR F4,D,E,A,B,C,62 173 RR F4,B,C,D,E,A,64 174 RR F4,E,A,B,C,D,66 175 RR F4,C,D,E,A,B,68 176 177 RR F4,A,B,C,D,E,70 178 RR F4,D,E,A,B,C,72 179 RR F4,B,C,D,E,A,74 180 RR F4,E,A,B,C,D,76 181 RR F4,C,D,E,A,B,78 182 183 UPDATE_HASH (HASH_PTR), A 184 UPDATE_HASH 4(HASH_PTR), B 185 UPDATE_HASH 8(HASH_PTR), C 186 UPDATE_HASH 12(HASH_PTR), D 187 UPDATE_HASH 16(HASH_PTR), E 188 189 RESTORE_RENAMED_REGS 190 cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end 191 jne 1b 192.endm 193 194.macro INIT_REGALLOC 195 .set A, REG_A 196 .set B, REG_B 197 .set C, REG_C 198 .set D, REG_D 199 .set E, REG_E 200 .set T1, REG_T1 201 .set T2, REG_T2 202.endm 203 204.macro RESTORE_RENAMED_REGS 205 # order is important (REG_C is where it should be) 206 mov B, REG_B 207 mov D, REG_D 208 mov A, REG_A 209 mov E, REG_E 210.endm 211 212.macro SWAP_REG_NAMES a, b 213 .set _T, \a 214 .set \a, \b 215 .set \b, _T 216.endm 217 218.macro F1 b, c, d 219 mov \c, T1 220 SWAP_REG_NAMES \c, T1 221 xor \d, T1 222 and \b, T1 223 xor \d, T1 224.endm 225 226.macro F2 b, c, d 227 mov \d, T1 228 SWAP_REG_NAMES \d, T1 229 xor \c, T1 230 xor \b, T1 231.endm 232 233.macro F3 b, c ,d 234 mov \c, T1 235 SWAP_REG_NAMES \c, T1 236 mov \b, T2 237 or \b, T1 238 and \c, T2 239 and \d, T1 240 or T2, T1 241.endm 242 243.macro F4 b, c, d 244 F2 \b, \c, \d 245.endm 246 247.macro UPDATE_HASH hash, val 248 add \hash, \val 249 mov \val, \hash 250.endm 251 252/* 253 * RR does two rounds of SHA-1 back to back with W[] pre-calc 254 * t1 = F(b, c, d); e += w(i) 255 * e += t1; b <<= 30; d += w(i+1); 256 * t1 = F(a, b, c); 257 * d += t1; a <<= 5; 258 * e += a; 259 * t1 = e; a >>= 7; 260 * t1 <<= 5; 261 * d += t1; 262 */ 263.macro RR F, a, b, c, d, e, round 264 add WK(\round), \e 265 \F \b, \c, \d # t1 = F(b, c, d); 266 W_PRECALC (\round + W_PRECALC_AHEAD) 267 rol $30, \b 268 add T1, \e 269 add WK(\round + 1), \d 270 271 \F \a, \b, \c 272 W_PRECALC (\round + W_PRECALC_AHEAD + 1) 273 rol $5, \a 274 add \a, \e 275 add T1, \d 276 ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) 277 278 mov \e, T1 279 SWAP_REG_NAMES \e, T1 280 281 rol $5, T1 282 add T1, \d 283 284 # write: \a, \b 285 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c 286.endm 287 288.macro W_PRECALC r 289 .set i, \r 290 291 .if (i < 20) 292 .set K_XMM, 0 293 .elseif (i < 40) 294 .set K_XMM, 16 295 .elseif (i < 60) 296 .set K_XMM, 32 297 .elseif (i < 80) 298 .set K_XMM, 48 299 .endif 300 301 .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) 302 .set i, ((\r) % 80) # pre-compute for the next iteration 303 .if (i == 0) 304 W_PRECALC_RESET 305 .endif 306 W_PRECALC_00_15 307 .elseif (i<32) 308 W_PRECALC_16_31 309 .elseif (i < 80) // rounds 32-79 310 W_PRECALC_32_79 311 .endif 312.endm 313 314.macro W_PRECALC_RESET 315 .set W, W0 316 .set W_minus_04, W4 317 .set W_minus_08, W8 318 .set W_minus_12, W12 319 .set W_minus_16, W16 320 .set W_minus_20, W20 321 .set W_minus_24, W24 322 .set W_minus_28, W28 323 .set W_minus_32, W 324.endm 325 326.macro W_PRECALC_ROTATE 327 .set W_minus_32, W_minus_28 328 .set W_minus_28, W_minus_24 329 .set W_minus_24, W_minus_20 330 .set W_minus_20, W_minus_16 331 .set W_minus_16, W_minus_12 332 .set W_minus_12, W_minus_08 333 .set W_minus_08, W_minus_04 334 .set W_minus_04, W 335 .set W, W_minus_32 336.endm 337 338.macro W_PRECALC_SSSE3 339 340.macro W_PRECALC_00_15 341 W_PRECALC_00_15_SSSE3 342.endm 343.macro W_PRECALC_16_31 344 W_PRECALC_16_31_SSSE3 345.endm 346.macro W_PRECALC_32_79 347 W_PRECALC_32_79_SSSE3 348.endm 349 350/* message scheduling pre-compute for rounds 0-15 */ 351.macro W_PRECALC_00_15_SSSE3 352 .if ((i & 3) == 0) 353 movdqu (i*4)(BUFFER_PTR), W_TMP1 354 .elseif ((i & 3) == 1) 355 pshufb XMM_SHUFB_BSWAP, W_TMP1 356 movdqa W_TMP1, W 357 .elseif ((i & 3) == 2) 358 paddd (K_BASE), W_TMP1 359 .elseif ((i & 3) == 3) 360 movdqa W_TMP1, WK(i&~3) 361 W_PRECALC_ROTATE 362 .endif 363.endm 364 365/* message scheduling pre-compute for rounds 16-31 366 * 367 * - calculating last 32 w[i] values in 8 XMM registers 368 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add 369 * instruction 370 * 371 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] 372 * dependency, but improves for 32-79 373 */ 374.macro W_PRECALC_16_31_SSSE3 375 # blended scheduling of vector and scalar instruction streams, one 4-wide 376 # vector iteration / 4 scalar rounds 377 .if ((i & 3) == 0) 378 movdqa W_minus_12, W 379 palignr $8, W_minus_16, W # w[i-14] 380 movdqa W_minus_04, W_TMP1 381 psrldq $4, W_TMP1 # w[i-3] 382 pxor W_minus_08, W 383 .elseif ((i & 3) == 1) 384 pxor W_minus_16, W_TMP1 385 pxor W_TMP1, W 386 movdqa W, W_TMP2 387 movdqa W, W_TMP1 388 pslldq $12, W_TMP2 389 .elseif ((i & 3) == 2) 390 psrld $31, W 391 pslld $1, W_TMP1 392 por W, W_TMP1 393 movdqa W_TMP2, W 394 psrld $30, W_TMP2 395 pslld $2, W 396 .elseif ((i & 3) == 3) 397 pxor W, W_TMP1 398 pxor W_TMP2, W_TMP1 399 movdqa W_TMP1, W 400 paddd K_XMM(K_BASE), W_TMP1 401 movdqa W_TMP1, WK(i&~3) 402 W_PRECALC_ROTATE 403 .endif 404.endm 405 406/* message scheduling pre-compute for rounds 32-79 407 * 408 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 409 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 410 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken 411 */ 412.macro W_PRECALC_32_79_SSSE3 413 .if ((i & 3) == 0) 414 movdqa W_minus_04, W_TMP1 415 pxor W_minus_28, W # W is W_minus_32 before xor 416 palignr $8, W_minus_08, W_TMP1 417 .elseif ((i & 3) == 1) 418 pxor W_minus_16, W 419 pxor W_TMP1, W 420 movdqa W, W_TMP1 421 .elseif ((i & 3) == 2) 422 psrld $30, W 423 pslld $2, W_TMP1 424 por W, W_TMP1 425 .elseif ((i & 3) == 3) 426 movdqa W_TMP1, W 427 paddd K_XMM(K_BASE), W_TMP1 428 movdqa W_TMP1, WK(i&~3) 429 W_PRECALC_ROTATE 430 .endif 431.endm 432 433.endm // W_PRECALC_SSSE3 434 435 436#define K1 0x5a827999 437#define K2 0x6ed9eba1 438#define K3 0x8f1bbcdc 439#define K4 0xca62c1d6 440 441.section .rodata 442.align 16 443 444K_XMM_AR: 445 .long K1, K1, K1, K1 446 .long K2, K2, K2, K2 447 .long K3, K3, K3, K3 448 .long K4, K4, K4, K4 449 450BSWAP_SHUFB_CTL: 451 .long 0x00010203 452 .long 0x04050607 453 .long 0x08090a0b 454 .long 0x0c0d0e0f 455 456 457.section .text 458 459W_PRECALC_SSSE3 460.macro xmm_mov a, b 461 movdqu \a,\b 462.endm 463 464/* SSSE3 optimized implementation: 465 * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, 466 * unsigned int rounds); 467 */ 468SHA1_VECTOR_ASM sha1_transform_ssse3 469 470#ifdef CONFIG_AS_AVX 471 472.macro W_PRECALC_AVX 473 474.purgem W_PRECALC_00_15 475.macro W_PRECALC_00_15 476 W_PRECALC_00_15_AVX 477.endm 478.purgem W_PRECALC_16_31 479.macro W_PRECALC_16_31 480 W_PRECALC_16_31_AVX 481.endm 482.purgem W_PRECALC_32_79 483.macro W_PRECALC_32_79 484 W_PRECALC_32_79_AVX 485.endm 486 487.macro W_PRECALC_00_15_AVX 488 .if ((i & 3) == 0) 489 vmovdqu (i*4)(BUFFER_PTR), W_TMP1 490 .elseif ((i & 3) == 1) 491 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W 492 .elseif ((i & 3) == 2) 493 vpaddd (K_BASE), W, W_TMP1 494 .elseif ((i & 3) == 3) 495 vmovdqa W_TMP1, WK(i&~3) 496 W_PRECALC_ROTATE 497 .endif 498.endm 499 500.macro W_PRECALC_16_31_AVX 501 .if ((i & 3) == 0) 502 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] 503 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] 504 vpxor W_minus_08, W, W 505 vpxor W_minus_16, W_TMP1, W_TMP1 506 .elseif ((i & 3) == 1) 507 vpxor W_TMP1, W, W 508 vpslldq $12, W, W_TMP2 509 vpslld $1, W, W_TMP1 510 .elseif ((i & 3) == 2) 511 vpsrld $31, W, W 512 vpor W, W_TMP1, W_TMP1 513 vpslld $2, W_TMP2, W 514 vpsrld $30, W_TMP2, W_TMP2 515 .elseif ((i & 3) == 3) 516 vpxor W, W_TMP1, W_TMP1 517 vpxor W_TMP2, W_TMP1, W 518 vpaddd K_XMM(K_BASE), W, W_TMP1 519 vmovdqu W_TMP1, WK(i&~3) 520 W_PRECALC_ROTATE 521 .endif 522.endm 523 524.macro W_PRECALC_32_79_AVX 525 .if ((i & 3) == 0) 526 vpalignr $8, W_minus_08, W_minus_04, W_TMP1 527 vpxor W_minus_28, W, W # W is W_minus_32 before xor 528 .elseif ((i & 3) == 1) 529 vpxor W_minus_16, W_TMP1, W_TMP1 530 vpxor W_TMP1, W, W 531 .elseif ((i & 3) == 2) 532 vpslld $2, W, W_TMP1 533 vpsrld $30, W, W 534 vpor W, W_TMP1, W 535 .elseif ((i & 3) == 3) 536 vpaddd K_XMM(K_BASE), W, W_TMP1 537 vmovdqu W_TMP1, WK(i&~3) 538 W_PRECALC_ROTATE 539 .endif 540.endm 541 542.endm // W_PRECALC_AVX 543 544W_PRECALC_AVX 545.purgem xmm_mov 546.macro xmm_mov a, b 547 vmovdqu \a,\b 548.endm 549 550 551/* AVX optimized implementation: 552 * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, 553 * unsigned int rounds); 554 */ 555SHA1_VECTOR_ASM sha1_transform_avx 556 557#endif 558