1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental 4 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture 5 * processors. CPUs supporting Intel(R) AVX extensions will get an additional 6 * boost. 7 * 8 * This work was inspired by the vectorized implementation of Dean Gaudet. 9 * Additional information on it can be found at: 10 * http://www.arctic.org/~dean/crypto/sha1.html 11 * 12 * It was improved upon with more efficient vectorization of the message 13 * scheduling. This implementation has also been optimized for all current and 14 * several future generations of Intel CPUs. 15 * 16 * See this article for more information about the implementation details: 17 * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ 18 * 19 * Copyright (C) 2010, Intel Corp. 20 * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> 21 * Ronen Zohar <ronen.zohar@intel.com> 22 * 23 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 */ 26 27#include <linux/linkage.h> 28#include <linux/cfi_types.h> 29 30#define CTX %rdi // arg1 31#define BUF %rsi // arg2 32#define CNT %rdx // arg3 33 34#define REG_A %ecx 35#define REG_B %esi 36#define REG_C %edi 37#define REG_D %r12d 38#define REG_E %edx 39 40#define REG_T1 %eax 41#define REG_T2 %ebx 42 43#define K_BASE %r8 44#define HASH_PTR %r9 45#define BUFFER_PTR %r10 46#define BUFFER_END %r11 47 48#define W_TMP1 %xmm0 49#define W_TMP2 %xmm9 50 51#define W0 %xmm1 52#define W4 %xmm2 53#define W8 %xmm3 54#define W12 %xmm4 55#define W16 %xmm5 56#define W20 %xmm6 57#define W24 %xmm7 58#define W28 %xmm8 59 60#define XMM_SHUFB_BSWAP %xmm10 61 62/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ 63#define WK(t) (((t) & 15) * 4)(%rsp) 64#define W_PRECALC_AHEAD 16 65 66/* 67 * This macro implements the SHA-1 function's body for single 64-byte block 68 * param: function's name 69 */ 70.macro SHA1_VECTOR_ASM name 71 SYM_TYPED_FUNC_START(\name) 72 73 push %rbx 74 push %r12 75 push %rbp 76 mov %rsp, %rbp 77 78 sub $64, %rsp # allocate workspace 79 and $~15, %rsp # align stack 80 81 mov CTX, HASH_PTR 82 mov BUF, BUFFER_PTR 83 84 shl $6, CNT # multiply by 64 85 add BUF, CNT 86 mov CNT, BUFFER_END 87 88 lea K_XMM_AR(%rip), K_BASE 89 xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP 90 91 SHA1_PIPELINED_MAIN_BODY 92 93 # cleanup workspace 94 mov $8, %ecx 95 mov %rsp, %rdi 96 xor %eax, %eax 97 rep stosq 98 99 mov %rbp, %rsp # deallocate workspace 100 pop %rbp 101 pop %r12 102 pop %rbx 103 RET 104 105 SYM_FUNC_END(\name) 106.endm 107 108/* 109 * This macro implements 80 rounds of SHA-1 for one 64-byte block 110 */ 111.macro SHA1_PIPELINED_MAIN_BODY 112 INIT_REGALLOC 113 114 mov (HASH_PTR), A 115 mov 4(HASH_PTR), B 116 mov 8(HASH_PTR), C 117 mov 12(HASH_PTR), D 118 mov 16(HASH_PTR), E 119 120 .set i, 0 121 .rept W_PRECALC_AHEAD 122 W_PRECALC i 123 .set i, (i+1) 124 .endr 125 126.align 4 1271: 128 RR F1,A,B,C,D,E,0 129 RR F1,D,E,A,B,C,2 130 RR F1,B,C,D,E,A,4 131 RR F1,E,A,B,C,D,6 132 RR F1,C,D,E,A,B,8 133 134 RR F1,A,B,C,D,E,10 135 RR F1,D,E,A,B,C,12 136 RR F1,B,C,D,E,A,14 137 RR F1,E,A,B,C,D,16 138 RR F1,C,D,E,A,B,18 139 140 RR F2,A,B,C,D,E,20 141 RR F2,D,E,A,B,C,22 142 RR F2,B,C,D,E,A,24 143 RR F2,E,A,B,C,D,26 144 RR F2,C,D,E,A,B,28 145 146 RR F2,A,B,C,D,E,30 147 RR F2,D,E,A,B,C,32 148 RR F2,B,C,D,E,A,34 149 RR F2,E,A,B,C,D,36 150 RR F2,C,D,E,A,B,38 151 152 RR F3,A,B,C,D,E,40 153 RR F3,D,E,A,B,C,42 154 RR F3,B,C,D,E,A,44 155 RR F3,E,A,B,C,D,46 156 RR F3,C,D,E,A,B,48 157 158 RR F3,A,B,C,D,E,50 159 RR F3,D,E,A,B,C,52 160 RR F3,B,C,D,E,A,54 161 RR F3,E,A,B,C,D,56 162 RR F3,C,D,E,A,B,58 163 164 add $64, BUFFER_PTR # move to the next 64-byte block 165 cmp BUFFER_END, BUFFER_PTR # if the current is the last one use 166 cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun 167 168 RR F4,A,B,C,D,E,60 169 RR F4,D,E,A,B,C,62 170 RR F4,B,C,D,E,A,64 171 RR F4,E,A,B,C,D,66 172 RR F4,C,D,E,A,B,68 173 174 RR F4,A,B,C,D,E,70 175 RR F4,D,E,A,B,C,72 176 RR F4,B,C,D,E,A,74 177 RR F4,E,A,B,C,D,76 178 RR F4,C,D,E,A,B,78 179 180 UPDATE_HASH (HASH_PTR), A 181 UPDATE_HASH 4(HASH_PTR), B 182 UPDATE_HASH 8(HASH_PTR), C 183 UPDATE_HASH 12(HASH_PTR), D 184 UPDATE_HASH 16(HASH_PTR), E 185 186 RESTORE_RENAMED_REGS 187 cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end 188 jne 1b 189.endm 190 191.macro INIT_REGALLOC 192 .set A, REG_A 193 .set B, REG_B 194 .set C, REG_C 195 .set D, REG_D 196 .set E, REG_E 197 .set T1, REG_T1 198 .set T2, REG_T2 199.endm 200 201.macro RESTORE_RENAMED_REGS 202 # order is important (REG_C is where it should be) 203 mov B, REG_B 204 mov D, REG_D 205 mov A, REG_A 206 mov E, REG_E 207.endm 208 209.macro SWAP_REG_NAMES a, b 210 .set _T, \a 211 .set \a, \b 212 .set \b, _T 213.endm 214 215.macro F1 b, c, d 216 mov \c, T1 217 SWAP_REG_NAMES \c, T1 218 xor \d, T1 219 and \b, T1 220 xor \d, T1 221.endm 222 223.macro F2 b, c, d 224 mov \d, T1 225 SWAP_REG_NAMES \d, T1 226 xor \c, T1 227 xor \b, T1 228.endm 229 230.macro F3 b, c ,d 231 mov \c, T1 232 SWAP_REG_NAMES \c, T1 233 mov \b, T2 234 or \b, T1 235 and \c, T2 236 and \d, T1 237 or T2, T1 238.endm 239 240.macro F4 b, c, d 241 F2 \b, \c, \d 242.endm 243 244.macro UPDATE_HASH hash, val 245 add \hash, \val 246 mov \val, \hash 247.endm 248 249/* 250 * RR does two rounds of SHA-1 back to back with W[] pre-calc 251 * t1 = F(b, c, d); e += w(i) 252 * e += t1; b <<= 30; d += w(i+1); 253 * t1 = F(a, b, c); 254 * d += t1; a <<= 5; 255 * e += a; 256 * t1 = e; a >>= 7; 257 * t1 <<= 5; 258 * d += t1; 259 */ 260.macro RR F, a, b, c, d, e, round 261 add WK(\round), \e 262 \F \b, \c, \d # t1 = F(b, c, d); 263 W_PRECALC (\round + W_PRECALC_AHEAD) 264 rol $30, \b 265 add T1, \e 266 add WK(\round + 1), \d 267 268 \F \a, \b, \c 269 W_PRECALC (\round + W_PRECALC_AHEAD + 1) 270 rol $5, \a 271 add \a, \e 272 add T1, \d 273 ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) 274 275 mov \e, T1 276 SWAP_REG_NAMES \e, T1 277 278 rol $5, T1 279 add T1, \d 280 281 # write: \a, \b 282 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c 283.endm 284 285.macro W_PRECALC r 286 .set i, \r 287 288 .if (i < 20) 289 .set K_XMM, 0 290 .elseif (i < 40) 291 .set K_XMM, 16 292 .elseif (i < 60) 293 .set K_XMM, 32 294 .elseif (i < 80) 295 .set K_XMM, 48 296 .endif 297 298 .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) 299 .set i, ((\r) % 80) # pre-compute for the next iteration 300 .if (i == 0) 301 W_PRECALC_RESET 302 .endif 303 W_PRECALC_00_15 304 .elseif (i<32) 305 W_PRECALC_16_31 306 .elseif (i < 80) // rounds 32-79 307 W_PRECALC_32_79 308 .endif 309.endm 310 311.macro W_PRECALC_RESET 312 .set W, W0 313 .set W_minus_04, W4 314 .set W_minus_08, W8 315 .set W_minus_12, W12 316 .set W_minus_16, W16 317 .set W_minus_20, W20 318 .set W_minus_24, W24 319 .set W_minus_28, W28 320 .set W_minus_32, W 321.endm 322 323.macro W_PRECALC_ROTATE 324 .set W_minus_32, W_minus_28 325 .set W_minus_28, W_minus_24 326 .set W_minus_24, W_minus_20 327 .set W_minus_20, W_minus_16 328 .set W_minus_16, W_minus_12 329 .set W_minus_12, W_minus_08 330 .set W_minus_08, W_minus_04 331 .set W_minus_04, W 332 .set W, W_minus_32 333.endm 334 335.macro W_PRECALC_SSSE3 336 337.macro W_PRECALC_00_15 338 W_PRECALC_00_15_SSSE3 339.endm 340.macro W_PRECALC_16_31 341 W_PRECALC_16_31_SSSE3 342.endm 343.macro W_PRECALC_32_79 344 W_PRECALC_32_79_SSSE3 345.endm 346 347/* message scheduling pre-compute for rounds 0-15 */ 348.macro W_PRECALC_00_15_SSSE3 349 .if ((i & 3) == 0) 350 movdqu (i*4)(BUFFER_PTR), W_TMP1 351 .elseif ((i & 3) == 1) 352 pshufb XMM_SHUFB_BSWAP, W_TMP1 353 movdqa W_TMP1, W 354 .elseif ((i & 3) == 2) 355 paddd (K_BASE), W_TMP1 356 .elseif ((i & 3) == 3) 357 movdqa W_TMP1, WK(i&~3) 358 W_PRECALC_ROTATE 359 .endif 360.endm 361 362/* message scheduling pre-compute for rounds 16-31 363 * 364 * - calculating last 32 w[i] values in 8 XMM registers 365 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add 366 * instruction 367 * 368 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] 369 * dependency, but improves for 32-79 370 */ 371.macro W_PRECALC_16_31_SSSE3 372 # blended scheduling of vector and scalar instruction streams, one 4-wide 373 # vector iteration / 4 scalar rounds 374 .if ((i & 3) == 0) 375 movdqa W_minus_12, W 376 palignr $8, W_minus_16, W # w[i-14] 377 movdqa W_minus_04, W_TMP1 378 psrldq $4, W_TMP1 # w[i-3] 379 pxor W_minus_08, W 380 .elseif ((i & 3) == 1) 381 pxor W_minus_16, W_TMP1 382 pxor W_TMP1, W 383 movdqa W, W_TMP2 384 movdqa W, W_TMP1 385 pslldq $12, W_TMP2 386 .elseif ((i & 3) == 2) 387 psrld $31, W 388 pslld $1, W_TMP1 389 por W, W_TMP1 390 movdqa W_TMP2, W 391 psrld $30, W_TMP2 392 pslld $2, W 393 .elseif ((i & 3) == 3) 394 pxor W, W_TMP1 395 pxor W_TMP2, W_TMP1 396 movdqa W_TMP1, W 397 paddd K_XMM(K_BASE), W_TMP1 398 movdqa W_TMP1, WK(i&~3) 399 W_PRECALC_ROTATE 400 .endif 401.endm 402 403/* message scheduling pre-compute for rounds 32-79 404 * 405 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 406 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 407 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken 408 */ 409.macro W_PRECALC_32_79_SSSE3 410 .if ((i & 3) == 0) 411 movdqa W_minus_04, W_TMP1 412 pxor W_minus_28, W # W is W_minus_32 before xor 413 palignr $8, W_minus_08, W_TMP1 414 .elseif ((i & 3) == 1) 415 pxor W_minus_16, W 416 pxor W_TMP1, W 417 movdqa W, W_TMP1 418 .elseif ((i & 3) == 2) 419 psrld $30, W 420 pslld $2, W_TMP1 421 por W, W_TMP1 422 .elseif ((i & 3) == 3) 423 movdqa W_TMP1, W 424 paddd K_XMM(K_BASE), W_TMP1 425 movdqa W_TMP1, WK(i&~3) 426 W_PRECALC_ROTATE 427 .endif 428.endm 429 430.endm // W_PRECALC_SSSE3 431 432 433#define K1 0x5a827999 434#define K2 0x6ed9eba1 435#define K3 0x8f1bbcdc 436#define K4 0xca62c1d6 437 438.section .rodata 439.align 16 440 441K_XMM_AR: 442 .long K1, K1, K1, K1 443 .long K2, K2, K2, K2 444 .long K3, K3, K3, K3 445 .long K4, K4, K4, K4 446 447BSWAP_SHUFB_CTL: 448 .long 0x00010203 449 .long 0x04050607 450 .long 0x08090a0b 451 .long 0x0c0d0e0f 452 453 454.section .text 455 456W_PRECALC_SSSE3 457.macro xmm_mov a, b 458 movdqu \a,\b 459.endm 460 461/* 462 * SSSE3 optimized implementation: 463 * 464 * extern "C" void sha1_transform_ssse3(struct sha1_state *state, 465 * const u8 *data, int blocks); 466 * 467 * Note that struct sha1_state is assumed to begin with u32 state[5]. 468 */ 469SHA1_VECTOR_ASM sha1_transform_ssse3 470 471.macro W_PRECALC_AVX 472 473.purgem W_PRECALC_00_15 474.macro W_PRECALC_00_15 475 W_PRECALC_00_15_AVX 476.endm 477.purgem W_PRECALC_16_31 478.macro W_PRECALC_16_31 479 W_PRECALC_16_31_AVX 480.endm 481.purgem W_PRECALC_32_79 482.macro W_PRECALC_32_79 483 W_PRECALC_32_79_AVX 484.endm 485 486.macro W_PRECALC_00_15_AVX 487 .if ((i & 3) == 0) 488 vmovdqu (i*4)(BUFFER_PTR), W_TMP1 489 .elseif ((i & 3) == 1) 490 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W 491 .elseif ((i & 3) == 2) 492 vpaddd (K_BASE), W, W_TMP1 493 .elseif ((i & 3) == 3) 494 vmovdqa W_TMP1, WK(i&~3) 495 W_PRECALC_ROTATE 496 .endif 497.endm 498 499.macro W_PRECALC_16_31_AVX 500 .if ((i & 3) == 0) 501 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] 502 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] 503 vpxor W_minus_08, W, W 504 vpxor W_minus_16, W_TMP1, W_TMP1 505 .elseif ((i & 3) == 1) 506 vpxor W_TMP1, W, W 507 vpslldq $12, W, W_TMP2 508 vpslld $1, W, W_TMP1 509 .elseif ((i & 3) == 2) 510 vpsrld $31, W, W 511 vpor W, W_TMP1, W_TMP1 512 vpslld $2, W_TMP2, W 513 vpsrld $30, W_TMP2, W_TMP2 514 .elseif ((i & 3) == 3) 515 vpxor W, W_TMP1, W_TMP1 516 vpxor W_TMP2, W_TMP1, W 517 vpaddd K_XMM(K_BASE), W, W_TMP1 518 vmovdqu W_TMP1, WK(i&~3) 519 W_PRECALC_ROTATE 520 .endif 521.endm 522 523.macro W_PRECALC_32_79_AVX 524 .if ((i & 3) == 0) 525 vpalignr $8, W_minus_08, W_minus_04, W_TMP1 526 vpxor W_minus_28, W, W # W is W_minus_32 before xor 527 .elseif ((i & 3) == 1) 528 vpxor W_minus_16, W_TMP1, W_TMP1 529 vpxor W_TMP1, W, W 530 .elseif ((i & 3) == 2) 531 vpslld $2, W, W_TMP1 532 vpsrld $30, W, W 533 vpor W, W_TMP1, W 534 .elseif ((i & 3) == 3) 535 vpaddd K_XMM(K_BASE), W, W_TMP1 536 vmovdqu W_TMP1, WK(i&~3) 537 W_PRECALC_ROTATE 538 .endif 539.endm 540 541.endm // W_PRECALC_AVX 542 543W_PRECALC_AVX 544.purgem xmm_mov 545.macro xmm_mov a, b 546 vmovdqu \a,\b 547.endm 548 549 550/* AVX optimized implementation: 551 * extern "C" void sha1_transform_avx(struct sha1_state *state, 552 * const u8 *data, int blocks); 553 */ 554SHA1_VECTOR_ASM sha1_transform_avx 555