1/* 2 * Implement fast SHA-1 with AVX2 instructions. (x86_64) 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * Copyright(c) 2014 Intel Corporation. 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of version 2 of the GNU General Public License as 13 * published by the Free Software Foundation. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * Contact Information: 21 * Ilya Albrekht <ilya.albrekht@intel.com> 22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com> 23 * Ronen Zohar <ronen.zohar@intel.com> 24 * Chandramouli Narayanan <mouli@linux.intel.com> 25 * 26 * BSD LICENSE 27 * 28 * Copyright(c) 2014 Intel Corporation. 29 * 30 * Redistribution and use in source and binary forms, with or without 31 * modification, are permitted provided that the following conditions 32 * are met: 33 * 34 * Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in 38 * the documentation and/or other materials provided with the 39 * distribution. 40 * Neither the name of Intel Corporation nor the names of its 41 * contributors may be used to endorse or promote products derived 42 * from this software without specific prior written permission. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 55 * 56 */ 57 58/* 59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. 60 * 61 *This implementation is based on the previous SSSE3 release: 62 *Visit http://software.intel.com/en-us/articles/ 63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ 64 * 65 *Updates 20-byte SHA-1 record in 'hash' for even number of 66 *'num_blocks' consecutive 64-byte blocks 67 * 68 *extern "C" void sha1_transform_avx2( 69 * int *hash, const char* input, size_t num_blocks ); 70 */ 71 72#include <linux/linkage.h> 73 74#define CTX %rdi /* arg1 */ 75#define BUF %rsi /* arg2 */ 76#define CNT %rdx /* arg3 */ 77 78#define REG_A %ecx 79#define REG_B %esi 80#define REG_C %edi 81#define REG_D %eax 82#define REG_E %edx 83#define REG_TB %ebx 84#define REG_TA %r12d 85#define REG_RA %rcx 86#define REG_RB %rsi 87#define REG_RC %rdi 88#define REG_RD %rax 89#define REG_RE %rdx 90#define REG_RTA %r12 91#define REG_RTB %rbx 92#define REG_T1 %r11d 93#define xmm_mov vmovups 94#define avx2_zeroupper vzeroupper 95#define RND_F1 1 96#define RND_F2 2 97#define RND_F3 3 98 99.macro REGALLOC 100 .set A, REG_A 101 .set B, REG_B 102 .set C, REG_C 103 .set D, REG_D 104 .set E, REG_E 105 .set TB, REG_TB 106 .set TA, REG_TA 107 108 .set RA, REG_RA 109 .set RB, REG_RB 110 .set RC, REG_RC 111 .set RD, REG_RD 112 .set RE, REG_RE 113 114 .set RTA, REG_RTA 115 .set RTB, REG_RTB 116 117 .set T1, REG_T1 118.endm 119 120#define HASH_PTR %r9 121#define BLOCKS_CTR %r8 122#define BUFFER_PTR %r10 123#define BUFFER_PTR2 %r13 124 125#define PRECALC_BUF %r14 126#define WK_BUF %r15 127 128#define W_TMP %xmm0 129#define WY_TMP %ymm0 130#define WY_TMP2 %ymm9 131 132# AVX2 variables 133#define WY0 %ymm3 134#define WY4 %ymm5 135#define WY08 %ymm7 136#define WY12 %ymm8 137#define WY16 %ymm12 138#define WY20 %ymm13 139#define WY24 %ymm14 140#define WY28 %ymm15 141 142#define YMM_SHUFB_BSWAP %ymm10 143 144/* 145 * Keep 2 iterations precalculated at a time: 146 * - 80 DWORDs per iteration * 2 147 */ 148#define W_SIZE (80*2*2 +16) 149 150#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) 151#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) 152 153 154.macro UPDATE_HASH hash, val 155 add \hash, \val 156 mov \val, \hash 157.endm 158 159.macro PRECALC_RESET_WY 160 .set WY_00, WY0 161 .set WY_04, WY4 162 .set WY_08, WY08 163 .set WY_12, WY12 164 .set WY_16, WY16 165 .set WY_20, WY20 166 .set WY_24, WY24 167 .set WY_28, WY28 168 .set WY_32, WY_00 169.endm 170 171.macro PRECALC_ROTATE_WY 172 /* Rotate macros */ 173 .set WY_32, WY_28 174 .set WY_28, WY_24 175 .set WY_24, WY_20 176 .set WY_20, WY_16 177 .set WY_16, WY_12 178 .set WY_12, WY_08 179 .set WY_08, WY_04 180 .set WY_04, WY_00 181 .set WY_00, WY_32 182 183 /* Define register aliases */ 184 .set WY, WY_00 185 .set WY_minus_04, WY_04 186 .set WY_minus_08, WY_08 187 .set WY_minus_12, WY_12 188 .set WY_minus_16, WY_16 189 .set WY_minus_20, WY_20 190 .set WY_minus_24, WY_24 191 .set WY_minus_28, WY_28 192 .set WY_minus_32, WY 193.endm 194 195.macro PRECALC_00_15 196 .if (i == 0) # Initialize and rotate registers 197 PRECALC_RESET_WY 198 PRECALC_ROTATE_WY 199 .endif 200 201 /* message scheduling pre-compute for rounds 0-15 */ 202 .if ((i & 7) == 0) 203 /* 204 * blended AVX2 and ALU instruction scheduling 205 * 1 vector iteration per 8 rounds 206 */ 207 vmovdqu (i * 2)(BUFFER_PTR), W_TMP 208 .elseif ((i & 7) == 1) 209 vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ 210 WY_TMP, WY_TMP 211 .elseif ((i & 7) == 2) 212 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY 213 .elseif ((i & 7) == 4) 214 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP 215 .elseif ((i & 7) == 7) 216 vmovdqu WY_TMP, PRECALC_WK(i&~7) 217 218 PRECALC_ROTATE_WY 219 .endif 220.endm 221 222.macro PRECALC_16_31 223 /* 224 * message scheduling pre-compute for rounds 16-31 225 * calculating last 32 w[i] values in 8 XMM registers 226 * pre-calculate K+w[i] values and store to mem 227 * for later load by ALU add instruction 228 * 229 * "brute force" vectorization for rounds 16-31 only 230 * due to w[i]->w[i-3] dependency 231 */ 232 .if ((i & 7) == 0) 233 /* 234 * blended AVX2 and ALU instruction scheduling 235 * 1 vector iteration per 8 rounds 236 */ 237 /* w[i-14] */ 238 vpalignr $8, WY_minus_16, WY_minus_12, WY 239 vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ 240 .elseif ((i & 7) == 1) 241 vpxor WY_minus_08, WY, WY 242 vpxor WY_minus_16, WY_TMP, WY_TMP 243 .elseif ((i & 7) == 2) 244 vpxor WY_TMP, WY, WY 245 vpslldq $12, WY, WY_TMP2 246 .elseif ((i & 7) == 3) 247 vpslld $1, WY, WY_TMP 248 vpsrld $31, WY, WY 249 .elseif ((i & 7) == 4) 250 vpor WY, WY_TMP, WY_TMP 251 vpslld $2, WY_TMP2, WY 252 .elseif ((i & 7) == 5) 253 vpsrld $30, WY_TMP2, WY_TMP2 254 vpxor WY, WY_TMP, WY_TMP 255 .elseif ((i & 7) == 7) 256 vpxor WY_TMP2, WY_TMP, WY 257 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP 258 vmovdqu WY_TMP, PRECALC_WK(i&~7) 259 260 PRECALC_ROTATE_WY 261 .endif 262.endm 263 264.macro PRECALC_32_79 265 /* 266 * in SHA-1 specification: 267 * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 268 * instead we do equal: 269 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 270 * allows more efficient vectorization 271 * since w[i]=>w[i-3] dependency is broken 272 */ 273 274 .if ((i & 7) == 0) 275 /* 276 * blended AVX2 and ALU instruction scheduling 277 * 1 vector iteration per 8 rounds 278 */ 279 vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP 280 .elseif ((i & 7) == 1) 281 /* W is W_minus_32 before xor */ 282 vpxor WY_minus_28, WY, WY 283 .elseif ((i & 7) == 2) 284 vpxor WY_minus_16, WY_TMP, WY_TMP 285 .elseif ((i & 7) == 3) 286 vpxor WY_TMP, WY, WY 287 .elseif ((i & 7) == 4) 288 vpslld $2, WY, WY_TMP 289 .elseif ((i & 7) == 5) 290 vpsrld $30, WY, WY 291 vpor WY, WY_TMP, WY 292 .elseif ((i & 7) == 7) 293 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP 294 vmovdqu WY_TMP, PRECALC_WK(i&~7) 295 296 PRECALC_ROTATE_WY 297 .endif 298.endm 299 300.macro PRECALC r, s 301 .set i, \r 302 303 .if (i < 40) 304 .set K_XMM, 32*0 305 .elseif (i < 80) 306 .set K_XMM, 32*1 307 .elseif (i < 120) 308 .set K_XMM, 32*2 309 .else 310 .set K_XMM, 32*3 311 .endif 312 313 .if (i<32) 314 PRECALC_00_15 \s 315 .elseif (i<64) 316 PRECALC_16_31 \s 317 .elseif (i < 160) 318 PRECALC_32_79 \s 319 .endif 320.endm 321 322.macro ROTATE_STATE 323 .set T_REG, E 324 .set E, D 325 .set D, C 326 .set C, B 327 .set B, TB 328 .set TB, A 329 .set A, T_REG 330 331 .set T_REG, RE 332 .set RE, RD 333 .set RD, RC 334 .set RC, RB 335 .set RB, RTB 336 .set RTB, RA 337 .set RA, T_REG 338.endm 339 340/* Macro relies on saved ROUND_Fx */ 341 342.macro RND_FUN f, r 343 .if (\f == RND_F1) 344 ROUND_F1 \r 345 .elseif (\f == RND_F2) 346 ROUND_F2 \r 347 .elseif (\f == RND_F3) 348 ROUND_F3 \r 349 .endif 350.endm 351 352.macro RR r 353 .set round_id, (\r % 80) 354 355 .if (round_id == 0) /* Precalculate F for first round */ 356 .set ROUND_FUNC, RND_F1 357 mov B, TB 358 359 rorx $(32-30), B, B /* b>>>2 */ 360 andn D, TB, T1 361 and C, TB 362 xor T1, TB 363 .endif 364 365 RND_FUN ROUND_FUNC, \r 366 ROTATE_STATE 367 368 .if (round_id == 18) 369 .set ROUND_FUNC, RND_F2 370 .elseif (round_id == 38) 371 .set ROUND_FUNC, RND_F3 372 .elseif (round_id == 58) 373 .set ROUND_FUNC, RND_F2 374 .endif 375 376 .set round_id, ( (\r+1) % 80) 377 378 RND_FUN ROUND_FUNC, (\r+1) 379 ROTATE_STATE 380.endm 381 382.macro ROUND_F1 r 383 add WK(\r), E 384 385 andn C, A, T1 /* ~b&d */ 386 lea (RE,RTB), E /* Add F from the previous round */ 387 388 rorx $(32-5), A, TA /* T2 = A >>> 5 */ 389 rorx $(32-30),A, TB /* b>>>2 for next round */ 390 391 PRECALC (\r) /* msg scheduling for next 2 blocks */ 392 393 /* 394 * Calculate F for the next round 395 * (b & c) ^ andn[b, d] 396 */ 397 and B, A /* b&c */ 398 xor T1, A /* F1 = (b&c) ^ (~b&d) */ 399 400 lea (RE,RTA), E /* E += A >>> 5 */ 401.endm 402 403.macro ROUND_F2 r 404 add WK(\r), E 405 lea (RE,RTB), E /* Add F from the previous round */ 406 407 /* Calculate F for the next round */ 408 rorx $(32-5), A, TA /* T2 = A >>> 5 */ 409 .if ((round_id) < 79) 410 rorx $(32-30), A, TB /* b>>>2 for next round */ 411 .endif 412 PRECALC (\r) /* msg scheduling for next 2 blocks */ 413 414 .if ((round_id) < 79) 415 xor B, A 416 .endif 417 418 add TA, E /* E += A >>> 5 */ 419 420 .if ((round_id) < 79) 421 xor C, A 422 .endif 423.endm 424 425.macro ROUND_F3 r 426 add WK(\r), E 427 PRECALC (\r) /* msg scheduling for next 2 blocks */ 428 429 lea (RE,RTB), E /* Add F from the previous round */ 430 431 mov B, T1 432 or A, T1 433 434 rorx $(32-5), A, TA /* T2 = A >>> 5 */ 435 rorx $(32-30), A, TB /* b>>>2 for next round */ 436 437 /* Calculate F for the next round 438 * (b and c) or (d and (b or c)) 439 */ 440 and C, T1 441 and B, A 442 or T1, A 443 444 add TA, E /* E += A >>> 5 */ 445 446.endm 447 448/* Add constant only if (%2 > %3) condition met (uses RTA as temp) 449 * %1 + %2 >= %3 ? %4 : 0 450 */ 451.macro ADD_IF_GE a, b, c, d 452 mov \a, RTA 453 add $\d, RTA 454 cmp $\c, \b 455 cmovge RTA, \a 456.endm 457 458/* 459 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining 460 */ 461.macro SHA1_PIPELINED_MAIN_BODY 462 463 REGALLOC 464 465 mov (HASH_PTR), A 466 mov 4(HASH_PTR), B 467 mov 8(HASH_PTR), C 468 mov 12(HASH_PTR), D 469 mov 16(HASH_PTR), E 470 471 mov %rsp, PRECALC_BUF 472 lea (2*4*80+32)(%rsp), WK_BUF 473 474 # Precalc WK for first 2 blocks 475 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 476 .set i, 0 477 .rept 160 478 PRECALC i 479 .set i, i + 1 480 .endr 481 482 /* Go to next block if needed */ 483 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 484 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 485 xchg WK_BUF, PRECALC_BUF 486 487 .align 32 488_loop: 489 /* 490 * code loops through more than one block 491 * we use K_BASE value as a signal of a last block, 492 * it is set below by: cmovae BUFFER_PTR, K_BASE 493 */ 494 test BLOCKS_CTR, BLOCKS_CTR 495 jnz _begin 496 .align 32 497 jmp _end 498 .align 32 499_begin: 500 501 /* 502 * Do first block 503 * rounds: 0,2,4,6,8 504 */ 505 .set j, 0 506 .rept 5 507 RR j 508 .set j, j+2 509 .endr 510 511 jmp _loop0 512_loop0: 513 514 /* 515 * rounds: 516 * 10,12,14,16,18 517 * 20,22,24,26,28 518 * 30,32,34,36,38 519 * 40,42,44,46,48 520 * 50,52,54,56,58 521 */ 522 .rept 25 523 RR j 524 .set j, j+2 525 .endr 526 527 /* Update Counter */ 528 sub $1, BLOCKS_CTR 529 /* Move to the next block only if needed*/ 530 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 531 /* 532 * rounds 533 * 60,62,64,66,68 534 * 70,72,74,76,78 535 */ 536 .rept 10 537 RR j 538 .set j, j+2 539 .endr 540 541 UPDATE_HASH (HASH_PTR), A 542 UPDATE_HASH 4(HASH_PTR), TB 543 UPDATE_HASH 8(HASH_PTR), C 544 UPDATE_HASH 12(HASH_PTR), D 545 UPDATE_HASH 16(HASH_PTR), E 546 547 test BLOCKS_CTR, BLOCKS_CTR 548 jz _loop 549 550 mov TB, B 551 552 /* Process second block */ 553 /* 554 * rounds 555 * 0+80, 2+80, 4+80, 6+80, 8+80 556 * 10+80,12+80,14+80,16+80,18+80 557 */ 558 559 .set j, 0 560 .rept 10 561 RR j+80 562 .set j, j+2 563 .endr 564 565 jmp _loop1 566_loop1: 567 /* 568 * rounds 569 * 20+80,22+80,24+80,26+80,28+80 570 * 30+80,32+80,34+80,36+80,38+80 571 */ 572 .rept 10 573 RR j+80 574 .set j, j+2 575 .endr 576 577 jmp _loop2 578_loop2: 579 580 /* 581 * rounds 582 * 40+80,42+80,44+80,46+80,48+80 583 * 50+80,52+80,54+80,56+80,58+80 584 */ 585 .rept 10 586 RR j+80 587 .set j, j+2 588 .endr 589 590 /* update counter */ 591 sub $1, BLOCKS_CTR 592 /* Move to the next block only if needed*/ 593 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 594 595 jmp _loop3 596_loop3: 597 598 /* 599 * rounds 600 * 60+80,62+80,64+80,66+80,68+80 601 * 70+80,72+80,74+80,76+80,78+80 602 */ 603 .rept 10 604 RR j+80 605 .set j, j+2 606 .endr 607 608 UPDATE_HASH (HASH_PTR), A 609 UPDATE_HASH 4(HASH_PTR), TB 610 UPDATE_HASH 8(HASH_PTR), C 611 UPDATE_HASH 12(HASH_PTR), D 612 UPDATE_HASH 16(HASH_PTR), E 613 614 /* Reset state for AVX2 reg permutation */ 615 mov A, TA 616 mov TB, A 617 mov C, TB 618 mov E, C 619 mov D, B 620 mov TA, D 621 622 REGALLOC 623 624 xchg WK_BUF, PRECALC_BUF 625 626 jmp _loop 627 628 .align 32 629 _end: 630 631.endm 632/* 633 * macro implements SHA-1 function's body for several 64-byte blocks 634 * param: function's name 635 */ 636.macro SHA1_VECTOR_ASM name 637 SYM_FUNC_START(\name) 638 639 push %rbx 640 push %r12 641 push %r13 642 push %r14 643 push %r15 644 645 RESERVE_STACK = (W_SIZE*4 + 8+24) 646 647 /* Align stack */ 648 mov %rsp, %rbx 649 and $~(0x20-1), %rsp 650 push %rbx 651 sub $RESERVE_STACK, %rsp 652 653 avx2_zeroupper 654 655 /* Setup initial values */ 656 mov CTX, HASH_PTR 657 mov BUF, BUFFER_PTR 658 659 mov BUF, BUFFER_PTR2 660 mov CNT, BLOCKS_CTR 661 662 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP 663 664 SHA1_PIPELINED_MAIN_BODY 665 666 avx2_zeroupper 667 668 add $RESERVE_STACK, %rsp 669 pop %rsp 670 671 pop %r15 672 pop %r14 673 pop %r13 674 pop %r12 675 pop %rbx 676 677 ret 678 679 SYM_FUNC_END(\name) 680.endm 681 682.section .rodata 683 684#define K1 0x5a827999 685#define K2 0x6ed9eba1 686#define K3 0x8f1bbcdc 687#define K4 0xca62c1d6 688 689.align 128 690K_XMM_AR: 691 .long K1, K1, K1, K1 692 .long K1, K1, K1, K1 693 .long K2, K2, K2, K2 694 .long K2, K2, K2, K2 695 .long K3, K3, K3, K3 696 .long K3, K3, K3, K3 697 .long K4, K4, K4, K4 698 .long K4, K4, K4, K4 699 700BSWAP_SHUFB_CTL: 701 .long 0x00010203 702 .long 0x04050607 703 .long 0x08090a0b 704 .long 0x0c0d0e0f 705 .long 0x00010203 706 .long 0x04050607 707 .long 0x08090a0b 708 .long 0x0c0d0e0f 709.text 710 711SHA1_VECTOR_ASM sha1_transform_avx2 712